imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,94 +0,0 @@
1
- module IMW
2
-
3
- # Contains modules which define the behavior of compressed files.
4
- module CompressedFiles
5
- autoload :Bz2, 'imw/compressed_files/bz2'
6
- autoload :Gz, 'imw/compressed_files/gz'
7
- autoload :Compressible, 'imw/compressed_files/compressible'
8
-
9
- # Handlers which include modules for compressed file formats as
10
- # well as the IMW::CompressedFiles::Compressible module for
11
- # compressing regular files.
12
- HANDLERS = [
13
- ["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/i } ],
14
- ["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/i && r.path !~ /\.tar\.gz$/i && r.path !~ /\.tgz$/i } ],
15
- ["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/i && r.path !~ /\.tar\.bz2$/i && r.path !~ /\.tbz2$/i } ]
16
- ]
17
-
18
- # Defines methods for decompressing a compressed file. This
19
- # module isn't used to directly extend an IMW::Resource --
20
- # instead, format specific modules (e.g. -
21
- # IMW::Resources::CompressedFiles::Bz2) include this module and
22
- # further define the command-line flags &c. needed to make
23
- # everything work.
24
- module Base
25
-
26
- attr_accessor :compression_settings
27
-
28
- # Is this file compressed?
29
- #
30
- # @return [true, false]
31
- def is_compressed?
32
- true
33
- end
34
-
35
- # Can this file be compressed?
36
- #
37
- # @return [true, false]
38
- def is_compressible?
39
- false
40
- end
41
-
42
- # The basename of this resource after it is decompressed
43
- #
44
- # IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
45
- # => 'my_file.txt'
46
- #
47
- # @return [String] the decompressed basename
48
- def decompressed_basename
49
- basename[0..-(extname.size + 1)]
50
- end
51
-
52
- # The path of this resource after it is decompressed
53
- #
54
- # IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
55
- # => '/path/to/my_file.txt'
56
- #
57
- # @return [String] the decompressed path
58
- def decompressed_path
59
- File.join(dirname, decompressed_basename)
60
- end
61
-
62
- # Decompress this file in its present directory overwriting any
63
- # existing files and without saving the original compressed
64
- # file.
65
- #
66
- # @return [IMW::Resource] the decompressed resource
67
- def decompress!
68
- should_exist!("Cannot decompress.")
69
- program = compression_settings[:decompression_program] || compression_settings[:program]
70
- FileUtils.cd(dirname) { IMW.system(program, compression_settings[:decompress], path) }
71
- IMW.open(decompressed_path)
72
- end
73
-
74
- # Decompress this file in its present directory, overwriting any
75
- # existing files while keeping the original compressed file.
76
- #
77
- # FIXME The implementation is a little stupid as the file is
78
- # needlessly copied.
79
- #
80
- # @return [IMW::Resource] the decompressed resource
81
- def decompress
82
- should_exist!("Cannot decompress.")
83
- begin
84
- copy = cp(path + '.imw_copy')
85
- regular_file = decompress!
86
- copy.mv(path)
87
- regular_file
88
- ensure
89
- copy.mv(path) if copy && copy.exist?
90
- end
91
- end
92
- end
93
- end
94
- end
@@ -1,16 +0,0 @@
1
- module IMW
2
- module CompressedFiles
3
- module Bz2
4
-
5
- include IMW::CompressedFiles::Base
6
-
7
- def compression_settings
8
- @compression_settings ||= {
9
- :decompression_program => :bzip2,
10
- :decompress => '-fd'
11
- }
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,75 +0,0 @@
1
- module IMW
2
-
3
- # Default settings used when compressing files. <tt>:program</tt>
4
- # defines the name of the command-line program to use,
5
- # <tt>:compress</tt> gives the flags to use when compressing, and
6
- # <tt>:extension</tt> gives the extension (_without_ the `.') added
7
- # by the program after compressing.
8
- COMPRESSION_SETTINGS = {
9
- :program => 'bzip2',
10
- :compress => '',
11
- :extension => 'bz2'
12
- } unless defined?(COMPRESSION_SETTINGS)
13
-
14
- module CompressedFiles
15
-
16
- # Defines methods for compressing a file. The default compression
17
- # program is defined in IMW::COMPRESSION_SETTINGS though a
18
- # particular resource can change the values in its
19
- # +compression_settings+ hash.
20
- module Compressible
21
-
22
- # Compression settings.
23
- attr_accessor :compression_settings
24
-
25
- # Is this file compressible?
26
- #
27
- # @return [true]
28
- def is_compressible?
29
- true
30
- end
31
-
32
- # Defines the compression settings used for this
33
- # resource. <tt>:program</tt> defines the name of the
34
- # command-line program to use, <tt>:compress</tt> gives the
35
- # flags to use when compressing, and <tt>:extension</tt> gives
36
- # the extension (_without_ the `.') added by the program after
37
- # compressing.
38
- #
39
- # @return [Hash]
40
- def compression_settings
41
- @compression_settings ||= COMPRESSION_SETTINGS
42
- end
43
-
44
- # Compress this resource in place, overwriting it.
45
- #
46
- # This resource's +compression_settings+ method is used to
47
- # determine the method of compression.
48
- #
49
- # @return [IMW::Resource] the compressed file
50
- def compress!
51
- should_exist!("Cannot compress.")
52
- IMW.system(*[compression_settings[:program], compression_settings[:compress], path])
53
- IMW.open(File.join(dirname,basename + "." + compression_settings[:extension]))
54
- end
55
-
56
- # Compress this resource without overwriting it.
57
- #
58
- # FIXME The implementation is a little stupid as the file is
59
- # needlessly copied.
60
- #
61
- # @return [IMW::Resource] the compressed file
62
- def compress options={}
63
- should_exist!("Cannot compress.")
64
- begin
65
- copy = cp(path + '.imw_copy')
66
- compressed_file = compress!
67
- copy.mv(path)
68
- compressed_file
69
- ensure
70
- copy.mv(path) if copy.exist?
71
- end
72
- end
73
- end
74
- end
75
- end
@@ -1,16 +0,0 @@
1
- module IMW
2
- module CompressedFiles
3
- module Gz
4
-
5
- include IMW::CompressedFiles::Base
6
-
7
- def compression_settings
8
- @compression_settings ||= {
9
- :decompression_program => :gunzip,
10
- :decompress => '-fd'
11
- }
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,125 +0,0 @@
1
- require 'imw/dataset/workflow'
2
- require 'imw/dataset/paths'
3
-
4
- module IMW
5
-
6
- # The IMW::Dataset represents a common object in which paths, data
7
- # resources, and various tasks can be intermingled to define a
8
- # complex transformation of data.
9
- #
10
- # == Organizing Paths
11
- #
12
- # IMW encourages you to work within the following directory
13
- # structure for a dataset +my_dataset+:
14
- #
15
- # my_dataset/
16
- # |-- my_dataset.rb
17
- # |-- ripd
18
- # | `-- ...
19
- # |-- rawd
20
- # | `-- ...
21
- # |-- fixd
22
- # | `-- ...
23
- # `-- pkgd
24
- # `-- ...
25
- #
26
- # Just like IMW itself, a dataset can manage a collection of paths.
27
- # If <tt>my_dataset.rb</tt> defines a dataset:
28
- #
29
- # # my_dataset/my_dataset.rb
30
- # dataset = IMW::Dataset.new(:my_dataset)
31
- #
32
- # then the following paths will be defined:
33
- #
34
- # dataset.path_to(:root) #=> my_dataset
35
- # dataset.path_to(:script) #=> my_dataset/my_dataset.rb
36
- # dataset.path_to(:ripd) #=> my_dataset/ripd
37
- # dataset.path_to(:rawd) #=> my_dataset/rawd
38
- # dataset.path_to(:fixd) #=> my_dataset/fixd
39
- # dataset.path_to(:pkgd) #=> my_dataset/pkgd
40
- #
41
- # Just like IMW itself, the +dataset+ supports adding path
42
- # references
43
- #
44
- # dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
45
- # dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
46
- #
47
- # as well as removed (via <tt>dataset.remove_path</tt>)).
48
- #
49
- # A subclass of IMW::Dataset can customize these paths be overriding
50
- # IMW::Dataset#set_default_paths as well as define new ones by
51
- # overriding IMW::Dataset#set_paths.
52
- #
53
- # Setting paths can be skipped altogether by passing the
54
- # <tt>:skip_paths</tt> option when instantiating a dataset:
55
- #
56
- # dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
57
- #
58
- # == Utilizing Tasks
59
- #
60
- # An IMW::Dataset utilizes Rake to manage tasks needed to transform
61
- # data. See IMW::Workflow for a description of the pre-defined
62
- # tasks (+rip+, +parse+, +fix+, +package+).
63
- #
64
- # New tasks can be defined
65
- #
66
- # dataset.task :get_authorization do
67
- # # ... get an authorization token
68
- # end
69
- #
70
- # and hooked into the default tasks in the usual Rake manner
71
- #
72
- # dataset.task :rip => [:get_authorization]
73
- #
74
- # A dataset also has methods for the workflow step tasks to make
75
- # this easier
76
- #
77
- # dataset.rip [:get_authorized]
78
- #
79
- # Tasks for a dataset can be accessed and invoked as follows
80
- #
81
- # dataset[:rip].invoke
82
- #
83
- # as well as by using the command line +imw+ tool.
84
- #
85
- # Defining tasks can be skipped altogether by passing the
86
- # <tt>:skip_workflow</tt> option when instantiating a dataset
87
- #
88
- # dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
89
- #
90
- # == Working with Repositories
91
- #
92
- # A dataset can be added to a repository by passing the
93
- # <tt>:repository</tt> option
94
- #
95
- # repo = IMW::Repository.new
96
- # dataset = IMW::Dataset.new :my_dataset, :repository => repo
97
- class Dataset
98
-
99
- # The handle this dataset goes by. Used for identifying it within
100
- # a repository.
101
- attr_accessor :handle
102
-
103
- # Options for this dataset.
104
- attr_accessor :options
105
-
106
- def initialize handle, options = {}
107
- @options = options
108
- @handle = handle
109
- set_default_paths unless options[:skip_paths]
110
- set_paths unless options[:skip_paths]
111
- initialize_workflow unless options[:skip_workflow]
112
- if options[:repository]
113
- options[:repository][handle] = self
114
- end
115
- end
116
-
117
- # Provides this dataset with a workflow of tasks managed by Rake.
118
- include IMW::Workflow
119
-
120
- # Provides this dataset with DSL like methods to construct a
121
- # schema in an IMW file.
122
- # include IMW::Metadata::DSL
123
-
124
- end
125
- end
@@ -1,29 +0,0 @@
1
- module IMW
2
- class Dataset
3
- include IMW::Paths
4
-
5
- protected
6
- # Sets paths to the workflow directories for this dataset (+ripd+,
7
- # +rawd+, +fixd+, +pkgd+) as well as the following paths:
8
- #
9
- # script::
10
- # The path to the file the dataset was initialized in.
11
- #
12
- # root::
13
- # The parent directory of the file the dataset was initialized
14
- # in or the value of the <tt>:root</tt> key in
15
- # IMW::Dataset#options
16
- #
17
- def set_default_paths
18
- add_path :script, File.expand_path(eval('__FILE__'))
19
- add_path :root, options[:root] || File.dirname(path_to(:script))
20
- workflow_dirs.each do |dir|
21
- add_path dir, :root, dir.to_s
22
- end
23
- end
24
-
25
- # Overwrite this method to set additional paths for the dataset.
26
- def set_paths
27
- end
28
- end
29
- end
@@ -1,195 +0,0 @@
1
- require 'ostruct'
2
- require 'rake'
3
-
4
- module IMW
5
-
6
- # An IMW version of Rake::Task
7
- Task = Class.new(Rake::Task)
8
-
9
- # An IMW subclass of Rake:FileTask
10
- FileTask = Class.new(Rake::FileTask)
11
-
12
- # An IMW subclass of Rake::FileCreationTask
13
- FileCreationTask = Class.new(Rake::FileCreationTask)
14
-
15
- # IMW encourages you to view a data transformation as a series of
16
- # interdependent steps.
17
- #
18
- # By default, IMW defines four main steps in such a transformation:
19
- # +rip+, +parse+, +fix+, and +package+.
20
- #
21
- # Each step is associated with a directory on disk in which it keeps
22
- # its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
23
- #
24
- # The steps are:
25
- #
26
- # rip::
27
- # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
28
- # store the results in +ripd+.
29
- #
30
- # parse::
31
- # Parse data into a structured form using a library (JSON, YAML,
32
- # &c.) or using your own parser (XML, flat files, &c.) and store
33
- # the results in +prsd+
34
- #
35
- # fix::
36
- # Combine, filter, reconcile, and transform already structured
37
- # data into a desired form and store the results in +fixd+.
38
- #
39
- # package::
40
- # Archive, compress, and deliver data in its final form to some
41
- # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
42
- # storing the ouptut in +pkgd+.
43
- #
44
- # Each step depends upon the one before it. The steps are blank by
45
- # default so there's no need to write code for steps you don't need
46
- # to use. You can also define your own steps (using +task+ just
47
- # like in Rake) and hook them into these pre-defined steps (or
48
- # not...).
49
- #
50
- # A dataset also has an <tt>:initialize</tt> task (which by default
51
- # just creates the directories for these steps) which you can use to
52
- # hook in your own initialization tasks by making it depend on them.
53
- #
54
- # A subclass of IMW::Dataset can customize how tasks are defined by
55
- # overriding +define_workflow_tasks+, among other methods, and
56
- # introduce new tasks by overriding +define_tasks+.
57
- module Workflow
58
-
59
- include Rake::TaskManager
60
- # Default options passed to <tt>Rake</tt>. Any class including
61
- # the <tt>Rake::TaskManager</tt> module must define a constant by
62
- # this name.
63
- DEFAULT_OPTIONS = {
64
- :dry_run => false,
65
- :trace => false,
66
- :verbose => false
67
- }
68
-
69
- # Return a new (or existing) <tt>IMW::Task</tt> with the given
70
- # +name+. Dependencies can be declared and a block passed in just
71
- # as in Rake.
72
- #
73
- # @param [Hash, Symbol, String] deps the name of the task (if a
74
- # Symbol or String) or the name of the task mapped to an Array of
75
- # dependencies (if a Hash)
76
- #
77
- # @return [IMW::Task] the task
78
- def task deps, &block
79
- self.define_task IMW::Task, deps, &block
80
- end
81
-
82
- # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
83
- # +path+. Dependencies can be declared and a block passed in just
84
- # as in Rake.
85
- #
86
- # @param [String, IMW::Resource] path the path to the file
87
- # @return [IMW::FileTask] the task
88
- def file path, &block
89
- path = path.respond_to?(:path) ? path.path : path
90
- self.define_task IMW::FileTask, path, &block
91
- end
92
-
93
- # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
94
- # +path+. Dependencies can be declared and a block passed in just
95
- # as in Rake.
96
- #
97
- # @param [String, IMW::Resource] path the path to the file
98
- # @return [IMW::FileCreationTask] the task
99
- def file_create path, &block
100
- path = path.respond_to?(:path) ? path.path : path
101
- self.define_task IMW::FileCreationTask, path, &block
102
- end
103
-
104
- # Override this method to define default tasks for a subclass of
105
- # IMW::Dataset.
106
- def define_tasks
107
- end
108
-
109
- # The standard IMW workflow steps.
110
- #
111
- # @return [Array] the workflow step names
112
- def workflow_steps
113
- [:rip, :parse, :fix, :package]
114
- end
115
-
116
- # The steps of the IMW workflow each correspond to a directory in
117
- # which it is customary that they deposit their files <em>once
118
- # they are finished processing</em> (so ripped files wind up in
119
- # the +ripd+ directory, packaged files in the +pkgd+ directory,
120
- # and so on).
121
- #
122
- # @return [Array] the workflow directory names
123
- def workflow_dirs
124
- [:ripd, :rawd, :fixd, :pkgd]
125
- end
126
-
127
- protected
128
-
129
- # Convenience method for defining tasks for this workflow.
130
- #
131
- # @param [Hash, Symbol, String] deps the name of the task (if a
132
- # Symbol or String) or the name of the task mapped to an Array of
133
- # dependencies (if a Hash)
134
- # @param [String] comment the comment to associate to the task
135
- # @return [IMW::Task] the task
136
- def define_workflow_task deps, comment, &block
137
- @last_description = comment
138
- define_task(IMW::Task, deps, &block)
139
- end
140
-
141
- # Create all the instance variables required by Rake::TaskManager
142
- # and define default tasks for this dataset.
143
- def initialize_workflow
144
- @tasks = Hash.new
145
- @rules = Array.new
146
- @scope = Array.new
147
- @last_description = nil
148
- @options = OpenStruct.new(DEFAULT_OPTIONS)
149
- define_initialize_task
150
- define_workflow_tasks
151
- define_clean_task
152
- define_tasks
153
- end
154
-
155
- # Defines the <tt>:initialize</tt> task. The only other task
156
- # hooked into <tt>:initialize</tt> is the
157
- # <tt>:create_workflow_dirs</tt> task which creates the workflow
158
- # directories for this dataset.
159
- def define_initialize_task
160
- define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
161
- workflow_dirs.each do |dir|
162
- FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
163
- end
164
- end
165
- define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
166
- end
167
-
168
- # Creates a task <tt>:clean</tt> which removes dataset's
169
- # workflow directories.
170
- def define_clean_task
171
- define_workflow_task :clean, "Remove the workflow directories for this dataset." do
172
- workflow_dirs.each do |dir|
173
- FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
174
- end
175
- end
176
- end
177
-
178
- # Creates the task dependency chain <tt>:package => :fix =>
179
- # :parse => :rip => :initialize</tt> of the
180
- # IMW::Workflow.
181
- def define_workflow_tasks
182
- define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
183
- define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
184
- define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
185
- define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
186
- end
187
-
188
-
189
- def rip(deps=nil, &block); self[:rip].enhance(deps, &block); end
190
- def parse(deps=nil, &block); self[:parse].enhance(deps, &block); end
191
- def fix(deps=nil, &block); self[:fix].enhance(deps, &block); end
192
- def package(deps=nil, &block); self[:package].enhance(deps, &block); end
193
-
194
- end
195
- end