imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,94 +0,0 @@
1
- module IMW
2
-
3
- # Contains modules which define the behavior of compressed files.
4
- module CompressedFiles
5
- autoload :Bz2, 'imw/compressed_files/bz2'
6
- autoload :Gz, 'imw/compressed_files/gz'
7
- autoload :Compressible, 'imw/compressed_files/compressible'
8
-
9
- # Handlers which include modules for compressed file formats as
10
- # well as the IMW::CompressedFiles::Compressible module for
11
- # compressing regular files.
12
- HANDLERS = [
13
- ["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/i } ],
14
- ["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/i && r.path !~ /\.tar\.gz$/i && r.path !~ /\.tgz$/i } ],
15
- ["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/i && r.path !~ /\.tar\.bz2$/i && r.path !~ /\.tbz2$/i } ]
16
- ]
17
-
18
- # Defines methods for decompressing a compressed file. This
19
- # module isn't used to directly extend an IMW::Resource --
20
- # instead, format specific modules (e.g. -
21
- # IMW::Resources::CompressedFiles::Bz2) include this module and
22
- # further define the command-line flags &c. needed to make
23
- # everything work.
24
- module Base
25
-
26
- attr_accessor :compression_settings
27
-
28
- # Is this file compressed?
29
- #
30
- # @return [true, false]
31
- def is_compressed?
32
- true
33
- end
34
-
35
- # Can this file be compressed?
36
- #
37
- # @return [true, false]
38
- def is_compressible?
39
- false
40
- end
41
-
42
- # The basename of this resource after it is decompressed
43
- #
44
- # IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
45
- # => 'my_file.txt'
46
- #
47
- # @return [String] the decompressed basename
48
- def decompressed_basename
49
- basename[0..-(extname.size + 1)]
50
- end
51
-
52
- # The path of this resource after it is decompressed
53
- #
54
- # IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
55
- # => '/path/to/my_file.txt'
56
- #
57
- # @return [String] the decompressed path
58
- def decompressed_path
59
- File.join(dirname, decompressed_basename)
60
- end
61
-
62
- # Decompress this file in its present directory overwriting any
63
- # existing files and without saving the original compressed
64
- # file.
65
- #
66
- # @return [IMW::Resource] the decompressed resource
67
- def decompress!
68
- should_exist!("Cannot decompress.")
69
- program = compression_settings[:decompression_program] || compression_settings[:program]
70
- FileUtils.cd(dirname) { IMW.system(program, compression_settings[:decompress], path) }
71
- IMW.open(decompressed_path)
72
- end
73
-
74
- # Decompress this file in its present directory, overwriting any
75
- # existing files while keeping the original compressed file.
76
- #
77
- # FIXME The implementation is a little stupid as the file is
78
- # needlessly copied.
79
- #
80
- # @return [IMW::Resource] the decompressed resource
81
- def decompress
82
- should_exist!("Cannot decompress.")
83
- begin
84
- copy = cp(path + '.imw_copy')
85
- regular_file = decompress!
86
- copy.mv(path)
87
- regular_file
88
- ensure
89
- copy.mv(path) if copy && copy.exist?
90
- end
91
- end
92
- end
93
- end
94
- end
@@ -1,16 +0,0 @@
1
- module IMW
2
- module CompressedFiles
3
- module Bz2
4
-
5
- include IMW::CompressedFiles::Base
6
-
7
- def compression_settings
8
- @compression_settings ||= {
9
- :decompression_program => :bzip2,
10
- :decompress => '-fd'
11
- }
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,75 +0,0 @@
1
- module IMW
2
-
3
- # Default settings used when compressing files. <tt>:program</tt>
4
- # defines the name of the command-line program to use,
5
- # <tt>:compress</tt> gives the flags to use when compressing, and
6
- # <tt>:extension</tt> gives the extension (_without_ the `.') added
7
- # by the program after compressing.
8
- COMPRESSION_SETTINGS = {
9
- :program => 'bzip2',
10
- :compress => '',
11
- :extension => 'bz2'
12
- } unless defined?(COMPRESSION_SETTINGS)
13
-
14
- module CompressedFiles
15
-
16
- # Defines methods for compressing a file. The default compression
17
- # program is defined in IMW::COMPRESSION_SETTINGS though a
18
- # particular resource can change the values in its
19
- # +compression_settings+ hash.
20
- module Compressible
21
-
22
- # Compression settings.
23
- attr_accessor :compression_settings
24
-
25
- # Is this file compressible?
26
- #
27
- # @return [true]
28
- def is_compressible?
29
- true
30
- end
31
-
32
- # Defines the compression settings used for this
33
- # resource. <tt>:program</tt> defines the name of the
34
- # command-line program to use, <tt>:compress</tt> gives the
35
- # flags to use when compressing, and <tt>:extension</tt> gives
36
- # the extension (_without_ the `.') added by the program after
37
- # compressing.
38
- #
39
- # @return [Hash]
40
- def compression_settings
41
- @compression_settings ||= COMPRESSION_SETTINGS
42
- end
43
-
44
- # Compress this resource in place, overwriting it.
45
- #
46
- # This resource's +compression_settings+ method is used to
47
- # determine the method of compression.
48
- #
49
- # @return [IMW::Resource] the compressed file
50
- def compress!
51
- should_exist!("Cannot compress.")
52
- IMW.system(*[compression_settings[:program], compression_settings[:compress], path])
53
- IMW.open(File.join(dirname,basename + "." + compression_settings[:extension]))
54
- end
55
-
56
- # Compress this resource without overwriting it.
57
- #
58
- # FIXME The implementation is a little stupid as the file is
59
- # needlessly copied.
60
- #
61
- # @return [IMW::Resource] the compressed file
62
- def compress options={}
63
- should_exist!("Cannot compress.")
64
- begin
65
- copy = cp(path + '.imw_copy')
66
- compressed_file = compress!
67
- copy.mv(path)
68
- compressed_file
69
- ensure
70
- copy.mv(path) if copy.exist?
71
- end
72
- end
73
- end
74
- end
75
- end
@@ -1,16 +0,0 @@
1
- module IMW
2
- module CompressedFiles
3
- module Gz
4
-
5
- include IMW::CompressedFiles::Base
6
-
7
- def compression_settings
8
- @compression_settings ||= {
9
- :decompression_program => :gunzip,
10
- :decompress => '-fd'
11
- }
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,125 +0,0 @@
1
- require 'imw/dataset/workflow'
2
- require 'imw/dataset/paths'
3
-
4
- module IMW
5
-
6
- # The IMW::Dataset represents a common object in which paths, data
7
- # resources, and various tasks can be intermingled to define a
8
- # complex transformation of data.
9
- #
10
- # == Organizing Paths
11
- #
12
- # IMW encourages you to work within the following directory
13
- # structure for a dataset +my_dataset+:
14
- #
15
- # my_dataset/
16
- # |-- my_dataset.rb
17
- # |-- ripd
18
- # | `-- ...
19
- # |-- rawd
20
- # | `-- ...
21
- # |-- fixd
22
- # | `-- ...
23
- # `-- pkgd
24
- # `-- ...
25
- #
26
- # Just like IMW itself, a dataset can manage a collection of paths.
27
- # If <tt>my_dataset.rb</tt> defines a dataset:
28
- #
29
- # # my_dataset/my_dataset.rb
30
- # dataset = IMW::Dataset.new(:my_dataset)
31
- #
32
- # then the following paths will be defined:
33
- #
34
- # dataset.path_to(:root) #=> my_dataset
35
- # dataset.path_to(:script) #=> my_dataset/my_dataset.rb
36
- # dataset.path_to(:ripd) #=> my_dataset/ripd
37
- # dataset.path_to(:rawd) #=> my_dataset/rawd
38
- # dataset.path_to(:fixd) #=> my_dataset/fixd
39
- # dataset.path_to(:pkgd) #=> my_dataset/pkgd
40
- #
41
- # Just like IMW itself, the +dataset+ supports adding path
42
- # references
43
- #
44
- # dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
45
- # dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
46
- #
47
- # as well as removed (via <tt>dataset.remove_path</tt>)).
48
- #
49
- # A subclass of IMW::Dataset can customize these paths be overriding
50
- # IMW::Dataset#set_default_paths as well as define new ones by
51
- # overriding IMW::Dataset#set_paths.
52
- #
53
- # Setting paths can be skipped altogether by passing the
54
- # <tt>:skip_paths</tt> option when instantiating a dataset:
55
- #
56
- # dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
57
- #
58
- # == Utilizing Tasks
59
- #
60
- # An IMW::Dataset utilizes Rake to manage tasks needed to transform
61
- # data. See IMW::Workflow for a description of the pre-defined
62
- # tasks (+rip+, +parse+, +fix+, +package+).
63
- #
64
- # New tasks can be defined
65
- #
66
- # dataset.task :get_authorization do
67
- # # ... get an authorization token
68
- # end
69
- #
70
- # and hooked into the default tasks in the usual Rake manner
71
- #
72
- # dataset.task :rip => [:get_authorization]
73
- #
74
- # A dataset also has methods for the workflow step tasks to make
75
- # this easier
76
- #
77
- # dataset.rip [:get_authorized]
78
- #
79
- # Tasks for a dataset can be accessed and invoked as follows
80
- #
81
- # dataset[:rip].invoke
82
- #
83
- # as well as by using the command line +imw+ tool.
84
- #
85
- # Defining tasks can be skipped altogether by passing the
86
- # <tt>:skip_workflow</tt> option when instantiating a dataset
87
- #
88
- # dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
89
- #
90
- # == Working with Repositories
91
- #
92
- # A dataset can be added to a repository by passing the
93
- # <tt>:repository</tt> option
94
- #
95
- # repo = IMW::Repository.new
96
- # dataset = IMW::Dataset.new :my_dataset, :repository => repo
97
- class Dataset
98
-
99
- # The handle this dataset goes by. Used for identifying it within
100
- # a repository.
101
- attr_accessor :handle
102
-
103
- # Options for this dataset.
104
- attr_accessor :options
105
-
106
- def initialize handle, options = {}
107
- @options = options
108
- @handle = handle
109
- set_default_paths unless options[:skip_paths]
110
- set_paths unless options[:skip_paths]
111
- initialize_workflow unless options[:skip_workflow]
112
- if options[:repository]
113
- options[:repository][handle] = self
114
- end
115
- end
116
-
117
- # Provides this dataset with a workflow of tasks managed by Rake.
118
- include IMW::Workflow
119
-
120
- # Provides this dataset with DSL like methods to construct a
121
- # schema in an IMW file.
122
- # include IMW::Metadata::DSL
123
-
124
- end
125
- end
@@ -1,29 +0,0 @@
1
- module IMW
2
- class Dataset
3
- include IMW::Paths
4
-
5
- protected
6
- # Sets paths to the workflow directories for this dataset (+ripd+,
7
- # +rawd+, +fixd+, +pkgd+) as well as the following paths:
8
- #
9
- # script::
10
- # The path to the file the dataset was initialized in.
11
- #
12
- # root::
13
- # The parent directory of the file the dataset was initialized
14
- # in or the value of the <tt>:root</tt> key in
15
- # IMW::Dataset#options
16
- #
17
- def set_default_paths
18
- add_path :script, File.expand_path(eval('__FILE__'))
19
- add_path :root, options[:root] || File.dirname(path_to(:script))
20
- workflow_dirs.each do |dir|
21
- add_path dir, :root, dir.to_s
22
- end
23
- end
24
-
25
- # Overwrite this method to set additional paths for the dataset.
26
- def set_paths
27
- end
28
- end
29
- end
@@ -1,195 +0,0 @@
1
- require 'ostruct'
2
- require 'rake'
3
-
4
- module IMW
5
-
6
- # An IMW version of Rake::Task
7
- Task = Class.new(Rake::Task)
8
-
9
- # An IMW subclass of Rake:FileTask
10
- FileTask = Class.new(Rake::FileTask)
11
-
12
- # An IMW subclass of Rake::FileCreationTask
13
- FileCreationTask = Class.new(Rake::FileCreationTask)
14
-
15
- # IMW encourages you to view a data transformation as a series of
16
- # interdependent steps.
17
- #
18
- # By default, IMW defines four main steps in such a transformation:
19
- # +rip+, +parse+, +fix+, and +package+.
20
- #
21
- # Each step is associated with a directory on disk in which it keeps
22
- # its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
23
- #
24
- # The steps are:
25
- #
26
- # rip::
27
- # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
28
- # store the results in +ripd+.
29
- #
30
- # parse::
31
- # Parse data into a structured form using a library (JSON, YAML,
32
- # &c.) or using your own parser (XML, flat files, &c.) and store
33
- # the results in +prsd+
34
- #
35
- # fix::
36
- # Combine, filter, reconcile, and transform already structured
37
- # data into a desired form and store the results in +fixd+.
38
- #
39
- # package::
40
- # Archive, compress, and deliver data in its final form to some
41
- # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
42
- # storing the ouptut in +pkgd+.
43
- #
44
- # Each step depends upon the one before it. The steps are blank by
45
- # default so there's no need to write code for steps you don't need
46
- # to use. You can also define your own steps (using +task+ just
47
- # like in Rake) and hook them into these pre-defined steps (or
48
- # not...).
49
- #
50
- # A dataset also has an <tt>:initialize</tt> task (which by default
51
- # just creates the directories for these steps) which you can use to
52
- # hook in your own initialization tasks by making it depend on them.
53
- #
54
- # A subclass of IMW::Dataset can customize how tasks are defined by
55
- # overriding +define_workflow_tasks+, among other methods, and
56
- # introduce new tasks by overriding +define_tasks+.
57
- module Workflow
58
-
59
- include Rake::TaskManager
60
- # Default options passed to <tt>Rake</tt>. Any class including
61
- # the <tt>Rake::TaskManager</tt> module must define a constant by
62
- # this name.
63
- DEFAULT_OPTIONS = {
64
- :dry_run => false,
65
- :trace => false,
66
- :verbose => false
67
- }
68
-
69
- # Return a new (or existing) <tt>IMW::Task</tt> with the given
70
- # +name+. Dependencies can be declared and a block passed in just
71
- # as in Rake.
72
- #
73
- # @param [Hash, Symbol, String] deps the name of the task (if a
74
- # Symbol or String) or the name of the task mapped to an Array of
75
- # dependencies (if a Hash)
76
- #
77
- # @return [IMW::Task] the task
78
- def task deps, &block
79
- self.define_task IMW::Task, deps, &block
80
- end
81
-
82
- # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
83
- # +path+. Dependencies can be declared and a block passed in just
84
- # as in Rake.
85
- #
86
- # @param [String, IMW::Resource] path the path to the file
87
- # @return [IMW::FileTask] the task
88
- def file path, &block
89
- path = path.respond_to?(:path) ? path.path : path
90
- self.define_task IMW::FileTask, path, &block
91
- end
92
-
93
- # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
94
- # +path+. Dependencies can be declared and a block passed in just
95
- # as in Rake.
96
- #
97
- # @param [String, IMW::Resource] path the path to the file
98
- # @return [IMW::FileCreationTask] the task
99
- def file_create path, &block
100
- path = path.respond_to?(:path) ? path.path : path
101
- self.define_task IMW::FileCreationTask, path, &block
102
- end
103
-
104
- # Override this method to define default tasks for a subclass of
105
- # IMW::Dataset.
106
- def define_tasks
107
- end
108
-
109
- # The standard IMW workflow steps.
110
- #
111
- # @return [Array] the workflow step names
112
- def workflow_steps
113
- [:rip, :parse, :fix, :package]
114
- end
115
-
116
- # The steps of the IMW workflow each correspond to a directory in
117
- # which it is customary that they deposit their files <em>once
118
- # they are finished processing</em> (so ripped files wind up in
119
- # the +ripd+ directory, packaged files in the +pkgd+ directory,
120
- # and so on).
121
- #
122
- # @return [Array] the workflow directory names
123
- def workflow_dirs
124
- [:ripd, :rawd, :fixd, :pkgd]
125
- end
126
-
127
- protected
128
-
129
- # Convenience method for defining tasks for this workflow.
130
- #
131
- # @param [Hash, Symbol, String] deps the name of the task (if a
132
- # Symbol or String) or the name of the task mapped to an Array of
133
- # dependencies (if a Hash)
134
- # @param [String] comment the comment to associate to the task
135
- # @return [IMW::Task] the task
136
- def define_workflow_task deps, comment, &block
137
- @last_description = comment
138
- define_task(IMW::Task, deps, &block)
139
- end
140
-
141
- # Create all the instance variables required by Rake::TaskManager
142
- # and define default tasks for this dataset.
143
- def initialize_workflow
144
- @tasks = Hash.new
145
- @rules = Array.new
146
- @scope = Array.new
147
- @last_description = nil
148
- @options = OpenStruct.new(DEFAULT_OPTIONS)
149
- define_initialize_task
150
- define_workflow_tasks
151
- define_clean_task
152
- define_tasks
153
- end
154
-
155
- # Defines the <tt>:initialize</tt> task. The only other task
156
- # hooked into <tt>:initialize</tt> is the
157
- # <tt>:create_workflow_dirs</tt> task which creates the workflow
158
- # directories for this dataset.
159
- def define_initialize_task
160
- define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
161
- workflow_dirs.each do |dir|
162
- FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
163
- end
164
- end
165
- define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
166
- end
167
-
168
- # Creates a task <tt>:clean</tt> which removes dataset's
169
- # workflow directories.
170
- def define_clean_task
171
- define_workflow_task :clean, "Remove the workflow directories for this dataset." do
172
- workflow_dirs.each do |dir|
173
- FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
174
- end
175
- end
176
- end
177
-
178
- # Creates the task dependency chain <tt>:package => :fix =>
179
- # :parse => :rip => :initialize</tt> of the
180
- # IMW::Workflow.
181
- def define_workflow_tasks
182
- define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
183
- define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
184
- define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
185
- define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
186
- end
187
-
188
-
189
- def rip(deps=nil, &block); self[:rip].enhance(deps, &block); end
190
- def parse(deps=nil, &block); self[:parse].enhance(deps, &block); end
191
- def fix(deps=nil, &block); self[:fix].enhance(deps, &block); end
192
- def package(deps=nil, &block); self[:package].enhance(deps, &block); end
193
-
194
- end
195
- end