imw 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/.gitignore CHANGED
@@ -12,4 +12,7 @@ TAGS
12
12
  tmp/*
13
13
  *.tmproj
14
14
  pkg/*
15
- imw.gemspec
15
+ *gemspec
16
+ tags
17
+ .yardoc/*
18
+ */.yardoc/*
data/Rakefile CHANGED
@@ -18,3 +18,13 @@ begin
18
18
  rescue LoadError
19
19
  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
20
  end
21
+
22
+ desc "Build tags"
23
+ task :tags do
24
+ system "etags -R bin etc examples lib spec"
25
+ end
26
+
27
+ desc "Build docs"
28
+ task :docs do
29
+ system "yardoc"
30
+ end
data/TODO ADDED
@@ -0,0 +1,18 @@
1
+ lookup basic yarddoc style (@params, etc) -- do a high-level description
2
+
3
+ learn how to run specs
4
+ write a spec that fails on the old code and passes on the new
5
+
6
+ convert all references to URI to be Addressable::URI
7
+ don't use URI.parse, use Addressable::URI.heuristic_parse (eg in files/*)
8
+ make basicfile methods delegate to its uri
9
+
10
+ tmpdir should use the actual system tmpdir libs (eg in archiver)
11
+ move config over to configliere
12
+
13
+
14
+
15
+
16
+ ------ WANT PONY -----
17
+
18
+ might be nice to learn the delegate pattern
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
data/bin/imw CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'imw/runner'
4
- exit IMW::Runner.new(*ARGV).run!
4
+ exit IMW::Runner.new(*ARGV.dup).run!
5
5
 
data/etc/imwrc.rb CHANGED
@@ -21,56 +21,6 @@
21
21
  # Copyright:: Copyright (c) 2008 infochimps.org
22
22
  # License:: GPL 3.0
23
23
  # Website:: http://infinitemonkeywrench.org/
24
- #
25
24
 
26
25
  module IMW
27
- PATHS = {
28
- :home => ENV['HOME'],
29
- :data_root => "/var/lib/imw",
30
- :log_root => "/var/log/imw",
31
- :scripts_root => "/usr/share/imw",
32
- :tmp_root => "/tmp/imw",
33
-
34
- # the imw library
35
- :imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
36
- :imw_bin => [:imw_root, 'bin'],
37
- :imw_etc => [:imw_root, 'etc'],
38
- :imw_lib => [:imw_root, 'lib'],
39
-
40
- # workflow
41
- :ripd_root => [:data_root, 'ripd'],
42
- :peeld_root => [:data_root, 'peeld'],
43
- :mungd_root => [:data_root, 'mungd'],
44
- :temp_root => [:data_root, 'temp'],
45
- :fixd_root => [:data_root, 'fixd'],
46
- :pkgd_root => [:data_root, 'pkgd']
47
- }
48
-
49
- # Default time format.
50
- STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
51
-
52
- # Paths to external programs used by IMW.
53
- EXTERNAL_PROGRAMS = {
54
- :tar => "tar",
55
- :rar => "rar",
56
- :zip => "zip",
57
- :unzip => "unzip",
58
- :gzip => "gzip",
59
- :bzip2 => "bzip2",
60
- :wget => "wget"
61
- } unless defined? ::IMW::EXTERNAL_PROGRAMS
62
-
63
- module Files
64
- # Regular expressions which match pathnames to the name of the
65
- # appropriate IMW::Files class.
66
- #
67
- # File class names should be stripped of the leading
68
- # <tt>IMW::Files</tt> prefix, i.e. - the file object
69
- # <tt>IMW::Files::Bz2</tt> should be referenced by the string
70
- # <tt>"Bz2"</tt>.
71
- FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
72
- end
73
-
74
26
  end
75
-
76
-
@@ -0,0 +1,12 @@
1
+ require 'imw'
2
+ dataset = IMW::Dataset.new :handle => 'test'
3
+
4
+ dataset.rip do
5
+ IMW.open("http://path/to/somre/resource.html").cp(dataset.path_to(:ripd), 'original_data.html')
6
+ end
7
+
8
+ dataset.parse do
9
+ #...
10
+ end
11
+
12
+
data/lib/imw/boot.rb CHANGED
@@ -1,4 +1,36 @@
1
+ require 'imw/utils/extensions/hash'
2
+
1
3
  module IMW
4
+
5
+ # IMW looks for configuration settings in the following places, in
6
+ # order of increasing precedence:
7
+ #
8
+ # 1. Settings defined directly in this file.
9
+ #
10
+ # 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
11
+ #
12
+ # 3. From the <tt>.imwrc</tt> file in the user's home directory (the
13
+ # filename can be changed; see
14
+ # <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
15
+ #
16
+ # 4. From the file defined by the environment variable +IMWRC+ (the
17
+ # value can be changed; see
18
+ # <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
19
+ #
20
+ # Settings not found in one configuration location will be searched
21
+ # for in locations of lesser precedence.
22
+ #
23
+ # *Note:* configuration files are plain Ruby code that will be directly
24
+ # evaluated.
25
+ #
26
+ # Relevant settings include
27
+ #
28
+ # * interfaces with external programs (+tar+, +wget+, &c.)
29
+ # * paths to directories where IMW reads/writes files
30
+ # * correspondences between file extensions and IMW file classes
31
+ #
32
+ # For more detailed information, see the default configuration file,
33
+ # <tt>etc/imwrc</tt>.
2
34
  module Config
3
35
 
4
36
  # Root of the IMW source base.
@@ -9,11 +41,12 @@ module IMW
9
41
  #
10
42
  # User configuration file
11
43
  #
12
- # By default, the file ~/.imwrc (.imwrc, in your home directory -- note no .rb extension)
13
- # is sourced at top level. If the $IMWRC environment variable is set,
14
- # that file will be sourced instead.
44
+ # By default, the file ~/.imwrc (.imwrc, in your home directory --
45
+ # note no .rb extension) is sourced at top level. If the $IMWRC
46
+ # environment variable is set, that file will be sourced instead.
15
47
  #
16
- # Any code within this file will override settings in IMW_ROOT/etc/imwrc.rb
48
+ # Any code within this file will override settings in
49
+ # /etc/imwrc.rb which itself overrides IMW_ROOT/etc/imwrc.rb
17
50
  #
18
51
  USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
19
52
  # Environment variable to override user configuration file location.
@@ -22,16 +55,29 @@ module IMW
22
55
  File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
23
56
  end
24
57
 
25
- # System-level config file
26
- SITE_CONFIG_FILE = "etc/imwrc.rb"
58
+ # Path to site-wide config file (overwrites IMW defaults but
59
+ # overridden by user defaults).
60
+ SITE_CONFIG_FILE = "/etc/imwrc.rb"
27
61
  def self.site_config_file # :nodoc:
28
- File.join(imw_root, SITE_CONFIG_FILE)
62
+ SITE_CONFIG_FILE
29
63
  end
30
64
 
65
+ def self.default_config_file # :nodoc:
66
+ File.join(imw_root, "etc/imwrc.rb")
67
+ end
68
+
31
69
  # Source the config files
32
70
  def self.load_config
33
- require site_config_file
34
- load user_config_file if File.exist? user_config_file
71
+ if File.exist?(user_config_file)
72
+ load user_config_file
73
+ end
74
+
75
+ if File.exist?(site_config_file)
76
+ load site_config_file
77
+ end
78
+
79
+ load default_config_file
80
+
35
81
  end
36
82
  end
37
83
  end
@@ -1,32 +1,24 @@
1
1
  module IMW
2
-
3
2
  class Dataset
4
3
  include IMW::Paths
5
4
 
6
- # A dataset keeps track of its own collection of paths just like
7
- # IMW itself. When an IMW::Dataset is instantiated in a script,
8
- # that script's directory becomes the dataset's +self+ path and
9
- # the default workflow directories (see IMW::Workflow) are created
10
- # within this directory.
5
+ protected
6
+ # Sets paths to the workflow directories for this dataset (+ripd+,
7
+ # +rawd+, +fixd+, +pkgd+) as well as the following paths:
11
8
  #
12
- # You can change a dataset's paths the same way you can change
13
- # IMW's paths; calling +add_path+ and +remove_path+ on the
14
- # dataset.
9
+ # script::
10
+ # The path to the file the dataset was initialized in.
15
11
  #
16
- # To customize this behavior for all future datasets, created a
17
- # subclass of IMW::Dataset and override the +set_paths+ method.
18
- def paths
19
- @paths
20
- end
21
-
22
- protected
23
- # Sets the roots of various paths relative to this dataset.
24
- def set_root_paths
25
- @paths = {}
26
- add_path :script, File.expand_path(eval('__FILE__'))
27
- add_path :self, File.dirname(path_to(:script))
28
- IMW::Workflow::DIRS.each do |dir|
29
- add_path dir, :self, dir.to_s
12
+ # root::
13
+ # The parent directory of the file the dataset was initialized
14
+ # in or the value of the <tt>:root</tt> key in
15
+ # IMW::Dataset#options
16
+ #
17
+ def set_default_paths
18
+ add_path :script, File.expand_path(eval('__FILE__'))
19
+ add_path :root, options[:root] || File.dirname(path_to(:script))
20
+ workflow_dirs.each do |dir|
21
+ add_path dir, :root, dir.to_s
30
22
  end
31
23
  end
32
24
 
@@ -34,5 +26,4 @@ module IMW
34
26
  def set_paths
35
27
  end
36
28
  end
37
-
38
29
  end
@@ -1,42 +1,62 @@
1
- require 'imw/dataset/task'
2
1
  require 'ostruct'
2
+ require 'rake'
3
3
 
4
4
  module IMW
5
5
 
6
- # IMW encourages you to view a data transformation as a network of
7
- # dependencies. By default, IMW defines five main steps:
6
+ # An IMW version of Rake::Task
7
+ Task = Class.new(Rake::Task)
8
+
9
+ # An IMW subclass of Rake:FileTask
10
+ FileTask = Class.new(Rake::FileTask)
11
+
12
+ # An IMW subclass of Rake::FileCreationTask
13
+ FileCreationTask = Class.new(Rake::FileCreationTask)
14
+
15
+ # IMW encourages you to view a data transformation as a series of
16
+ # interdependent steps.
8
17
  #
9
- # rip::
10
- # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
18
+ # By default, IMW defines four main steps in such a transformation:
19
+ # +rip+, +parse+, +fix+, and +package+.
20
+ #
21
+ # Each step is associated with a directory on disk in which it keeps
22
+ # its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
11
23
  #
12
- # extract::
13
- # Extract data from its ripped form to a form which can be
14
- # parsed.
24
+ # The steps are:
25
+ #
26
+ # rip::
27
+ # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
28
+ # store the results in +ripd+.
15
29
  #
16
30
  # parse::
17
- # Parse data into a structured form.
31
+ # Parse data into a structured form using a library (JSON, YAML,
32
+ # &c.) or using your own parser (XML, flat files, &c.) and store
33
+ # the results in +prsd+
18
34
  #
19
- # munge::
35
+ # fix::
20
36
  # Combine, filter, reconcile, and transform already structured
21
- # data into a desired form.
37
+ # data into a desired form and store the results in +fixd+.
22
38
  #
23
39
  # package::
24
40
  # Archive, compress, and deliver data in its final form to some
25
- # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
41
+ # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
42
+ # storing the ouptut in +pkgd+.
26
43
  #
27
44
  # Each step depends upon the one before it. The steps are blank by
28
45
  # default so there's no need to write code for steps you don't need
29
- # to use.
46
+ # to use. You can also define your own steps (using +task+ just
47
+ # like in Rake) and hook them into these pre-defined steps (or
48
+ # not...).
30
49
  #
31
- # Each step corresponds to a named directory in IMW::Workflow::DIRS.
50
+ # A dataset also has an <tt>:initialize</tt> task (which by default
51
+ # just creates the directories for these steps) which you can use to
52
+ # hook in your own initialization tasks by making it depend on them.
53
+ #
54
+ # A subclass of IMW::Dataset can customize how tasks are defined by
55
+ # overriding +define_workflow_tasks+, among other methods, and
56
+ # introduce new tasks by overriding +define_tasks+.
32
57
  module Workflow
33
58
 
34
- # The <tt>Rake::TaskManager</tt> module allows the
35
- # <tt>IMW::Dataset</tt> class to leverage the functionality of the
36
- # Rake[http://rake.rubyforge.org/] library to manage tasks
37
- # associated with the processing of this dataset.
38
59
  include Rake::TaskManager
39
-
40
60
  # Default options passed to <tt>Rake</tt>. Any class including
41
61
  # the <tt>Rake::TaskManager</tt> module must define a constant by
42
62
  # this name.
@@ -45,51 +65,77 @@ module IMW
45
65
  :trace => false,
46
66
  :verbose => false
47
67
  }
68
+
69
+ # Return a new (or existing) <tt>IMW::Task</tt> with the given
70
+ # +name+. Dependencies can be declared and a block passed in just
71
+ # as in Rake.
72
+ #
73
+ # @param [Hash, Symbol, String] deps the name of the task (if a
74
+ # Symbol or String) or the name of the task mapped to an Array of
75
+ # dependencies (if a Hash)
76
+ #
77
+ # @return [IMW::Task] the task
78
+ def task deps, &block
79
+ self.define_task IMW::Task, deps, &block
80
+ end
81
+
82
+ # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
83
+ # +path+. Dependencies can be declared and a block passed in just
84
+ # as in Rake.
85
+ #
86
+ # @param [String, IMW::Resource] path the path to the file
87
+ # @return [IMW::FileTask] the task
88
+ def file path, &block
89
+ path = path.respond_to?(:path) ? path.path : path
90
+ self.define_task IMW::FileTask, path, &block
91
+ end
92
+
93
+ # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
94
+ # +path+. Dependencies can be declared and a block passed in just
95
+ # as in Rake.
96
+ #
97
+ # @param [String, IMW::Resource] path the path to the file
98
+ # @return [IMW::FileCreationTask] the task
99
+ def file_create path, &block
100
+ path = path.respond_to?(:path) ? path.path : path
101
+ self.define_task IMW::FileCreationTask, path, &block
102
+ end
103
+
104
+ # Override this method to define default tasks for a subclass of
105
+ # IMW::Dataset.
106
+ def define_tasks
107
+ end
48
108
 
49
109
  # The standard IMW workflow steps.
50
- STEPS = [:rip, :extract, :parse, :munge, :package]
110
+ #
111
+ # @return [Array] the workflow step names
112
+ def workflow_steps
113
+ [:rip, :parse, :fix, :package]
114
+ end
51
115
 
52
116
  # The steps of the IMW workflow each correspond to a directory in
53
117
  # which it is customary that they deposit their files <em>once
54
118
  # they are finished processing</em> (so ripped files wind up in
55
119
  # the +ripd+ directory, packaged files in the +pkgd+ directory,
56
120
  # and so on).
57
- DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
58
-
59
- # Each workflow step can be configured to take default actions,
60
- # each action being a proc in the array for the step in this hash.
61
121
  #
62
- # This allows classes which include IMW::Workflow to use class
63
- # methods named after each step (+rip+, +parse+, &c.) to directly
64
- # define tasks.
65
- STEPS_TASKS = returning({}) do |steps_procs|
66
- STEPS.each do |step|
67
- steps_procs[step] = []
68
- end
122
+ # @return [Array] the workflow directory names
123
+ def workflow_dirs
124
+ [:ripd, :rawd, :fixd, :pkgd]
69
125
  end
70
126
 
71
127
  protected
72
- def self.included klass
73
- STEPS.each do |step|
74
- klass.class_eval <<EOF
75
- def self.#{step}(deps=nil, &block)
76
- STEPS_TASKS[:#{step}] << [deps, block]
77
- end
78
- EOF
79
- end
80
-
81
-
82
- end
83
-
84
- def define_workflow_task deps, comment
128
+
129
+ # Convenience method for defining tasks for this workflow.
130
+ #
131
+ # @param [Hash, Symbol, String] deps the name of the task (if a
132
+ # Symbol or String) or the name of the task mapped to an Array of
133
+ # dependencies (if a Hash)
134
+ # @param [String] comment the comment to associate to the task
135
+ # @return [IMW::Task] the task
136
+ def define_workflow_task deps, comment, &block
85
137
  @last_description = comment
86
- define_task(IMW::Task, deps)
87
- step = deps.respond_to?(:keys) ? deps.keys.first : deps
88
- STEPS_TASKS[step].each do |deps, block|
89
- self[step].enhance(deps) do
90
- self.instance_eval(&block)
91
- end
92
- end
138
+ define_task(IMW::Task, deps, &block)
93
139
  end
94
140
 
95
141
  # Create all the instance variables required by Rake::TaskManager
@@ -100,43 +146,56 @@ EOF
100
146
  @scope = Array.new
101
147
  @last_description = nil
102
148
  @options = OpenStruct.new(DEFAULT_OPTIONS)
103
- define_create_directories_task
104
- define_workflow_tasks
105
- define_destroy_task
149
+ define_initialize_task
150
+ define_workflow_tasks
151
+ define_workflow_task_methods
152
+ define_clean_task
153
+ define_tasks
106
154
  end
107
155
 
108
- # Creates a task <tt>:create_directories</tt> to create the
109
- # directory structure for this dataset.
110
- def define_create_directories_task
111
- @last_description = "Creates workflow directories for this dataset."
112
- define_task(IMW::Task, {:create_directories => []}) do
113
- DIRS.each do |dir|
156
+ # Defines the <tt>:initialize</tt> task. The only other task
157
+ # hooked into <tt>:initialize</tt> is the
158
+ # <tt>:create_workflow_dirs</tt> task which creates the workflow
159
+ # directories for this dataset.
160
+ def define_initialize_task
161
+ define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
162
+ workflow_dirs.each do |dir|
114
163
  FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
115
164
  end
116
165
  end
166
+ define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
117
167
  end
118
168
 
119
- # Creates a task <tt>:destroy</tt> which removes dataset's
169
+ # Creates a task <tt>:clean</tt> which removes dataset's
120
170
  # workflow directories.
121
- def define_destroy_task
122
- @last_description = "Get rid of all traces of this dataset."
123
- define_task(IMW::Task, :destroy => [:create_directories]) do
124
- DIRS.each do |dir|
125
- FileUtils.rm_rf(path_to(dir))
171
+ def define_clean_task
172
+ define_workflow_task :clean, "Remove the workflow directories for this dataset." do
173
+ workflow_dirs.each do |dir|
174
+ FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
126
175
  end
127
176
  end
128
177
  end
129
178
 
130
- # Creates the task dependency chain <tt>:package => :munge =>
131
- # :parse => :extract => :rip => :initialize</tt> of the
179
+ # Creates the task dependency chain <tt>:package => :fix =>
180
+ # :parse => :rip => :initialize</tt> of the
132
181
  # IMW::Workflow.
133
182
  def define_workflow_tasks
134
183
  define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
135
- define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
136
- define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
137
- define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
138
- define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
184
+ define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
185
+ define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
186
+ define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
139
187
  end
140
188
 
189
+ # Dynamically define methods for each of the workflow steps which
190
+ # act as shorcuts for accessing the corresponding tasks.
191
+ def define_workflow_task_methods
192
+ workflow_steps.each do |step|
193
+ self.class.class_eval <<RUBY
194
+ def #{step} deps, &block
195
+ self[step].enhance(step => deps, &block)
196
+ end
197
+ RUBY
198
+ end
199
+ end
141
200
  end
142
201
  end