imw 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/.gitignore CHANGED
@@ -12,4 +12,7 @@ TAGS
12
12
  tmp/*
13
13
  *.tmproj
14
14
  pkg/*
15
- imw.gemspec
15
+ *gemspec
16
+ tags
17
+ .yardoc/*
18
+ */.yardoc/*
data/Rakefile CHANGED
@@ -18,3 +18,13 @@ begin
18
18
  rescue LoadError
19
19
  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
20
  end
21
+
22
+ desc "Build tags"
23
+ task :tags do
24
+ system "etags -R bin etc examples lib spec"
25
+ end
26
+
27
+ desc "Build docs"
28
+ task :docs do
29
+ system "yardoc"
30
+ end
data/TODO ADDED
@@ -0,0 +1,18 @@
1
+ lookup basic yarddoc style (@params, etc) -- do a high-level description
2
+
3
+ learn how to run specs
4
+ write a spec that fails on the old code and passes on the new
5
+
6
+ convert all references to URI to be Addressable::URI
7
+ don't use URI.parse, use Addressable::URI.heuristic_parse (eg in files/*)
8
+ make basicfile methods delegate to its uri
9
+
10
+ tmpdir should use the actual system tmpdir libs (eg in archiver)
11
+ move config over to configliere
12
+
13
+
14
+
15
+
16
+ ------ WANT PONY -----
17
+
18
+ might be nice to learn the delegate pattern
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
data/bin/imw CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'imw/runner'
4
- exit IMW::Runner.new(*ARGV).run!
4
+ exit IMW::Runner.new(*ARGV.dup).run!
5
5
 
data/etc/imwrc.rb CHANGED
@@ -21,56 +21,6 @@
21
21
  # Copyright:: Copyright (c) 2008 infochimps.org
22
22
  # License:: GPL 3.0
23
23
  # Website:: http://infinitemonkeywrench.org/
24
- #
25
24
 
26
25
  module IMW
27
- PATHS = {
28
- :home => ENV['HOME'],
29
- :data_root => "/var/lib/imw",
30
- :log_root => "/var/log/imw",
31
- :scripts_root => "/usr/share/imw",
32
- :tmp_root => "/tmp/imw",
33
-
34
- # the imw library
35
- :imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
36
- :imw_bin => [:imw_root, 'bin'],
37
- :imw_etc => [:imw_root, 'etc'],
38
- :imw_lib => [:imw_root, 'lib'],
39
-
40
- # workflow
41
- :ripd_root => [:data_root, 'ripd'],
42
- :peeld_root => [:data_root, 'peeld'],
43
- :mungd_root => [:data_root, 'mungd'],
44
- :temp_root => [:data_root, 'temp'],
45
- :fixd_root => [:data_root, 'fixd'],
46
- :pkgd_root => [:data_root, 'pkgd']
47
- }
48
-
49
- # Default time format.
50
- STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
51
-
52
- # Paths to external programs used by IMW.
53
- EXTERNAL_PROGRAMS = {
54
- :tar => "tar",
55
- :rar => "rar",
56
- :zip => "zip",
57
- :unzip => "unzip",
58
- :gzip => "gzip",
59
- :bzip2 => "bzip2",
60
- :wget => "wget"
61
- } unless defined? ::IMW::EXTERNAL_PROGRAMS
62
-
63
- module Files
64
- # Regular expressions which match pathnames to the name of the
65
- # appropriate IMW::Files class.
66
- #
67
- # File class names should be stripped of the leading
68
- # <tt>IMW::Files</tt> prefix, i.e. - the file object
69
- # <tt>IMW::Files::Bz2</tt> should be referenced by the string
70
- # <tt>"Bz2"</tt>.
71
- FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
72
- end
73
-
74
26
  end
75
-
76
-
@@ -0,0 +1,12 @@
1
+ require 'imw'
2
+ dataset = IMW::Dataset.new :handle => 'test'
3
+
4
+ dataset.rip do
5
+ IMW.open("http://path/to/somre/resource.html").cp(dataset.path_to(:ripd), 'original_data.html')
6
+ end
7
+
8
+ dataset.parse do
9
+ #...
10
+ end
11
+
12
+
data/lib/imw/boot.rb CHANGED
@@ -1,4 +1,36 @@
1
+ require 'imw/utils/extensions/hash'
2
+
1
3
  module IMW
4
+
5
+ # IMW looks for configuration settings in the following places, in
6
+ # order of increasing precedence:
7
+ #
8
+ # 1. Settings defined directly in this file.
9
+ #
10
+ # 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
11
+ #
12
+ # 3. From the <tt>.imwrc</tt> file in the user's home directory (the
13
+ # filename can be changed; see
14
+ # <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
15
+ #
16
+ # 4. From the file defined by the environment variable +IMWRC+ (the
17
+ # value can be changed; see
18
+ # <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
19
+ #
20
+ # Settings not found in one configuration location will be searched
21
+ # for in locations of lesser precedence.
22
+ #
23
+ # *Note:* configuration files are plain Ruby code that will be directly
24
+ # evaluated.
25
+ #
26
+ # Relevant settings include
27
+ #
28
+ # * interfaces with external programs (+tar+, +wget+, &c.)
29
+ # * paths to directories where IMW reads/writes files
30
+ # * correspondences between file extensions and IMW file classes
31
+ #
32
+ # For more detailed information, see the default configuration file,
33
+ # <tt>etc/imwrc</tt>.
2
34
  module Config
3
35
 
4
36
  # Root of the IMW source base.
@@ -9,11 +41,12 @@ module IMW
9
41
  #
10
42
  # User configuration file
11
43
  #
12
- # By default, the file ~/.imwrc (.imwrc, in your home directory -- note no .rb extension)
13
- # is sourced at top level. If the $IMWRC environment variable is set,
14
- # that file will be sourced instead.
44
+ # By default, the file ~/.imwrc (.imwrc, in your home directory --
45
+ # note no .rb extension) is sourced at top level. If the $IMWRC
46
+ # environment variable is set, that file will be sourced instead.
15
47
  #
16
- # Any code within this file will override settings in IMW_ROOT/etc/imwrc.rb
48
+ # Any code within this file will override settings in
49
+ # /etc/imwrc.rb which itself overrides IMW_ROOT/etc/imwrc.rb
17
50
  #
18
51
  USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
19
52
  # Environment variable to override user configuration file location.
@@ -22,16 +55,29 @@ module IMW
22
55
  File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
23
56
  end
24
57
 
25
- # System-level config file
26
- SITE_CONFIG_FILE = "etc/imwrc.rb"
58
+ # Path to site-wide config file (overwrites IMW defaults but
59
+ # overridden by user defaults).
60
+ SITE_CONFIG_FILE = "/etc/imwrc.rb"
27
61
  def self.site_config_file # :nodoc:
28
- File.join(imw_root, SITE_CONFIG_FILE)
62
+ SITE_CONFIG_FILE
29
63
  end
30
64
 
65
+ def self.default_config_file # :nodoc:
66
+ File.join(imw_root, "etc/imwrc.rb")
67
+ end
68
+
31
69
  # Source the config files
32
70
  def self.load_config
33
- require site_config_file
34
- load user_config_file if File.exist? user_config_file
71
+ if File.exist?(user_config_file)
72
+ load user_config_file
73
+ end
74
+
75
+ if File.exist?(site_config_file)
76
+ load site_config_file
77
+ end
78
+
79
+ load default_config_file
80
+
35
81
  end
36
82
  end
37
83
  end
@@ -1,32 +1,24 @@
1
1
  module IMW
2
-
3
2
  class Dataset
4
3
  include IMW::Paths
5
4
 
6
- # A dataset keeps track of its own collection of paths just like
7
- # IMW itself. When an IMW::Dataset is instantiated in a script,
8
- # that script's directory becomes the dataset's +self+ path and
9
- # the default workflow directories (see IMW::Workflow) are created
10
- # within this directory.
5
+ protected
6
+ # Sets paths to the workflow directories for this dataset (+ripd+,
7
+ # +rawd+, +fixd+, +pkgd+) as well as the following paths:
11
8
  #
12
- # You can change a dataset's paths the same way you can change
13
- # IMW's paths; calling +add_path+ and +remove_path+ on the
14
- # dataset.
9
+ # script::
10
+ # The path to the file the dataset was initialized in.
15
11
  #
16
- # To customize this behavior for all future datasets, created a
17
- # subclass of IMW::Dataset and override the +set_paths+ method.
18
- def paths
19
- @paths
20
- end
21
-
22
- protected
23
- # Sets the roots of various paths relative to this dataset.
24
- def set_root_paths
25
- @paths = {}
26
- add_path :script, File.expand_path(eval('__FILE__'))
27
- add_path :self, File.dirname(path_to(:script))
28
- IMW::Workflow::DIRS.each do |dir|
29
- add_path dir, :self, dir.to_s
12
+ # root::
13
+ # The parent directory of the file the dataset was initialized
14
+ # in or the value of the <tt>:root</tt> key in
15
+ # IMW::Dataset#options
16
+ #
17
+ def set_default_paths
18
+ add_path :script, File.expand_path(eval('__FILE__'))
19
+ add_path :root, options[:root] || File.dirname(path_to(:script))
20
+ workflow_dirs.each do |dir|
21
+ add_path dir, :root, dir.to_s
30
22
  end
31
23
  end
32
24
 
@@ -34,5 +26,4 @@ module IMW
34
26
  def set_paths
35
27
  end
36
28
  end
37
-
38
29
  end
@@ -1,42 +1,62 @@
1
- require 'imw/dataset/task'
2
1
  require 'ostruct'
2
+ require 'rake'
3
3
 
4
4
  module IMW
5
5
 
6
- # IMW encourages you to view a data transformation as a network of
7
- # dependencies. By default, IMW defines five main steps:
6
+ # An IMW version of Rake::Task
7
+ Task = Class.new(Rake::Task)
8
+
9
+ # An IMW subclass of Rake:FileTask
10
+ FileTask = Class.new(Rake::FileTask)
11
+
12
+ # An IMW subclass of Rake::FileCreationTask
13
+ FileCreationTask = Class.new(Rake::FileCreationTask)
14
+
15
+ # IMW encourages you to view a data transformation as a series of
16
+ # interdependent steps.
8
17
  #
9
- # rip::
10
- # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
18
+ # By default, IMW defines four main steps in such a transformation:
19
+ # +rip+, +parse+, +fix+, and +package+.
20
+ #
21
+ # Each step is associated with a directory on disk in which it keeps
22
+ # its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
11
23
  #
12
- # extract::
13
- # Extract data from its ripped form to a form which can be
14
- # parsed.
24
+ # The steps are:
25
+ #
26
+ # rip::
27
+ # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
28
+ # store the results in +ripd+.
15
29
  #
16
30
  # parse::
17
- # Parse data into a structured form.
31
+ # Parse data into a structured form using a library (JSON, YAML,
32
+ # &c.) or using your own parser (XML, flat files, &c.) and store
33
+ # the results in +prsd+
18
34
  #
19
- # munge::
35
+ # fix::
20
36
  # Combine, filter, reconcile, and transform already structured
21
- # data into a desired form.
37
+ # data into a desired form and store the results in +fixd+.
22
38
  #
23
39
  # package::
24
40
  # Archive, compress, and deliver data in its final form to some
25
- # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
41
+ # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
42
+ # storing the ouptut in +pkgd+.
26
43
  #
27
44
  # Each step depends upon the one before it. The steps are blank by
28
45
  # default so there's no need to write code for steps you don't need
29
- # to use.
46
+ # to use. You can also define your own steps (using +task+ just
47
+ # like in Rake) and hook them into these pre-defined steps (or
48
+ # not...).
30
49
  #
31
- # Each step corresponds to a named directory in IMW::Workflow::DIRS.
50
+ # A dataset also has an <tt>:initialize</tt> task (which by default
51
+ # just creates the directories for these steps) which you can use to
52
+ # hook in your own initialization tasks by making it depend on them.
53
+ #
54
+ # A subclass of IMW::Dataset can customize how tasks are defined by
55
+ # overriding +define_workflow_tasks+, among other methods, and
56
+ # introduce new tasks by overriding +define_tasks+.
32
57
  module Workflow
33
58
 
34
- # The <tt>Rake::TaskManager</tt> module allows the
35
- # <tt>IMW::Dataset</tt> class to leverage the functionality of the
36
- # Rake[http://rake.rubyforge.org/] library to manage tasks
37
- # associated with the processing of this dataset.
38
59
  include Rake::TaskManager
39
-
40
60
  # Default options passed to <tt>Rake</tt>. Any class including
41
61
  # the <tt>Rake::TaskManager</tt> module must define a constant by
42
62
  # this name.
@@ -45,51 +65,77 @@ module IMW
45
65
  :trace => false,
46
66
  :verbose => false
47
67
  }
68
+
69
+ # Return a new (or existing) <tt>IMW::Task</tt> with the given
70
+ # +name+. Dependencies can be declared and a block passed in just
71
+ # as in Rake.
72
+ #
73
+ # @param [Hash, Symbol, String] deps the name of the task (if a
74
+ # Symbol or String) or the name of the task mapped to an Array of
75
+ # dependencies (if a Hash)
76
+ #
77
+ # @return [IMW::Task] the task
78
+ def task deps, &block
79
+ self.define_task IMW::Task, deps, &block
80
+ end
81
+
82
+ # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
83
+ # +path+. Dependencies can be declared and a block passed in just
84
+ # as in Rake.
85
+ #
86
+ # @param [String, IMW::Resource] path the path to the file
87
+ # @return [IMW::FileTask] the task
88
+ def file path, &block
89
+ path = path.respond_to?(:path) ? path.path : path
90
+ self.define_task IMW::FileTask, path, &block
91
+ end
92
+
93
+ # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
94
+ # +path+. Dependencies can be declared and a block passed in just
95
+ # as in Rake.
96
+ #
97
+ # @param [String, IMW::Resource] path the path to the file
98
+ # @return [IMW::FileCreationTask] the task
99
+ def file_create path, &block
100
+ path = path.respond_to?(:path) ? path.path : path
101
+ self.define_task IMW::FileCreationTask, path, &block
102
+ end
103
+
104
+ # Override this method to define default tasks for a subclass of
105
+ # IMW::Dataset.
106
+ def define_tasks
107
+ end
48
108
 
49
109
  # The standard IMW workflow steps.
50
- STEPS = [:rip, :extract, :parse, :munge, :package]
110
+ #
111
+ # @return [Array] the workflow step names
112
+ def workflow_steps
113
+ [:rip, :parse, :fix, :package]
114
+ end
51
115
 
52
116
  # The steps of the IMW workflow each correspond to a directory in
53
117
  # which it is customary that they deposit their files <em>once
54
118
  # they are finished processing</em> (so ripped files wind up in
55
119
  # the +ripd+ directory, packaged files in the +pkgd+ directory,
56
120
  # and so on).
57
- DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
58
-
59
- # Each workflow step can be configured to take default actions,
60
- # each action being a proc in the array for the step in this hash.
61
121
  #
62
- # This allows classes which include IMW::Workflow to use class
63
- # methods named after each step (+rip+, +parse+, &c.) to directly
64
- # define tasks.
65
- STEPS_TASKS = returning({}) do |steps_procs|
66
- STEPS.each do |step|
67
- steps_procs[step] = []
68
- end
122
+ # @return [Array] the workflow directory names
123
+ def workflow_dirs
124
+ [:ripd, :rawd, :fixd, :pkgd]
69
125
  end
70
126
 
71
127
  protected
72
- def self.included klass
73
- STEPS.each do |step|
74
- klass.class_eval <<EOF
75
- def self.#{step}(deps=nil, &block)
76
- STEPS_TASKS[:#{step}] << [deps, block]
77
- end
78
- EOF
79
- end
80
-
81
-
82
- end
83
-
84
- def define_workflow_task deps, comment
128
+
129
+ # Convenience method for defining tasks for this workflow.
130
+ #
131
+ # @param [Hash, Symbol, String] deps the name of the task (if a
132
+ # Symbol or String) or the name of the task mapped to an Array of
133
+ # dependencies (if a Hash)
134
+ # @param [String] comment the comment to associate to the task
135
+ # @return [IMW::Task] the task
136
+ def define_workflow_task deps, comment, &block
85
137
  @last_description = comment
86
- define_task(IMW::Task, deps)
87
- step = deps.respond_to?(:keys) ? deps.keys.first : deps
88
- STEPS_TASKS[step].each do |deps, block|
89
- self[step].enhance(deps) do
90
- self.instance_eval(&block)
91
- end
92
- end
138
+ define_task(IMW::Task, deps, &block)
93
139
  end
94
140
 
95
141
  # Create all the instance variables required by Rake::TaskManager
@@ -100,43 +146,56 @@ EOF
100
146
  @scope = Array.new
101
147
  @last_description = nil
102
148
  @options = OpenStruct.new(DEFAULT_OPTIONS)
103
- define_create_directories_task
104
- define_workflow_tasks
105
- define_destroy_task
149
+ define_initialize_task
150
+ define_workflow_tasks
151
+ define_workflow_task_methods
152
+ define_clean_task
153
+ define_tasks
106
154
  end
107
155
 
108
- # Creates a task <tt>:create_directories</tt> to create the
109
- # directory structure for this dataset.
110
- def define_create_directories_task
111
- @last_description = "Creates workflow directories for this dataset."
112
- define_task(IMW::Task, {:create_directories => []}) do
113
- DIRS.each do |dir|
156
+ # Defines the <tt>:initialize</tt> task. The only other task
157
+ # hooked into <tt>:initialize</tt> is the
158
+ # <tt>:create_workflow_dirs</tt> task which creates the workflow
159
+ # directories for this dataset.
160
+ def define_initialize_task
161
+ define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
162
+ workflow_dirs.each do |dir|
114
163
  FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
115
164
  end
116
165
  end
166
+ define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
117
167
  end
118
168
 
119
- # Creates a task <tt>:destroy</tt> which removes dataset's
169
+ # Creates a task <tt>:clean</tt> which removes dataset's
120
170
  # workflow directories.
121
- def define_destroy_task
122
- @last_description = "Get rid of all traces of this dataset."
123
- define_task(IMW::Task, :destroy => [:create_directories]) do
124
- DIRS.each do |dir|
125
- FileUtils.rm_rf(path_to(dir))
171
+ def define_clean_task
172
+ define_workflow_task :clean, "Remove the workflow directories for this dataset." do
173
+ workflow_dirs.each do |dir|
174
+ FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
126
175
  end
127
176
  end
128
177
  end
129
178
 
130
- # Creates the task dependency chain <tt>:package => :munge =>
131
- # :parse => :extract => :rip => :initialize</tt> of the
179
+ # Creates the task dependency chain <tt>:package => :fix =>
180
+ # :parse => :rip => :initialize</tt> of the
132
181
  # IMW::Workflow.
133
182
  def define_workflow_tasks
134
183
  define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
135
- define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
136
- define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
137
- define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
138
- define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
184
+ define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
185
+ define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
186
+ define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
139
187
  end
140
188
 
189
+ # Dynamically define methods for each of the workflow steps which
190
+ # act as shorcuts for accessing the corresponding tasks.
191
+ def define_workflow_task_methods
192
+ workflow_steps.each do |step|
193
+ self.class.class_eval <<RUBY
194
+ def #{step} deps, &block
195
+ self[step].enhance(step => deps, &block)
196
+ end
197
+ RUBY
198
+ end
199
+ end
141
200
  end
142
201
  end