imw 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/.gitignore
CHANGED
data/Rakefile
CHANGED
|
@@ -18,3 +18,13 @@ begin
|
|
|
18
18
|
rescue LoadError
|
|
19
19
|
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
|
20
20
|
end
|
|
21
|
+
|
|
22
|
+
desc "Build tags"
|
|
23
|
+
task :tags do
|
|
24
|
+
system "etags -R bin etc examples lib spec"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
desc "Build docs"
|
|
28
|
+
task :docs do
|
|
29
|
+
system "yardoc"
|
|
30
|
+
end
|
data/TODO
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
lookup basic yarddoc style (@params, etc) -- do a high-level description
|
|
2
|
+
|
|
3
|
+
learn how to run specs
|
|
4
|
+
write a spec that fails on the old code and passes on the new
|
|
5
|
+
|
|
6
|
+
convert all references to URI to be Addressable::URI
|
|
7
|
+
don't use URI.parse, use Addressable::URI.heuristic_parse (eg in files/*)
|
|
8
|
+
make basicfile methods delegate to its uri
|
|
9
|
+
|
|
10
|
+
tmpdir should use the actual system tmpdir libs (eg in archiver)
|
|
11
|
+
move config over to configliere
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
------ WANT PONY -----
|
|
17
|
+
|
|
18
|
+
might be nice to learn the delegate pattern
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.2.0
|
data/bin/imw
CHANGED
data/etc/imwrc.rb
CHANGED
|
@@ -21,56 +21,6 @@
|
|
|
21
21
|
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
22
22
|
# License:: GPL 3.0
|
|
23
23
|
# Website:: http://infinitemonkeywrench.org/
|
|
24
|
-
#
|
|
25
24
|
|
|
26
25
|
module IMW
|
|
27
|
-
PATHS = {
|
|
28
|
-
:home => ENV['HOME'],
|
|
29
|
-
:data_root => "/var/lib/imw",
|
|
30
|
-
:log_root => "/var/log/imw",
|
|
31
|
-
:scripts_root => "/usr/share/imw",
|
|
32
|
-
:tmp_root => "/tmp/imw",
|
|
33
|
-
|
|
34
|
-
# the imw library
|
|
35
|
-
:imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
|
|
36
|
-
:imw_bin => [:imw_root, 'bin'],
|
|
37
|
-
:imw_etc => [:imw_root, 'etc'],
|
|
38
|
-
:imw_lib => [:imw_root, 'lib'],
|
|
39
|
-
|
|
40
|
-
# workflow
|
|
41
|
-
:ripd_root => [:data_root, 'ripd'],
|
|
42
|
-
:peeld_root => [:data_root, 'peeld'],
|
|
43
|
-
:mungd_root => [:data_root, 'mungd'],
|
|
44
|
-
:temp_root => [:data_root, 'temp'],
|
|
45
|
-
:fixd_root => [:data_root, 'fixd'],
|
|
46
|
-
:pkgd_root => [:data_root, 'pkgd']
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
# Default time format.
|
|
50
|
-
STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
|
|
51
|
-
|
|
52
|
-
# Paths to external programs used by IMW.
|
|
53
|
-
EXTERNAL_PROGRAMS = {
|
|
54
|
-
:tar => "tar",
|
|
55
|
-
:rar => "rar",
|
|
56
|
-
:zip => "zip",
|
|
57
|
-
:unzip => "unzip",
|
|
58
|
-
:gzip => "gzip",
|
|
59
|
-
:bzip2 => "bzip2",
|
|
60
|
-
:wget => "wget"
|
|
61
|
-
} unless defined? ::IMW::EXTERNAL_PROGRAMS
|
|
62
|
-
|
|
63
|
-
module Files
|
|
64
|
-
# Regular expressions which match pathnames to the name of the
|
|
65
|
-
# appropriate IMW::Files class.
|
|
66
|
-
#
|
|
67
|
-
# File class names should be stripped of the leading
|
|
68
|
-
# <tt>IMW::Files</tt> prefix, i.e. - the file object
|
|
69
|
-
# <tt>IMW::Files::Bz2</tt> should be referenced by the string
|
|
70
|
-
# <tt>"Bz2"</tt>.
|
|
71
|
-
FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
|
|
72
|
-
end
|
|
73
|
-
|
|
74
26
|
end
|
|
75
|
-
|
|
76
|
-
|
data/examples/dataset.rb
ADDED
data/lib/imw/boot.rb
CHANGED
|
@@ -1,4 +1,36 @@
|
|
|
1
|
+
require 'imw/utils/extensions/hash'
|
|
2
|
+
|
|
1
3
|
module IMW
|
|
4
|
+
|
|
5
|
+
# IMW looks for configuration settings in the following places, in
|
|
6
|
+
# order of increasing precedence:
|
|
7
|
+
#
|
|
8
|
+
# 1. Settings defined directly in this file.
|
|
9
|
+
#
|
|
10
|
+
# 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
|
|
11
|
+
#
|
|
12
|
+
# 3. From the <tt>.imwrc</tt> file in the user's home directory (the
|
|
13
|
+
# filename can be changed; see
|
|
14
|
+
# <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
|
|
15
|
+
#
|
|
16
|
+
# 4. From the file defined by the environment variable +IMWRC+ (the
|
|
17
|
+
# value can be changed; see
|
|
18
|
+
# <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
|
|
19
|
+
#
|
|
20
|
+
# Settings not found in one configuration location will be searched
|
|
21
|
+
# for in locations of lesser precedence.
|
|
22
|
+
#
|
|
23
|
+
# *Note:* configuration files are plain Ruby code that will be directly
|
|
24
|
+
# evaluated.
|
|
25
|
+
#
|
|
26
|
+
# Relevant settings include
|
|
27
|
+
#
|
|
28
|
+
# * interfaces with external programs (+tar+, +wget+, &c.)
|
|
29
|
+
# * paths to directories where IMW reads/writes files
|
|
30
|
+
# * correspondences between file extensions and IMW file classes
|
|
31
|
+
#
|
|
32
|
+
# For more detailed information, see the default configuration file,
|
|
33
|
+
# <tt>etc/imwrc</tt>.
|
|
2
34
|
module Config
|
|
3
35
|
|
|
4
36
|
# Root of the IMW source base.
|
|
@@ -9,11 +41,12 @@ module IMW
|
|
|
9
41
|
#
|
|
10
42
|
# User configuration file
|
|
11
43
|
#
|
|
12
|
-
# By default, the file ~/.imwrc (.imwrc, in your home directory --
|
|
13
|
-
# is sourced at top level. If the $IMWRC
|
|
14
|
-
# that file will be sourced instead.
|
|
44
|
+
# By default, the file ~/.imwrc (.imwrc, in your home directory --
|
|
45
|
+
# note no .rb extension) is sourced at top level. If the $IMWRC
|
|
46
|
+
# environment variable is set, that file will be sourced instead.
|
|
15
47
|
#
|
|
16
|
-
# Any code within this file will override settings in
|
|
48
|
+
# Any code within this file will override settings in
|
|
49
|
+
# /etc/imwrc.rb which itself overrides IMW_ROOT/etc/imwrc.rb
|
|
17
50
|
#
|
|
18
51
|
USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
|
|
19
52
|
# Environment variable to override user configuration file location.
|
|
@@ -22,16 +55,29 @@ module IMW
|
|
|
22
55
|
File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
|
|
23
56
|
end
|
|
24
57
|
|
|
25
|
-
#
|
|
26
|
-
|
|
58
|
+
# Path to site-wide config file (overwrites IMW defaults but
|
|
59
|
+
# overridden by user defaults).
|
|
60
|
+
SITE_CONFIG_FILE = "/etc/imwrc.rb"
|
|
27
61
|
def self.site_config_file # :nodoc:
|
|
28
|
-
|
|
62
|
+
SITE_CONFIG_FILE
|
|
29
63
|
end
|
|
30
64
|
|
|
65
|
+
def self.default_config_file # :nodoc:
|
|
66
|
+
File.join(imw_root, "etc/imwrc.rb")
|
|
67
|
+
end
|
|
68
|
+
|
|
31
69
|
# Source the config files
|
|
32
70
|
def self.load_config
|
|
33
|
-
|
|
34
|
-
|
|
71
|
+
if File.exist?(user_config_file)
|
|
72
|
+
load user_config_file
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
if File.exist?(site_config_file)
|
|
76
|
+
load site_config_file
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
load default_config_file
|
|
80
|
+
|
|
35
81
|
end
|
|
36
82
|
end
|
|
37
83
|
end
|
data/lib/imw/dataset/paths.rb
CHANGED
|
@@ -1,32 +1,24 @@
|
|
|
1
1
|
module IMW
|
|
2
|
-
|
|
3
2
|
class Dataset
|
|
4
3
|
include IMW::Paths
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
# the default workflow directories (see IMW::Workflow) are created
|
|
10
|
-
# within this directory.
|
|
5
|
+
protected
|
|
6
|
+
# Sets paths to the workflow directories for this dataset (+ripd+,
|
|
7
|
+
# +rawd+, +fixd+, +pkgd+) as well as the following paths:
|
|
11
8
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
# dataset.
|
|
9
|
+
# script::
|
|
10
|
+
# The path to the file the dataset was initialized in.
|
|
15
11
|
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
add_path :script, File.expand_path(eval('__FILE__'))
|
|
27
|
-
add_path :self, File.dirname(path_to(:script))
|
|
28
|
-
IMW::Workflow::DIRS.each do |dir|
|
|
29
|
-
add_path dir, :self, dir.to_s
|
|
12
|
+
# root::
|
|
13
|
+
# The parent directory of the file the dataset was initialized
|
|
14
|
+
# in or the value of the <tt>:root</tt> key in
|
|
15
|
+
# IMW::Dataset#options
|
|
16
|
+
#
|
|
17
|
+
def set_default_paths
|
|
18
|
+
add_path :script, File.expand_path(eval('__FILE__'))
|
|
19
|
+
add_path :root, options[:root] || File.dirname(path_to(:script))
|
|
20
|
+
workflow_dirs.each do |dir|
|
|
21
|
+
add_path dir, :root, dir.to_s
|
|
30
22
|
end
|
|
31
23
|
end
|
|
32
24
|
|
|
@@ -34,5 +26,4 @@ module IMW
|
|
|
34
26
|
def set_paths
|
|
35
27
|
end
|
|
36
28
|
end
|
|
37
|
-
|
|
38
29
|
end
|
data/lib/imw/dataset/workflow.rb
CHANGED
|
@@ -1,42 +1,62 @@
|
|
|
1
|
-
require 'imw/dataset/task'
|
|
2
1
|
require 'ostruct'
|
|
2
|
+
require 'rake'
|
|
3
3
|
|
|
4
4
|
module IMW
|
|
5
5
|
|
|
6
|
-
# IMW
|
|
7
|
-
|
|
6
|
+
# An IMW version of Rake::Task
|
|
7
|
+
Task = Class.new(Rake::Task)
|
|
8
|
+
|
|
9
|
+
# An IMW subclass of Rake:FileTask
|
|
10
|
+
FileTask = Class.new(Rake::FileTask)
|
|
11
|
+
|
|
12
|
+
# An IMW subclass of Rake::FileCreationTask
|
|
13
|
+
FileCreationTask = Class.new(Rake::FileCreationTask)
|
|
14
|
+
|
|
15
|
+
# IMW encourages you to view a data transformation as a series of
|
|
16
|
+
# interdependent steps.
|
|
8
17
|
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
18
|
+
# By default, IMW defines four main steps in such a transformation:
|
|
19
|
+
# +rip+, +parse+, +fix+, and +package+.
|
|
20
|
+
#
|
|
21
|
+
# Each step is associated with a directory on disk in which it keeps
|
|
22
|
+
# its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
|
|
11
23
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
24
|
+
# The steps are:
|
|
25
|
+
#
|
|
26
|
+
# rip::
|
|
27
|
+
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
|
|
28
|
+
# store the results in +ripd+.
|
|
15
29
|
#
|
|
16
30
|
# parse::
|
|
17
|
-
# Parse data into a structured form
|
|
31
|
+
# Parse data into a structured form using a library (JSON, YAML,
|
|
32
|
+
# &c.) or using your own parser (XML, flat files, &c.) and store
|
|
33
|
+
# the results in +prsd+
|
|
18
34
|
#
|
|
19
|
-
#
|
|
35
|
+
# fix::
|
|
20
36
|
# Combine, filter, reconcile, and transform already structured
|
|
21
|
-
# data into a desired form
|
|
37
|
+
# data into a desired form and store the results in +fixd+.
|
|
22
38
|
#
|
|
23
39
|
# package::
|
|
24
40
|
# Archive, compress, and deliver data in its final form to some
|
|
25
|
-
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.)
|
|
41
|
+
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
|
|
42
|
+
# storing the ouptut in +pkgd+.
|
|
26
43
|
#
|
|
27
44
|
# Each step depends upon the one before it. The steps are blank by
|
|
28
45
|
# default so there's no need to write code for steps you don't need
|
|
29
|
-
# to use.
|
|
46
|
+
# to use. You can also define your own steps (using +task+ just
|
|
47
|
+
# like in Rake) and hook them into these pre-defined steps (or
|
|
48
|
+
# not...).
|
|
30
49
|
#
|
|
31
|
-
#
|
|
50
|
+
# A dataset also has an <tt>:initialize</tt> task (which by default
|
|
51
|
+
# just creates the directories for these steps) which you can use to
|
|
52
|
+
# hook in your own initialization tasks by making it depend on them.
|
|
53
|
+
#
|
|
54
|
+
# A subclass of IMW::Dataset can customize how tasks are defined by
|
|
55
|
+
# overriding +define_workflow_tasks+, among other methods, and
|
|
56
|
+
# introduce new tasks by overriding +define_tasks+.
|
|
32
57
|
module Workflow
|
|
33
58
|
|
|
34
|
-
# The <tt>Rake::TaskManager</tt> module allows the
|
|
35
|
-
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
|
36
|
-
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
|
37
|
-
# associated with the processing of this dataset.
|
|
38
59
|
include Rake::TaskManager
|
|
39
|
-
|
|
40
60
|
# Default options passed to <tt>Rake</tt>. Any class including
|
|
41
61
|
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
|
42
62
|
# this name.
|
|
@@ -45,51 +65,77 @@ module IMW
|
|
|
45
65
|
:trace => false,
|
|
46
66
|
:verbose => false
|
|
47
67
|
}
|
|
68
|
+
|
|
69
|
+
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
|
70
|
+
# +name+. Dependencies can be declared and a block passed in just
|
|
71
|
+
# as in Rake.
|
|
72
|
+
#
|
|
73
|
+
# @param [Hash, Symbol, String] deps the name of the task (if a
|
|
74
|
+
# Symbol or String) or the name of the task mapped to an Array of
|
|
75
|
+
# dependencies (if a Hash)
|
|
76
|
+
#
|
|
77
|
+
# @return [IMW::Task] the task
|
|
78
|
+
def task deps, &block
|
|
79
|
+
self.define_task IMW::Task, deps, &block
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
|
83
|
+
# +path+. Dependencies can be declared and a block passed in just
|
|
84
|
+
# as in Rake.
|
|
85
|
+
#
|
|
86
|
+
# @param [String, IMW::Resource] path the path to the file
|
|
87
|
+
# @return [IMW::FileTask] the task
|
|
88
|
+
def file path, &block
|
|
89
|
+
path = path.respond_to?(:path) ? path.path : path
|
|
90
|
+
self.define_task IMW::FileTask, path, &block
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
|
94
|
+
# +path+. Dependencies can be declared and a block passed in just
|
|
95
|
+
# as in Rake.
|
|
96
|
+
#
|
|
97
|
+
# @param [String, IMW::Resource] path the path to the file
|
|
98
|
+
# @return [IMW::FileCreationTask] the task
|
|
99
|
+
def file_create path, &block
|
|
100
|
+
path = path.respond_to?(:path) ? path.path : path
|
|
101
|
+
self.define_task IMW::FileCreationTask, path, &block
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Override this method to define default tasks for a subclass of
|
|
105
|
+
# IMW::Dataset.
|
|
106
|
+
def define_tasks
|
|
107
|
+
end
|
|
48
108
|
|
|
49
109
|
# The standard IMW workflow steps.
|
|
50
|
-
|
|
110
|
+
#
|
|
111
|
+
# @return [Array] the workflow step names
|
|
112
|
+
def workflow_steps
|
|
113
|
+
[:rip, :parse, :fix, :package]
|
|
114
|
+
end
|
|
51
115
|
|
|
52
116
|
# The steps of the IMW workflow each correspond to a directory in
|
|
53
117
|
# which it is customary that they deposit their files <em>once
|
|
54
118
|
# they are finished processing</em> (so ripped files wind up in
|
|
55
119
|
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
|
56
120
|
# and so on).
|
|
57
|
-
DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
|
|
58
|
-
|
|
59
|
-
# Each workflow step can be configured to take default actions,
|
|
60
|
-
# each action being a proc in the array for the step in this hash.
|
|
61
121
|
#
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
STEPS_TASKS = returning({}) do |steps_procs|
|
|
66
|
-
STEPS.each do |step|
|
|
67
|
-
steps_procs[step] = []
|
|
68
|
-
end
|
|
122
|
+
# @return [Array] the workflow directory names
|
|
123
|
+
def workflow_dirs
|
|
124
|
+
[:ripd, :rawd, :fixd, :pkgd]
|
|
69
125
|
end
|
|
70
126
|
|
|
71
127
|
protected
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def define_workflow_task deps, comment
|
|
128
|
+
|
|
129
|
+
# Convenience method for defining tasks for this workflow.
|
|
130
|
+
#
|
|
131
|
+
# @param [Hash, Symbol, String] deps the name of the task (if a
|
|
132
|
+
# Symbol or String) or the name of the task mapped to an Array of
|
|
133
|
+
# dependencies (if a Hash)
|
|
134
|
+
# @param [String] comment the comment to associate to the task
|
|
135
|
+
# @return [IMW::Task] the task
|
|
136
|
+
def define_workflow_task deps, comment, &block
|
|
85
137
|
@last_description = comment
|
|
86
|
-
define_task(IMW::Task, deps)
|
|
87
|
-
step = deps.respond_to?(:keys) ? deps.keys.first : deps
|
|
88
|
-
STEPS_TASKS[step].each do |deps, block|
|
|
89
|
-
self[step].enhance(deps) do
|
|
90
|
-
self.instance_eval(&block)
|
|
91
|
-
end
|
|
92
|
-
end
|
|
138
|
+
define_task(IMW::Task, deps, &block)
|
|
93
139
|
end
|
|
94
140
|
|
|
95
141
|
# Create all the instance variables required by Rake::TaskManager
|
|
@@ -100,43 +146,56 @@ EOF
|
|
|
100
146
|
@scope = Array.new
|
|
101
147
|
@last_description = nil
|
|
102
148
|
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
|
103
|
-
|
|
104
|
-
define_workflow_tasks
|
|
105
|
-
|
|
149
|
+
define_initialize_task
|
|
150
|
+
define_workflow_tasks
|
|
151
|
+
define_workflow_task_methods
|
|
152
|
+
define_clean_task
|
|
153
|
+
define_tasks
|
|
106
154
|
end
|
|
107
155
|
|
|
108
|
-
#
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
156
|
+
# Defines the <tt>:initialize</tt> task. The only other task
|
|
157
|
+
# hooked into <tt>:initialize</tt> is the
|
|
158
|
+
# <tt>:create_workflow_dirs</tt> task which creates the workflow
|
|
159
|
+
# directories for this dataset.
|
|
160
|
+
def define_initialize_task
|
|
161
|
+
define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
|
|
162
|
+
workflow_dirs.each do |dir|
|
|
114
163
|
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
|
115
164
|
end
|
|
116
165
|
end
|
|
166
|
+
define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
|
|
117
167
|
end
|
|
118
168
|
|
|
119
|
-
# Creates a task <tt>:
|
|
169
|
+
# Creates a task <tt>:clean</tt> which removes dataset's
|
|
120
170
|
# workflow directories.
|
|
121
|
-
def
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
FileUtils.rm_rf(path_to(dir))
|
|
171
|
+
def define_clean_task
|
|
172
|
+
define_workflow_task :clean, "Remove the workflow directories for this dataset." do
|
|
173
|
+
workflow_dirs.each do |dir|
|
|
174
|
+
FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
|
|
126
175
|
end
|
|
127
176
|
end
|
|
128
177
|
end
|
|
129
178
|
|
|
130
|
-
# Creates the task dependency chain <tt>:package => :
|
|
131
|
-
# :parse => :
|
|
179
|
+
# Creates the task dependency chain <tt>:package => :fix =>
|
|
180
|
+
# :parse => :rip => :initialize</tt> of the
|
|
132
181
|
# IMW::Workflow.
|
|
133
182
|
def define_workflow_tasks
|
|
134
183
|
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
|
135
|
-
define_workflow_task({:
|
|
136
|
-
define_workflow_task({:
|
|
137
|
-
define_workflow_task({:
|
|
138
|
-
define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
|
|
184
|
+
define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
|
|
185
|
+
define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
|
|
186
|
+
define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
|
|
139
187
|
end
|
|
140
188
|
|
|
189
|
+
# Dynamically define methods for each of the workflow steps which
|
|
190
|
+
# act as shorcuts for accessing the corresponding tasks.
|
|
191
|
+
def define_workflow_task_methods
|
|
192
|
+
workflow_steps.each do |step|
|
|
193
|
+
self.class.class_eval <<RUBY
|
|
194
|
+
def #{step} deps, &block
|
|
195
|
+
self[step].enhance(step => deps, &block)
|
|
196
|
+
end
|
|
197
|
+
RUBY
|
|
198
|
+
end
|
|
199
|
+
end
|
|
141
200
|
end
|
|
142
201
|
end
|