imw 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -18,3 +18,13 @@ begin
|
|
18
18
|
rescue LoadError
|
19
19
|
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
20
|
end
|
21
|
+
|
22
|
+
desc "Build tags"
|
23
|
+
task :tags do
|
24
|
+
system "etags -R bin etc examples lib spec"
|
25
|
+
end
|
26
|
+
|
27
|
+
desc "Build docs"
|
28
|
+
task :docs do
|
29
|
+
system "yardoc"
|
30
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
lookup basic yarddoc style (@params, etc) -- do a high-level description
|
2
|
+
|
3
|
+
learn how to run specs
|
4
|
+
write a spec that fails on the old code and passes on the new
|
5
|
+
|
6
|
+
convert all references to URI to be Addressable::URI
|
7
|
+
don't use URI.parse, use Addressable::URI.heuristic_parse (eg in files/*)
|
8
|
+
make basicfile methods delegate to its uri
|
9
|
+
|
10
|
+
tmpdir should use the actual system tmpdir libs (eg in archiver)
|
11
|
+
move config over to configliere
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
------ WANT PONY -----
|
17
|
+
|
18
|
+
might be nice to learn the delegate pattern
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/imw
CHANGED
data/etc/imwrc.rb
CHANGED
@@ -21,56 +21,6 @@
|
|
21
21
|
# Copyright:: Copyright (c) 2008 infochimps.org
|
22
22
|
# License:: GPL 3.0
|
23
23
|
# Website:: http://infinitemonkeywrench.org/
|
24
|
-
#
|
25
24
|
|
26
25
|
module IMW
|
27
|
-
PATHS = {
|
28
|
-
:home => ENV['HOME'],
|
29
|
-
:data_root => "/var/lib/imw",
|
30
|
-
:log_root => "/var/log/imw",
|
31
|
-
:scripts_root => "/usr/share/imw",
|
32
|
-
:tmp_root => "/tmp/imw",
|
33
|
-
|
34
|
-
# the imw library
|
35
|
-
:imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
|
36
|
-
:imw_bin => [:imw_root, 'bin'],
|
37
|
-
:imw_etc => [:imw_root, 'etc'],
|
38
|
-
:imw_lib => [:imw_root, 'lib'],
|
39
|
-
|
40
|
-
# workflow
|
41
|
-
:ripd_root => [:data_root, 'ripd'],
|
42
|
-
:peeld_root => [:data_root, 'peeld'],
|
43
|
-
:mungd_root => [:data_root, 'mungd'],
|
44
|
-
:temp_root => [:data_root, 'temp'],
|
45
|
-
:fixd_root => [:data_root, 'fixd'],
|
46
|
-
:pkgd_root => [:data_root, 'pkgd']
|
47
|
-
}
|
48
|
-
|
49
|
-
# Default time format.
|
50
|
-
STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
|
51
|
-
|
52
|
-
# Paths to external programs used by IMW.
|
53
|
-
EXTERNAL_PROGRAMS = {
|
54
|
-
:tar => "tar",
|
55
|
-
:rar => "rar",
|
56
|
-
:zip => "zip",
|
57
|
-
:unzip => "unzip",
|
58
|
-
:gzip => "gzip",
|
59
|
-
:bzip2 => "bzip2",
|
60
|
-
:wget => "wget"
|
61
|
-
} unless defined? ::IMW::EXTERNAL_PROGRAMS
|
62
|
-
|
63
|
-
module Files
|
64
|
-
# Regular expressions which match pathnames to the name of the
|
65
|
-
# appropriate IMW::Files class.
|
66
|
-
#
|
67
|
-
# File class names should be stripped of the leading
|
68
|
-
# <tt>IMW::Files</tt> prefix, i.e. - the file object
|
69
|
-
# <tt>IMW::Files::Bz2</tt> should be referenced by the string
|
70
|
-
# <tt>"Bz2"</tt>.
|
71
|
-
FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
|
72
|
-
end
|
73
|
-
|
74
26
|
end
|
75
|
-
|
76
|
-
|
data/examples/dataset.rb
ADDED
data/lib/imw/boot.rb
CHANGED
@@ -1,4 +1,36 @@
|
|
1
|
+
require 'imw/utils/extensions/hash'
|
2
|
+
|
1
3
|
module IMW
|
4
|
+
|
5
|
+
# IMW looks for configuration settings in the following places, in
|
6
|
+
# order of increasing precedence:
|
7
|
+
#
|
8
|
+
# 1. Settings defined directly in this file.
|
9
|
+
#
|
10
|
+
# 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
|
11
|
+
#
|
12
|
+
# 3. From the <tt>.imwrc</tt> file in the user's home directory (the
|
13
|
+
# filename can be changed; see
|
14
|
+
# <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
|
15
|
+
#
|
16
|
+
# 4. From the file defined by the environment variable +IMWRC+ (the
|
17
|
+
# value can be changed; see
|
18
|
+
# <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
|
19
|
+
#
|
20
|
+
# Settings not found in one configuration location will be searched
|
21
|
+
# for in locations of lesser precedence.
|
22
|
+
#
|
23
|
+
# *Note:* configuration files are plain Ruby code that will be directly
|
24
|
+
# evaluated.
|
25
|
+
#
|
26
|
+
# Relevant settings include
|
27
|
+
#
|
28
|
+
# * interfaces with external programs (+tar+, +wget+, &c.)
|
29
|
+
# * paths to directories where IMW reads/writes files
|
30
|
+
# * correspondences between file extensions and IMW file classes
|
31
|
+
#
|
32
|
+
# For more detailed information, see the default configuration file,
|
33
|
+
# <tt>etc/imwrc</tt>.
|
2
34
|
module Config
|
3
35
|
|
4
36
|
# Root of the IMW source base.
|
@@ -9,11 +41,12 @@ module IMW
|
|
9
41
|
#
|
10
42
|
# User configuration file
|
11
43
|
#
|
12
|
-
# By default, the file ~/.imwrc (.imwrc, in your home directory --
|
13
|
-
# is sourced at top level. If the $IMWRC
|
14
|
-
# that file will be sourced instead.
|
44
|
+
# By default, the file ~/.imwrc (.imwrc, in your home directory --
|
45
|
+
# note no .rb extension) is sourced at top level. If the $IMWRC
|
46
|
+
# environment variable is set, that file will be sourced instead.
|
15
47
|
#
|
16
|
-
# Any code within this file will override settings in
|
48
|
+
# Any code within this file will override settings in
|
49
|
+
# /etc/imwrc.rb which itself overrides IMW_ROOT/etc/imwrc.rb
|
17
50
|
#
|
18
51
|
USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
|
19
52
|
# Environment variable to override user configuration file location.
|
@@ -22,16 +55,29 @@ module IMW
|
|
22
55
|
File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
|
23
56
|
end
|
24
57
|
|
25
|
-
#
|
26
|
-
|
58
|
+
# Path to site-wide config file (overwrites IMW defaults but
|
59
|
+
# overridden by user defaults).
|
60
|
+
SITE_CONFIG_FILE = "/etc/imwrc.rb"
|
27
61
|
def self.site_config_file # :nodoc:
|
28
|
-
|
62
|
+
SITE_CONFIG_FILE
|
29
63
|
end
|
30
64
|
|
65
|
+
def self.default_config_file # :nodoc:
|
66
|
+
File.join(imw_root, "etc/imwrc.rb")
|
67
|
+
end
|
68
|
+
|
31
69
|
# Source the config files
|
32
70
|
def self.load_config
|
33
|
-
|
34
|
-
|
71
|
+
if File.exist?(user_config_file)
|
72
|
+
load user_config_file
|
73
|
+
end
|
74
|
+
|
75
|
+
if File.exist?(site_config_file)
|
76
|
+
load site_config_file
|
77
|
+
end
|
78
|
+
|
79
|
+
load default_config_file
|
80
|
+
|
35
81
|
end
|
36
82
|
end
|
37
83
|
end
|
data/lib/imw/dataset/paths.rb
CHANGED
@@ -1,32 +1,24 @@
|
|
1
1
|
module IMW
|
2
|
-
|
3
2
|
class Dataset
|
4
3
|
include IMW::Paths
|
5
4
|
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# the default workflow directories (see IMW::Workflow) are created
|
10
|
-
# within this directory.
|
5
|
+
protected
|
6
|
+
# Sets paths to the workflow directories for this dataset (+ripd+,
|
7
|
+
# +rawd+, +fixd+, +pkgd+) as well as the following paths:
|
11
8
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
# dataset.
|
9
|
+
# script::
|
10
|
+
# The path to the file the dataset was initialized in.
|
15
11
|
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
add_path :script, File.expand_path(eval('__FILE__'))
|
27
|
-
add_path :self, File.dirname(path_to(:script))
|
28
|
-
IMW::Workflow::DIRS.each do |dir|
|
29
|
-
add_path dir, :self, dir.to_s
|
12
|
+
# root::
|
13
|
+
# The parent directory of the file the dataset was initialized
|
14
|
+
# in or the value of the <tt>:root</tt> key in
|
15
|
+
# IMW::Dataset#options
|
16
|
+
#
|
17
|
+
def set_default_paths
|
18
|
+
add_path :script, File.expand_path(eval('__FILE__'))
|
19
|
+
add_path :root, options[:root] || File.dirname(path_to(:script))
|
20
|
+
workflow_dirs.each do |dir|
|
21
|
+
add_path dir, :root, dir.to_s
|
30
22
|
end
|
31
23
|
end
|
32
24
|
|
@@ -34,5 +26,4 @@ module IMW
|
|
34
26
|
def set_paths
|
35
27
|
end
|
36
28
|
end
|
37
|
-
|
38
29
|
end
|
data/lib/imw/dataset/workflow.rb
CHANGED
@@ -1,42 +1,62 @@
|
|
1
|
-
require 'imw/dataset/task'
|
2
1
|
require 'ostruct'
|
2
|
+
require 'rake'
|
3
3
|
|
4
4
|
module IMW
|
5
5
|
|
6
|
-
# IMW
|
7
|
-
|
6
|
+
# An IMW version of Rake::Task
|
7
|
+
Task = Class.new(Rake::Task)
|
8
|
+
|
9
|
+
# An IMW subclass of Rake:FileTask
|
10
|
+
FileTask = Class.new(Rake::FileTask)
|
11
|
+
|
12
|
+
# An IMW subclass of Rake::FileCreationTask
|
13
|
+
FileCreationTask = Class.new(Rake::FileCreationTask)
|
14
|
+
|
15
|
+
# IMW encourages you to view a data transformation as a series of
|
16
|
+
# interdependent steps.
|
8
17
|
#
|
9
|
-
#
|
10
|
-
#
|
18
|
+
# By default, IMW defines four main steps in such a transformation:
|
19
|
+
# +rip+, +parse+, +fix+, and +package+.
|
20
|
+
#
|
21
|
+
# Each step is associated with a directory on disk in which it keeps
|
22
|
+
# its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
|
11
23
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
24
|
+
# The steps are:
|
25
|
+
#
|
26
|
+
# rip::
|
27
|
+
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
|
28
|
+
# store the results in +ripd+.
|
15
29
|
#
|
16
30
|
# parse::
|
17
|
-
# Parse data into a structured form
|
31
|
+
# Parse data into a structured form using a library (JSON, YAML,
|
32
|
+
# &c.) or using your own parser (XML, flat files, &c.) and store
|
33
|
+
# the results in +prsd+
|
18
34
|
#
|
19
|
-
#
|
35
|
+
# fix::
|
20
36
|
# Combine, filter, reconcile, and transform already structured
|
21
|
-
# data into a desired form
|
37
|
+
# data into a desired form and store the results in +fixd+.
|
22
38
|
#
|
23
39
|
# package::
|
24
40
|
# Archive, compress, and deliver data in its final form to some
|
25
|
-
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.)
|
41
|
+
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
|
42
|
+
# storing the ouptut in +pkgd+.
|
26
43
|
#
|
27
44
|
# Each step depends upon the one before it. The steps are blank by
|
28
45
|
# default so there's no need to write code for steps you don't need
|
29
|
-
# to use.
|
46
|
+
# to use. You can also define your own steps (using +task+ just
|
47
|
+
# like in Rake) and hook them into these pre-defined steps (or
|
48
|
+
# not...).
|
30
49
|
#
|
31
|
-
#
|
50
|
+
# A dataset also has an <tt>:initialize</tt> task (which by default
|
51
|
+
# just creates the directories for these steps) which you can use to
|
52
|
+
# hook in your own initialization tasks by making it depend on them.
|
53
|
+
#
|
54
|
+
# A subclass of IMW::Dataset can customize how tasks are defined by
|
55
|
+
# overriding +define_workflow_tasks+, among other methods, and
|
56
|
+
# introduce new tasks by overriding +define_tasks+.
|
32
57
|
module Workflow
|
33
58
|
|
34
|
-
# The <tt>Rake::TaskManager</tt> module allows the
|
35
|
-
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
36
|
-
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
37
|
-
# associated with the processing of this dataset.
|
38
59
|
include Rake::TaskManager
|
39
|
-
|
40
60
|
# Default options passed to <tt>Rake</tt>. Any class including
|
41
61
|
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
42
62
|
# this name.
|
@@ -45,51 +65,77 @@ module IMW
|
|
45
65
|
:trace => false,
|
46
66
|
:verbose => false
|
47
67
|
}
|
68
|
+
|
69
|
+
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
70
|
+
# +name+. Dependencies can be declared and a block passed in just
|
71
|
+
# as in Rake.
|
72
|
+
#
|
73
|
+
# @param [Hash, Symbol, String] deps the name of the task (if a
|
74
|
+
# Symbol or String) or the name of the task mapped to an Array of
|
75
|
+
# dependencies (if a Hash)
|
76
|
+
#
|
77
|
+
# @return [IMW::Task] the task
|
78
|
+
def task deps, &block
|
79
|
+
self.define_task IMW::Task, deps, &block
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
83
|
+
# +path+. Dependencies can be declared and a block passed in just
|
84
|
+
# as in Rake.
|
85
|
+
#
|
86
|
+
# @param [String, IMW::Resource] path the path to the file
|
87
|
+
# @return [IMW::FileTask] the task
|
88
|
+
def file path, &block
|
89
|
+
path = path.respond_to?(:path) ? path.path : path
|
90
|
+
self.define_task IMW::FileTask, path, &block
|
91
|
+
end
|
92
|
+
|
93
|
+
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
94
|
+
# +path+. Dependencies can be declared and a block passed in just
|
95
|
+
# as in Rake.
|
96
|
+
#
|
97
|
+
# @param [String, IMW::Resource] path the path to the file
|
98
|
+
# @return [IMW::FileCreationTask] the task
|
99
|
+
def file_create path, &block
|
100
|
+
path = path.respond_to?(:path) ? path.path : path
|
101
|
+
self.define_task IMW::FileCreationTask, path, &block
|
102
|
+
end
|
103
|
+
|
104
|
+
# Override this method to define default tasks for a subclass of
|
105
|
+
# IMW::Dataset.
|
106
|
+
def define_tasks
|
107
|
+
end
|
48
108
|
|
49
109
|
# The standard IMW workflow steps.
|
50
|
-
|
110
|
+
#
|
111
|
+
# @return [Array] the workflow step names
|
112
|
+
def workflow_steps
|
113
|
+
[:rip, :parse, :fix, :package]
|
114
|
+
end
|
51
115
|
|
52
116
|
# The steps of the IMW workflow each correspond to a directory in
|
53
117
|
# which it is customary that they deposit their files <em>once
|
54
118
|
# they are finished processing</em> (so ripped files wind up in
|
55
119
|
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
56
120
|
# and so on).
|
57
|
-
DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
|
58
|
-
|
59
|
-
# Each workflow step can be configured to take default actions,
|
60
|
-
# each action being a proc in the array for the step in this hash.
|
61
121
|
#
|
62
|
-
#
|
63
|
-
|
64
|
-
|
65
|
-
STEPS_TASKS = returning({}) do |steps_procs|
|
66
|
-
STEPS.each do |step|
|
67
|
-
steps_procs[step] = []
|
68
|
-
end
|
122
|
+
# @return [Array] the workflow directory names
|
123
|
+
def workflow_dirs
|
124
|
+
[:ripd, :rawd, :fixd, :pkgd]
|
69
125
|
end
|
70
126
|
|
71
127
|
protected
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
def define_workflow_task deps, comment
|
128
|
+
|
129
|
+
# Convenience method for defining tasks for this workflow.
|
130
|
+
#
|
131
|
+
# @param [Hash, Symbol, String] deps the name of the task (if a
|
132
|
+
# Symbol or String) or the name of the task mapped to an Array of
|
133
|
+
# dependencies (if a Hash)
|
134
|
+
# @param [String] comment the comment to associate to the task
|
135
|
+
# @return [IMW::Task] the task
|
136
|
+
def define_workflow_task deps, comment, &block
|
85
137
|
@last_description = comment
|
86
|
-
define_task(IMW::Task, deps)
|
87
|
-
step = deps.respond_to?(:keys) ? deps.keys.first : deps
|
88
|
-
STEPS_TASKS[step].each do |deps, block|
|
89
|
-
self[step].enhance(deps) do
|
90
|
-
self.instance_eval(&block)
|
91
|
-
end
|
92
|
-
end
|
138
|
+
define_task(IMW::Task, deps, &block)
|
93
139
|
end
|
94
140
|
|
95
141
|
# Create all the instance variables required by Rake::TaskManager
|
@@ -100,43 +146,56 @@ EOF
|
|
100
146
|
@scope = Array.new
|
101
147
|
@last_description = nil
|
102
148
|
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
103
|
-
|
104
|
-
define_workflow_tasks
|
105
|
-
|
149
|
+
define_initialize_task
|
150
|
+
define_workflow_tasks
|
151
|
+
define_workflow_task_methods
|
152
|
+
define_clean_task
|
153
|
+
define_tasks
|
106
154
|
end
|
107
155
|
|
108
|
-
#
|
109
|
-
#
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
156
|
+
# Defines the <tt>:initialize</tt> task. The only other task
|
157
|
+
# hooked into <tt>:initialize</tt> is the
|
158
|
+
# <tt>:create_workflow_dirs</tt> task which creates the workflow
|
159
|
+
# directories for this dataset.
|
160
|
+
def define_initialize_task
|
161
|
+
define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
|
162
|
+
workflow_dirs.each do |dir|
|
114
163
|
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
115
164
|
end
|
116
165
|
end
|
166
|
+
define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
|
117
167
|
end
|
118
168
|
|
119
|
-
# Creates a task <tt>:
|
169
|
+
# Creates a task <tt>:clean</tt> which removes dataset's
|
120
170
|
# workflow directories.
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
FileUtils.rm_rf(path_to(dir))
|
171
|
+
def define_clean_task
|
172
|
+
define_workflow_task :clean, "Remove the workflow directories for this dataset." do
|
173
|
+
workflow_dirs.each do |dir|
|
174
|
+
FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
|
126
175
|
end
|
127
176
|
end
|
128
177
|
end
|
129
178
|
|
130
|
-
# Creates the task dependency chain <tt>:package => :
|
131
|
-
# :parse => :
|
179
|
+
# Creates the task dependency chain <tt>:package => :fix =>
|
180
|
+
# :parse => :rip => :initialize</tt> of the
|
132
181
|
# IMW::Workflow.
|
133
182
|
def define_workflow_tasks
|
134
183
|
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
135
|
-
define_workflow_task({:
|
136
|
-
define_workflow_task({:
|
137
|
-
define_workflow_task({:
|
138
|
-
define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
|
184
|
+
define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
|
185
|
+
define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
|
186
|
+
define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
|
139
187
|
end
|
140
188
|
|
189
|
+
# Dynamically define methods for each of the workflow steps which
|
190
|
+
# act as shorcuts for accessing the corresponding tasks.
|
191
|
+
def define_workflow_task_methods
|
192
|
+
workflow_steps.each do |step|
|
193
|
+
self.class.class_eval <<RUBY
|
194
|
+
def #{step} deps, &block
|
195
|
+
self[step].enhance(step => deps, &block)
|
196
|
+
end
|
197
|
+
RUBY
|
198
|
+
end
|
199
|
+
end
|
141
200
|
end
|
142
201
|
end
|