imw 0.2.18 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/compressed_files.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
|
3
|
-
# Contains modules which define the behavior of compressed files.
|
4
|
-
module CompressedFiles
|
5
|
-
autoload :Bz2, 'imw/compressed_files/bz2'
|
6
|
-
autoload :Gz, 'imw/compressed_files/gz'
|
7
|
-
autoload :Compressible, 'imw/compressed_files/compressible'
|
8
|
-
|
9
|
-
# Handlers which include modules for compressed file formats as
|
10
|
-
# well as the IMW::CompressedFiles::Compressible module for
|
11
|
-
# compressing regular files.
|
12
|
-
HANDLERS = [
|
13
|
-
["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/i } ],
|
14
|
-
["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/i && r.path !~ /\.tar\.gz$/i && r.path !~ /\.tgz$/i } ],
|
15
|
-
["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/i && r.path !~ /\.tar\.bz2$/i && r.path !~ /\.tbz2$/i } ]
|
16
|
-
]
|
17
|
-
|
18
|
-
# Defines methods for decompressing a compressed file. This
|
19
|
-
# module isn't used to directly extend an IMW::Resource --
|
20
|
-
# instead, format specific modules (e.g. -
|
21
|
-
# IMW::Resources::CompressedFiles::Bz2) include this module and
|
22
|
-
# further define the command-line flags &c. needed to make
|
23
|
-
# everything work.
|
24
|
-
module Base
|
25
|
-
|
26
|
-
attr_accessor :compression_settings
|
27
|
-
|
28
|
-
# Is this file compressed?
|
29
|
-
#
|
30
|
-
# @return [true, false]
|
31
|
-
def is_compressed?
|
32
|
-
true
|
33
|
-
end
|
34
|
-
|
35
|
-
# Can this file be compressed?
|
36
|
-
#
|
37
|
-
# @return [true, false]
|
38
|
-
def is_compressible?
|
39
|
-
false
|
40
|
-
end
|
41
|
-
|
42
|
-
# The basename of this resource after it is decompressed
|
43
|
-
#
|
44
|
-
# IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
|
45
|
-
# => 'my_file.txt'
|
46
|
-
#
|
47
|
-
# @return [String] the decompressed basename
|
48
|
-
def decompressed_basename
|
49
|
-
basename[0..-(extname.size + 1)]
|
50
|
-
end
|
51
|
-
|
52
|
-
# The path of this resource after it is decompressed
|
53
|
-
#
|
54
|
-
# IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
|
55
|
-
# => '/path/to/my_file.txt'
|
56
|
-
#
|
57
|
-
# @return [String] the decompressed path
|
58
|
-
def decompressed_path
|
59
|
-
File.join(dirname, decompressed_basename)
|
60
|
-
end
|
61
|
-
|
62
|
-
# Decompress this file in its present directory overwriting any
|
63
|
-
# existing files and without saving the original compressed
|
64
|
-
# file.
|
65
|
-
#
|
66
|
-
# @return [IMW::Resource] the decompressed resource
|
67
|
-
def decompress!
|
68
|
-
should_exist!("Cannot decompress.")
|
69
|
-
program = compression_settings[:decompression_program] || compression_settings[:program]
|
70
|
-
FileUtils.cd(dirname) { IMW.system(program, compression_settings[:decompress], path) }
|
71
|
-
IMW.open(decompressed_path)
|
72
|
-
end
|
73
|
-
|
74
|
-
# Decompress this file in its present directory, overwriting any
|
75
|
-
# existing files while keeping the original compressed file.
|
76
|
-
#
|
77
|
-
# FIXME The implementation is a little stupid as the file is
|
78
|
-
# needlessly copied.
|
79
|
-
#
|
80
|
-
# @return [IMW::Resource] the decompressed resource
|
81
|
-
def decompress
|
82
|
-
should_exist!("Cannot decompress.")
|
83
|
-
begin
|
84
|
-
copy = cp(path + '.imw_copy')
|
85
|
-
regular_file = decompress!
|
86
|
-
copy.mv(path)
|
87
|
-
regular_file
|
88
|
-
ensure
|
89
|
-
copy.mv(path) if copy && copy.exist?
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
|
3
|
-
# Default settings used when compressing files. <tt>:program</tt>
|
4
|
-
# defines the name of the command-line program to use,
|
5
|
-
# <tt>:compress</tt> gives the flags to use when compressing, and
|
6
|
-
# <tt>:extension</tt> gives the extension (_without_ the `.') added
|
7
|
-
# by the program after compressing.
|
8
|
-
COMPRESSION_SETTINGS = {
|
9
|
-
:program => 'bzip2',
|
10
|
-
:compress => '',
|
11
|
-
:extension => 'bz2'
|
12
|
-
} unless defined?(COMPRESSION_SETTINGS)
|
13
|
-
|
14
|
-
module CompressedFiles
|
15
|
-
|
16
|
-
# Defines methods for compressing a file. The default compression
|
17
|
-
# program is defined in IMW::COMPRESSION_SETTINGS though a
|
18
|
-
# particular resource can change the values in its
|
19
|
-
# +compression_settings+ hash.
|
20
|
-
module Compressible
|
21
|
-
|
22
|
-
# Compression settings.
|
23
|
-
attr_accessor :compression_settings
|
24
|
-
|
25
|
-
# Is this file compressible?
|
26
|
-
#
|
27
|
-
# @return [true]
|
28
|
-
def is_compressible?
|
29
|
-
true
|
30
|
-
end
|
31
|
-
|
32
|
-
# Defines the compression settings used for this
|
33
|
-
# resource. <tt>:program</tt> defines the name of the
|
34
|
-
# command-line program to use, <tt>:compress</tt> gives the
|
35
|
-
# flags to use when compressing, and <tt>:extension</tt> gives
|
36
|
-
# the extension (_without_ the `.') added by the program after
|
37
|
-
# compressing.
|
38
|
-
#
|
39
|
-
# @return [Hash]
|
40
|
-
def compression_settings
|
41
|
-
@compression_settings ||= COMPRESSION_SETTINGS
|
42
|
-
end
|
43
|
-
|
44
|
-
# Compress this resource in place, overwriting it.
|
45
|
-
#
|
46
|
-
# This resource's +compression_settings+ method is used to
|
47
|
-
# determine the method of compression.
|
48
|
-
#
|
49
|
-
# @return [IMW::Resource] the compressed file
|
50
|
-
def compress!
|
51
|
-
should_exist!("Cannot compress.")
|
52
|
-
IMW.system(*[compression_settings[:program], compression_settings[:compress], path])
|
53
|
-
IMW.open(File.join(dirname,basename + "." + compression_settings[:extension]))
|
54
|
-
end
|
55
|
-
|
56
|
-
# Compress this resource without overwriting it.
|
57
|
-
#
|
58
|
-
# FIXME The implementation is a little stupid as the file is
|
59
|
-
# needlessly copied.
|
60
|
-
#
|
61
|
-
# @return [IMW::Resource] the compressed file
|
62
|
-
def compress options={}
|
63
|
-
should_exist!("Cannot compress.")
|
64
|
-
begin
|
65
|
-
copy = cp(path + '.imw_copy')
|
66
|
-
compressed_file = compress!
|
67
|
-
copy.mv(path)
|
68
|
-
compressed_file
|
69
|
-
ensure
|
70
|
-
copy.mv(path) if copy.exist?
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
data/lib/imw/dataset.rb
DELETED
@@ -1,125 +0,0 @@
|
|
1
|
-
require 'imw/dataset/workflow'
|
2
|
-
require 'imw/dataset/paths'
|
3
|
-
|
4
|
-
module IMW
|
5
|
-
|
6
|
-
# The IMW::Dataset represents a common object in which paths, data
|
7
|
-
# resources, and various tasks can be intermingled to define a
|
8
|
-
# complex transformation of data.
|
9
|
-
#
|
10
|
-
# == Organizing Paths
|
11
|
-
#
|
12
|
-
# IMW encourages you to work within the following directory
|
13
|
-
# structure for a dataset +my_dataset+:
|
14
|
-
#
|
15
|
-
# my_dataset/
|
16
|
-
# |-- my_dataset.rb
|
17
|
-
# |-- ripd
|
18
|
-
# | `-- ...
|
19
|
-
# |-- rawd
|
20
|
-
# | `-- ...
|
21
|
-
# |-- fixd
|
22
|
-
# | `-- ...
|
23
|
-
# `-- pkgd
|
24
|
-
# `-- ...
|
25
|
-
#
|
26
|
-
# Just like IMW itself, a dataset can manage a collection of paths.
|
27
|
-
# If <tt>my_dataset.rb</tt> defines a dataset:
|
28
|
-
#
|
29
|
-
# # my_dataset/my_dataset.rb
|
30
|
-
# dataset = IMW::Dataset.new(:my_dataset)
|
31
|
-
#
|
32
|
-
# then the following paths will be defined:
|
33
|
-
#
|
34
|
-
# dataset.path_to(:root) #=> my_dataset
|
35
|
-
# dataset.path_to(:script) #=> my_dataset/my_dataset.rb
|
36
|
-
# dataset.path_to(:ripd) #=> my_dataset/ripd
|
37
|
-
# dataset.path_to(:rawd) #=> my_dataset/rawd
|
38
|
-
# dataset.path_to(:fixd) #=> my_dataset/fixd
|
39
|
-
# dataset.path_to(:pkgd) #=> my_dataset/pkgd
|
40
|
-
#
|
41
|
-
# Just like IMW itself, the +dataset+ supports adding path
|
42
|
-
# references
|
43
|
-
#
|
44
|
-
# dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
|
45
|
-
# dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
|
46
|
-
#
|
47
|
-
# as well as removed (via <tt>dataset.remove_path</tt>)).
|
48
|
-
#
|
49
|
-
# A subclass of IMW::Dataset can customize these paths be overriding
|
50
|
-
# IMW::Dataset#set_default_paths as well as define new ones by
|
51
|
-
# overriding IMW::Dataset#set_paths.
|
52
|
-
#
|
53
|
-
# Setting paths can be skipped altogether by passing the
|
54
|
-
# <tt>:skip_paths</tt> option when instantiating a dataset:
|
55
|
-
#
|
56
|
-
# dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
|
57
|
-
#
|
58
|
-
# == Utilizing Tasks
|
59
|
-
#
|
60
|
-
# An IMW::Dataset utilizes Rake to manage tasks needed to transform
|
61
|
-
# data. See IMW::Workflow for a description of the pre-defined
|
62
|
-
# tasks (+rip+, +parse+, +fix+, +package+).
|
63
|
-
#
|
64
|
-
# New tasks can be defined
|
65
|
-
#
|
66
|
-
# dataset.task :get_authorization do
|
67
|
-
# # ... get an authorization token
|
68
|
-
# end
|
69
|
-
#
|
70
|
-
# and hooked into the default tasks in the usual Rake manner
|
71
|
-
#
|
72
|
-
# dataset.task :rip => [:get_authorization]
|
73
|
-
#
|
74
|
-
# A dataset also has methods for the workflow step tasks to make
|
75
|
-
# this easier
|
76
|
-
#
|
77
|
-
# dataset.rip [:get_authorized]
|
78
|
-
#
|
79
|
-
# Tasks for a dataset can be accessed and invoked as follows
|
80
|
-
#
|
81
|
-
# dataset[:rip].invoke
|
82
|
-
#
|
83
|
-
# as well as by using the command line +imw+ tool.
|
84
|
-
#
|
85
|
-
# Defining tasks can be skipped altogether by passing the
|
86
|
-
# <tt>:skip_workflow</tt> option when instantiating a dataset
|
87
|
-
#
|
88
|
-
# dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
|
89
|
-
#
|
90
|
-
# == Working with Repositories
|
91
|
-
#
|
92
|
-
# A dataset can be added to a repository by passing the
|
93
|
-
# <tt>:repository</tt> option
|
94
|
-
#
|
95
|
-
# repo = IMW::Repository.new
|
96
|
-
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
97
|
-
class Dataset
|
98
|
-
|
99
|
-
# The handle this dataset goes by. Used for identifying it within
|
100
|
-
# a repository.
|
101
|
-
attr_accessor :handle
|
102
|
-
|
103
|
-
# Options for this dataset.
|
104
|
-
attr_accessor :options
|
105
|
-
|
106
|
-
def initialize handle, options = {}
|
107
|
-
@options = options
|
108
|
-
@handle = handle
|
109
|
-
set_default_paths unless options[:skip_paths]
|
110
|
-
set_paths unless options[:skip_paths]
|
111
|
-
initialize_workflow unless options[:skip_workflow]
|
112
|
-
if options[:repository]
|
113
|
-
options[:repository][handle] = self
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
# Provides this dataset with a workflow of tasks managed by Rake.
|
118
|
-
include IMW::Workflow
|
119
|
-
|
120
|
-
# Provides this dataset with DSL like methods to construct a
|
121
|
-
# schema in an IMW file.
|
122
|
-
# include IMW::Metadata::DSL
|
123
|
-
|
124
|
-
end
|
125
|
-
end
|
data/lib/imw/dataset/paths.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
class Dataset
|
3
|
-
include IMW::Paths
|
4
|
-
|
5
|
-
protected
|
6
|
-
# Sets paths to the workflow directories for this dataset (+ripd+,
|
7
|
-
# +rawd+, +fixd+, +pkgd+) as well as the following paths:
|
8
|
-
#
|
9
|
-
# script::
|
10
|
-
# The path to the file the dataset was initialized in.
|
11
|
-
#
|
12
|
-
# root::
|
13
|
-
# The parent directory of the file the dataset was initialized
|
14
|
-
# in or the value of the <tt>:root</tt> key in
|
15
|
-
# IMW::Dataset#options
|
16
|
-
#
|
17
|
-
def set_default_paths
|
18
|
-
add_path :script, File.expand_path(eval('__FILE__'))
|
19
|
-
add_path :root, options[:root] || File.dirname(path_to(:script))
|
20
|
-
workflow_dirs.each do |dir|
|
21
|
-
add_path dir, :root, dir.to_s
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Overwrite this method to set additional paths for the dataset.
|
26
|
-
def set_paths
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
data/lib/imw/dataset/workflow.rb
DELETED
@@ -1,195 +0,0 @@
|
|
1
|
-
require 'ostruct'
|
2
|
-
require 'rake'
|
3
|
-
|
4
|
-
module IMW
|
5
|
-
|
6
|
-
# An IMW version of Rake::Task
|
7
|
-
Task = Class.new(Rake::Task)
|
8
|
-
|
9
|
-
# An IMW subclass of Rake:FileTask
|
10
|
-
FileTask = Class.new(Rake::FileTask)
|
11
|
-
|
12
|
-
# An IMW subclass of Rake::FileCreationTask
|
13
|
-
FileCreationTask = Class.new(Rake::FileCreationTask)
|
14
|
-
|
15
|
-
# IMW encourages you to view a data transformation as a series of
|
16
|
-
# interdependent steps.
|
17
|
-
#
|
18
|
-
# By default, IMW defines four main steps in such a transformation:
|
19
|
-
# +rip+, +parse+, +fix+, and +package+.
|
20
|
-
#
|
21
|
-
# Each step is associated with a directory on disk in which it keeps
|
22
|
-
# its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
|
23
|
-
#
|
24
|
-
# The steps are:
|
25
|
-
#
|
26
|
-
# rip::
|
27
|
-
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
|
28
|
-
# store the results in +ripd+.
|
29
|
-
#
|
30
|
-
# parse::
|
31
|
-
# Parse data into a structured form using a library (JSON, YAML,
|
32
|
-
# &c.) or using your own parser (XML, flat files, &c.) and store
|
33
|
-
# the results in +prsd+
|
34
|
-
#
|
35
|
-
# fix::
|
36
|
-
# Combine, filter, reconcile, and transform already structured
|
37
|
-
# data into a desired form and store the results in +fixd+.
|
38
|
-
#
|
39
|
-
# package::
|
40
|
-
# Archive, compress, and deliver data in its final form to some
|
41
|
-
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
|
42
|
-
# storing the ouptut in +pkgd+.
|
43
|
-
#
|
44
|
-
# Each step depends upon the one before it. The steps are blank by
|
45
|
-
# default so there's no need to write code for steps you don't need
|
46
|
-
# to use. You can also define your own steps (using +task+ just
|
47
|
-
# like in Rake) and hook them into these pre-defined steps (or
|
48
|
-
# not...).
|
49
|
-
#
|
50
|
-
# A dataset also has an <tt>:initialize</tt> task (which by default
|
51
|
-
# just creates the directories for these steps) which you can use to
|
52
|
-
# hook in your own initialization tasks by making it depend on them.
|
53
|
-
#
|
54
|
-
# A subclass of IMW::Dataset can customize how tasks are defined by
|
55
|
-
# overriding +define_workflow_tasks+, among other methods, and
|
56
|
-
# introduce new tasks by overriding +define_tasks+.
|
57
|
-
module Workflow
|
58
|
-
|
59
|
-
include Rake::TaskManager
|
60
|
-
# Default options passed to <tt>Rake</tt>. Any class including
|
61
|
-
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
62
|
-
# this name.
|
63
|
-
DEFAULT_OPTIONS = {
|
64
|
-
:dry_run => false,
|
65
|
-
:trace => false,
|
66
|
-
:verbose => false
|
67
|
-
}
|
68
|
-
|
69
|
-
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
70
|
-
# +name+. Dependencies can be declared and a block passed in just
|
71
|
-
# as in Rake.
|
72
|
-
#
|
73
|
-
# @param [Hash, Symbol, String] deps the name of the task (if a
|
74
|
-
# Symbol or String) or the name of the task mapped to an Array of
|
75
|
-
# dependencies (if a Hash)
|
76
|
-
#
|
77
|
-
# @return [IMW::Task] the task
|
78
|
-
def task deps, &block
|
79
|
-
self.define_task IMW::Task, deps, &block
|
80
|
-
end
|
81
|
-
|
82
|
-
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
83
|
-
# +path+. Dependencies can be declared and a block passed in just
|
84
|
-
# as in Rake.
|
85
|
-
#
|
86
|
-
# @param [String, IMW::Resource] path the path to the file
|
87
|
-
# @return [IMW::FileTask] the task
|
88
|
-
def file path, &block
|
89
|
-
path = path.respond_to?(:path) ? path.path : path
|
90
|
-
self.define_task IMW::FileTask, path, &block
|
91
|
-
end
|
92
|
-
|
93
|
-
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
94
|
-
# +path+. Dependencies can be declared and a block passed in just
|
95
|
-
# as in Rake.
|
96
|
-
#
|
97
|
-
# @param [String, IMW::Resource] path the path to the file
|
98
|
-
# @return [IMW::FileCreationTask] the task
|
99
|
-
def file_create path, &block
|
100
|
-
path = path.respond_to?(:path) ? path.path : path
|
101
|
-
self.define_task IMW::FileCreationTask, path, &block
|
102
|
-
end
|
103
|
-
|
104
|
-
# Override this method to define default tasks for a subclass of
|
105
|
-
# IMW::Dataset.
|
106
|
-
def define_tasks
|
107
|
-
end
|
108
|
-
|
109
|
-
# The standard IMW workflow steps.
|
110
|
-
#
|
111
|
-
# @return [Array] the workflow step names
|
112
|
-
def workflow_steps
|
113
|
-
[:rip, :parse, :fix, :package]
|
114
|
-
end
|
115
|
-
|
116
|
-
# The steps of the IMW workflow each correspond to a directory in
|
117
|
-
# which it is customary that they deposit their files <em>once
|
118
|
-
# they are finished processing</em> (so ripped files wind up in
|
119
|
-
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
120
|
-
# and so on).
|
121
|
-
#
|
122
|
-
# @return [Array] the workflow directory names
|
123
|
-
def workflow_dirs
|
124
|
-
[:ripd, :rawd, :fixd, :pkgd]
|
125
|
-
end
|
126
|
-
|
127
|
-
protected
|
128
|
-
|
129
|
-
# Convenience method for defining tasks for this workflow.
|
130
|
-
#
|
131
|
-
# @param [Hash, Symbol, String] deps the name of the task (if a
|
132
|
-
# Symbol or String) or the name of the task mapped to an Array of
|
133
|
-
# dependencies (if a Hash)
|
134
|
-
# @param [String] comment the comment to associate to the task
|
135
|
-
# @return [IMW::Task] the task
|
136
|
-
def define_workflow_task deps, comment, &block
|
137
|
-
@last_description = comment
|
138
|
-
define_task(IMW::Task, deps, &block)
|
139
|
-
end
|
140
|
-
|
141
|
-
# Create all the instance variables required by Rake::TaskManager
|
142
|
-
# and define default tasks for this dataset.
|
143
|
-
def initialize_workflow
|
144
|
-
@tasks = Hash.new
|
145
|
-
@rules = Array.new
|
146
|
-
@scope = Array.new
|
147
|
-
@last_description = nil
|
148
|
-
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
149
|
-
define_initialize_task
|
150
|
-
define_workflow_tasks
|
151
|
-
define_clean_task
|
152
|
-
define_tasks
|
153
|
-
end
|
154
|
-
|
155
|
-
# Defines the <tt>:initialize</tt> task. The only other task
|
156
|
-
# hooked into <tt>:initialize</tt> is the
|
157
|
-
# <tt>:create_workflow_dirs</tt> task which creates the workflow
|
158
|
-
# directories for this dataset.
|
159
|
-
def define_initialize_task
|
160
|
-
define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
|
161
|
-
workflow_dirs.each do |dir|
|
162
|
-
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
163
|
-
end
|
164
|
-
end
|
165
|
-
define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
|
166
|
-
end
|
167
|
-
|
168
|
-
# Creates a task <tt>:clean</tt> which removes dataset's
|
169
|
-
# workflow directories.
|
170
|
-
def define_clean_task
|
171
|
-
define_workflow_task :clean, "Remove the workflow directories for this dataset." do
|
172
|
-
workflow_dirs.each do |dir|
|
173
|
-
FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
|
174
|
-
end
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
|
-
# Creates the task dependency chain <tt>:package => :fix =>
|
179
|
-
# :parse => :rip => :initialize</tt> of the
|
180
|
-
# IMW::Workflow.
|
181
|
-
def define_workflow_tasks
|
182
|
-
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
183
|
-
define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
|
184
|
-
define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
|
185
|
-
define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
|
186
|
-
end
|
187
|
-
|
188
|
-
|
189
|
-
def rip(deps=nil, &block); self[:rip].enhance(deps, &block); end
|
190
|
-
def parse(deps=nil, &block); self[:parse].enhance(deps, &block); end
|
191
|
-
def fix(deps=nil, &block); self[:fix].enhance(deps, &block); end
|
192
|
-
def package(deps=nil, &block); self[:package].enhance(deps, &block); end
|
193
|
-
|
194
|
-
end
|
195
|
-
end
|