imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/compressed_files.rb
DELETED
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
|
|
3
|
-
# Contains modules which define the behavior of compressed files.
|
|
4
|
-
module CompressedFiles
|
|
5
|
-
autoload :Bz2, 'imw/compressed_files/bz2'
|
|
6
|
-
autoload :Gz, 'imw/compressed_files/gz'
|
|
7
|
-
autoload :Compressible, 'imw/compressed_files/compressible'
|
|
8
|
-
|
|
9
|
-
# Handlers which include modules for compressed file formats as
|
|
10
|
-
# well as the IMW::CompressedFiles::Compressible module for
|
|
11
|
-
# compressing regular files.
|
|
12
|
-
HANDLERS = [
|
|
13
|
-
["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/i } ],
|
|
14
|
-
["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/i && r.path !~ /\.tar\.gz$/i && r.path !~ /\.tgz$/i } ],
|
|
15
|
-
["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/i && r.path !~ /\.tar\.bz2$/i && r.path !~ /\.tbz2$/i } ]
|
|
16
|
-
]
|
|
17
|
-
|
|
18
|
-
# Defines methods for decompressing a compressed file. This
|
|
19
|
-
# module isn't used to directly extend an IMW::Resource --
|
|
20
|
-
# instead, format specific modules (e.g. -
|
|
21
|
-
# IMW::Resources::CompressedFiles::Bz2) include this module and
|
|
22
|
-
# further define the command-line flags &c. needed to make
|
|
23
|
-
# everything work.
|
|
24
|
-
module Base
|
|
25
|
-
|
|
26
|
-
attr_accessor :compression_settings
|
|
27
|
-
|
|
28
|
-
# Is this file compressed?
|
|
29
|
-
#
|
|
30
|
-
# @return [true, false]
|
|
31
|
-
def is_compressed?
|
|
32
|
-
true
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
# Can this file be compressed?
|
|
36
|
-
#
|
|
37
|
-
# @return [true, false]
|
|
38
|
-
def is_compressible?
|
|
39
|
-
false
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# The basename of this resource after it is decompressed
|
|
43
|
-
#
|
|
44
|
-
# IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
|
|
45
|
-
# => 'my_file.txt'
|
|
46
|
-
#
|
|
47
|
-
# @return [String] the decompressed basename
|
|
48
|
-
def decompressed_basename
|
|
49
|
-
basename[0..-(extname.size + 1)]
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# The path of this resource after it is decompressed
|
|
53
|
-
#
|
|
54
|
-
# IMW::Resource.new('/path/to/my_file.txt.bz2').decompressed_basename
|
|
55
|
-
# => '/path/to/my_file.txt'
|
|
56
|
-
#
|
|
57
|
-
# @return [String] the decompressed path
|
|
58
|
-
def decompressed_path
|
|
59
|
-
File.join(dirname, decompressed_basename)
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# Decompress this file in its present directory overwriting any
|
|
63
|
-
# existing files and without saving the original compressed
|
|
64
|
-
# file.
|
|
65
|
-
#
|
|
66
|
-
# @return [IMW::Resource] the decompressed resource
|
|
67
|
-
def decompress!
|
|
68
|
-
should_exist!("Cannot decompress.")
|
|
69
|
-
program = compression_settings[:decompression_program] || compression_settings[:program]
|
|
70
|
-
FileUtils.cd(dirname) { IMW.system(program, compression_settings[:decompress], path) }
|
|
71
|
-
IMW.open(decompressed_path)
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Decompress this file in its present directory, overwriting any
|
|
75
|
-
# existing files while keeping the original compressed file.
|
|
76
|
-
#
|
|
77
|
-
# FIXME The implementation is a little stupid as the file is
|
|
78
|
-
# needlessly copied.
|
|
79
|
-
#
|
|
80
|
-
# @return [IMW::Resource] the decompressed resource
|
|
81
|
-
def decompress
|
|
82
|
-
should_exist!("Cannot decompress.")
|
|
83
|
-
begin
|
|
84
|
-
copy = cp(path + '.imw_copy')
|
|
85
|
-
regular_file = decompress!
|
|
86
|
-
copy.mv(path)
|
|
87
|
-
regular_file
|
|
88
|
-
ensure
|
|
89
|
-
copy.mv(path) if copy && copy.exist?
|
|
90
|
-
end
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
end
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
|
|
3
|
-
# Default settings used when compressing files. <tt>:program</tt>
|
|
4
|
-
# defines the name of the command-line program to use,
|
|
5
|
-
# <tt>:compress</tt> gives the flags to use when compressing, and
|
|
6
|
-
# <tt>:extension</tt> gives the extension (_without_ the `.') added
|
|
7
|
-
# by the program after compressing.
|
|
8
|
-
COMPRESSION_SETTINGS = {
|
|
9
|
-
:program => 'bzip2',
|
|
10
|
-
:compress => '',
|
|
11
|
-
:extension => 'bz2'
|
|
12
|
-
} unless defined?(COMPRESSION_SETTINGS)
|
|
13
|
-
|
|
14
|
-
module CompressedFiles
|
|
15
|
-
|
|
16
|
-
# Defines methods for compressing a file. The default compression
|
|
17
|
-
# program is defined in IMW::COMPRESSION_SETTINGS though a
|
|
18
|
-
# particular resource can change the values in its
|
|
19
|
-
# +compression_settings+ hash.
|
|
20
|
-
module Compressible
|
|
21
|
-
|
|
22
|
-
# Compression settings.
|
|
23
|
-
attr_accessor :compression_settings
|
|
24
|
-
|
|
25
|
-
# Is this file compressible?
|
|
26
|
-
#
|
|
27
|
-
# @return [true]
|
|
28
|
-
def is_compressible?
|
|
29
|
-
true
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Defines the compression settings used for this
|
|
33
|
-
# resource. <tt>:program</tt> defines the name of the
|
|
34
|
-
# command-line program to use, <tt>:compress</tt> gives the
|
|
35
|
-
# flags to use when compressing, and <tt>:extension</tt> gives
|
|
36
|
-
# the extension (_without_ the `.') added by the program after
|
|
37
|
-
# compressing.
|
|
38
|
-
#
|
|
39
|
-
# @return [Hash]
|
|
40
|
-
def compression_settings
|
|
41
|
-
@compression_settings ||= COMPRESSION_SETTINGS
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Compress this resource in place, overwriting it.
|
|
45
|
-
#
|
|
46
|
-
# This resource's +compression_settings+ method is used to
|
|
47
|
-
# determine the method of compression.
|
|
48
|
-
#
|
|
49
|
-
# @return [IMW::Resource] the compressed file
|
|
50
|
-
def compress!
|
|
51
|
-
should_exist!("Cannot compress.")
|
|
52
|
-
IMW.system(*[compression_settings[:program], compression_settings[:compress], path])
|
|
53
|
-
IMW.open(File.join(dirname,basename + "." + compression_settings[:extension]))
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# Compress this resource without overwriting it.
|
|
57
|
-
#
|
|
58
|
-
# FIXME The implementation is a little stupid as the file is
|
|
59
|
-
# needlessly copied.
|
|
60
|
-
#
|
|
61
|
-
# @return [IMW::Resource] the compressed file
|
|
62
|
-
def compress options={}
|
|
63
|
-
should_exist!("Cannot compress.")
|
|
64
|
-
begin
|
|
65
|
-
copy = cp(path + '.imw_copy')
|
|
66
|
-
compressed_file = compress!
|
|
67
|
-
copy.mv(path)
|
|
68
|
-
compressed_file
|
|
69
|
-
ensure
|
|
70
|
-
copy.mv(path) if copy.exist?
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
end
|
data/lib/imw/dataset.rb
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
require 'imw/dataset/workflow'
|
|
2
|
-
require 'imw/dataset/paths'
|
|
3
|
-
|
|
4
|
-
module IMW
|
|
5
|
-
|
|
6
|
-
# The IMW::Dataset represents a common object in which paths, data
|
|
7
|
-
# resources, and various tasks can be intermingled to define a
|
|
8
|
-
# complex transformation of data.
|
|
9
|
-
#
|
|
10
|
-
# == Organizing Paths
|
|
11
|
-
#
|
|
12
|
-
# IMW encourages you to work within the following directory
|
|
13
|
-
# structure for a dataset +my_dataset+:
|
|
14
|
-
#
|
|
15
|
-
# my_dataset/
|
|
16
|
-
# |-- my_dataset.rb
|
|
17
|
-
# |-- ripd
|
|
18
|
-
# | `-- ...
|
|
19
|
-
# |-- rawd
|
|
20
|
-
# | `-- ...
|
|
21
|
-
# |-- fixd
|
|
22
|
-
# | `-- ...
|
|
23
|
-
# `-- pkgd
|
|
24
|
-
# `-- ...
|
|
25
|
-
#
|
|
26
|
-
# Just like IMW itself, a dataset can manage a collection of paths.
|
|
27
|
-
# If <tt>my_dataset.rb</tt> defines a dataset:
|
|
28
|
-
#
|
|
29
|
-
# # my_dataset/my_dataset.rb
|
|
30
|
-
# dataset = IMW::Dataset.new(:my_dataset)
|
|
31
|
-
#
|
|
32
|
-
# then the following paths will be defined:
|
|
33
|
-
#
|
|
34
|
-
# dataset.path_to(:root) #=> my_dataset
|
|
35
|
-
# dataset.path_to(:script) #=> my_dataset/my_dataset.rb
|
|
36
|
-
# dataset.path_to(:ripd) #=> my_dataset/ripd
|
|
37
|
-
# dataset.path_to(:rawd) #=> my_dataset/rawd
|
|
38
|
-
# dataset.path_to(:fixd) #=> my_dataset/fixd
|
|
39
|
-
# dataset.path_to(:pkgd) #=> my_dataset/pkgd
|
|
40
|
-
#
|
|
41
|
-
# Just like IMW itself, the +dataset+ supports adding path
|
|
42
|
-
# references
|
|
43
|
-
#
|
|
44
|
-
# dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
|
|
45
|
-
# dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
|
|
46
|
-
#
|
|
47
|
-
# as well as removed (via <tt>dataset.remove_path</tt>)).
|
|
48
|
-
#
|
|
49
|
-
# A subclass of IMW::Dataset can customize these paths be overriding
|
|
50
|
-
# IMW::Dataset#set_default_paths as well as define new ones by
|
|
51
|
-
# overriding IMW::Dataset#set_paths.
|
|
52
|
-
#
|
|
53
|
-
# Setting paths can be skipped altogether by passing the
|
|
54
|
-
# <tt>:skip_paths</tt> option when instantiating a dataset:
|
|
55
|
-
#
|
|
56
|
-
# dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
|
|
57
|
-
#
|
|
58
|
-
# == Utilizing Tasks
|
|
59
|
-
#
|
|
60
|
-
# An IMW::Dataset utilizes Rake to manage tasks needed to transform
|
|
61
|
-
# data. See IMW::Workflow for a description of the pre-defined
|
|
62
|
-
# tasks (+rip+, +parse+, +fix+, +package+).
|
|
63
|
-
#
|
|
64
|
-
# New tasks can be defined
|
|
65
|
-
#
|
|
66
|
-
# dataset.task :get_authorization do
|
|
67
|
-
# # ... get an authorization token
|
|
68
|
-
# end
|
|
69
|
-
#
|
|
70
|
-
# and hooked into the default tasks in the usual Rake manner
|
|
71
|
-
#
|
|
72
|
-
# dataset.task :rip => [:get_authorization]
|
|
73
|
-
#
|
|
74
|
-
# A dataset also has methods for the workflow step tasks to make
|
|
75
|
-
# this easier
|
|
76
|
-
#
|
|
77
|
-
# dataset.rip [:get_authorized]
|
|
78
|
-
#
|
|
79
|
-
# Tasks for a dataset can be accessed and invoked as follows
|
|
80
|
-
#
|
|
81
|
-
# dataset[:rip].invoke
|
|
82
|
-
#
|
|
83
|
-
# as well as by using the command line +imw+ tool.
|
|
84
|
-
#
|
|
85
|
-
# Defining tasks can be skipped altogether by passing the
|
|
86
|
-
# <tt>:skip_workflow</tt> option when instantiating a dataset
|
|
87
|
-
#
|
|
88
|
-
# dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
|
|
89
|
-
#
|
|
90
|
-
# == Working with Repositories
|
|
91
|
-
#
|
|
92
|
-
# A dataset can be added to a repository by passing the
|
|
93
|
-
# <tt>:repository</tt> option
|
|
94
|
-
#
|
|
95
|
-
# repo = IMW::Repository.new
|
|
96
|
-
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
|
97
|
-
class Dataset
|
|
98
|
-
|
|
99
|
-
# The handle this dataset goes by. Used for identifying it within
|
|
100
|
-
# a repository.
|
|
101
|
-
attr_accessor :handle
|
|
102
|
-
|
|
103
|
-
# Options for this dataset.
|
|
104
|
-
attr_accessor :options
|
|
105
|
-
|
|
106
|
-
def initialize handle, options = {}
|
|
107
|
-
@options = options
|
|
108
|
-
@handle = handle
|
|
109
|
-
set_default_paths unless options[:skip_paths]
|
|
110
|
-
set_paths unless options[:skip_paths]
|
|
111
|
-
initialize_workflow unless options[:skip_workflow]
|
|
112
|
-
if options[:repository]
|
|
113
|
-
options[:repository][handle] = self
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
# Provides this dataset with a workflow of tasks managed by Rake.
|
|
118
|
-
include IMW::Workflow
|
|
119
|
-
|
|
120
|
-
# Provides this dataset with DSL like methods to construct a
|
|
121
|
-
# schema in an IMW file.
|
|
122
|
-
# include IMW::Metadata::DSL
|
|
123
|
-
|
|
124
|
-
end
|
|
125
|
-
end
|
data/lib/imw/dataset/paths.rb
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
class Dataset
|
|
3
|
-
include IMW::Paths
|
|
4
|
-
|
|
5
|
-
protected
|
|
6
|
-
# Sets paths to the workflow directories for this dataset (+ripd+,
|
|
7
|
-
# +rawd+, +fixd+, +pkgd+) as well as the following paths:
|
|
8
|
-
#
|
|
9
|
-
# script::
|
|
10
|
-
# The path to the file the dataset was initialized in.
|
|
11
|
-
#
|
|
12
|
-
# root::
|
|
13
|
-
# The parent directory of the file the dataset was initialized
|
|
14
|
-
# in or the value of the <tt>:root</tt> key in
|
|
15
|
-
# IMW::Dataset#options
|
|
16
|
-
#
|
|
17
|
-
def set_default_paths
|
|
18
|
-
add_path :script, File.expand_path(eval('__FILE__'))
|
|
19
|
-
add_path :root, options[:root] || File.dirname(path_to(:script))
|
|
20
|
-
workflow_dirs.each do |dir|
|
|
21
|
-
add_path dir, :root, dir.to_s
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# Overwrite this method to set additional paths for the dataset.
|
|
26
|
-
def set_paths
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
end
|
data/lib/imw/dataset/workflow.rb
DELETED
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
require 'ostruct'
|
|
2
|
-
require 'rake'
|
|
3
|
-
|
|
4
|
-
module IMW
|
|
5
|
-
|
|
6
|
-
# An IMW version of Rake::Task
|
|
7
|
-
Task = Class.new(Rake::Task)
|
|
8
|
-
|
|
9
|
-
# An IMW subclass of Rake:FileTask
|
|
10
|
-
FileTask = Class.new(Rake::FileTask)
|
|
11
|
-
|
|
12
|
-
# An IMW subclass of Rake::FileCreationTask
|
|
13
|
-
FileCreationTask = Class.new(Rake::FileCreationTask)
|
|
14
|
-
|
|
15
|
-
# IMW encourages you to view a data transformation as a series of
|
|
16
|
-
# interdependent steps.
|
|
17
|
-
#
|
|
18
|
-
# By default, IMW defines four main steps in such a transformation:
|
|
19
|
-
# +rip+, +parse+, +fix+, and +package+.
|
|
20
|
-
#
|
|
21
|
-
# Each step is associated with a directory on disk in which it keeps
|
|
22
|
-
# its files: +ripd+, +prsd+, +fixd+, and +pkgd+.
|
|
23
|
-
#
|
|
24
|
-
# The steps are:
|
|
25
|
-
#
|
|
26
|
-
# rip::
|
|
27
|
-
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and
|
|
28
|
-
# store the results in +ripd+.
|
|
29
|
-
#
|
|
30
|
-
# parse::
|
|
31
|
-
# Parse data into a structured form using a library (JSON, YAML,
|
|
32
|
-
# &c.) or using your own parser (XML, flat files, &c.) and store
|
|
33
|
-
# the results in +prsd+
|
|
34
|
-
#
|
|
35
|
-
# fix::
|
|
36
|
-
# Combine, filter, reconcile, and transform already structured
|
|
37
|
-
# data into a desired form and store the results in +fixd+.
|
|
38
|
-
#
|
|
39
|
-
# package::
|
|
40
|
-
# Archive, compress, and deliver data in its final form to some
|
|
41
|
-
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally
|
|
42
|
-
# storing the ouptut in +pkgd+.
|
|
43
|
-
#
|
|
44
|
-
# Each step depends upon the one before it. The steps are blank by
|
|
45
|
-
# default so there's no need to write code for steps you don't need
|
|
46
|
-
# to use. You can also define your own steps (using +task+ just
|
|
47
|
-
# like in Rake) and hook them into these pre-defined steps (or
|
|
48
|
-
# not...).
|
|
49
|
-
#
|
|
50
|
-
# A dataset also has an <tt>:initialize</tt> task (which by default
|
|
51
|
-
# just creates the directories for these steps) which you can use to
|
|
52
|
-
# hook in your own initialization tasks by making it depend on them.
|
|
53
|
-
#
|
|
54
|
-
# A subclass of IMW::Dataset can customize how tasks are defined by
|
|
55
|
-
# overriding +define_workflow_tasks+, among other methods, and
|
|
56
|
-
# introduce new tasks by overriding +define_tasks+.
|
|
57
|
-
module Workflow
|
|
58
|
-
|
|
59
|
-
include Rake::TaskManager
|
|
60
|
-
# Default options passed to <tt>Rake</tt>. Any class including
|
|
61
|
-
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
|
62
|
-
# this name.
|
|
63
|
-
DEFAULT_OPTIONS = {
|
|
64
|
-
:dry_run => false,
|
|
65
|
-
:trace => false,
|
|
66
|
-
:verbose => false
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
|
70
|
-
# +name+. Dependencies can be declared and a block passed in just
|
|
71
|
-
# as in Rake.
|
|
72
|
-
#
|
|
73
|
-
# @param [Hash, Symbol, String] deps the name of the task (if a
|
|
74
|
-
# Symbol or String) or the name of the task mapped to an Array of
|
|
75
|
-
# dependencies (if a Hash)
|
|
76
|
-
#
|
|
77
|
-
# @return [IMW::Task] the task
|
|
78
|
-
def task deps, &block
|
|
79
|
-
self.define_task IMW::Task, deps, &block
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
|
83
|
-
# +path+. Dependencies can be declared and a block passed in just
|
|
84
|
-
# as in Rake.
|
|
85
|
-
#
|
|
86
|
-
# @param [String, IMW::Resource] path the path to the file
|
|
87
|
-
# @return [IMW::FileTask] the task
|
|
88
|
-
def file path, &block
|
|
89
|
-
path = path.respond_to?(:path) ? path.path : path
|
|
90
|
-
self.define_task IMW::FileTask, path, &block
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
|
94
|
-
# +path+. Dependencies can be declared and a block passed in just
|
|
95
|
-
# as in Rake.
|
|
96
|
-
#
|
|
97
|
-
# @param [String, IMW::Resource] path the path to the file
|
|
98
|
-
# @return [IMW::FileCreationTask] the task
|
|
99
|
-
def file_create path, &block
|
|
100
|
-
path = path.respond_to?(:path) ? path.path : path
|
|
101
|
-
self.define_task IMW::FileCreationTask, path, &block
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Override this method to define default tasks for a subclass of
|
|
105
|
-
# IMW::Dataset.
|
|
106
|
-
def define_tasks
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# The standard IMW workflow steps.
|
|
110
|
-
#
|
|
111
|
-
# @return [Array] the workflow step names
|
|
112
|
-
def workflow_steps
|
|
113
|
-
[:rip, :parse, :fix, :package]
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
# The steps of the IMW workflow each correspond to a directory in
|
|
117
|
-
# which it is customary that they deposit their files <em>once
|
|
118
|
-
# they are finished processing</em> (so ripped files wind up in
|
|
119
|
-
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
|
120
|
-
# and so on).
|
|
121
|
-
#
|
|
122
|
-
# @return [Array] the workflow directory names
|
|
123
|
-
def workflow_dirs
|
|
124
|
-
[:ripd, :rawd, :fixd, :pkgd]
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
protected
|
|
128
|
-
|
|
129
|
-
# Convenience method for defining tasks for this workflow.
|
|
130
|
-
#
|
|
131
|
-
# @param [Hash, Symbol, String] deps the name of the task (if a
|
|
132
|
-
# Symbol or String) or the name of the task mapped to an Array of
|
|
133
|
-
# dependencies (if a Hash)
|
|
134
|
-
# @param [String] comment the comment to associate to the task
|
|
135
|
-
# @return [IMW::Task] the task
|
|
136
|
-
def define_workflow_task deps, comment, &block
|
|
137
|
-
@last_description = comment
|
|
138
|
-
define_task(IMW::Task, deps, &block)
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
# Create all the instance variables required by Rake::TaskManager
|
|
142
|
-
# and define default tasks for this dataset.
|
|
143
|
-
def initialize_workflow
|
|
144
|
-
@tasks = Hash.new
|
|
145
|
-
@rules = Array.new
|
|
146
|
-
@scope = Array.new
|
|
147
|
-
@last_description = nil
|
|
148
|
-
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
|
149
|
-
define_initialize_task
|
|
150
|
-
define_workflow_tasks
|
|
151
|
-
define_clean_task
|
|
152
|
-
define_tasks
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
# Defines the <tt>:initialize</tt> task. The only other task
|
|
156
|
-
# hooked into <tt>:initialize</tt> is the
|
|
157
|
-
# <tt>:create_workflow_dirs</tt> task which creates the workflow
|
|
158
|
-
# directories for this dataset.
|
|
159
|
-
def define_initialize_task
|
|
160
|
-
define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do
|
|
161
|
-
workflow_dirs.each do |dir|
|
|
162
|
-
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
|
163
|
-
end
|
|
164
|
-
end
|
|
165
|
-
define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.")
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
# Creates a task <tt>:clean</tt> which removes dataset's
|
|
169
|
-
# workflow directories.
|
|
170
|
-
def define_clean_task
|
|
171
|
-
define_workflow_task :clean, "Remove the workflow directories for this dataset." do
|
|
172
|
-
workflow_dirs.each do |dir|
|
|
173
|
-
FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir))
|
|
174
|
-
end
|
|
175
|
-
end
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Creates the task dependency chain <tt>:package => :fix =>
|
|
179
|
-
# :parse => :rip => :initialize</tt> of the
|
|
180
|
-
# IMW::Workflow.
|
|
181
|
-
def define_workflow_tasks
|
|
182
|
-
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
|
183
|
-
define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." )
|
|
184
|
-
define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." )
|
|
185
|
-
define_workflow_task({:package => [:fix]}, "Package dataset in final form." )
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def rip(deps=nil, &block); self[:rip].enhance(deps, &block); end
|
|
190
|
-
def parse(deps=nil, &block); self[:parse].enhance(deps, &block); end
|
|
191
|
-
def fix(deps=nil, &block); self[:fix].enhance(deps, &block); end
|
|
192
|
-
def package(deps=nil, &block); self[:package].enhance(deps, &block); end
|
|
193
|
-
|
|
194
|
-
end
|
|
195
|
-
end
|