imw 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/dataset.rb
CHANGED
|
@@ -1,206 +1,114 @@
|
|
|
1
|
-
require 'imw/utils'
|
|
2
1
|
require 'imw/dataset/workflow'
|
|
3
2
|
require 'imw/dataset/paths'
|
|
4
3
|
|
|
5
4
|
module IMW
|
|
6
5
|
|
|
7
|
-
# The IMW::Dataset
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
#
|
|
56
|
-
#
|
|
57
|
-
#
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
#
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
#
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
70
|
-
# Where <tt>[ripd]</tt> would be replaced by the IMW
|
|
71
|
-
# <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
|
|
72
|
-
# empty so If there's no need to rip data (perhaps it's already on
|
|
73
|
-
# disk?) then nothing needs to be done here.
|
|
74
|
-
#
|
|
75
|
-
# raw::
|
|
76
|
-
# Managed by the <tt>:raw</tt> task, data is uncompressed and
|
|
77
|
-
# extracted (if necessary) and stored in a subdirectory of the
|
|
78
|
-
# <tt>:data</tt> directory named by the taxon and handle of this
|
|
79
|
-
# dataset.
|
|
80
|
-
#
|
|
81
|
-
# dataset.task :raw do
|
|
82
|
-
# IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
|
|
83
|
-
# Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
|
|
84
|
-
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
|
|
85
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
|
|
86
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
|
|
87
|
-
# ...
|
|
88
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
|
|
89
|
-
# end
|
|
90
|
-
#
|
|
91
|
-
# Where <tt>[data]</tt> would be replaced by the IMW
|
|
92
|
-
# <tt>:data</tt> directory.
|
|
93
|
-
#
|
|
94
|
-
# If this dataset didn't have a taxon
|
|
95
|
-
# (economics/alarming_trends) its files would be stored in a
|
|
96
|
-
# directory +recent_history_of_banana_prices+ just below the
|
|
97
|
-
# <tt>:data</tt> directory.
|
|
98
|
-
#
|
|
99
|
-
# fix::
|
|
100
|
-
# Managed by the <tt>:fix</tt> task, transformations on the data
|
|
101
|
-
# are performed. IMW's method is to read data from a source
|
|
102
|
-
# format (XML, YAML, CSV, &c.) into Ruby objects with hash
|
|
103
|
-
# semantics. These objects might be based upon structs,
|
|
104
|
-
# ActiveRecord, DataMapper::Resource, FasterCSV...anything which
|
|
105
|
-
# can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
|
|
106
|
-
# ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
|
|
107
|
-
# neatly into your toobox.
|
|
108
|
-
#
|
|
109
|
-
#
|
|
110
|
-
# # Open an output file in XML for writing
|
|
111
|
-
# output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
|
|
112
|
-
# #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
|
|
113
|
-
#
|
|
114
|
-
# # A place to store the combined data
|
|
115
|
-
# correlations = []
|
|
116
|
-
#
|
|
117
|
-
# dataset.task :fix do
|
|
118
|
-
#
|
|
119
|
-
# # Return the contents of the weather data which has rows like
|
|
120
|
-
# #
|
|
121
|
-
# # 1 2008-09-01 4
|
|
122
|
-
# # 2 2008-09-08 3
|
|
123
|
-
# # 3 2008-08-15 3
|
|
124
|
-
# # ...
|
|
125
|
-
# #
|
|
126
|
-
# weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
|
|
127
|
-
# :headers => ["ID","DATE","NUM_HURRICANES"]).entries
|
|
128
|
-
# #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
|
|
129
|
-
#
|
|
130
|
-
#
|
|
131
|
-
# # Return the matching data from the produce prices XML file which looks like
|
|
132
|
-
# #
|
|
133
|
-
# # <prices>
|
|
134
|
-
# # <price type="apple">
|
|
135
|
-
# # <date>2008/09/01</date>
|
|
136
|
-
# # <amount>0.15</amount>
|
|
137
|
-
# # </price>
|
|
138
|
-
# # <price type="banana">
|
|
139
|
-
# # <date>2008/09/01</date>
|
|
140
|
-
# # <amount>0.20</amount>
|
|
141
|
-
# # </price>
|
|
142
|
-
# # ...
|
|
143
|
-
# # </prices>
|
|
144
|
-
# parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
|
|
145
|
-
# { :week => 'date',
|
|
146
|
-
# :price => 'amount' }]
|
|
147
|
-
#
|
|
148
|
-
# # Loop through the XML produce prices, mixing in the hurricane data,
|
|
149
|
-
# # and outputting new rows.
|
|
150
|
-
# Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
|
|
151
|
-
# IMW.open file do |xml| #=> Hpricot::Doc
|
|
152
|
-
# parser.parse(xml).each do |record|
|
|
153
|
-
# num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
|
|
154
|
-
# output << [week,record[:price],num_hurricanes]
|
|
155
|
-
# end
|
|
156
|
-
# end
|
|
157
|
-
# end
|
|
158
|
-
# end
|
|
159
|
-
#
|
|
160
|
-
# package::
|
|
161
|
-
# Data is packaged and compressed (if necessary) into a delivery
|
|
162
|
-
# format and deposited into the <tt>:pkgd</tt> directory.
|
|
163
|
-
#
|
|
164
|
-
# dataset.task :pkg do
|
|
165
|
-
# IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
|
|
166
|
-
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
|
|
6
|
+
# The IMW::Dataset represents a common object in which paths, data
|
|
7
|
+
# resources, and various tasks can be intermingled to define a
|
|
8
|
+
# complex transformation of data.
|
|
9
|
+
#
|
|
10
|
+
# == Organizing Paths
|
|
11
|
+
#
|
|
12
|
+
# IMW encourages you to work within the following directory
|
|
13
|
+
# structure for a dataset +my_dataset+:
|
|
14
|
+
#
|
|
15
|
+
# my_dataset/
|
|
16
|
+
# |-- my_dataset.rb
|
|
17
|
+
# |-- ripd
|
|
18
|
+
# | `-- ...
|
|
19
|
+
# |-- rawd
|
|
20
|
+
# | `-- ...
|
|
21
|
+
# |-- fixd
|
|
22
|
+
# | `-- ...
|
|
23
|
+
# `-- pkgd
|
|
24
|
+
# `-- ...
|
|
25
|
+
#
|
|
26
|
+
# Just like IMW itself, a dataset can manage a collection of paths.
|
|
27
|
+
# If <tt>my_dataset.rb</tt> defines a dataset:
|
|
28
|
+
#
|
|
29
|
+
# # my_dataset/my_dataset.rb
|
|
30
|
+
# dataset = IMW::Dataset.new(:my_dataset)
|
|
31
|
+
#
|
|
32
|
+
# then the following paths will be defined:
|
|
33
|
+
#
|
|
34
|
+
# dataset.path_to(:root) #=> my_dataset
|
|
35
|
+
# dataset.path_to(:script) #=> my_dataset/my_dataset.rb
|
|
36
|
+
# dataset.path_to(:ripd) #=> my_dataset/ripd
|
|
37
|
+
# dataset.path_to(:rawd) #=> my_dataset/rawd
|
|
38
|
+
# dataset.path_to(:fixd) #=> my_dataset/fixd
|
|
39
|
+
# dataset.path_to(:pkgd) #=> my_dataset/pkgd
|
|
40
|
+
#
|
|
41
|
+
# Just like IMW itself, the +dataset+ supports adding path
|
|
42
|
+
# references
|
|
43
|
+
#
|
|
44
|
+
# dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
|
|
45
|
+
# dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
|
|
46
|
+
#
|
|
47
|
+
# as well as removed (via <tt>dataset.remove_path</tt>)).
|
|
48
|
+
#
|
|
49
|
+
# A subclass of IMW::Dataset can customize these paths be overriding
|
|
50
|
+
# IMW::Dataset#set_default_paths as well as define new ones by
|
|
51
|
+
# overriding IMW::Dataset#set_paths.
|
|
52
|
+
#
|
|
53
|
+
# Setting paths can be skipped altogether by passing the
|
|
54
|
+
# <tt>:skip_paths</tt> option when instantiating a dataset:
|
|
55
|
+
#
|
|
56
|
+
# dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
|
|
57
|
+
#
|
|
58
|
+
# == Utilizing Tasks
|
|
59
|
+
#
|
|
60
|
+
# An IMW::Dataset utilizes Rake to manage tasks needed to transform
|
|
61
|
+
# data. See IMW::Workflow for a description of the pre-defined
|
|
62
|
+
# tasks (+rip+, +parse+, +fix+, +package+).
|
|
63
|
+
#
|
|
64
|
+
# New tasks can be defined
|
|
65
|
+
#
|
|
66
|
+
# dataset.task :get_authorization do
|
|
67
|
+
# # ... get an authorization token
|
|
167
68
|
# end
|
|
168
69
|
#
|
|
169
|
-
#
|
|
170
|
-
#
|
|
171
|
-
#
|
|
70
|
+
# and hooked into the default tasks in the usual Rake manner
|
|
71
|
+
#
|
|
72
|
+
# dataset.task :rip => [:get_authorization]
|
|
73
|
+
#
|
|
74
|
+
# A dataset also has methods for the workflow step tasks to make
|
|
75
|
+
# this easier
|
|
76
|
+
#
|
|
77
|
+
# dataset.rip [:get_authorized]
|
|
78
|
+
#
|
|
79
|
+
# Tasks for a dataset can be accessed and invoked as follows
|
|
80
|
+
#
|
|
81
|
+
# dataset[:rip].invoke
|
|
82
|
+
#
|
|
83
|
+
# as well as by using the command line +imw+ tool.
|
|
84
|
+
#
|
|
85
|
+
# Defining tasks can be skipped altogether by passing the
|
|
86
|
+
# <tt>:skip_workflow</tt> option when instantiating a dataset
|
|
172
87
|
#
|
|
173
|
-
# dataset.
|
|
88
|
+
# dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
|
|
174
89
|
#
|
|
175
|
-
#
|
|
176
|
-
# <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
|
|
177
|
-
# will first cause <tt>:rip</tt> to run.
|
|
90
|
+
# == Working with Repositories
|
|
178
91
|
#
|
|
179
|
-
#
|
|
180
|
-
#
|
|
181
|
-
# simply provide a convenient scaffold for building a data
|
|
182
|
-
# transformation upon.
|
|
92
|
+
# A dataset can be added to a repository by passing the
|
|
93
|
+
# <tt>:repository</tt> option
|
|
183
94
|
#
|
|
184
|
-
#
|
|
185
|
-
#
|
|
186
|
-
# Right Thing where possible. The combination of tasks with
|
|
187
|
-
# matching directory structure is a suggested but not mandatory
|
|
188
|
-
# framework in which to program.
|
|
95
|
+
# repo = IMW::Repository.new
|
|
96
|
+
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
|
189
97
|
class Dataset
|
|
190
98
|
|
|
191
|
-
# The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
|
|
192
|
-
# dataset processing.
|
|
193
99
|
include IMW::Workflow
|
|
194
100
|
|
|
195
|
-
attr_accessor :handle, :options
|
|
101
|
+
attr_accessor :handle, :options
|
|
196
102
|
|
|
197
|
-
def initialize options = {}
|
|
103
|
+
def initialize handle, options = {}
|
|
198
104
|
@options = options
|
|
199
|
-
@handle =
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
105
|
+
@handle = handle
|
|
106
|
+
set_default_paths unless options[:skip_paths]
|
|
107
|
+
set_paths unless options[:skip_paths]
|
|
108
|
+
initialize_workflow unless options[:skip_workflow]
|
|
109
|
+
if options[:repository]
|
|
110
|
+
options[:repository][handle] = self
|
|
111
|
+
end
|
|
204
112
|
end
|
|
205
113
|
|
|
206
114
|
end
|
data/lib/imw/parsers.rb
CHANGED
data/lib/imw/repository.rb
CHANGED
|
@@ -1,35 +1,11 @@
|
|
|
1
|
-
require 'imw/utils'
|
|
2
|
-
|
|
3
1
|
module IMW
|
|
4
2
|
|
|
5
|
-
# A Repository is a collection of datasets.
|
|
3
|
+
# A Repository is a collection of datasets. It is used by the
|
|
4
|
+
# command-line +imw+ tool.
|
|
6
5
|
class Repository < Hash
|
|
7
|
-
|
|
8
|
-
# FIXME This should read some configuration settings somewhere and
|
|
9
|
-
# generate a pool specific to each IMW user.
|
|
10
|
-
def self.default
|
|
11
|
-
new
|
|
12
|
-
end
|
|
13
|
-
|
|
6
|
+
alias_method :datasets, :values
|
|
14
7
|
end
|
|
15
|
-
|
|
16
|
-
# The default repository managed by IMW.
|
|
17
|
-
REPOSITORY = Repository.default
|
|
18
8
|
|
|
19
|
-
# Add a dataset to the IMW::REPOSITORY. If the dataset has a
|
|
20
|
-
# +handle+ then it will be used as the key in this repository;
|
|
21
|
-
# otherwise the dataset's class will be used.
|
|
22
|
-
def self.add dataset
|
|
23
|
-
REPOSITORY[dataset.handle] = dataset
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# Remove a dataset from the IMW::REPOSITORY. Can pass in either a
|
|
27
|
-
# string handle or an instance of the dataset.
|
|
28
|
-
def self.delete handle
|
|
29
|
-
handle = handle.handle if handle.respond_to?(:handle)
|
|
30
|
-
REPOSITORY.delete(handle)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
9
|
end
|
|
34
10
|
|
|
35
11
|
|
data/lib/imw/resource.rb
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
require 'addressable/uri'
|
|
2
|
+
require 'imw/resources'
|
|
3
|
+
|
|
4
|
+
module IMW
|
|
5
|
+
|
|
6
|
+
# A resource can be anything addressable via a URI. Examples
|
|
7
|
+
# include local files, remote files, webpages, &c.
|
|
8
|
+
#
|
|
9
|
+
# The IMW::Resource class takes a URI as input and then dynamically
|
|
10
|
+
# extends itself with appropriate modules from IMW::Resources. As
|
|
11
|
+
# an example, calling
|
|
12
|
+
#
|
|
13
|
+
# my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
|
|
14
|
+
#
|
|
15
|
+
# would return an IMW::Resource extended by
|
|
16
|
+
# IMW::Resources::Archives::Tarbz2 (among other modules) which
|
|
17
|
+
# therefore has methods for extracting, listing, and appending to
|
|
18
|
+
# the archive.
|
|
19
|
+
#
|
|
20
|
+
# Modules are so extended based on handlers defined in the
|
|
21
|
+
# <tt>imw/resources</tt> directory and accessible via
|
|
22
|
+
# IMW::Resources#handlers. You can define your own handlers by
|
|
23
|
+
# defining the constant IMW::Resources::USER_DEFINED_HANDLERS in
|
|
24
|
+
# your configuration file.
|
|
25
|
+
#
|
|
26
|
+
# The modules extending a particular IMW::Resource instance can be
|
|
27
|
+
# listed as follows
|
|
28
|
+
#
|
|
29
|
+
# my_archive.resource_modules #=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Archives::Tarbz2]
|
|
30
|
+
#
|
|
31
|
+
# By default, resources are opened for reading. Passing in the
|
|
32
|
+
# appropriate <tt>:mode</tt> option changes this:
|
|
33
|
+
#
|
|
34
|
+
# IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
|
|
35
|
+
#
|
|
36
|
+
# If the <tt>:skip_modules</tt> option is passed in then the
|
|
37
|
+
# resource will not extend itself with any modules and will
|
|
38
|
+
# essentially only retain the bare functionality of a URI. This can
|
|
39
|
+
# be useful when subclassing IMW::Resource or dealing with a very
|
|
40
|
+
# strange kind of resource.
|
|
41
|
+
#
|
|
42
|
+
# Read the documentation for modules in IMW::Resources to learn more
|
|
43
|
+
# about the various behaviors an IMW::Resource can acquire.
|
|
44
|
+
class Resource
|
|
45
|
+
|
|
46
|
+
attr_reader :uri, :mode
|
|
47
|
+
|
|
48
|
+
def initialize uri, options={}
|
|
49
|
+
self.uri = uri
|
|
50
|
+
@mode = options[:mode] || 'r'
|
|
51
|
+
extend_appropriately! unless options[:skip_modules]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Return the modules this resource has been extended by.
|
|
55
|
+
#
|
|
56
|
+
# @return [Array] the modules this resource has been extended by.
|
|
57
|
+
def resource_modules
|
|
58
|
+
@resource_modules ||= []
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Works just like Object#extend except it keeps track of the
|
|
62
|
+
# modules it has extended, see Resource#resource_modules.
|
|
63
|
+
def extend mod
|
|
64
|
+
resource_modules << mod
|
|
65
|
+
super mod
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Extend this resource with modules by passing it through a
|
|
69
|
+
# collection of handlers defined by IMW::Resources#handlers
|
|
70
|
+
def extend_appropriately!
|
|
71
|
+
IMW::Resources.extend_resource!(self)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Set the URI of this resource by parsing the given +uri+ (if
|
|
75
|
+
# necessary).
|
|
76
|
+
#
|
|
77
|
+
# @param [String, Addressable::URI] uri the uri to parse
|
|
78
|
+
def uri= uri
|
|
79
|
+
if uri.is_a?(Addressable::URI)
|
|
80
|
+
@uri = uri
|
|
81
|
+
else
|
|
82
|
+
begin
|
|
83
|
+
@uri = Addressable::URI.parse(uri.to_s)
|
|
84
|
+
rescue URI::InvalidURIError
|
|
85
|
+
@uri = Addressable::URI.parse(URI.encode(uri.to_s))
|
|
86
|
+
@encoded_uri = true
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# The scheme of this resource. Will be +nil+ for local resources.
|
|
92
|
+
#
|
|
93
|
+
# @return [String]
|
|
94
|
+
def scheme
|
|
95
|
+
@scheme ||= uri.scheme
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# The directory name of this resource's path.
|
|
99
|
+
#
|
|
100
|
+
# @return [String]
|
|
101
|
+
def dirname
|
|
102
|
+
@dirname ||= File.dirname(path)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# The basename of this resource's path.
|
|
106
|
+
#
|
|
107
|
+
# @return [String]
|
|
108
|
+
def basename
|
|
109
|
+
@basename ||= File.basename(path)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Returns the extension (INCLUDING the '.') of this resource's
|
|
113
|
+
# path. Redefine this in an including class for which this is
|
|
114
|
+
# weird ('.tar.gz' I'm talking to you...)
|
|
115
|
+
#
|
|
116
|
+
# @return [String]
|
|
117
|
+
def extname
|
|
118
|
+
@extname ||= File.extname(path)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Returns the extension (WITHOUT the '.') of this resource's path.
|
|
122
|
+
#
|
|
123
|
+
# @return [String]
|
|
124
|
+
def extension
|
|
125
|
+
@extension ||= extname[1..-1] || ''
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Returns the basename of the file with its extension removed
|
|
129
|
+
#
|
|
130
|
+
# IMW.open('/path/to/some_file.tar.gz').name # => some_file
|
|
131
|
+
#
|
|
132
|
+
# @return [String]
|
|
133
|
+
def name
|
|
134
|
+
@name ||= extname ? basename[0,basename.length - extname.length] : basename
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def to_s
|
|
138
|
+
uri.to_s
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Raise an error unless this resource exists.
|
|
142
|
+
#
|
|
143
|
+
# @param [String] message an optional message to include
|
|
144
|
+
def should_exist!(message=nil)
|
|
145
|
+
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
|
|
146
|
+
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
|
|
147
|
+
raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', ')) unless exist?
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Open a copy of this resource.
|
|
151
|
+
#
|
|
152
|
+
# This is useful when wanting to reset file handles. Though -- be
|
|
153
|
+
# warned -- it does not close any file handles itself...
|
|
154
|
+
#
|
|
155
|
+
# @return [IMW::Resource] the new (old) resource
|
|
156
|
+
def reopen
|
|
157
|
+
IMW.open(self.uri.to_s)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# If +method+ begins with the strings +is+, +on+, or +via+ and
|
|
161
|
+
# ends with a question mark then we interpret it as a question
|
|
162
|
+
# this resource doesn't know how to answer -- so we have it answer
|
|
163
|
+
# +false+.
|
|
164
|
+
#
|
|
165
|
+
# As an example, consider the following loop:
|
|
166
|
+
#
|
|
167
|
+
# IMW.open('/tmp').all_contents.each do |obj|
|
|
168
|
+
# if obj.is_archive?
|
|
169
|
+
# # ... do something
|
|
170
|
+
# end
|
|
171
|
+
# end
|
|
172
|
+
#
|
|
173
|
+
# When +obj+ is initialized and it _isn't_ an archive, then it
|
|
174
|
+
# doesn't know about the <tt>is_archive?</tt> method -- but it
|
|
175
|
+
# should therefore answer false anyway.
|
|
176
|
+
#
|
|
177
|
+
# This lets a basic text file answer questions about whether it's
|
|
178
|
+
# an archive (or on S3, or accessed via some user-defined scheme,
|
|
179
|
+
# &c.) without needing to know anything about archives (or S3 or
|
|
180
|
+
# the user-defined scheme).
|
|
181
|
+
def method_missing method, *args
|
|
182
|
+
if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
|
|
183
|
+
# querying for a boolean response so answer false
|
|
184
|
+
return false
|
|
185
|
+
else
|
|
186
|
+
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{resource_modules.join(', ')}"
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
|
|
4
|
+
module Archives
|
|
5
|
+
autoload :Rar, 'imw/resources/archives_and_compressed/rar'
|
|
6
|
+
autoload :Tar, 'imw/resources/archives_and_compressed/tar'
|
|
7
|
+
autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
|
|
8
|
+
autoload :Targz, 'imw/resources/archives_and_compressed/targz'
|
|
9
|
+
autoload :Zip, 'imw/resources/archives_and_compressed/zip'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Defines methods for creating, appending to, extracting, and
|
|
13
|
+
# listing an archive file. This module isn't used to directly
|
|
14
|
+
# extend an IMW::Resource -- instead, format specifc modules
|
|
15
|
+
# (e.g. - IMW::Resources::Archives::Tarbz2) include this module
|
|
16
|
+
# and define the specific settings (command-line flags, &c.)
|
|
17
|
+
# required to make things work.
|
|
18
|
+
module Archive
|
|
19
|
+
|
|
20
|
+
attr_accessor :archive_settings
|
|
21
|
+
|
|
22
|
+
# Is this file an archive?
|
|
23
|
+
#
|
|
24
|
+
# @return [true, false]
|
|
25
|
+
def is_archive?
|
|
26
|
+
true
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Create an archive of the given +input_paths+.
|
|
30
|
+
#
|
|
31
|
+
# @param [String, IMW::Resource] input_paths the paths to add to this archive
|
|
32
|
+
def create *input_paths
|
|
33
|
+
should_have_archive_setting!("Cannot create archive #{path}", :program, :create)
|
|
34
|
+
IMW.system archive_settings[:program], archive_settings[:create], path, *input_paths.flatten
|
|
35
|
+
self
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Append to this archive the given +input_paths+.
|
|
39
|
+
#
|
|
40
|
+
# @param [String, IMW::Resource] input_paths the paths to add to this archive
|
|
41
|
+
def append *input_paths
|
|
42
|
+
should_have_archive_setting!("Cannot append to archive #{path}", :append)
|
|
43
|
+
IMW.system archive_settings[:program], archive_settings[:append], path, *input_paths.flatten
|
|
44
|
+
self
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Extract the files from this archive to the current directory.
|
|
48
|
+
def extract
|
|
49
|
+
should_exist!("Cannot extract archive.")
|
|
50
|
+
should_have_archive_setting!("Cannot extract archive #{path}", :extract, [:unarchving_program, :program])
|
|
51
|
+
program = archive_settings[:unarchiving_program] || archive_settings[:program]
|
|
52
|
+
IMW.system program, archive_settings[:extract], path
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Return a (sorted) list of contents in this archive.
|
|
56
|
+
#
|
|
57
|
+
# @return [Array] a list of paths in the archive.
|
|
58
|
+
def contents
|
|
59
|
+
should_exist!("Cannot list archive contents.")
|
|
60
|
+
should_have_archive_setting!("Cannot list archive #{path}", :list, [:unarchiving_program, :program])
|
|
61
|
+
program = archive_settings[:unarchiving_program] || archive_settings[:program]
|
|
62
|
+
# FIXME this needs to be more robust
|
|
63
|
+
flags = archive_settings[:list]
|
|
64
|
+
flags = flags.join(' ') if flags.is_a?(Array)
|
|
65
|
+
command = [program, flags, path.gsub(' ', '\ ')].join(' ')
|
|
66
|
+
output = `#{command}`
|
|
67
|
+
archive_contents_string_to_array(output)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
protected
|
|
71
|
+
|
|
72
|
+
def should_have_archive_setting! message=nil,*settings # :nodoc:
|
|
73
|
+
settings.each do |setting|
|
|
74
|
+
if setting.is_a?(Array)
|
|
75
|
+
raise IMW::Error.new([message, "Must define one of #{setting.join(', ')} in archive_settings"].compact.join(', ')) unless setting.any? { |optional_setting| archive_settings[optional_setting] }
|
|
76
|
+
else
|
|
77
|
+
raise IMW::Error.new([message, "Must define #{setting} in archive_setings"].compact.join(', ')) unless archive_settings[setting]
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Parse and format the output from the archive program's "list"
|
|
83
|
+
# command into an array of filenames.
|
|
84
|
+
#
|
|
85
|
+
# An including class can override this method to match the
|
|
86
|
+
# output from the archiving program of that class.
|
|
87
|
+
#
|
|
88
|
+
# @param [String] string the raw output from the archive program's "list" command
|
|
89
|
+
# @return [Array] a list of paths in the archive
|
|
90
|
+
def archive_contents_string_to_array string
|
|
91
|
+
string.split("\n")
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
module CompressedFiles
|
|
4
|
+
module Bz2
|
|
5
|
+
|
|
6
|
+
include IMW::Resources::CompressedFile
|
|
7
|
+
|
|
8
|
+
def compression_settings
|
|
9
|
+
@compression_settings ||= {
|
|
10
|
+
:decompression_program => :bzip2,
|
|
11
|
+
:decompress => '-fd'
|
|
12
|
+
}
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|