imw 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/dataset.rb
CHANGED
@@ -1,206 +1,114 @@
|
|
1
|
-
require 'imw/utils'
|
2
1
|
require 'imw/dataset/workflow'
|
3
2
|
require 'imw/dataset/paths'
|
4
3
|
|
5
4
|
module IMW
|
6
5
|
|
7
|
-
# The IMW::Dataset
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
#
|
70
|
-
# Where <tt>[ripd]</tt> would be replaced by the IMW
|
71
|
-
# <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
|
72
|
-
# empty so If there's no need to rip data (perhaps it's already on
|
73
|
-
# disk?) then nothing needs to be done here.
|
74
|
-
#
|
75
|
-
# raw::
|
76
|
-
# Managed by the <tt>:raw</tt> task, data is uncompressed and
|
77
|
-
# extracted (if necessary) and stored in a subdirectory of the
|
78
|
-
# <tt>:data</tt> directory named by the taxon and handle of this
|
79
|
-
# dataset.
|
80
|
-
#
|
81
|
-
# dataset.task :raw do
|
82
|
-
# IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
|
83
|
-
# Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
|
84
|
-
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
|
85
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
|
86
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
|
87
|
-
# ...
|
88
|
-
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
|
89
|
-
# end
|
90
|
-
#
|
91
|
-
# Where <tt>[data]</tt> would be replaced by the IMW
|
92
|
-
# <tt>:data</tt> directory.
|
93
|
-
#
|
94
|
-
# If this dataset didn't have a taxon
|
95
|
-
# (economics/alarming_trends) its files would be stored in a
|
96
|
-
# directory +recent_history_of_banana_prices+ just below the
|
97
|
-
# <tt>:data</tt> directory.
|
98
|
-
#
|
99
|
-
# fix::
|
100
|
-
# Managed by the <tt>:fix</tt> task, transformations on the data
|
101
|
-
# are performed. IMW's method is to read data from a source
|
102
|
-
# format (XML, YAML, CSV, &c.) into Ruby objects with hash
|
103
|
-
# semantics. These objects might be based upon structs,
|
104
|
-
# ActiveRecord, DataMapper::Resource, FasterCSV...anything which
|
105
|
-
# can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
|
106
|
-
# ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
|
107
|
-
# neatly into your toobox.
|
108
|
-
#
|
109
|
-
#
|
110
|
-
# # Open an output file in XML for writing
|
111
|
-
# output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
|
112
|
-
# #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
|
113
|
-
#
|
114
|
-
# # A place to store the combined data
|
115
|
-
# correlations = []
|
116
|
-
#
|
117
|
-
# dataset.task :fix do
|
118
|
-
#
|
119
|
-
# # Return the contents of the weather data which has rows like
|
120
|
-
# #
|
121
|
-
# # 1 2008-09-01 4
|
122
|
-
# # 2 2008-09-08 3
|
123
|
-
# # 3 2008-08-15 3
|
124
|
-
# # ...
|
125
|
-
# #
|
126
|
-
# weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
|
127
|
-
# :headers => ["ID","DATE","NUM_HURRICANES"]).entries
|
128
|
-
# #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
|
129
|
-
#
|
130
|
-
#
|
131
|
-
# # Return the matching data from the produce prices XML file which looks like
|
132
|
-
# #
|
133
|
-
# # <prices>
|
134
|
-
# # <price type="apple">
|
135
|
-
# # <date>2008/09/01</date>
|
136
|
-
# # <amount>0.15</amount>
|
137
|
-
# # </price>
|
138
|
-
# # <price type="banana">
|
139
|
-
# # <date>2008/09/01</date>
|
140
|
-
# # <amount>0.20</amount>
|
141
|
-
# # </price>
|
142
|
-
# # ...
|
143
|
-
# # </prices>
|
144
|
-
# parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
|
145
|
-
# { :week => 'date',
|
146
|
-
# :price => 'amount' }]
|
147
|
-
#
|
148
|
-
# # Loop through the XML produce prices, mixing in the hurricane data,
|
149
|
-
# # and outputting new rows.
|
150
|
-
# Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
|
151
|
-
# IMW.open file do |xml| #=> Hpricot::Doc
|
152
|
-
# parser.parse(xml).each do |record|
|
153
|
-
# num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
|
154
|
-
# output << [week,record[:price],num_hurricanes]
|
155
|
-
# end
|
156
|
-
# end
|
157
|
-
# end
|
158
|
-
# end
|
159
|
-
#
|
160
|
-
# package::
|
161
|
-
# Data is packaged and compressed (if necessary) into a delivery
|
162
|
-
# format and deposited into the <tt>:pkgd</tt> directory.
|
163
|
-
#
|
164
|
-
# dataset.task :pkg do
|
165
|
-
# IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
|
166
|
-
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
|
6
|
+
# The IMW::Dataset represents a common object in which paths, data
|
7
|
+
# resources, and various tasks can be intermingled to define a
|
8
|
+
# complex transformation of data.
|
9
|
+
#
|
10
|
+
# == Organizing Paths
|
11
|
+
#
|
12
|
+
# IMW encourages you to work within the following directory
|
13
|
+
# structure for a dataset +my_dataset+:
|
14
|
+
#
|
15
|
+
# my_dataset/
|
16
|
+
# |-- my_dataset.rb
|
17
|
+
# |-- ripd
|
18
|
+
# | `-- ...
|
19
|
+
# |-- rawd
|
20
|
+
# | `-- ...
|
21
|
+
# |-- fixd
|
22
|
+
# | `-- ...
|
23
|
+
# `-- pkgd
|
24
|
+
# `-- ...
|
25
|
+
#
|
26
|
+
# Just like IMW itself, a dataset can manage a collection of paths.
|
27
|
+
# If <tt>my_dataset.rb</tt> defines a dataset:
|
28
|
+
#
|
29
|
+
# # my_dataset/my_dataset.rb
|
30
|
+
# dataset = IMW::Dataset.new(:my_dataset)
|
31
|
+
#
|
32
|
+
# then the following paths will be defined:
|
33
|
+
#
|
34
|
+
# dataset.path_to(:root) #=> my_dataset
|
35
|
+
# dataset.path_to(:script) #=> my_dataset/my_dataset.rb
|
36
|
+
# dataset.path_to(:ripd) #=> my_dataset/ripd
|
37
|
+
# dataset.path_to(:rawd) #=> my_dataset/rawd
|
38
|
+
# dataset.path_to(:fixd) #=> my_dataset/fixd
|
39
|
+
# dataset.path_to(:pkgd) #=> my_dataset/pkgd
|
40
|
+
#
|
41
|
+
# Just like IMW itself, the +dataset+ supports adding path
|
42
|
+
# references
|
43
|
+
#
|
44
|
+
# dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
|
45
|
+
# dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
|
46
|
+
#
|
47
|
+
# as well as removed (via <tt>dataset.remove_path</tt>)).
|
48
|
+
#
|
49
|
+
# A subclass of IMW::Dataset can customize these paths be overriding
|
50
|
+
# IMW::Dataset#set_default_paths as well as define new ones by
|
51
|
+
# overriding IMW::Dataset#set_paths.
|
52
|
+
#
|
53
|
+
# Setting paths can be skipped altogether by passing the
|
54
|
+
# <tt>:skip_paths</tt> option when instantiating a dataset:
|
55
|
+
#
|
56
|
+
# dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
|
57
|
+
#
|
58
|
+
# == Utilizing Tasks
|
59
|
+
#
|
60
|
+
# An IMW::Dataset utilizes Rake to manage tasks needed to transform
|
61
|
+
# data. See IMW::Workflow for a description of the pre-defined
|
62
|
+
# tasks (+rip+, +parse+, +fix+, +package+).
|
63
|
+
#
|
64
|
+
# New tasks can be defined
|
65
|
+
#
|
66
|
+
# dataset.task :get_authorization do
|
67
|
+
# # ... get an authorization token
|
167
68
|
# end
|
168
69
|
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
70
|
+
# and hooked into the default tasks in the usual Rake manner
|
71
|
+
#
|
72
|
+
# dataset.task :rip => [:get_authorization]
|
73
|
+
#
|
74
|
+
# A dataset also has methods for the workflow step tasks to make
|
75
|
+
# this easier
|
76
|
+
#
|
77
|
+
# dataset.rip [:get_authorized]
|
78
|
+
#
|
79
|
+
# Tasks for a dataset can be accessed and invoked as follows
|
80
|
+
#
|
81
|
+
# dataset[:rip].invoke
|
82
|
+
#
|
83
|
+
# as well as by using the command line +imw+ tool.
|
84
|
+
#
|
85
|
+
# Defining tasks can be skipped altogether by passing the
|
86
|
+
# <tt>:skip_workflow</tt> option when instantiating a dataset
|
172
87
|
#
|
173
|
-
# dataset.
|
88
|
+
# dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
|
174
89
|
#
|
175
|
-
#
|
176
|
-
# <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
|
177
|
-
# will first cause <tt>:rip</tt> to run.
|
90
|
+
# == Working with Repositories
|
178
91
|
#
|
179
|
-
#
|
180
|
-
#
|
181
|
-
# simply provide a convenient scaffold for building a data
|
182
|
-
# transformation upon.
|
92
|
+
# A dataset can be added to a repository by passing the
|
93
|
+
# <tt>:repository</tt> option
|
183
94
|
#
|
184
|
-
#
|
185
|
-
#
|
186
|
-
# Right Thing where possible. The combination of tasks with
|
187
|
-
# matching directory structure is a suggested but not mandatory
|
188
|
-
# framework in which to program.
|
95
|
+
# repo = IMW::Repository.new
|
96
|
+
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
189
97
|
class Dataset
|
190
98
|
|
191
|
-
# The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
|
192
|
-
# dataset processing.
|
193
99
|
include IMW::Workflow
|
194
100
|
|
195
|
-
attr_accessor :handle, :options
|
101
|
+
attr_accessor :handle, :options
|
196
102
|
|
197
|
-
def initialize options = {}
|
103
|
+
def initialize handle, options = {}
|
198
104
|
@options = options
|
199
|
-
@handle =
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
105
|
+
@handle = handle
|
106
|
+
set_default_paths unless options[:skip_paths]
|
107
|
+
set_paths unless options[:skip_paths]
|
108
|
+
initialize_workflow unless options[:skip_workflow]
|
109
|
+
if options[:repository]
|
110
|
+
options[:repository][handle] = self
|
111
|
+
end
|
204
112
|
end
|
205
113
|
|
206
114
|
end
|
data/lib/imw/parsers.rb
CHANGED
data/lib/imw/repository.rb
CHANGED
@@ -1,35 +1,11 @@
|
|
1
|
-
require 'imw/utils'
|
2
|
-
|
3
1
|
module IMW
|
4
2
|
|
5
|
-
# A Repository is a collection of datasets.
|
3
|
+
# A Repository is a collection of datasets. It is used by the
|
4
|
+
# command-line +imw+ tool.
|
6
5
|
class Repository < Hash
|
7
|
-
|
8
|
-
# FIXME This should read some configuration settings somewhere and
|
9
|
-
# generate a pool specific to each IMW user.
|
10
|
-
def self.default
|
11
|
-
new
|
12
|
-
end
|
13
|
-
|
6
|
+
alias_method :datasets, :values
|
14
7
|
end
|
15
|
-
|
16
|
-
# The default repository managed by IMW.
|
17
|
-
REPOSITORY = Repository.default
|
18
8
|
|
19
|
-
# Add a dataset to the IMW::REPOSITORY. If the dataset has a
|
20
|
-
# +handle+ then it will be used as the key in this repository;
|
21
|
-
# otherwise the dataset's class will be used.
|
22
|
-
def self.add dataset
|
23
|
-
REPOSITORY[dataset.handle] = dataset
|
24
|
-
end
|
25
|
-
|
26
|
-
# Remove a dataset from the IMW::REPOSITORY. Can pass in either a
|
27
|
-
# string handle or an instance of the dataset.
|
28
|
-
def self.delete handle
|
29
|
-
handle = handle.handle if handle.respond_to?(:handle)
|
30
|
-
REPOSITORY.delete(handle)
|
31
|
-
end
|
32
|
-
|
33
9
|
end
|
34
10
|
|
35
11
|
|
data/lib/imw/resource.rb
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'imw/resources'
|
3
|
+
|
4
|
+
module IMW
|
5
|
+
|
6
|
+
# A resource can be anything addressable via a URI. Examples
|
7
|
+
# include local files, remote files, webpages, &c.
|
8
|
+
#
|
9
|
+
# The IMW::Resource class takes a URI as input and then dynamically
|
10
|
+
# extends itself with appropriate modules from IMW::Resources. As
|
11
|
+
# an example, calling
|
12
|
+
#
|
13
|
+
# my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
|
14
|
+
#
|
15
|
+
# would return an IMW::Resource extended by
|
16
|
+
# IMW::Resources::Archives::Tarbz2 (among other modules) which
|
17
|
+
# therefore has methods for extracting, listing, and appending to
|
18
|
+
# the archive.
|
19
|
+
#
|
20
|
+
# Modules are so extended based on handlers defined in the
|
21
|
+
# <tt>imw/resources</tt> directory and accessible via
|
22
|
+
# IMW::Resources#handlers. You can define your own handlers by
|
23
|
+
# defining the constant IMW::Resources::USER_DEFINED_HANDLERS in
|
24
|
+
# your configuration file.
|
25
|
+
#
|
26
|
+
# The modules extending a particular IMW::Resource instance can be
|
27
|
+
# listed as follows
|
28
|
+
#
|
29
|
+
# my_archive.resource_modules #=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Archives::Tarbz2]
|
30
|
+
#
|
31
|
+
# By default, resources are opened for reading. Passing in the
|
32
|
+
# appropriate <tt>:mode</tt> option changes this:
|
33
|
+
#
|
34
|
+
# IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
|
35
|
+
#
|
36
|
+
# If the <tt>:skip_modules</tt> option is passed in then the
|
37
|
+
# resource will not extend itself with any modules and will
|
38
|
+
# essentially only retain the bare functionality of a URI. This can
|
39
|
+
# be useful when subclassing IMW::Resource or dealing with a very
|
40
|
+
# strange kind of resource.
|
41
|
+
#
|
42
|
+
# Read the documentation for modules in IMW::Resources to learn more
|
43
|
+
# about the various behaviors an IMW::Resource can acquire.
|
44
|
+
class Resource
|
45
|
+
|
46
|
+
attr_reader :uri, :mode
|
47
|
+
|
48
|
+
def initialize uri, options={}
|
49
|
+
self.uri = uri
|
50
|
+
@mode = options[:mode] || 'r'
|
51
|
+
extend_appropriately! unless options[:skip_modules]
|
52
|
+
end
|
53
|
+
|
54
|
+
# Return the modules this resource has been extended by.
|
55
|
+
#
|
56
|
+
# @return [Array] the modules this resource has been extended by.
|
57
|
+
def resource_modules
|
58
|
+
@resource_modules ||= []
|
59
|
+
end
|
60
|
+
|
61
|
+
# Works just like Object#extend except it keeps track of the
|
62
|
+
# modules it has extended, see Resource#resource_modules.
|
63
|
+
def extend mod
|
64
|
+
resource_modules << mod
|
65
|
+
super mod
|
66
|
+
end
|
67
|
+
|
68
|
+
# Extend this resource with modules by passing it through a
|
69
|
+
# collection of handlers defined by IMW::Resources#handlers
|
70
|
+
def extend_appropriately!
|
71
|
+
IMW::Resources.extend_resource!(self)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Set the URI of this resource by parsing the given +uri+ (if
|
75
|
+
# necessary).
|
76
|
+
#
|
77
|
+
# @param [String, Addressable::URI] uri the uri to parse
|
78
|
+
def uri= uri
|
79
|
+
if uri.is_a?(Addressable::URI)
|
80
|
+
@uri = uri
|
81
|
+
else
|
82
|
+
begin
|
83
|
+
@uri = Addressable::URI.parse(uri.to_s)
|
84
|
+
rescue URI::InvalidURIError
|
85
|
+
@uri = Addressable::URI.parse(URI.encode(uri.to_s))
|
86
|
+
@encoded_uri = true
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# The scheme of this resource. Will be +nil+ for local resources.
|
92
|
+
#
|
93
|
+
# @return [String]
|
94
|
+
def scheme
|
95
|
+
@scheme ||= uri.scheme
|
96
|
+
end
|
97
|
+
|
98
|
+
# The directory name of this resource's path.
|
99
|
+
#
|
100
|
+
# @return [String]
|
101
|
+
def dirname
|
102
|
+
@dirname ||= File.dirname(path)
|
103
|
+
end
|
104
|
+
|
105
|
+
# The basename of this resource's path.
|
106
|
+
#
|
107
|
+
# @return [String]
|
108
|
+
def basename
|
109
|
+
@basename ||= File.basename(path)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns the extension (INCLUDING the '.') of this resource's
|
113
|
+
# path. Redefine this in an including class for which this is
|
114
|
+
# weird ('.tar.gz' I'm talking to you...)
|
115
|
+
#
|
116
|
+
# @return [String]
|
117
|
+
def extname
|
118
|
+
@extname ||= File.extname(path)
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns the extension (WITHOUT the '.') of this resource's path.
|
122
|
+
#
|
123
|
+
# @return [String]
|
124
|
+
def extension
|
125
|
+
@extension ||= extname[1..-1] || ''
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns the basename of the file with its extension removed
|
129
|
+
#
|
130
|
+
# IMW.open('/path/to/some_file.tar.gz').name # => some_file
|
131
|
+
#
|
132
|
+
# @return [String]
|
133
|
+
def name
|
134
|
+
@name ||= extname ? basename[0,basename.length - extname.length] : basename
|
135
|
+
end
|
136
|
+
|
137
|
+
def to_s
|
138
|
+
uri.to_s
|
139
|
+
end
|
140
|
+
|
141
|
+
# Raise an error unless this resource exists.
|
142
|
+
#
|
143
|
+
# @param [String] message an optional message to include
|
144
|
+
def should_exist!(message=nil)
|
145
|
+
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
|
146
|
+
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
|
147
|
+
raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', ')) unless exist?
|
148
|
+
end
|
149
|
+
|
150
|
+
# Open a copy of this resource.
|
151
|
+
#
|
152
|
+
# This is useful when wanting to reset file handles. Though -- be
|
153
|
+
# warned -- it does not close any file handles itself...
|
154
|
+
#
|
155
|
+
# @return [IMW::Resource] the new (old) resource
|
156
|
+
def reopen
|
157
|
+
IMW.open(self.uri.to_s)
|
158
|
+
end
|
159
|
+
|
160
|
+
# If +method+ begins with the strings +is+, +on+, or +via+ and
|
161
|
+
# ends with a question mark then we interpret it as a question
|
162
|
+
# this resource doesn't know how to answer -- so we have it answer
|
163
|
+
# +false+.
|
164
|
+
#
|
165
|
+
# As an example, consider the following loop:
|
166
|
+
#
|
167
|
+
# IMW.open('/tmp').all_contents.each do |obj|
|
168
|
+
# if obj.is_archive?
|
169
|
+
# # ... do something
|
170
|
+
# end
|
171
|
+
# end
|
172
|
+
#
|
173
|
+
# When +obj+ is initialized and it _isn't_ an archive, then it
|
174
|
+
# doesn't know about the <tt>is_archive?</tt> method -- but it
|
175
|
+
# should therefore answer false anyway.
|
176
|
+
#
|
177
|
+
# This lets a basic text file answer questions about whether it's
|
178
|
+
# an archive (or on S3, or accessed via some user-defined scheme,
|
179
|
+
# &c.) without needing to know anything about archives (or S3 or
|
180
|
+
# the user-defined scheme).
|
181
|
+
def method_missing method, *args
|
182
|
+
if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
|
183
|
+
# querying for a boolean response so answer false
|
184
|
+
return false
|
185
|
+
else
|
186
|
+
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{resource_modules.join(', ')}"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module IMW
|
2
|
+
module Resources
|
3
|
+
|
4
|
+
module Archives
|
5
|
+
autoload :Rar, 'imw/resources/archives_and_compressed/rar'
|
6
|
+
autoload :Tar, 'imw/resources/archives_and_compressed/tar'
|
7
|
+
autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
|
8
|
+
autoload :Targz, 'imw/resources/archives_and_compressed/targz'
|
9
|
+
autoload :Zip, 'imw/resources/archives_and_compressed/zip'
|
10
|
+
end
|
11
|
+
|
12
|
+
# Defines methods for creating, appending to, extracting, and
|
13
|
+
# listing an archive file. This module isn't used to directly
|
14
|
+
# extend an IMW::Resource -- instead, format specifc modules
|
15
|
+
# (e.g. - IMW::Resources::Archives::Tarbz2) include this module
|
16
|
+
# and define the specific settings (command-line flags, &c.)
|
17
|
+
# required to make things work.
|
18
|
+
module Archive
|
19
|
+
|
20
|
+
attr_accessor :archive_settings
|
21
|
+
|
22
|
+
# Is this file an archive?
|
23
|
+
#
|
24
|
+
# @return [true, false]
|
25
|
+
def is_archive?
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
# Create an archive of the given +input_paths+.
|
30
|
+
#
|
31
|
+
# @param [String, IMW::Resource] input_paths the paths to add to this archive
|
32
|
+
def create *input_paths
|
33
|
+
should_have_archive_setting!("Cannot create archive #{path}", :program, :create)
|
34
|
+
IMW.system archive_settings[:program], archive_settings[:create], path, *input_paths.flatten
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
# Append to this archive the given +input_paths+.
|
39
|
+
#
|
40
|
+
# @param [String, IMW::Resource] input_paths the paths to add to this archive
|
41
|
+
def append *input_paths
|
42
|
+
should_have_archive_setting!("Cannot append to archive #{path}", :append)
|
43
|
+
IMW.system archive_settings[:program], archive_settings[:append], path, *input_paths.flatten
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
# Extract the files from this archive to the current directory.
|
48
|
+
def extract
|
49
|
+
should_exist!("Cannot extract archive.")
|
50
|
+
should_have_archive_setting!("Cannot extract archive #{path}", :extract, [:unarchving_program, :program])
|
51
|
+
program = archive_settings[:unarchiving_program] || archive_settings[:program]
|
52
|
+
IMW.system program, archive_settings[:extract], path
|
53
|
+
end
|
54
|
+
|
55
|
+
# Return a (sorted) list of contents in this archive.
|
56
|
+
#
|
57
|
+
# @return [Array] a list of paths in the archive.
|
58
|
+
def contents
|
59
|
+
should_exist!("Cannot list archive contents.")
|
60
|
+
should_have_archive_setting!("Cannot list archive #{path}", :list, [:unarchiving_program, :program])
|
61
|
+
program = archive_settings[:unarchiving_program] || archive_settings[:program]
|
62
|
+
# FIXME this needs to be more robust
|
63
|
+
flags = archive_settings[:list]
|
64
|
+
flags = flags.join(' ') if flags.is_a?(Array)
|
65
|
+
command = [program, flags, path.gsub(' ', '\ ')].join(' ')
|
66
|
+
output = `#{command}`
|
67
|
+
archive_contents_string_to_array(output)
|
68
|
+
end
|
69
|
+
|
70
|
+
protected
|
71
|
+
|
72
|
+
def should_have_archive_setting! message=nil,*settings # :nodoc:
|
73
|
+
settings.each do |setting|
|
74
|
+
if setting.is_a?(Array)
|
75
|
+
raise IMW::Error.new([message, "Must define one of #{setting.join(', ')} in archive_settings"].compact.join(', ')) unless setting.any? { |optional_setting| archive_settings[optional_setting] }
|
76
|
+
else
|
77
|
+
raise IMW::Error.new([message, "Must define #{setting} in archive_setings"].compact.join(', ')) unless archive_settings[setting]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Parse and format the output from the archive program's "list"
|
83
|
+
# command into an array of filenames.
|
84
|
+
#
|
85
|
+
# An including class can override this method to match the
|
86
|
+
# output from the archiving program of that class.
|
87
|
+
#
|
88
|
+
# @param [String] string the raw output from the archive program's "list" command
|
89
|
+
# @return [Array] a list of paths in the archive
|
90
|
+
def archive_contents_string_to_array string
|
91
|
+
string.split("\n")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module IMW
|
2
|
+
module Resources
|
3
|
+
module CompressedFiles
|
4
|
+
module Bz2
|
5
|
+
|
6
|
+
include IMW::Resources::CompressedFile
|
7
|
+
|
8
|
+
def compression_settings
|
9
|
+
@compression_settings ||= {
|
10
|
+
:decompression_program => :bzip2,
|
11
|
+
:decompress => '-fd'
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|