imw 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/dataset.rb CHANGED
@@ -1,206 +1,114 @@
1
- require 'imw/utils'
2
1
  require 'imw/dataset/workflow'
3
2
  require 'imw/dataset/paths'
4
3
 
5
4
  module IMW
6
5
 
7
- # The IMW::Dataset class is useful organizing a complex data
8
- # transformation because it is capable of managing a collection of
9
- # paths and the interdependencies between subparts of the
10
- # transformation.
11
- #
12
- # == Manipulating Paths
13
- #
14
- # Storing paths makes code shorter and more readable. By default
15
- # (this assumes the executing script is in a file
16
- # /home/imw_user/data/foo.rb):
17
- #
18
- # dataset = IMW::Dataset.new
19
- # dataset.path_to(:self)
20
- # #=> '/home/imw_user/data'
21
- # dataset.path_to(:ripd)
22
- # #=> '/home/imw_user/data/ripd'
23
- # dataset.path_to(:pkgd, 'final.tar.gz')
24
- # #=> '/home/imw_user/data/pkgd/final.tar.gz'
25
- #
26
- # Paths can be added
27
- #
28
- # dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
29
- # dataset.path_to(:sorted_output)
30
- # #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
31
- #
32
- # as well as removed (via +remove_path+).
33
- #
34
- # == Defining Workflows
35
- #
36
- # IMW encourages you to think of transforming data as a network of
37
- # interdependent steps (see IMW::Workflow). Each of IMW's five
38
- # default steps maps to a named directory remembered by each
39
- # dataset.
40
- #
41
- # The following example shows why this is a useful abstraction as
42
- # well as illustrating some of the other functionality in IMW.
43
- #
44
- # == Example Dataset
45
- #
46
- # The first step is to import IMW and create the dataset
47
- #
48
- # require 'rubygems'
49
- # require 'imw'
50
- # dataset = IMW::Dataset.new
51
- #
52
- # You can pass in a handle (the name or "slug" for the dataset) as
53
- # well as some options. Now define the steps you intend to take to
54
- # complete the transformation:
55
- #
56
- # rip::
57
- # Data is collected from a source (+http+, +ftp+, database, &c.)
58
- # and deposited in the <tt>:ripd</tt> directory of this dataset.
59
- #
60
- # dataset.task :rip do
61
- # IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
62
- # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
63
- #
64
- # IMW::Rip.from_database :named => "weather_records",
65
- # :at => "public.astro.chimpu.edu",
66
- # :select => "* FROM hurricane_frequency"
67
- # #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
68
- # end
69
- #
70
- # Where <tt>[ripd]</tt> would be replaced by the IMW
71
- # <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
72
- # empty so If there's no need to rip data (perhaps it's already on
73
- # disk?) then nothing needs to be done here.
74
- #
75
- # raw::
76
- # Managed by the <tt>:raw</tt> task, data is uncompressed and
77
- # extracted (if necessary) and stored in a subdirectory of the
78
- # <tt>:data</tt> directory named by the taxon and handle of this
79
- # dataset.
80
- #
81
- # dataset.task :raw do
82
- # IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
83
- # Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
84
- # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
85
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
86
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
87
- # ...
88
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
89
- # end
90
- #
91
- # Where <tt>[data]</tt> would be replaced by the IMW
92
- # <tt>:data</tt> directory.
93
- #
94
- # If this dataset didn't have a taxon
95
- # (economics/alarming_trends) its files would be stored in a
96
- # directory +recent_history_of_banana_prices+ just below the
97
- # <tt>:data</tt> directory.
98
- #
99
- # fix::
100
- # Managed by the <tt>:fix</tt> task, transformations on the data
101
- # are performed. IMW's method is to read data from a source
102
- # format (XML, YAML, CSV, &c.) into Ruby objects with hash
103
- # semantics. These objects might be based upon structs,
104
- # ActiveRecord, DataMapper::Resource, FasterCSV...anything which
105
- # can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
106
- # ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
107
- # neatly into your toobox.
108
- #
109
- #
110
- # # Open an output file in XML for writing
111
- # output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
112
- # #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
113
- #
114
- # # A place to store the combined data
115
- # correlations = []
116
- #
117
- # dataset.task :fix do
118
- #
119
- # # Return the contents of the weather data which has rows like
120
- # #
121
- # # 1 2008-09-01 4
122
- # # 2 2008-09-08 3
123
- # # 3 2008-08-15 3
124
- # # ...
125
- # #
126
- # weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
127
- # :headers => ["ID","DATE","NUM_HURRICANES"]).entries
128
- # #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
129
- #
130
- #
131
- # # Return the matching data from the produce prices XML file which looks like
132
- # #
133
- # # <prices>
134
- # # <price type="apple">
135
- # # <date>2008/09/01</date>
136
- # # <amount>0.15</amount>
137
- # # </price>
138
- # # <price type="banana">
139
- # # <date>2008/09/01</date>
140
- # # <amount>0.20</amount>
141
- # # </price>
142
- # # ...
143
- # # </prices>
144
- # parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
145
- # { :week => 'date',
146
- # :price => 'amount' }]
147
- #
148
- # # Loop through the XML produce prices, mixing in the hurricane data,
149
- # # and outputting new rows.
150
- # Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
151
- # IMW.open file do |xml| #=> Hpricot::Doc
152
- # parser.parse(xml).each do |record|
153
- # num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
154
- # output << [week,record[:price],num_hurricanes]
155
- # end
156
- # end
157
- # end
158
- # end
159
- #
160
- # package::
161
- # Data is packaged and compressed (if necessary) into a delivery
162
- # format and deposited into the <tt>:pkgd</tt> directory.
163
- #
164
- # dataset.task :pkg do
165
- # IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
166
- # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
6
+ # The IMW::Dataset represents a common object in which paths, data
7
+ # resources, and various tasks can be intermingled to define a
8
+ # complex transformation of data.
9
+ #
10
+ # == Organizing Paths
11
+ #
12
+ # IMW encourages you to work within the following directory
13
+ # structure for a dataset +my_dataset+:
14
+ #
15
+ # my_dataset/
16
+ # |-- my_dataset.rb
17
+ # |-- ripd
18
+ # | `-- ...
19
+ # |-- rawd
20
+ # | `-- ...
21
+ # |-- fixd
22
+ # | `-- ...
23
+ # `-- pkgd
24
+ # `-- ...
25
+ #
26
+ # Just like IMW itself, a dataset can manage a collection of paths.
27
+ # If <tt>my_dataset.rb</tt> defines a dataset:
28
+ #
29
+ # # my_dataset/my_dataset.rb
30
+ # dataset = IMW::Dataset.new(:my_dataset)
31
+ #
32
+ # then the following paths will be defined:
33
+ #
34
+ # dataset.path_to(:root) #=> my_dataset
35
+ # dataset.path_to(:script) #=> my_dataset/my_dataset.rb
36
+ # dataset.path_to(:ripd) #=> my_dataset/ripd
37
+ # dataset.path_to(:rawd) #=> my_dataset/rawd
38
+ # dataset.path_to(:fixd) #=> my_dataset/fixd
39
+ # dataset.path_to(:pkgd) #=> my_dataset/pkgd
40
+ #
41
+ # Just like IMW itself, the +dataset+ supports adding path
42
+ # references
43
+ #
44
+ # dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
45
+ # dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
46
+ #
47
+ # as well as removed (via <tt>dataset.remove_path</tt>)).
48
+ #
49
+ # A subclass of IMW::Dataset can customize these paths be overriding
50
+ # IMW::Dataset#set_default_paths as well as define new ones by
51
+ # overriding IMW::Dataset#set_paths.
52
+ #
53
+ # Setting paths can be skipped altogether by passing the
54
+ # <tt>:skip_paths</tt> option when instantiating a dataset:
55
+ #
56
+ # dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
57
+ #
58
+ # == Utilizing Tasks
59
+ #
60
+ # An IMW::Dataset utilizes Rake to manage tasks needed to transform
61
+ # data. See IMW::Workflow for a description of the pre-defined
62
+ # tasks (+rip+, +parse+, +fix+, +package+).
63
+ #
64
+ # New tasks can be defined
65
+ #
66
+ # dataset.task :get_authorization do
67
+ # # ... get an authorization token
167
68
  # end
168
69
  #
169
- # In the above, <tt>dataset.task</tt> behaves like
170
- # <tt>Rake.task</tt>, merely defining a task and its dependencies
171
- # without executing it via
70
+ # and hooked into the default tasks in the usual Rake manner
71
+ #
72
+ # dataset.task :rip => [:get_authorization]
73
+ #
74
+ # A dataset also has methods for the workflow step tasks to make
75
+ # this easier
76
+ #
77
+ # dataset.rip [:get_authorized]
78
+ #
79
+ # Tasks for a dataset can be accessed and invoked as follows
80
+ #
81
+ # dataset[:rip].invoke
82
+ #
83
+ # as well as by using the command line +imw+ tool.
84
+ #
85
+ # Defining tasks can be skipped altogether by passing the
86
+ # <tt>:skip_workflow</tt> option when instantiating a dataset
172
87
  #
173
- # dataset.task(:pkg).invoke
88
+ # dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
174
89
  #
175
- # Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and
176
- # <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
177
- # will first cause <tt>:rip</tt> to run.
90
+ # == Working with Repositories
178
91
  #
179
- # By default, the tasks associated with a dataset are blank. All of
180
- # IMW's functionality is available without defining tasks. Tasks
181
- # simply provide a convenient scaffold for building a data
182
- # transformation upon.
92
+ # A dataset can be added to a repository by passing the
93
+ # <tt>:repository</tt> option
183
94
  #
184
- # Similarly, there is no requirement to use the directory structure
185
- # outlined above. IMW's methods accept plain filenames and do the
186
- # Right Thing where possible. The combination of tasks with
187
- # matching directory structure is a suggested but not mandatory
188
- # framework in which to program.
95
+ # repo = IMW::Repository.new
96
+ # dataset = IMW::Dataset.new :my_dataset, :repository => repo
189
97
  class Dataset
190
98
 
191
- # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
192
- # dataset processing.
193
99
  include IMW::Workflow
194
100
 
195
- attr_accessor :handle, :options, :data
101
+ attr_accessor :handle, :options
196
102
 
197
- def initialize options = {}
103
+ def initialize handle, options = {}
198
104
  @options = options
199
- @handle = options[:handle]
200
- initialize_workflow
201
- set_root_paths
202
- set_paths
203
- set_tasks
105
+ @handle = handle
106
+ set_default_paths unless options[:skip_paths]
107
+ set_paths unless options[:skip_paths]
108
+ initialize_workflow unless options[:skip_workflow]
109
+ if options[:repository]
110
+ options[:repository][handle] = self
111
+ end
204
112
  end
205
113
 
206
114
  end
@@ -18,7 +18,7 @@
18
18
  # that goes elsewhere.
19
19
  #
20
20
  #
21
- # == Sample HTML (http://twitter.com:
21
+ # == Sample HTML (http://twitter.com):
22
22
  #
23
23
  # <ul class="about vcard entry-author">
24
24
  # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
data/lib/imw/parsers.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module IMW
2
2
  module Parsers
3
- autoload :HtmlParser, 'imw/parsers/html_parser'
4
3
  autoload :LineParser, 'imw/parsers/line_parser'
5
4
  autoload :RegexpParser, 'imw/parsers/regexp_parser'
5
+ autoload :HtmlParser, 'imw/parsers/html_parser'
6
6
  end
7
7
  end
@@ -1,35 +1,11 @@
1
- require 'imw/utils'
2
-
3
1
  module IMW
4
2
 
5
- # A Repository is a collection of datasets.
3
+ # A Repository is a collection of datasets. It is used by the
4
+ # command-line +imw+ tool.
6
5
  class Repository < Hash
7
-
8
- # FIXME This should read some configuration settings somewhere and
9
- # generate a pool specific to each IMW user.
10
- def self.default
11
- new
12
- end
13
-
6
+ alias_method :datasets, :values
14
7
  end
15
-
16
- # The default repository managed by IMW.
17
- REPOSITORY = Repository.default
18
8
 
19
- # Add a dataset to the IMW::REPOSITORY. If the dataset has a
20
- # +handle+ then it will be used as the key in this repository;
21
- # otherwise the dataset's class will be used.
22
- def self.add dataset
23
- REPOSITORY[dataset.handle] = dataset
24
- end
25
-
26
- # Remove a dataset from the IMW::REPOSITORY. Can pass in either a
27
- # string handle or an instance of the dataset.
28
- def self.delete handle
29
- handle = handle.handle if handle.respond_to?(:handle)
30
- REPOSITORY.delete(handle)
31
- end
32
-
33
9
  end
34
10
 
35
11
 
@@ -0,0 +1,190 @@
1
+ require 'addressable/uri'
2
+ require 'imw/resources'
3
+
4
+ module IMW
5
+
6
+ # A resource can be anything addressable via a URI. Examples
7
+ # include local files, remote files, webpages, &c.
8
+ #
9
+ # The IMW::Resource class takes a URI as input and then dynamically
10
+ # extends itself with appropriate modules from IMW::Resources. As
11
+ # an example, calling
12
+ #
13
+ # my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
14
+ #
15
+ # would return an IMW::Resource extended by
16
+ # IMW::Resources::Archives::Tarbz2 (among other modules) which
17
+ # therefore has methods for extracting, listing, and appending to
18
+ # the archive.
19
+ #
20
+ # Modules are so extended based on handlers defined in the
21
+ # <tt>imw/resources</tt> directory and accessible via
22
+ # IMW::Resources#handlers. You can define your own handlers by
23
+ # defining the constant IMW::Resources::USER_DEFINED_HANDLERS in
24
+ # your configuration file.
25
+ #
26
+ # The modules extending a particular IMW::Resource instance can be
27
+ # listed as follows
28
+ #
29
+ # my_archive.resource_modules #=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Archives::Tarbz2]
30
+ #
31
+ # By default, resources are opened for reading. Passing in the
32
+ # appropriate <tt>:mode</tt> option changes this:
33
+ #
34
+ # IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
35
+ #
36
+ # If the <tt>:skip_modules</tt> option is passed in then the
37
+ # resource will not extend itself with any modules and will
38
+ # essentially only retain the bare functionality of a URI. This can
39
+ # be useful when subclassing IMW::Resource or dealing with a very
40
+ # strange kind of resource.
41
+ #
42
+ # Read the documentation for modules in IMW::Resources to learn more
43
+ # about the various behaviors an IMW::Resource can acquire.
44
+ class Resource
45
+
46
+ attr_reader :uri, :mode
47
+
48
+ def initialize uri, options={}
49
+ self.uri = uri
50
+ @mode = options[:mode] || 'r'
51
+ extend_appropriately! unless options[:skip_modules]
52
+ end
53
+
54
+ # Return the modules this resource has been extended by.
55
+ #
56
+ # @return [Array] the modules this resource has been extended by.
57
+ def resource_modules
58
+ @resource_modules ||= []
59
+ end
60
+
61
+ # Works just like Object#extend except it keeps track of the
62
+ # modules it has extended, see Resource#resource_modules.
63
+ def extend mod
64
+ resource_modules << mod
65
+ super mod
66
+ end
67
+
68
+ # Extend this resource with modules by passing it through a
69
+ # collection of handlers defined by IMW::Resources#handlers
70
+ def extend_appropriately!
71
+ IMW::Resources.extend_resource!(self)
72
+ end
73
+
74
+ # Set the URI of this resource by parsing the given +uri+ (if
75
+ # necessary).
76
+ #
77
+ # @param [String, Addressable::URI] uri the uri to parse
78
+ def uri= uri
79
+ if uri.is_a?(Addressable::URI)
80
+ @uri = uri
81
+ else
82
+ begin
83
+ @uri = Addressable::URI.parse(uri.to_s)
84
+ rescue URI::InvalidURIError
85
+ @uri = Addressable::URI.parse(URI.encode(uri.to_s))
86
+ @encoded_uri = true
87
+ end
88
+ end
89
+ end
90
+
91
+ # The scheme of this resource. Will be +nil+ for local resources.
92
+ #
93
+ # @return [String]
94
+ def scheme
95
+ @scheme ||= uri.scheme
96
+ end
97
+
98
+ # The directory name of this resource's path.
99
+ #
100
+ # @return [String]
101
+ def dirname
102
+ @dirname ||= File.dirname(path)
103
+ end
104
+
105
+ # The basename of this resource's path.
106
+ #
107
+ # @return [String]
108
+ def basename
109
+ @basename ||= File.basename(path)
110
+ end
111
+
112
+ # Returns the extension (INCLUDING the '.') of this resource's
113
+ # path. Redefine this in an including class for which this is
114
+ # weird ('.tar.gz' I'm talking to you...)
115
+ #
116
+ # @return [String]
117
+ def extname
118
+ @extname ||= File.extname(path)
119
+ end
120
+
121
+ # Returns the extension (WITHOUT the '.') of this resource's path.
122
+ #
123
+ # @return [String]
124
+ def extension
125
+ @extension ||= extname[1..-1] || ''
126
+ end
127
+
128
+ # Returns the basename of the file with its extension removed
129
+ #
130
+ # IMW.open('/path/to/some_file.tar.gz').name # => some_file
131
+ #
132
+ # @return [String]
133
+ def name
134
+ @name ||= extname ? basename[0,basename.length - extname.length] : basename
135
+ end
136
+
137
+ def to_s
138
+ uri.to_s
139
+ end
140
+
141
+ # Raise an error unless this resource exists.
142
+ #
143
+ # @param [String] message an optional message to include
144
+ def should_exist!(message=nil)
145
+ raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
146
+ raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
147
+ raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', ')) unless exist?
148
+ end
149
+
150
+ # Open a copy of this resource.
151
+ #
152
+ # This is useful when wanting to reset file handles. Though -- be
153
+ # warned -- it does not close any file handles itself...
154
+ #
155
+ # @return [IMW::Resource] the new (old) resource
156
+ def reopen
157
+ IMW.open(self.uri.to_s)
158
+ end
159
+
160
+ # If +method+ begins with the strings +is+, +on+, or +via+ and
161
+ # ends with a question mark then we interpret it as a question
162
+ # this resource doesn't know how to answer -- so we have it answer
163
+ # +false+.
164
+ #
165
+ # As an example, consider the following loop:
166
+ #
167
+ # IMW.open('/tmp').all_contents.each do |obj|
168
+ # if obj.is_archive?
169
+ # # ... do something
170
+ # end
171
+ # end
172
+ #
173
+ # When +obj+ is initialized and it _isn't_ an archive, then it
174
+ # doesn't know about the <tt>is_archive?</tt> method -- but it
175
+ # should therefore answer false anyway.
176
+ #
177
+ # This lets a basic text file answer questions about whether it's
178
+ # an archive (or on S3, or accessed via some user-defined scheme,
179
+ # &c.) without needing to know anything about archives (or S3 or
180
+ # the user-defined scheme).
181
+ def method_missing method, *args
182
+ if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
183
+ # querying for a boolean response so answer false
184
+ return false
185
+ else
186
+ raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{resource_modules.join(', ')}"
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,97 @@
1
+ module IMW
2
+ module Resources
3
+
4
+ module Archives
5
+ autoload :Rar, 'imw/resources/archives_and_compressed/rar'
6
+ autoload :Tar, 'imw/resources/archives_and_compressed/tar'
7
+ autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
8
+ autoload :Targz, 'imw/resources/archives_and_compressed/targz'
9
+ autoload :Zip, 'imw/resources/archives_and_compressed/zip'
10
+ end
11
+
12
+ # Defines methods for creating, appending to, extracting, and
13
+ # listing an archive file. This module isn't used to directly
14
+ # extend an IMW::Resource -- instead, format specifc modules
15
+ # (e.g. - IMW::Resources::Archives::Tarbz2) include this module
16
+ # and define the specific settings (command-line flags, &c.)
17
+ # required to make things work.
18
+ module Archive
19
+
20
+ attr_accessor :archive_settings
21
+
22
+ # Is this file an archive?
23
+ #
24
+ # @return [true, false]
25
+ def is_archive?
26
+ true
27
+ end
28
+
29
+ # Create an archive of the given +input_paths+.
30
+ #
31
+ # @param [String, IMW::Resource] input_paths the paths to add to this archive
32
+ def create *input_paths
33
+ should_have_archive_setting!("Cannot create archive #{path}", :program, :create)
34
+ IMW.system archive_settings[:program], archive_settings[:create], path, *input_paths.flatten
35
+ self
36
+ end
37
+
38
+ # Append to this archive the given +input_paths+.
39
+ #
40
+ # @param [String, IMW::Resource] input_paths the paths to add to this archive
41
+ def append *input_paths
42
+ should_have_archive_setting!("Cannot append to archive #{path}", :append)
43
+ IMW.system archive_settings[:program], archive_settings[:append], path, *input_paths.flatten
44
+ self
45
+ end
46
+
47
+ # Extract the files from this archive to the current directory.
48
+ def extract
49
+ should_exist!("Cannot extract archive.")
50
+ should_have_archive_setting!("Cannot extract archive #{path}", :extract, [:unarchving_program, :program])
51
+ program = archive_settings[:unarchiving_program] || archive_settings[:program]
52
+ IMW.system program, archive_settings[:extract], path
53
+ end
54
+
55
+ # Return a (sorted) list of contents in this archive.
56
+ #
57
+ # @return [Array] a list of paths in the archive.
58
+ def contents
59
+ should_exist!("Cannot list archive contents.")
60
+ should_have_archive_setting!("Cannot list archive #{path}", :list, [:unarchiving_program, :program])
61
+ program = archive_settings[:unarchiving_program] || archive_settings[:program]
62
+ # FIXME this needs to be more robust
63
+ flags = archive_settings[:list]
64
+ flags = flags.join(' ') if flags.is_a?(Array)
65
+ command = [program, flags, path.gsub(' ', '\ ')].join(' ')
66
+ output = `#{command}`
67
+ archive_contents_string_to_array(output)
68
+ end
69
+
70
+ protected
71
+
72
+ def should_have_archive_setting! message=nil,*settings # :nodoc:
73
+ settings.each do |setting|
74
+ if setting.is_a?(Array)
75
+ raise IMW::Error.new([message, "Must define one of #{setting.join(', ')} in archive_settings"].compact.join(', ')) unless setting.any? { |optional_setting| archive_settings[optional_setting] }
76
+ else
77
+ raise IMW::Error.new([message, "Must define #{setting} in archive_setings"].compact.join(', ')) unless archive_settings[setting]
78
+ end
79
+ end
80
+ end
81
+
82
+ # Parse and format the output from the archive program's "list"
83
+ # command into an array of filenames.
84
+ #
85
+ # An including class can override this method to match the
86
+ # output from the archiving program of that class.
87
+ #
88
+ # @param [String] string the raw output from the archive program's "list" command
89
+ # @return [Array] a list of paths in the archive
90
+ def archive_contents_string_to_array string
91
+ string.split("\n")
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+
@@ -0,0 +1,18 @@
1
+ module IMW
2
+ module Resources
3
+ module CompressedFiles
4
+ module Bz2
5
+
6
+ include IMW::Resources::CompressedFile
7
+
8
+ def compression_settings
9
+ @compression_settings ||= {
10
+ :decompression_program => :bzip2,
11
+ :decompress => '-fd'
12
+ }
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+ end