imw 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/dataset.rb CHANGED
@@ -1,206 +1,114 @@
1
- require 'imw/utils'
2
1
  require 'imw/dataset/workflow'
3
2
  require 'imw/dataset/paths'
4
3
 
5
4
  module IMW
6
5
 
7
- # The IMW::Dataset class is useful organizing a complex data
8
- # transformation because it is capable of managing a collection of
9
- # paths and the interdependencies between subparts of the
10
- # transformation.
11
- #
12
- # == Manipulating Paths
13
- #
14
- # Storing paths makes code shorter and more readable. By default
15
- # (this assumes the executing script is in a file
16
- # /home/imw_user/data/foo.rb):
17
- #
18
- # dataset = IMW::Dataset.new
19
- # dataset.path_to(:self)
20
- # #=> '/home/imw_user/data'
21
- # dataset.path_to(:ripd)
22
- # #=> '/home/imw_user/data/ripd'
23
- # dataset.path_to(:pkgd, 'final.tar.gz')
24
- # #=> '/home/imw_user/data/pkgd/final.tar.gz'
25
- #
26
- # Paths can be added
27
- #
28
- # dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
29
- # dataset.path_to(:sorted_output)
30
- # #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
31
- #
32
- # as well as removed (via +remove_path+).
33
- #
34
- # == Defining Workflows
35
- #
36
- # IMW encourages you to think of transforming data as a network of
37
- # interdependent steps (see IMW::Workflow). Each of IMW's five
38
- # default steps maps to a named directory remembered by each
39
- # dataset.
40
- #
41
- # The following example shows why this is a useful abstraction as
42
- # well as illustrating some of the other functionality in IMW.
43
- #
44
- # == Example Dataset
45
- #
46
- # The first step is to import IMW and create the dataset
47
- #
48
- # require 'rubygems'
49
- # require 'imw'
50
- # dataset = IMW::Dataset.new
51
- #
52
- # You can pass in a handle (the name or "slug" for the dataset) as
53
- # well as some options. Now define the steps you intend to take to
54
- # complete the transformation:
55
- #
56
- # rip::
57
- # Data is collected from a source (+http+, +ftp+, database, &c.)
58
- # and deposited in the <tt>:ripd</tt> directory of this dataset.
59
- #
60
- # dataset.task :rip do
61
- # IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
62
- # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
63
- #
64
- # IMW::Rip.from_database :named => "weather_records",
65
- # :at => "public.astro.chimpu.edu",
66
- # :select => "* FROM hurricane_frequency"
67
- # #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
68
- # end
69
- #
70
- # Where <tt>[ripd]</tt> would be replaced by the IMW
71
- # <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
72
- # empty so If there's no need to rip data (perhaps it's already on
73
- # disk?) then nothing needs to be done here.
74
- #
75
- # raw::
76
- # Managed by the <tt>:raw</tt> task, data is uncompressed and
77
- # extracted (if necessary) and stored in a subdirectory of the
78
- # <tt>:data</tt> directory named by the taxon and handle of this
79
- # dataset.
80
- #
81
- # dataset.task :raw do
82
- # IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
83
- # Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
84
- # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
85
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
86
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
87
- # ...
88
- # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
89
- # end
90
- #
91
- # Where <tt>[data]</tt> would be replaced by the IMW
92
- # <tt>:data</tt> directory.
93
- #
94
- # If this dataset didn't have a taxon
95
- # (economics/alarming_trends) its files would be stored in a
96
- # directory +recent_history_of_banana_prices+ just below the
97
- # <tt>:data</tt> directory.
98
- #
99
- # fix::
100
- # Managed by the <tt>:fix</tt> task, transformations on the data
101
- # are performed. IMW's method is to read data from a source
102
- # format (XML, YAML, CSV, &c.) into Ruby objects with hash
103
- # semantics. These objects might be based upon structs,
104
- # ActiveRecord, DataMapper::Resource, FasterCSV...anything which
105
- # can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
106
- # ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
107
- # neatly into your toobox.
108
- #
109
- #
110
- # # Open an output file in XML for writing
111
- # output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
112
- # #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
113
- #
114
- # # A place to store the combined data
115
- # correlations = []
116
- #
117
- # dataset.task :fix do
118
- #
119
- # # Return the contents of the weather data which has rows like
120
- # #
121
- # # 1 2008-09-01 4
122
- # # 2 2008-09-08 3
123
- # # 3 2008-08-15 3
124
- # # ...
125
- # #
126
- # weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
127
- # :headers => ["ID","DATE","NUM_HURRICANES"]).entries
128
- # #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
129
- #
130
- #
131
- # # Return the matching data from the produce prices XML file which looks like
132
- # #
133
- # # <prices>
134
- # # <price type="apple">
135
- # # <date>2008/09/01</date>
136
- # # <amount>0.15</amount>
137
- # # </price>
138
- # # <price type="banana">
139
- # # <date>2008/09/01</date>
140
- # # <amount>0.20</amount>
141
- # # </price>
142
- # # ...
143
- # # </prices>
144
- # parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
145
- # { :week => 'date',
146
- # :price => 'amount' }]
147
- #
148
- # # Loop through the XML produce prices, mixing in the hurricane data,
149
- # # and outputting new rows.
150
- # Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
151
- # IMW.open file do |xml| #=> Hpricot::Doc
152
- # parser.parse(xml).each do |record|
153
- # num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
154
- # output << [week,record[:price],num_hurricanes]
155
- # end
156
- # end
157
- # end
158
- # end
159
- #
160
- # package::
161
- # Data is packaged and compressed (if necessary) into a delivery
162
- # format and deposited into the <tt>:pkgd</tt> directory.
163
- #
164
- # dataset.task :pkg do
165
- # IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
166
- # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
6
+ # The IMW::Dataset represents a common object in which paths, data
7
+ # resources, and various tasks can be intermingled to define a
8
+ # complex transformation of data.
9
+ #
10
+ # == Organizing Paths
11
+ #
12
+ # IMW encourages you to work within the following directory
13
+ # structure for a dataset +my_dataset+:
14
+ #
15
+ # my_dataset/
16
+ # |-- my_dataset.rb
17
+ # |-- ripd
18
+ # | `-- ...
19
+ # |-- rawd
20
+ # | `-- ...
21
+ # |-- fixd
22
+ # | `-- ...
23
+ # `-- pkgd
24
+ # `-- ...
25
+ #
26
+ # Just like IMW itself, a dataset can manage a collection of paths.
27
+ # If <tt>my_dataset.rb</tt> defines a dataset:
28
+ #
29
+ # # my_dataset/my_dataset.rb
30
+ # dataset = IMW::Dataset.new(:my_dataset)
31
+ #
32
+ # then the following paths will be defined:
33
+ #
34
+ # dataset.path_to(:root) #=> my_dataset
35
+ # dataset.path_to(:script) #=> my_dataset/my_dataset.rb
36
+ # dataset.path_to(:ripd) #=> my_dataset/ripd
37
+ # dataset.path_to(:rawd) #=> my_dataset/rawd
38
+ # dataset.path_to(:fixd) #=> my_dataset/fixd
39
+ # dataset.path_to(:pkgd) #=> my_dataset/pkgd
40
+ #
41
+ # Just like IMW itself, the +dataset+ supports adding path
42
+ # references
43
+ #
44
+ # dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
45
+ # dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
46
+ #
47
+ # as well as removed (via <tt>dataset.remove_path</tt>)).
48
+ #
49
+ # A subclass of IMW::Dataset can customize these paths be overriding
50
+ # IMW::Dataset#set_default_paths as well as define new ones by
51
+ # overriding IMW::Dataset#set_paths.
52
+ #
53
+ # Setting paths can be skipped altogether by passing the
54
+ # <tt>:skip_paths</tt> option when instantiating a dataset:
55
+ #
56
+ # dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
57
+ #
58
+ # == Utilizing Tasks
59
+ #
60
+ # An IMW::Dataset utilizes Rake to manage tasks needed to transform
61
+ # data. See IMW::Workflow for a description of the pre-defined
62
+ # tasks (+rip+, +parse+, +fix+, +package+).
63
+ #
64
+ # New tasks can be defined
65
+ #
66
+ # dataset.task :get_authorization do
67
+ # # ... get an authorization token
167
68
  # end
168
69
  #
169
- # In the above, <tt>dataset.task</tt> behaves like
170
- # <tt>Rake.task</tt>, merely defining a task and its dependencies
171
- # without executing it via
70
+ # and hooked into the default tasks in the usual Rake manner
71
+ #
72
+ # dataset.task :rip => [:get_authorization]
73
+ #
74
+ # A dataset also has methods for the workflow step tasks to make
75
+ # this easier
76
+ #
77
+ # dataset.rip [:get_authorized]
78
+ #
79
+ # Tasks for a dataset can be accessed and invoked as follows
80
+ #
81
+ # dataset[:rip].invoke
82
+ #
83
+ # as well as by using the command line +imw+ tool.
84
+ #
85
+ # Defining tasks can be skipped altogether by passing the
86
+ # <tt>:skip_workflow</tt> option when instantiating a dataset
172
87
  #
173
- # dataset.task(:pkg).invoke
88
+ # dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
174
89
  #
175
- # Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and
176
- # <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
177
- # will first cause <tt>:rip</tt> to run.
90
+ # == Working with Repositories
178
91
  #
179
- # By default, the tasks associated with a dataset are blank. All of
180
- # IMW's functionality is available without defining tasks. Tasks
181
- # simply provide a convenient scaffold for building a data
182
- # transformation upon.
92
+ # A dataset can be added to a repository by passing the
93
+ # <tt>:repository</tt> option
183
94
  #
184
- # Similarly, there is no requirement to use the directory structure
185
- # outlined above. IMW's methods accept plain filenames and do the
186
- # Right Thing where possible. The combination of tasks with
187
- # matching directory structure is a suggested but not mandatory
188
- # framework in which to program.
95
+ # repo = IMW::Repository.new
96
+ # dataset = IMW::Dataset.new :my_dataset, :repository => repo
189
97
  class Dataset
190
98
 
191
- # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
192
- # dataset processing.
193
99
  include IMW::Workflow
194
100
 
195
- attr_accessor :handle, :options, :data
101
+ attr_accessor :handle, :options
196
102
 
197
- def initialize options = {}
103
+ def initialize handle, options = {}
198
104
  @options = options
199
- @handle = options[:handle]
200
- initialize_workflow
201
- set_root_paths
202
- set_paths
203
- set_tasks
105
+ @handle = handle
106
+ set_default_paths unless options[:skip_paths]
107
+ set_paths unless options[:skip_paths]
108
+ initialize_workflow unless options[:skip_workflow]
109
+ if options[:repository]
110
+ options[:repository][handle] = self
111
+ end
204
112
  end
205
113
 
206
114
  end
@@ -18,7 +18,7 @@
18
18
  # that goes elsewhere.
19
19
  #
20
20
  #
21
- # == Sample HTML (http://twitter.com:
21
+ # == Sample HTML (http://twitter.com):
22
22
  #
23
23
  # <ul class="about vcard entry-author">
24
24
  # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
data/lib/imw/parsers.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module IMW
2
2
  module Parsers
3
- autoload :HtmlParser, 'imw/parsers/html_parser'
4
3
  autoload :LineParser, 'imw/parsers/line_parser'
5
4
  autoload :RegexpParser, 'imw/parsers/regexp_parser'
5
+ autoload :HtmlParser, 'imw/parsers/html_parser'
6
6
  end
7
7
  end
@@ -1,35 +1,11 @@
1
- require 'imw/utils'
2
-
3
1
  module IMW
4
2
 
5
- # A Repository is a collection of datasets.
3
+ # A Repository is a collection of datasets. It is used by the
4
+ # command-line +imw+ tool.
6
5
  class Repository < Hash
7
-
8
- # FIXME This should read some configuration settings somewhere and
9
- # generate a pool specific to each IMW user.
10
- def self.default
11
- new
12
- end
13
-
6
+ alias_method :datasets, :values
14
7
  end
15
-
16
- # The default repository managed by IMW.
17
- REPOSITORY = Repository.default
18
8
 
19
- # Add a dataset to the IMW::REPOSITORY. If the dataset has a
20
- # +handle+ then it will be used as the key in this repository;
21
- # otherwise the dataset's class will be used.
22
- def self.add dataset
23
- REPOSITORY[dataset.handle] = dataset
24
- end
25
-
26
- # Remove a dataset from the IMW::REPOSITORY. Can pass in either a
27
- # string handle or an instance of the dataset.
28
- def self.delete handle
29
- handle = handle.handle if handle.respond_to?(:handle)
30
- REPOSITORY.delete(handle)
31
- end
32
-
33
9
  end
34
10
 
35
11
 
@@ -0,0 +1,190 @@
1
+ require 'addressable/uri'
2
+ require 'imw/resources'
3
+
4
+ module IMW
5
+
6
+ # A resource can be anything addressable via a URI. Examples
7
+ # include local files, remote files, webpages, &c.
8
+ #
9
+ # The IMW::Resource class takes a URI as input and then dynamically
10
+ # extends itself with appropriate modules from IMW::Resources. As
11
+ # an example, calling
12
+ #
13
+ # my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
14
+ #
15
+ # would return an IMW::Resource extended by
16
+ # IMW::Resources::Archives::Tarbz2 (among other modules) which
17
+ # therefore has methods for extracting, listing, and appending to
18
+ # the archive.
19
+ #
20
+ # Modules are so extended based on handlers defined in the
21
+ # <tt>imw/resources</tt> directory and accessible via
22
+ # IMW::Resources#handlers. You can define your own handlers by
23
+ # defining the constant IMW::Resources::USER_DEFINED_HANDLERS in
24
+ # your configuration file.
25
+ #
26
+ # The modules extending a particular IMW::Resource instance can be
27
+ # listed as follows
28
+ #
29
+ # my_archive.resource_modules #=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Archives::Tarbz2]
30
+ #
31
+ # By default, resources are opened for reading. Passing in the
32
+ # appropriate <tt>:mode</tt> option changes this:
33
+ #
34
+ # IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
35
+ #
36
+ # If the <tt>:skip_modules</tt> option is passed in then the
37
+ # resource will not extend itself with any modules and will
38
+ # essentially only retain the bare functionality of a URI. This can
39
+ # be useful when subclassing IMW::Resource or dealing with a very
40
+ # strange kind of resource.
41
+ #
42
+ # Read the documentation for modules in IMW::Resources to learn more
43
+ # about the various behaviors an IMW::Resource can acquire.
44
+ class Resource
45
+
46
+ attr_reader :uri, :mode
47
+
48
+ def initialize uri, options={}
49
+ self.uri = uri
50
+ @mode = options[:mode] || 'r'
51
+ extend_appropriately! unless options[:skip_modules]
52
+ end
53
+
54
+ # Return the modules this resource has been extended by.
55
+ #
56
+ # @return [Array] the modules this resource has been extended by.
57
+ def resource_modules
58
+ @resource_modules ||= []
59
+ end
60
+
61
+ # Works just like Object#extend except it keeps track of the
62
+ # modules it has extended, see Resource#resource_modules.
63
+ def extend mod
64
+ resource_modules << mod
65
+ super mod
66
+ end
67
+
68
+ # Extend this resource with modules by passing it through a
69
+ # collection of handlers defined by IMW::Resources#handlers
70
+ def extend_appropriately!
71
+ IMW::Resources.extend_resource!(self)
72
+ end
73
+
74
+ # Set the URI of this resource by parsing the given +uri+ (if
75
+ # necessary).
76
+ #
77
+ # @param [String, Addressable::URI] uri the uri to parse
78
+ def uri= uri
79
+ if uri.is_a?(Addressable::URI)
80
+ @uri = uri
81
+ else
82
+ begin
83
+ @uri = Addressable::URI.parse(uri.to_s)
84
+ rescue URI::InvalidURIError
85
+ @uri = Addressable::URI.parse(URI.encode(uri.to_s))
86
+ @encoded_uri = true
87
+ end
88
+ end
89
+ end
90
+
91
+ # The scheme of this resource. Will be +nil+ for local resources.
92
+ #
93
+ # @return [String]
94
+ def scheme
95
+ @scheme ||= uri.scheme
96
+ end
97
+
98
+ # The directory name of this resource's path.
99
+ #
100
+ # @return [String]
101
+ def dirname
102
+ @dirname ||= File.dirname(path)
103
+ end
104
+
105
+ # The basename of this resource's path.
106
+ #
107
+ # @return [String]
108
+ def basename
109
+ @basename ||= File.basename(path)
110
+ end
111
+
112
+ # Returns the extension (INCLUDING the '.') of this resource's
113
+ # path. Redefine this in an including class for which this is
114
+ # weird ('.tar.gz' I'm talking to you...)
115
+ #
116
+ # @return [String]
117
+ def extname
118
+ @extname ||= File.extname(path)
119
+ end
120
+
121
+ # Returns the extension (WITHOUT the '.') of this resource's path.
122
+ #
123
+ # @return [String]
124
+ def extension
125
+ @extension ||= extname[1..-1] || ''
126
+ end
127
+
128
+ # Returns the basename of the file with its extension removed
129
+ #
130
+ # IMW.open('/path/to/some_file.tar.gz').name # => some_file
131
+ #
132
+ # @return [String]
133
+ def name
134
+ @name ||= extname ? basename[0,basename.length - extname.length] : basename
135
+ end
136
+
137
+ def to_s
138
+ uri.to_s
139
+ end
140
+
141
+ # Raise an error unless this resource exists.
142
+ #
143
+ # @param [String] message an optional message to include
144
+ def should_exist!(message=nil)
145
+ raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
146
+ raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{resource_modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
147
+ raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', ')) unless exist?
148
+ end
149
+
150
+ # Open a copy of this resource.
151
+ #
152
+ # This is useful when wanting to reset file handles. Though -- be
153
+ # warned -- it does not close any file handles itself...
154
+ #
155
+ # @return [IMW::Resource] the new (old) resource
156
+ def reopen
157
+ IMW.open(self.uri.to_s)
158
+ end
159
+
160
+ # If +method+ begins with the strings +is+, +on+, or +via+ and
161
+ # ends with a question mark then we interpret it as a question
162
+ # this resource doesn't know how to answer -- so we have it answer
163
+ # +false+.
164
+ #
165
+ # As an example, consider the following loop:
166
+ #
167
+ # IMW.open('/tmp').all_contents.each do |obj|
168
+ # if obj.is_archive?
169
+ # # ... do something
170
+ # end
171
+ # end
172
+ #
173
+ # When +obj+ is initialized and it _isn't_ an archive, then it
174
+ # doesn't know about the <tt>is_archive?</tt> method -- but it
175
+ # should therefore answer false anyway.
176
+ #
177
+ # This lets a basic text file answer questions about whether it's
178
+ # an archive (or on S3, or accessed via some user-defined scheme,
179
+ # &c.) without needing to know anything about archives (or S3 or
180
+ # the user-defined scheme).
181
+ def method_missing method, *args
182
+ if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
183
+ # querying for a boolean response so answer false
184
+ return false
185
+ else
186
+ raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{resource_modules.join(', ')}"
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,97 @@
1
+ module IMW
2
+ module Resources
3
+
4
+ module Archives
5
+ autoload :Rar, 'imw/resources/archives_and_compressed/rar'
6
+ autoload :Tar, 'imw/resources/archives_and_compressed/tar'
7
+ autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
8
+ autoload :Targz, 'imw/resources/archives_and_compressed/targz'
9
+ autoload :Zip, 'imw/resources/archives_and_compressed/zip'
10
+ end
11
+
12
+ # Defines methods for creating, appending to, extracting, and
13
+ # listing an archive file. This module isn't used to directly
14
+ # extend an IMW::Resource -- instead, format specifc modules
15
+ # (e.g. - IMW::Resources::Archives::Tarbz2) include this module
16
+ # and define the specific settings (command-line flags, &c.)
17
+ # required to make things work.
18
+ module Archive
19
+
20
+ attr_accessor :archive_settings
21
+
22
+ # Is this file an archive?
23
+ #
24
+ # @return [true, false]
25
+ def is_archive?
26
+ true
27
+ end
28
+
29
+ # Create an archive of the given +input_paths+.
30
+ #
31
+ # @param [String, IMW::Resource] input_paths the paths to add to this archive
32
+ def create *input_paths
33
+ should_have_archive_setting!("Cannot create archive #{path}", :program, :create)
34
+ IMW.system archive_settings[:program], archive_settings[:create], path, *input_paths.flatten
35
+ self
36
+ end
37
+
38
+ # Append to this archive the given +input_paths+.
39
+ #
40
+ # @param [String, IMW::Resource] input_paths the paths to add to this archive
41
+ def append *input_paths
42
+ should_have_archive_setting!("Cannot append to archive #{path}", :append)
43
+ IMW.system archive_settings[:program], archive_settings[:append], path, *input_paths.flatten
44
+ self
45
+ end
46
+
47
+ # Extract the files from this archive to the current directory.
48
+ def extract
49
+ should_exist!("Cannot extract archive.")
50
+ should_have_archive_setting!("Cannot extract archive #{path}", :extract, [:unarchving_program, :program])
51
+ program = archive_settings[:unarchiving_program] || archive_settings[:program]
52
+ IMW.system program, archive_settings[:extract], path
53
+ end
54
+
55
+ # Return a (sorted) list of contents in this archive.
56
+ #
57
+ # @return [Array] a list of paths in the archive.
58
+ def contents
59
+ should_exist!("Cannot list archive contents.")
60
+ should_have_archive_setting!("Cannot list archive #{path}", :list, [:unarchiving_program, :program])
61
+ program = archive_settings[:unarchiving_program] || archive_settings[:program]
62
+ # FIXME this needs to be more robust
63
+ flags = archive_settings[:list]
64
+ flags = flags.join(' ') if flags.is_a?(Array)
65
+ command = [program, flags, path.gsub(' ', '\ ')].join(' ')
66
+ output = `#{command}`
67
+ archive_contents_string_to_array(output)
68
+ end
69
+
70
+ protected
71
+
72
+ def should_have_archive_setting! message=nil,*settings # :nodoc:
73
+ settings.each do |setting|
74
+ if setting.is_a?(Array)
75
+ raise IMW::Error.new([message, "Must define one of #{setting.join(', ')} in archive_settings"].compact.join(', ')) unless setting.any? { |optional_setting| archive_settings[optional_setting] }
76
+ else
77
+ raise IMW::Error.new([message, "Must define #{setting} in archive_setings"].compact.join(', ')) unless archive_settings[setting]
78
+ end
79
+ end
80
+ end
81
+
82
+ # Parse and format the output from the archive program's "list"
83
+ # command into an array of filenames.
84
+ #
85
+ # An including class can override this method to match the
86
+ # output from the archiving program of that class.
87
+ #
88
+ # @param [String] string the raw output from the archive program's "list" command
89
+ # @return [Array] a list of paths in the archive
90
+ def archive_contents_string_to_array string
91
+ string.split("\n")
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+
@@ -0,0 +1,18 @@
1
+ module IMW
2
+ module Resources
3
+ module CompressedFiles
4
+ module Bz2
5
+
6
+ include IMW::Resources::CompressedFile
7
+
8
+ def compression_settings
9
+ @compression_settings ||= {
10
+ :decompression_program => :bzip2,
11
+ :decompress => '-fd'
12
+ }
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+ end