imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
data/lib/imw/dataset.rb CHANGED
@@ -1,50 +1,64 @@
1
- #
2
- # h2. lib/imw/dataset.rb -- imw dataset
3
- #
4
- # == About
5
- #
6
- # Defines basic properties of the <tt>IMW::Dataset</tt>
7
- #
8
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
- # Copyright:: Copyright (c) 2008 infochimps.org
10
- # License:: GPL 3.0
11
- # Website:: http://infinitemonkeywrench.org/
12
- #
13
- # puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
14
-
15
- require 'rake'
16
- require 'ostruct'
17
-
18
1
  require 'imw/utils'
19
2
  require 'imw/dataset/workflow'
20
- require 'imw/dataset/loaddump'
21
- require 'imw/dataset/stats'
3
+ require 'imw/dataset/paths'
22
4
 
23
5
  module IMW
24
6
 
25
- # The basic unit in IMW is the dataset. Each dataset has a handle
26
- # which is meant to be unique (at least in the context of a
27
- # particular pool of datasets, see <tt>IMW::Pool</tt>). A dataset
28
- # can also have a taxonomic classification or _taxon_
7
+ # The IMW::Dataset class is useful organizing a complex data
8
+ # transformation because it is capable of managing a collection of
9
+ # paths and the interdependencies between subparts of the
10
+ # transformation.
11
+ #
12
+ # == Manipulating Paths
13
+ #
14
+ # Storing paths makes code shorter and more readable. By default
15
+ # (this assumes the executing script is in a file
16
+ # /home/imw_user/data/foo.rb):
17
+ #
18
+ # dataset = IMW::Dataset.new
19
+ # dataset.path_to(:self)
20
+ # #=> '/home/imw_user/data'
21
+ # dataset.path_to(:ripd)
22
+ # #=> '/home/imw_user/data/ripd'
23
+ # dataset.path_to(:pkgd, 'final.tar.gz')
24
+ # #=> '/home/imw_user/data/pkgd/final.tar.gz'
25
+ #
26
+ # Paths can be added
27
+ #
28
+ # dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
29
+ # dataset.path_to(:sorted_output)
30
+ # #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
31
+ #
32
+ # as well as removed (via +remove_path+).
33
+ #
34
+ # == Defining Workflows
35
+ #
36
+ # IMW encourages you to think of transforming data as a network of
37
+ # interdependent steps (see IMW::Workflow). Each of IMW's five
38
+ # default steps maps to a named directory remembered by each
39
+ # dataset.
29
40
  #
30
- # dataset = IMW::Dataset.new :recent_history_of_banana_prices,
31
- # :taxon => [:economics,:alarming_trends]
41
+ # The following example shows why this is a useful abstraction as
42
+ # well as illustrating some of the other functionality in IMW.
32
43
  #
33
- # but it isn't required like the handle.
44
+ # == Example Dataset
34
45
  #
35
- # Processing a dataset commonly occurs in four course steps. IMW
36
- # defines a task[http://rake.rubyforge.org] for each of these steps
37
- # and keeps files involved in different steps in different
38
- # directories.
46
+ # The first step is to import IMW and create the dataset
47
+ #
48
+ # require 'rubygems'
49
+ # require 'imw'
50
+ # dataset = IMW::Dataset.new
51
+ #
52
+ # You can pass in a handle (the name or "slug" for the dataset) as
53
+ # well as some options. Now define the steps you intend to take to
54
+ # complete the transformation:
39
55
  #
40
56
  # rip::
41
- # Managed by the <tt>:rip</tt> task, data is collected from a
42
- # source (+http+, +ftp+, database, &c.) and deposited in a
43
- # subdirectory of the <tt>:ripd</tt> directory named for the URI
44
- # of the source.
57
+ # Data is collected from a source (+http+, +ftp+, database, &c.)
58
+ # and deposited in the <tt>:ripd</tt> directory of this dataset.
45
59
  #
46
60
  # dataset.task :rip do
47
- # IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2'
61
+ # IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
48
62
  # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
49
63
  #
50
64
  # IMW::Rip.from_database :named => "weather_records",
@@ -174,59 +188,19 @@ module IMW
174
188
  # framework in which to program.
175
189
  class Dataset
176
190
 
177
- # The <tt>Rake::TaskManager</tt> module allows the
178
- # <tt>IMW::Dataset</tt> class to leverage the functionality of the
179
- # Rake[http://rake.rubyforge.org/] library to manage tasks
180
- # associated with the processing of this dataset.
181
- include Rake::TaskManager
182
-
183
191
  # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
184
192
  # dataset processing.
185
193
  include IMW::Workflow
186
194
 
187
- attr_reader :handle, :taxon, :options
188
- attr_accessor :data
189
-
190
- # The default taxon assigned to a dataset.
191
- DEFAULT_TAXON = nil
192
-
193
- # Default options passed to <tt>Rake</tt>. Any class including
194
- # the <tt>Rake::TaskManager</tt> module must define a constant by
195
- # this name.
196
- DEFAULT_OPTIONS = {
197
- :dry_run => false,
198
- :trace => false,
199
- :verbose => false
200
- }
195
+ attr_accessor :handle, :options, :data
201
196
 
202
- # Create a new dataset. Arguments include
203
- #
204
- # <tt>:taxon</tt> (+DEFAULT_TAXON+):: a string or sequence
205
- # giving the taxonomic classification of the dataset. See
206
- # <tt>IMW::Dataset.taxon=</tt> for more details on how this
207
- # argument is interpreted.
208
- def initialize handle, options = {}
209
- options = options.reverse_merge :taxon => DEFAULT_TAXON
210
-
211
- # FIXME is this how the attribute writer functions should be
212
- # called?
213
- @handle = handle
214
- @taxon = options[:taxon]
215
-
216
- # for rake
217
- @tasks = Hash.new
218
- @rules = Array.new
219
- @scope = Array.new
220
- @last_description = nil
221
- @options = OpenStruct.new(DEFAULT_OPTIONS)
222
- create_default_tasks
223
-
224
- # sets an empty @paths hash; see utils/paths.rb
197
+ def initialize options = {}
198
+ @options = options
199
+ @handle = options[:handle]
200
+ initialize_workflow
201
+ set_root_paths
225
202
  set_paths
226
- end
227
-
228
- def handle= thing
229
- @handle = thing.is_a?(String) ? thing.to_handle : thing
203
+ set_tasks
230
204
  end
231
205
 
232
206
  end
@@ -20,7 +20,7 @@ module IMW
20
20
  protected
21
21
 
22
22
  def uri= uri
23
- @uri = URI.parse(uri) if uri.is_a?(String)
23
+ @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
24
24
  @host = self.uri.host
25
25
  @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
26
26
  @dirname = ::File.dirname path
@@ -53,7 +53,7 @@ module IMW
53
53
  # path as a first argument.
54
54
  [:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
55
55
  define_method class_method do
56
- File.send(class_method, path) if local?
56
+ File.send(class_method, path)
57
57
  end
58
58
  end
59
59
 
@@ -61,7 +61,7 @@ module IMW
61
61
  # to open files online too to check.
62
62
  def exist?
63
63
  if local?
64
- ::File.exist?(path) ? true : false
64
+ ::File.exist?(path)
65
65
  else
66
66
  begin
67
67
  true if open(uri)
@@ -1,17 +1,3 @@
1
- #
2
- # h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
3
- #
4
- # == About
5
- #
6
- # Just required all the archive and compressed formats (+tar+, +bz2+,
7
- # &c.)
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
1
  module IMW
16
2
  module Files
17
3
 
@@ -29,9 +15,9 @@ module IMW
29
15
  # The default flags used creating, appending to, listing, and
30
16
  # extracting a tar archive.
31
17
  DEFAULT_FLAGS = {
32
- :create => "-cf",
33
- :append => "-rf",
34
- :list => "-tf",
18
+ :create => "-cf",
19
+ :append => "-rf",
20
+ :list => "-tf",
35
21
  :extract => "-xf",
36
22
  :program => :tar
37
23
  }
@@ -39,10 +25,10 @@ module IMW
39
25
  def initialize uri, *args
40
26
  self.uri= uri
41
27
  @archive = {
42
- :program => DEFAULT_FLAGS[:program],
43
- :create_flags => DEFAULT_FLAGS[:create],
44
- :append_flags => DEFAULT_FLAGS[:append],
45
- :list_flags => DEFAULT_FLAGS[:list],
28
+ :program => DEFAULT_FLAGS[:program],
29
+ :create_flags => DEFAULT_FLAGS[:create],
30
+ :append_flags => DEFAULT_FLAGS[:append],
31
+ :list_flags => DEFAULT_FLAGS[:list],
46
32
  :extract_flags => DEFAULT_FLAGS[:extract]
47
33
  }
48
34
  end
@@ -51,9 +37,9 @@ module IMW
51
37
  # A class to wrap a <tt>tar.gz</tt> archive.
52
38
  #
53
39
  # Creation, appending, listing, and extraction flags are stored in
54
- # <tt>IMW::Files::TarGz::DEFAULT_FLAGS</tt> and all are passed to
40
+ # <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
55
41
  # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
56
- class TarGz
42
+ class Targz
57
43
 
58
44
  include IMW::Files::BasicFile
59
45
  include IMW::Files::Archive
@@ -63,21 +49,21 @@ module IMW
63
49
  # extracting a <tt>tar.gz</tt> archive.
64
50
  DEFAULT_FLAGS = {
65
51
  :decompression_program => :gzip,
66
- :decompression_flags => '-fd',
67
- :archive_program => :tar,
68
- :archive_list_flags => "-tf",
52
+ :decompression_flags => '-fd',
53
+ :archive_program => :tar,
54
+ :archive_list_flags => "-tf",
69
55
  :archive_extract_flags => "-xzf"
70
56
  }
71
57
 
72
58
  def initialize uri, *args
73
59
  self.uri= uri
74
60
  @compression = {
75
- :program => DEFAULT_FLAGS[:decompression_program],
61
+ :program => DEFAULT_FLAGS[:decompression_program],
76
62
  :decompression_flags => DEFAULT_FLAGS[:decompression_flags]
77
63
  }
78
64
  @archive = {
79
- :program => DEFAULT_FLAGS[:archive_program],
80
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
65
+ :program => DEFAULT_FLAGS[:archive_program],
66
+ :list_flags => DEFAULT_FLAGS[:archive_list_flags],
81
67
  :extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
82
68
  }
83
69
  end
@@ -99,14 +85,14 @@ module IMW
99
85
  end
100
86
  end
101
87
 
102
- end # TarGz
88
+ end # Targz
103
89
 
104
90
  # A class to wrap a <tt>tar.bz2</tt> archive.
105
91
  #
106
92
  # Creation, appending, listing, and extraction flags are stored in
107
- # <tt>IMW::Files::TarBz2::DEFAULT_FLAGS</tt> and all are passed to
93
+ # <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
108
94
  # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
109
- class TarBz2
95
+ class Tarbz2
110
96
 
111
97
  include IMW::Files::BasicFile
112
98
  include IMW::Files::Archive
@@ -169,7 +155,7 @@ module IMW
169
155
  File.join(dirname,name + '.tar')
170
156
  end
171
157
 
172
- end # TarBz2
158
+ end # Tarbz2
173
159
 
174
160
  # A class to wrap a +rar+ archive.
175
161
  #
@@ -330,11 +316,11 @@ module IMW
330
316
 
331
317
 
332
318
  # make sure that tar.bz2 precedes bz2 and so on...
333
- FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::TarBz2]
334
- FILE_REGEXPS << [/\.tbz2$/, IMW::Files::TarBz2]
319
+ FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
320
+ FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
335
321
 
336
- FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::TarGz]
337
- FILE_REGEXPS << [/\.tgz$/, IMW::Files::TarGz]
322
+ FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
323
+ FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
338
324
 
339
325
  FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
340
326
  FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
data/lib/imw/files/csv.rb CHANGED
@@ -39,7 +39,8 @@ module IMW
39
39
  def initialize uri, mode='r', options = {}
40
40
  options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
41
  self.uri= uri
42
- super open(uri,mode),options
42
+ options.delete(:write) # FasterCSV complains about unkown options
43
+ super open(uri,mode), options
43
44
  end
44
45
 
45
46
  # Return the contents of this CSV file as an array of arrays.
@@ -0,0 +1,62 @@
1
+ require 'imw/files/basicfile'
2
+ module IMW
3
+ module Files
4
+ class Directory
5
+
6
+ include IMW::Files::BasicFile
7
+
8
+ # FIXME these should be defined by BasicFile and then removed here but I don't see how...
9
+ # [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
10
+ # instance_eval do
11
+ # remove_method method
12
+ # end
13
+ # end
14
+
15
+ def uri= uri
16
+ @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
17
+ @host = self.uri.host
18
+ @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
19
+ @dirname = ::File.dirname path
20
+ @basename = ::File.basename path
21
+ end
22
+
23
+ def initialize uri
24
+ self.uri = uri
25
+ end
26
+
27
+ def [] selector='*'
28
+ Dir[File.join(path, selector)] if local?
29
+ end
30
+
31
+ # Copy the contents of this directory to +new_dir+.
32
+ def cp new_dir
33
+ raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
34
+ if local?
35
+ FileUtils.cp_r path, new_dir
36
+ else
37
+ raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
38
+ end
39
+ self.class.new(new_dir)
40
+ end
41
+
42
+ # Move this directory to +new_dir+.
43
+ def mv new_dir
44
+ raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
45
+ if local?
46
+ FileUtils.mv path, new_dir
47
+ else
48
+ raise IMW::PathError.new("cannot move remote directories (yet!)")
49
+ end
50
+ self.class.new(new_dir)
51
+ end
52
+ alias_method :mv!, :mv
53
+
54
+ # Move this directory so it sits beneath +dir+.
55
+ def mv_to_dir dir
56
+ mv File.join(File.expand_path(dir),basename)
57
+ end
58
+ alias_method :mv_to_dir!, :mv_to_dir
59
+
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,84 @@
1
+ require 'spreadsheet'
2
+
3
+ # FIXME Main issue with this:
4
+ # You can make a new excel book and dump data to it no problem.
5
+ # However, something that doesn't seem to work is dumping to a file, opening,
6
+ # and dumping to it again. At the moment this is probably not a big deal.
7
+
8
+
9
+ module IMW
10
+ module Files
11
+ class Excel
12
+ include IMW::Files::BasicFile
13
+ include IMW::Files::Compressible
14
+
15
+ #need to initialize, load, and dump
16
+ attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
17
+ def initialize uri, mode, options={}
18
+ self.uri = uri
19
+ @max_lines = options[:max_lines] || 65000
20
+ @idx = 0
21
+ @book_idx = 0
22
+ @sht_idx = 0
23
+ unless self.exist?
24
+ make_new_book
25
+ make_new_sheet
26
+ else
27
+ get_existing_book
28
+ end
29
+ end
30
+
31
+ def load
32
+ @sheet.map{|row| row.to_a}
33
+ end
34
+
35
+ def dump data
36
+ data.each do |line|
37
+ raise "too many lines" if too_many?
38
+ self << line
39
+ end
40
+ save unless no_data?
41
+ end
42
+
43
+ def << line
44
+ @sheet.row(@sht_row).concat( line )
45
+ @sht_row += 1
46
+ @idx += 1
47
+ end
48
+
49
+ def make_new_book
50
+ @book = Spreadsheet::Workbook.new
51
+ @book_idx += 1
52
+ end
53
+
54
+ def make_new_sheet
55
+ @sheet = @book.create_worksheet
56
+ @sht_idx += 1
57
+ @sht_row = 0 #always start at row 0 in a new sheet
58
+ end
59
+
60
+ def get_existing_book
61
+ @book = Spreadsheet.open path
62
+ @sheet = book.worksheet 0
63
+ @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
64
+ @sht_idx += 1
65
+ end
66
+
67
+ def incr_sheet
68
+ @sheet = book.worksheet @sht_idx
69
+ end
70
+
71
+ def too_many?
72
+ @sht_row >= @max_lines
73
+ end
74
+
75
+ def no_data?
76
+ @sht_row == 0
77
+ end
78
+
79
+ def save
80
+ @book.write path
81
+ end
82
+ end
83
+ end
84
+ end
@@ -1,17 +1,3 @@
1
- #
2
- # h2. lib/imw/files/sgml.rb -- SGML files
3
- #
4
- # == About
5
- #
6
- # For SGML-derived files, including XML, HTML, &c..
7
- #
8
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
- # Copyright:: Copyright (c) 2008 infochimps.org
10
- # License:: GPL 3.0
11
- # Website:: http://infinitemonkeywrench.org/
12
- #
13
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
14
-
15
1
  require 'hpricot'
16
2
  require 'imw/files/text'
17
3
  require 'imw/parsers/html_parser'
@@ -23,21 +9,16 @@ module IMW
23
9
 
24
10
  attr_accessor :doc
25
11
 
26
- def initialize uri, mode='r', options={}
27
- super uri, mode, options
28
- @doc = Hpricot(open(uri))
29
- end
30
-
31
12
  # Delegate to Hpricot
32
13
  def method_missing method, *args, &block
33
14
  @doc.send method, *args, &block
34
15
  end
35
16
 
36
- # Parse this file using the IMW HTMLParser. The parser can
37
- # either be passed in directly or constructed from a passed hash
38
- # of matchers.
17
+ # Parse this file using the IMW::Parsers::HtmlParser. The
18
+ # parser can either be passed in directly or constructed from a
19
+ # passed hash of specs and/or matchers.
39
20
  def parse *args
40
- parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
21
+ parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
41
22
  parser.parse(self)
42
23
  end
43
24
 
data/lib/imw/files.rb CHANGED
@@ -1,22 +1,8 @@
1
- #
2
- # h2. lib/imw/files.rb -- uniform interface to various files
3
- #
4
- # == About
5
- #
6
- # Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
7
- # object given a URI.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
1
  require 'uri'
17
2
  require 'open-uri'
18
3
  require 'imw/utils'
19
4
  require 'imw/files/basicfile'
5
+ require 'imw/files/directory'
20
6
  require 'imw/files/archive'
21
7
  require 'imw/files/compressible'
22
8
  require 'imw/files/compressed_file'
@@ -28,13 +14,21 @@ module IMW
28
14
  #
29
15
  # IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
30
16
  #
31
- #
32
- def self.open path, options = {}
33
- mode = options[:write] ? 'w' : 'r'
34
- Files.file_class_for(path, options).new(path, mode, options)
17
+ #
18
+ def self.open path, options = {}, &block
19
+ if File.directory?(File.expand_path(path))
20
+ dir = Files::Directory.new(path)
21
+ yield dir if block_given?
22
+ dir
23
+ else
24
+ mode = options[:write] ? 'w' : 'r'
25
+ file = Files.file_class_for(path, options).new(path, mode, options)
26
+ yield file if block_given?
27
+ file
28
+ end
35
29
  end
36
30
 
37
- def self.open! path, options = {}
31
+ def self.open! path, options = {}, &block
38
32
  self.open path, options.reverse_merge(:write => true)
39
33
  end
40
34
 
@@ -50,13 +44,14 @@ module IMW
50
44
  autoload :Bz2, 'imw/files/compressed_files_and_archives'
51
45
  autoload :Gz, 'imw/files/compressed_files_and_archives'
52
46
  autoload :Tar, 'imw/files/compressed_files_and_archives'
53
- autoload :TarBz2, 'imw/files/compressed_files_and_archives'
54
- autoload :TarGz, 'imw/files/compressed_files_and_archives'
47
+ autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
48
+ autoload :Targz, 'imw/files/compressed_files_and_archives'
55
49
  autoload :Rar, 'imw/files/compressed_files_and_archives'
56
50
  autoload :Zip, 'imw/files/compressed_files_and_archives'
57
51
  autoload :Xml, 'imw/files/sgml'
58
52
  autoload :Html, 'imw/files/sgml'
59
-
53
+ autoload :Excel, 'imw/files/excel'
54
+
60
55
 
61
56
  # An array used to match files to classes to handle them. The
62
57
  # first element of each array is the regexp and the second names
@@ -70,33 +65,39 @@ module IMW
70
65
  # allows, say, <tt>.tar.gz</tt> to be handled differently from
71
66
  # <tt>.gz</tt>.
72
67
  EXTENSION_HANDLERS = [
73
- [/./, :Text], # catchall
74
- [/\.txt$/, :Text],
75
- [/\.txt$/, :Text],
76
- [/\.dat$/, :Text],
77
- [/\.ascii$/, :Text],
78
- [/\.yaml$/, :Yaml],
79
- [/\.yml$/, :Yaml],
80
- [/\.csv$/, :Csv],
81
- [/\.tsv$/, :Tsv],
82
- [/\.json$/, :Json],
83
- [/\.bz2$/, :Bz2],
84
- [/\.gz$/, :Gz],
85
- [/\.tar\.bz2$/, :TarBz2],
86
- [/\.tbz2$/, :TarBz2],
87
- [/\.tar\.gz$/, :TarGz],
88
- [/\.tgz$/, :TarGz],
89
- [/\.tar$/, :Tar],
90
- [/\.rar$/, :Rar],
91
- [/\.zip$/, :Zip],
92
- [/\.xml$/, :Xml],
93
- [/\.html$/, :Html],
94
- [/\.htm$/, :Html]
68
+ [/\.txt$/, :text],
69
+ [/\.txt$/, :text],
70
+ [/\.dat$/, :text],
71
+ [/\.ascii$/, :text],
72
+ [/\.yaml$/, :yaml],
73
+ [/\.yml$/, :yaml],
74
+ [/\.csv$/, :csv],
75
+ [/\.tsv$/, :tsv],
76
+ [/\.json$/, :json],
77
+ [/\.bz2$/, :bz2],
78
+ [/\.gz$/, :gz],
79
+ [/\.tar\.bz2$/, :tarbz2],
80
+ [/\.tbz2$/, :tarbz2],
81
+ [/\.tar\.gz$/, :targz],
82
+ [/\.tgz$/, :targz],
83
+ [/\.tar$/, :tar],
84
+ [/\.rar$/, :rar],
85
+ [/\.zip$/, :zip],
86
+ [/\.xml$/, :xml],
87
+ [/\.html$/, :html],
88
+ [/\.htm$/, :html],
89
+ [/\.xlsx?$/, :excel]
95
90
  ]
96
-
91
+
92
+ SCHEME_HANDLERS = [
93
+ [/http/, :html]
94
+ ]
95
+
97
96
  protected
98
97
  def self.file_class_for path, options = {}
99
98
  klass = options.delete(:as)
99
+
100
+ # try to choose klass from path extension if not already set
100
101
  unless klass
101
102
  EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
102
103
  next unless regexp =~ path
@@ -104,7 +105,21 @@ module IMW
104
105
  break
105
106
  end
106
107
  end
107
- klass.is_a?(Class) ? klass : class_eval(klass.to_s)
108
+
109
+ # try to choose klass from uri scheme if not already set
110
+ unless klass
111
+ scheme = URI.parse(path).scheme
112
+ SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
113
+ next unless regexp =~ scheme
114
+ klass = thing
115
+ break
116
+ end
117
+ end
118
+
119
+ # just stick with text if still not set
120
+ klass = :text unless klass
121
+
122
+ klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
108
123
  end
109
124
  end
110
125
  end