imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
data/lib/imw/dataset.rb CHANGED
@@ -1,50 +1,64 @@
1
- #
2
- # h2. lib/imw/dataset.rb -- imw dataset
3
- #
4
- # == About
5
- #
6
- # Defines basic properties of the <tt>IMW::Dataset</tt>
7
- #
8
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
- # Copyright:: Copyright (c) 2008 infochimps.org
10
- # License:: GPL 3.0
11
- # Website:: http://infinitemonkeywrench.org/
12
- #
13
- # puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
14
-
15
- require 'rake'
16
- require 'ostruct'
17
-
18
1
  require 'imw/utils'
19
2
  require 'imw/dataset/workflow'
20
- require 'imw/dataset/loaddump'
21
- require 'imw/dataset/stats'
3
+ require 'imw/dataset/paths'
22
4
 
23
5
  module IMW
24
6
 
25
- # The basic unit in IMW is the dataset. Each dataset has a handle
26
- # which is meant to be unique (at least in the context of a
27
- # particular pool of datasets, see <tt>IMW::Pool</tt>). A dataset
28
- # can also have a taxonomic classification or _taxon_
7
+ # The IMW::Dataset class is useful organizing a complex data
8
+ # transformation because it is capable of managing a collection of
9
+ # paths and the interdependencies between subparts of the
10
+ # transformation.
11
+ #
12
+ # == Manipulating Paths
13
+ #
14
+ # Storing paths makes code shorter and more readable. By default
15
+ # (this assumes the executing script is in a file
16
+ # /home/imw_user/data/foo.rb):
17
+ #
18
+ # dataset = IMW::Dataset.new
19
+ # dataset.path_to(:self)
20
+ # #=> '/home/imw_user/data'
21
+ # dataset.path_to(:ripd)
22
+ # #=> '/home/imw_user/data/ripd'
23
+ # dataset.path_to(:pkgd, 'final.tar.gz')
24
+ # #=> '/home/imw_user/data/pkgd/final.tar.gz'
25
+ #
26
+ # Paths can be added
27
+ #
28
+ # dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
29
+ # dataset.path_to(:sorted_output)
30
+ # #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
31
+ #
32
+ # as well as removed (via +remove_path+).
33
+ #
34
+ # == Defining Workflows
35
+ #
36
+ # IMW encourages you to think of transforming data as a network of
37
+ # interdependent steps (see IMW::Workflow). Each of IMW's five
38
+ # default steps maps to a named directory remembered by each
39
+ # dataset.
29
40
  #
30
- # dataset = IMW::Dataset.new :recent_history_of_banana_prices,
31
- # :taxon => [:economics,:alarming_trends]
41
+ # The following example shows why this is a useful abstraction as
42
+ # well as illustrating some of the other functionality in IMW.
32
43
  #
33
- # but it isn't required like the handle.
44
+ # == Example Dataset
34
45
  #
35
- # Processing a dataset commonly occurs in four course steps. IMW
36
- # defines a task[http://rake.rubyforge.org] for each of these steps
37
- # and keeps files involved in different steps in different
38
- # directories.
46
+ # The first step is to import IMW and create the dataset
47
+ #
48
+ # require 'rubygems'
49
+ # require 'imw'
50
+ # dataset = IMW::Dataset.new
51
+ #
52
+ # You can pass in a handle (the name or "slug" for the dataset) as
53
+ # well as some options. Now define the steps you intend to take to
54
+ # complete the transformation:
39
55
  #
40
56
  # rip::
41
- # Managed by the <tt>:rip</tt> task, data is collected from a
42
- # source (+http+, +ftp+, database, &c.) and deposited in a
43
- # subdirectory of the <tt>:ripd</tt> directory named for the URI
44
- # of the source.
57
+ # Data is collected from a source (+http+, +ftp+, database, &c.)
58
+ # and deposited in the <tt>:ripd</tt> directory of this dataset.
45
59
  #
46
60
  # dataset.task :rip do
47
- # IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2'
61
+ # IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
48
62
  # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
49
63
  #
50
64
  # IMW::Rip.from_database :named => "weather_records",
@@ -174,59 +188,19 @@ module IMW
174
188
  # framework in which to program.
175
189
  class Dataset
176
190
 
177
- # The <tt>Rake::TaskManager</tt> module allows the
178
- # <tt>IMW::Dataset</tt> class to leverage the functionality of the
179
- # Rake[http://rake.rubyforge.org/] library to manage tasks
180
- # associated with the processing of this dataset.
181
- include Rake::TaskManager
182
-
183
191
  # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
184
192
  # dataset processing.
185
193
  include IMW::Workflow
186
194
 
187
- attr_reader :handle, :taxon, :options
188
- attr_accessor :data
189
-
190
- # The default taxon assigned to a dataset.
191
- DEFAULT_TAXON = nil
192
-
193
- # Default options passed to <tt>Rake</tt>. Any class including
194
- # the <tt>Rake::TaskManager</tt> module must define a constant by
195
- # this name.
196
- DEFAULT_OPTIONS = {
197
- :dry_run => false,
198
- :trace => false,
199
- :verbose => false
200
- }
195
+ attr_accessor :handle, :options, :data
201
196
 
202
- # Create a new dataset. Arguments include
203
- #
204
- # <tt>:taxon</tt> (+DEFAULT_TAXON+):: a string or sequence
205
- # giving the taxonomic classification of the dataset. See
206
- # <tt>IMW::Dataset.taxon=</tt> for more details on how this
207
- # argument is interpreted.
208
- def initialize handle, options = {}
209
- options = options.reverse_merge :taxon => DEFAULT_TAXON
210
-
211
- # FIXME is this how the attribute writer functions should be
212
- # called?
213
- @handle = handle
214
- @taxon = options[:taxon]
215
-
216
- # for rake
217
- @tasks = Hash.new
218
- @rules = Array.new
219
- @scope = Array.new
220
- @last_description = nil
221
- @options = OpenStruct.new(DEFAULT_OPTIONS)
222
- create_default_tasks
223
-
224
- # sets an empty @paths hash; see utils/paths.rb
197
+ def initialize options = {}
198
+ @options = options
199
+ @handle = options[:handle]
200
+ initialize_workflow
201
+ set_root_paths
225
202
  set_paths
226
- end
227
-
228
- def handle= thing
229
- @handle = thing.is_a?(String) ? thing.to_handle : thing
203
+ set_tasks
230
204
  end
231
205
 
232
206
  end
@@ -20,7 +20,7 @@ module IMW
20
20
  protected
21
21
 
22
22
  def uri= uri
23
- @uri = URI.parse(uri) if uri.is_a?(String)
23
+ @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
24
24
  @host = self.uri.host
25
25
  @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
26
26
  @dirname = ::File.dirname path
@@ -53,7 +53,7 @@ module IMW
53
53
  # path as a first argument.
54
54
  [:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
55
55
  define_method class_method do
56
- File.send(class_method, path) if local?
56
+ File.send(class_method, path)
57
57
  end
58
58
  end
59
59
 
@@ -61,7 +61,7 @@ module IMW
61
61
  # to open files online too to check.
62
62
  def exist?
63
63
  if local?
64
- ::File.exist?(path) ? true : false
64
+ ::File.exist?(path)
65
65
  else
66
66
  begin
67
67
  true if open(uri)
@@ -1,17 +1,3 @@
1
- #
2
- # h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
3
- #
4
- # == About
5
- #
6
- # Just required all the archive and compressed formats (+tar+, +bz2+,
7
- # &c.)
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
1
  module IMW
16
2
  module Files
17
3
 
@@ -29,9 +15,9 @@ module IMW
29
15
  # The default flags used creating, appending to, listing, and
30
16
  # extracting a tar archive.
31
17
  DEFAULT_FLAGS = {
32
- :create => "-cf",
33
- :append => "-rf",
34
- :list => "-tf",
18
+ :create => "-cf",
19
+ :append => "-rf",
20
+ :list => "-tf",
35
21
  :extract => "-xf",
36
22
  :program => :tar
37
23
  }
@@ -39,10 +25,10 @@ module IMW
39
25
  def initialize uri, *args
40
26
  self.uri= uri
41
27
  @archive = {
42
- :program => DEFAULT_FLAGS[:program],
43
- :create_flags => DEFAULT_FLAGS[:create],
44
- :append_flags => DEFAULT_FLAGS[:append],
45
- :list_flags => DEFAULT_FLAGS[:list],
28
+ :program => DEFAULT_FLAGS[:program],
29
+ :create_flags => DEFAULT_FLAGS[:create],
30
+ :append_flags => DEFAULT_FLAGS[:append],
31
+ :list_flags => DEFAULT_FLAGS[:list],
46
32
  :extract_flags => DEFAULT_FLAGS[:extract]
47
33
  }
48
34
  end
@@ -51,9 +37,9 @@ module IMW
51
37
  # A class to wrap a <tt>tar.gz</tt> archive.
52
38
  #
53
39
  # Creation, appending, listing, and extraction flags are stored in
54
- # <tt>IMW::Files::TarGz::DEFAULT_FLAGS</tt> and all are passed to
40
+ # <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
55
41
  # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
56
- class TarGz
42
+ class Targz
57
43
 
58
44
  include IMW::Files::BasicFile
59
45
  include IMW::Files::Archive
@@ -63,21 +49,21 @@ module IMW
63
49
  # extracting a <tt>tar.gz</tt> archive.
64
50
  DEFAULT_FLAGS = {
65
51
  :decompression_program => :gzip,
66
- :decompression_flags => '-fd',
67
- :archive_program => :tar,
68
- :archive_list_flags => "-tf",
52
+ :decompression_flags => '-fd',
53
+ :archive_program => :tar,
54
+ :archive_list_flags => "-tf",
69
55
  :archive_extract_flags => "-xzf"
70
56
  }
71
57
 
72
58
  def initialize uri, *args
73
59
  self.uri= uri
74
60
  @compression = {
75
- :program => DEFAULT_FLAGS[:decompression_program],
61
+ :program => DEFAULT_FLAGS[:decompression_program],
76
62
  :decompression_flags => DEFAULT_FLAGS[:decompression_flags]
77
63
  }
78
64
  @archive = {
79
- :program => DEFAULT_FLAGS[:archive_program],
80
- :list_flags => DEFAULT_FLAGS[:archive_list_flags],
65
+ :program => DEFAULT_FLAGS[:archive_program],
66
+ :list_flags => DEFAULT_FLAGS[:archive_list_flags],
81
67
  :extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
82
68
  }
83
69
  end
@@ -99,14 +85,14 @@ module IMW
99
85
  end
100
86
  end
101
87
 
102
- end # TarGz
88
+ end # Targz
103
89
 
104
90
  # A class to wrap a <tt>tar.bz2</tt> archive.
105
91
  #
106
92
  # Creation, appending, listing, and extraction flags are stored in
107
- # <tt>IMW::Files::TarBz2::DEFAULT_FLAGS</tt> and all are passed to
93
+ # <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
108
94
  # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
109
- class TarBz2
95
+ class Tarbz2
110
96
 
111
97
  include IMW::Files::BasicFile
112
98
  include IMW::Files::Archive
@@ -169,7 +155,7 @@ module IMW
169
155
  File.join(dirname,name + '.tar')
170
156
  end
171
157
 
172
- end # TarBz2
158
+ end # Tarbz2
173
159
 
174
160
  # A class to wrap a +rar+ archive.
175
161
  #
@@ -330,11 +316,11 @@ module IMW
330
316
 
331
317
 
332
318
  # make sure that tar.bz2 precedes bz2 and so on...
333
- FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::TarBz2]
334
- FILE_REGEXPS << [/\.tbz2$/, IMW::Files::TarBz2]
319
+ FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
320
+ FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
335
321
 
336
- FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::TarGz]
337
- FILE_REGEXPS << [/\.tgz$/, IMW::Files::TarGz]
322
+ FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
323
+ FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
338
324
 
339
325
  FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
340
326
  FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
data/lib/imw/files/csv.rb CHANGED
@@ -39,7 +39,8 @@ module IMW
39
39
  def initialize uri, mode='r', options = {}
40
40
  options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
41
  self.uri= uri
42
- super open(uri,mode),options
42
+ options.delete(:write) # FasterCSV complains about unkown options
43
+ super open(uri,mode), options
43
44
  end
44
45
 
45
46
  # Return the contents of this CSV file as an array of arrays.
@@ -0,0 +1,62 @@
1
+ require 'imw/files/basicfile'
2
+ module IMW
3
+ module Files
4
+ class Directory
5
+
6
+ include IMW::Files::BasicFile
7
+
8
+ # FIXME these should be defined by BasicFile and then removed here but I don't see how...
9
+ # [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
10
+ # instance_eval do
11
+ # remove_method method
12
+ # end
13
+ # end
14
+
15
+ def uri= uri
16
+ @uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
17
+ @host = self.uri.host
18
+ @path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
19
+ @dirname = ::File.dirname path
20
+ @basename = ::File.basename path
21
+ end
22
+
23
+ def initialize uri
24
+ self.uri = uri
25
+ end
26
+
27
+ def [] selector='*'
28
+ Dir[File.join(path, selector)] if local?
29
+ end
30
+
31
+ # Copy the contents of this directory to +new_dir+.
32
+ def cp new_dir
33
+ raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
34
+ if local?
35
+ FileUtils.cp_r path, new_dir
36
+ else
37
+ raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
38
+ end
39
+ self.class.new(new_dir)
40
+ end
41
+
42
+ # Move this directory to +new_dir+.
43
+ def mv new_dir
44
+ raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
45
+ if local?
46
+ FileUtils.mv path, new_dir
47
+ else
48
+ raise IMW::PathError.new("cannot move remote directories (yet!)")
49
+ end
50
+ self.class.new(new_dir)
51
+ end
52
+ alias_method :mv!, :mv
53
+
54
+ # Move this directory so it sits beneath +dir+.
55
+ def mv_to_dir dir
56
+ mv File.join(File.expand_path(dir),basename)
57
+ end
58
+ alias_method :mv_to_dir!, :mv_to_dir
59
+
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,84 @@
1
+ require 'spreadsheet'
2
+
3
+ # FIXME Main issue with this:
4
+ # You can make a new excel book and dump data to it no problem.
5
+ # However, something that doesn't seem to work is dumping to a file, opening,
6
+ # and dumping to it again. At the moment this is probably not a big deal.
7
+
8
+
9
+ module IMW
10
+ module Files
11
+ class Excel
12
+ include IMW::Files::BasicFile
13
+ include IMW::Files::Compressible
14
+
15
+ #need to initialize, load, and dump
16
+ attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
17
+ def initialize uri, mode, options={}
18
+ self.uri = uri
19
+ @max_lines = options[:max_lines] || 65000
20
+ @idx = 0
21
+ @book_idx = 0
22
+ @sht_idx = 0
23
+ unless self.exist?
24
+ make_new_book
25
+ make_new_sheet
26
+ else
27
+ get_existing_book
28
+ end
29
+ end
30
+
31
+ def load
32
+ @sheet.map{|row| row.to_a}
33
+ end
34
+
35
+ def dump data
36
+ data.each do |line|
37
+ raise "too many lines" if too_many?
38
+ self << line
39
+ end
40
+ save unless no_data?
41
+ end
42
+
43
+ def << line
44
+ @sheet.row(@sht_row).concat( line )
45
+ @sht_row += 1
46
+ @idx += 1
47
+ end
48
+
49
+ def make_new_book
50
+ @book = Spreadsheet::Workbook.new
51
+ @book_idx += 1
52
+ end
53
+
54
+ def make_new_sheet
55
+ @sheet = @book.create_worksheet
56
+ @sht_idx += 1
57
+ @sht_row = 0 #always start at row 0 in a new sheet
58
+ end
59
+
60
+ def get_existing_book
61
+ @book = Spreadsheet.open path
62
+ @sheet = book.worksheet 0
63
+ @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
64
+ @sht_idx += 1
65
+ end
66
+
67
+ def incr_sheet
68
+ @sheet = book.worksheet @sht_idx
69
+ end
70
+
71
+ def too_many?
72
+ @sht_row >= @max_lines
73
+ end
74
+
75
+ def no_data?
76
+ @sht_row == 0
77
+ end
78
+
79
+ def save
80
+ @book.write path
81
+ end
82
+ end
83
+ end
84
+ end
@@ -1,17 +1,3 @@
1
- #
2
- # h2. lib/imw/files/sgml.rb -- SGML files
3
- #
4
- # == About
5
- #
6
- # For SGML-derived files, including XML, HTML, &c..
7
- #
8
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
- # Copyright:: Copyright (c) 2008 infochimps.org
10
- # License:: GPL 3.0
11
- # Website:: http://infinitemonkeywrench.org/
12
- #
13
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
14
-
15
1
  require 'hpricot'
16
2
  require 'imw/files/text'
17
3
  require 'imw/parsers/html_parser'
@@ -23,21 +9,16 @@ module IMW
23
9
 
24
10
  attr_accessor :doc
25
11
 
26
- def initialize uri, mode='r', options={}
27
- super uri, mode, options
28
- @doc = Hpricot(open(uri))
29
- end
30
-
31
12
  # Delegate to Hpricot
32
13
  def method_missing method, *args, &block
33
14
  @doc.send method, *args, &block
34
15
  end
35
16
 
36
- # Parse this file using the IMW HTMLParser. The parser can
37
- # either be passed in directly or constructed from a passed hash
38
- # of matchers.
17
+ # Parse this file using the IMW::Parsers::HtmlParser. The
18
+ # parser can either be passed in directly or constructed from a
19
+ # passed hash of specs and/or matchers.
39
20
  def parse *args
40
- parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
21
+ parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
41
22
  parser.parse(self)
42
23
  end
43
24
 
data/lib/imw/files.rb CHANGED
@@ -1,22 +1,8 @@
1
- #
2
- # h2. lib/imw/files.rb -- uniform interface to various files
3
- #
4
- # == About
5
- #
6
- # Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
7
- # object given a URI.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
1
  require 'uri'
17
2
  require 'open-uri'
18
3
  require 'imw/utils'
19
4
  require 'imw/files/basicfile'
5
+ require 'imw/files/directory'
20
6
  require 'imw/files/archive'
21
7
  require 'imw/files/compressible'
22
8
  require 'imw/files/compressed_file'
@@ -28,13 +14,21 @@ module IMW
28
14
  #
29
15
  # IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
30
16
  #
31
- #
32
- def self.open path, options = {}
33
- mode = options[:write] ? 'w' : 'r'
34
- Files.file_class_for(path, options).new(path, mode, options)
17
+ #
18
+ def self.open path, options = {}, &block
19
+ if File.directory?(File.expand_path(path))
20
+ dir = Files::Directory.new(path)
21
+ yield dir if block_given?
22
+ dir
23
+ else
24
+ mode = options[:write] ? 'w' : 'r'
25
+ file = Files.file_class_for(path, options).new(path, mode, options)
26
+ yield file if block_given?
27
+ file
28
+ end
35
29
  end
36
30
 
37
- def self.open! path, options = {}
31
+ def self.open! path, options = {}, &block
38
32
  self.open path, options.reverse_merge(:write => true)
39
33
  end
40
34
 
@@ -50,13 +44,14 @@ module IMW
50
44
  autoload :Bz2, 'imw/files/compressed_files_and_archives'
51
45
  autoload :Gz, 'imw/files/compressed_files_and_archives'
52
46
  autoload :Tar, 'imw/files/compressed_files_and_archives'
53
- autoload :TarBz2, 'imw/files/compressed_files_and_archives'
54
- autoload :TarGz, 'imw/files/compressed_files_and_archives'
47
+ autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
48
+ autoload :Targz, 'imw/files/compressed_files_and_archives'
55
49
  autoload :Rar, 'imw/files/compressed_files_and_archives'
56
50
  autoload :Zip, 'imw/files/compressed_files_and_archives'
57
51
  autoload :Xml, 'imw/files/sgml'
58
52
  autoload :Html, 'imw/files/sgml'
59
-
53
+ autoload :Excel, 'imw/files/excel'
54
+
60
55
 
61
56
  # An array used to match files to classes to handle them. The
62
57
  # first element of each array is the regexp and the second names
@@ -70,33 +65,39 @@ module IMW
70
65
  # allows, say, <tt>.tar.gz</tt> to be handled differently from
71
66
  # <tt>.gz</tt>.
72
67
  EXTENSION_HANDLERS = [
73
- [/./, :Text], # catchall
74
- [/\.txt$/, :Text],
75
- [/\.txt$/, :Text],
76
- [/\.dat$/, :Text],
77
- [/\.ascii$/, :Text],
78
- [/\.yaml$/, :Yaml],
79
- [/\.yml$/, :Yaml],
80
- [/\.csv$/, :Csv],
81
- [/\.tsv$/, :Tsv],
82
- [/\.json$/, :Json],
83
- [/\.bz2$/, :Bz2],
84
- [/\.gz$/, :Gz],
85
- [/\.tar\.bz2$/, :TarBz2],
86
- [/\.tbz2$/, :TarBz2],
87
- [/\.tar\.gz$/, :TarGz],
88
- [/\.tgz$/, :TarGz],
89
- [/\.tar$/, :Tar],
90
- [/\.rar$/, :Rar],
91
- [/\.zip$/, :Zip],
92
- [/\.xml$/, :Xml],
93
- [/\.html$/, :Html],
94
- [/\.htm$/, :Html]
68
+ [/\.txt$/, :text],
69
+ [/\.txt$/, :text],
70
+ [/\.dat$/, :text],
71
+ [/\.ascii$/, :text],
72
+ [/\.yaml$/, :yaml],
73
+ [/\.yml$/, :yaml],
74
+ [/\.csv$/, :csv],
75
+ [/\.tsv$/, :tsv],
76
+ [/\.json$/, :json],
77
+ [/\.bz2$/, :bz2],
78
+ [/\.gz$/, :gz],
79
+ [/\.tar\.bz2$/, :tarbz2],
80
+ [/\.tbz2$/, :tarbz2],
81
+ [/\.tar\.gz$/, :targz],
82
+ [/\.tgz$/, :targz],
83
+ [/\.tar$/, :tar],
84
+ [/\.rar$/, :rar],
85
+ [/\.zip$/, :zip],
86
+ [/\.xml$/, :xml],
87
+ [/\.html$/, :html],
88
+ [/\.htm$/, :html],
89
+ [/\.xlsx?$/, :excel]
95
90
  ]
96
-
91
+
92
+ SCHEME_HANDLERS = [
93
+ [/http/, :html]
94
+ ]
95
+
97
96
  protected
98
97
  def self.file_class_for path, options = {}
99
98
  klass = options.delete(:as)
99
+
100
+ # try to choose klass from path extension if not already set
100
101
  unless klass
101
102
  EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
102
103
  next unless regexp =~ path
@@ -104,7 +105,21 @@ module IMW
104
105
  break
105
106
  end
106
107
  end
107
- klass.is_a?(Class) ? klass : class_eval(klass.to_s)
108
+
109
+ # try to choose klass from uri scheme if not already set
110
+ unless klass
111
+ scheme = URI.parse(path).scheme
112
+ SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
113
+ next unless regexp =~ scheme
114
+ klass = thing
115
+ break
116
+ end
117
+ end
118
+
119
+ # just stick with text if still not set
120
+ klass = :text unless klass
121
+
122
+ klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
108
123
  end
109
124
  end
110
125
  end