imw 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
data/lib/imw/dataset.rb
CHANGED
@@ -1,50 +1,64 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/dataset.rb -- imw dataset
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Defines basic properties of the <tt>IMW::Dataset</tt>
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
# puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
|
14
|
-
|
15
|
-
require 'rake'
|
16
|
-
require 'ostruct'
|
17
|
-
|
18
1
|
require 'imw/utils'
|
19
2
|
require 'imw/dataset/workflow'
|
20
|
-
require 'imw/dataset/
|
21
|
-
require 'imw/dataset/stats'
|
3
|
+
require 'imw/dataset/paths'
|
22
4
|
|
23
5
|
module IMW
|
24
6
|
|
25
|
-
# The
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
7
|
+
# The IMW::Dataset class is useful organizing a complex data
|
8
|
+
# transformation because it is capable of managing a collection of
|
9
|
+
# paths and the interdependencies between subparts of the
|
10
|
+
# transformation.
|
11
|
+
#
|
12
|
+
# == Manipulating Paths
|
13
|
+
#
|
14
|
+
# Storing paths makes code shorter and more readable. By default
|
15
|
+
# (this assumes the executing script is in a file
|
16
|
+
# /home/imw_user/data/foo.rb):
|
17
|
+
#
|
18
|
+
# dataset = IMW::Dataset.new
|
19
|
+
# dataset.path_to(:self)
|
20
|
+
# #=> '/home/imw_user/data'
|
21
|
+
# dataset.path_to(:ripd)
|
22
|
+
# #=> '/home/imw_user/data/ripd'
|
23
|
+
# dataset.path_to(:pkgd, 'final.tar.gz')
|
24
|
+
# #=> '/home/imw_user/data/pkgd/final.tar.gz'
|
25
|
+
#
|
26
|
+
# Paths can be added
|
27
|
+
#
|
28
|
+
# dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
|
29
|
+
# dataset.path_to(:sorted_output)
|
30
|
+
# #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
|
31
|
+
#
|
32
|
+
# as well as removed (via +remove_path+).
|
33
|
+
#
|
34
|
+
# == Defining Workflows
|
35
|
+
#
|
36
|
+
# IMW encourages you to think of transforming data as a network of
|
37
|
+
# interdependent steps (see IMW::Workflow). Each of IMW's five
|
38
|
+
# default steps maps to a named directory remembered by each
|
39
|
+
# dataset.
|
29
40
|
#
|
30
|
-
#
|
31
|
-
#
|
41
|
+
# The following example shows why this is a useful abstraction as
|
42
|
+
# well as illustrating some of the other functionality in IMW.
|
32
43
|
#
|
33
|
-
#
|
44
|
+
# == Example Dataset
|
34
45
|
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
46
|
+
# The first step is to import IMW and create the dataset
|
47
|
+
#
|
48
|
+
# require 'rubygems'
|
49
|
+
# require 'imw'
|
50
|
+
# dataset = IMW::Dataset.new
|
51
|
+
#
|
52
|
+
# You can pass in a handle (the name or "slug" for the dataset) as
|
53
|
+
# well as some options. Now define the steps you intend to take to
|
54
|
+
# complete the transformation:
|
39
55
|
#
|
40
56
|
# rip::
|
41
|
-
#
|
42
|
-
#
|
43
|
-
# subdirectory of the <tt>:ripd</tt> directory named for the URI
|
44
|
-
# of the source.
|
57
|
+
# Data is collected from a source (+http+, +ftp+, database, &c.)
|
58
|
+
# and deposited in the <tt>:ripd</tt> directory of this dataset.
|
45
59
|
#
|
46
60
|
# dataset.task :rip do
|
47
|
-
# IMW
|
61
|
+
# IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
|
48
62
|
# #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
|
49
63
|
#
|
50
64
|
# IMW::Rip.from_database :named => "weather_records",
|
@@ -174,59 +188,19 @@ module IMW
|
|
174
188
|
# framework in which to program.
|
175
189
|
class Dataset
|
176
190
|
|
177
|
-
# The <tt>Rake::TaskManager</tt> module allows the
|
178
|
-
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
179
|
-
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
180
|
-
# associated with the processing of this dataset.
|
181
|
-
include Rake::TaskManager
|
182
|
-
|
183
191
|
# The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
|
184
192
|
# dataset processing.
|
185
193
|
include IMW::Workflow
|
186
194
|
|
187
|
-
|
188
|
-
attr_accessor :data
|
189
|
-
|
190
|
-
# The default taxon assigned to a dataset.
|
191
|
-
DEFAULT_TAXON = nil
|
192
|
-
|
193
|
-
# Default options passed to <tt>Rake</tt>. Any class including
|
194
|
-
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
195
|
-
# this name.
|
196
|
-
DEFAULT_OPTIONS = {
|
197
|
-
:dry_run => false,
|
198
|
-
:trace => false,
|
199
|
-
:verbose => false
|
200
|
-
}
|
195
|
+
attr_accessor :handle, :options, :data
|
201
196
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
# argument is interpreted.
|
208
|
-
def initialize handle, options = {}
|
209
|
-
options = options.reverse_merge :taxon => DEFAULT_TAXON
|
210
|
-
|
211
|
-
# FIXME is this how the attribute writer functions should be
|
212
|
-
# called?
|
213
|
-
@handle = handle
|
214
|
-
@taxon = options[:taxon]
|
215
|
-
|
216
|
-
# for rake
|
217
|
-
@tasks = Hash.new
|
218
|
-
@rules = Array.new
|
219
|
-
@scope = Array.new
|
220
|
-
@last_description = nil
|
221
|
-
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
222
|
-
create_default_tasks
|
223
|
-
|
224
|
-
# sets an empty @paths hash; see utils/paths.rb
|
197
|
+
def initialize options = {}
|
198
|
+
@options = options
|
199
|
+
@handle = options[:handle]
|
200
|
+
initialize_workflow
|
201
|
+
set_root_paths
|
225
202
|
set_paths
|
226
|
-
|
227
|
-
|
228
|
-
def handle= thing
|
229
|
-
@handle = thing.is_a?(String) ? thing.to_handle : thing
|
203
|
+
set_tasks
|
230
204
|
end
|
231
205
|
|
232
206
|
end
|
data/lib/imw/files/basicfile.rb
CHANGED
@@ -20,7 +20,7 @@ module IMW
|
|
20
20
|
protected
|
21
21
|
|
22
22
|
def uri= uri
|
23
|
-
@uri =
|
23
|
+
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
24
24
|
@host = self.uri.host
|
25
25
|
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
26
26
|
@dirname = ::File.dirname path
|
@@ -53,7 +53,7 @@ module IMW
|
|
53
53
|
# path as a first argument.
|
54
54
|
[:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
|
55
55
|
define_method class_method do
|
56
|
-
File.send(class_method, path)
|
56
|
+
File.send(class_method, path)
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
@@ -61,7 +61,7 @@ module IMW
|
|
61
61
|
# to open files online too to check.
|
62
62
|
def exist?
|
63
63
|
if local?
|
64
|
-
::File.exist?(path)
|
64
|
+
::File.exist?(path)
|
65
65
|
else
|
66
66
|
begin
|
67
67
|
true if open(uri)
|
@@ -1,17 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Just required all the archive and compressed formats (+tar+, +bz2+,
|
7
|
-
# &c.)
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
1
|
module IMW
|
16
2
|
module Files
|
17
3
|
|
@@ -29,9 +15,9 @@ module IMW
|
|
29
15
|
# The default flags used creating, appending to, listing, and
|
30
16
|
# extracting a tar archive.
|
31
17
|
DEFAULT_FLAGS = {
|
32
|
-
:create
|
33
|
-
:append
|
34
|
-
:list
|
18
|
+
:create => "-cf",
|
19
|
+
:append => "-rf",
|
20
|
+
:list => "-tf",
|
35
21
|
:extract => "-xf",
|
36
22
|
:program => :tar
|
37
23
|
}
|
@@ -39,10 +25,10 @@ module IMW
|
|
39
25
|
def initialize uri, *args
|
40
26
|
self.uri= uri
|
41
27
|
@archive = {
|
42
|
-
:program
|
43
|
-
:create_flags
|
44
|
-
:append_flags
|
45
|
-
:list_flags
|
28
|
+
:program => DEFAULT_FLAGS[:program],
|
29
|
+
:create_flags => DEFAULT_FLAGS[:create],
|
30
|
+
:append_flags => DEFAULT_FLAGS[:append],
|
31
|
+
:list_flags => DEFAULT_FLAGS[:list],
|
46
32
|
:extract_flags => DEFAULT_FLAGS[:extract]
|
47
33
|
}
|
48
34
|
end
|
@@ -51,9 +37,9 @@ module IMW
|
|
51
37
|
# A class to wrap a <tt>tar.gz</tt> archive.
|
52
38
|
#
|
53
39
|
# Creation, appending, listing, and extraction flags are stored in
|
54
|
-
# <tt>IMW::Files::
|
40
|
+
# <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
|
55
41
|
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
56
|
-
class
|
42
|
+
class Targz
|
57
43
|
|
58
44
|
include IMW::Files::BasicFile
|
59
45
|
include IMW::Files::Archive
|
@@ -63,21 +49,21 @@ module IMW
|
|
63
49
|
# extracting a <tt>tar.gz</tt> archive.
|
64
50
|
DEFAULT_FLAGS = {
|
65
51
|
:decompression_program => :gzip,
|
66
|
-
:decompression_flags
|
67
|
-
:archive_program
|
68
|
-
:archive_list_flags
|
52
|
+
:decompression_flags => '-fd',
|
53
|
+
:archive_program => :tar,
|
54
|
+
:archive_list_flags => "-tf",
|
69
55
|
:archive_extract_flags => "-xzf"
|
70
56
|
}
|
71
57
|
|
72
58
|
def initialize uri, *args
|
73
59
|
self.uri= uri
|
74
60
|
@compression = {
|
75
|
-
:program
|
61
|
+
:program => DEFAULT_FLAGS[:decompression_program],
|
76
62
|
:decompression_flags => DEFAULT_FLAGS[:decompression_flags]
|
77
63
|
}
|
78
64
|
@archive = {
|
79
|
-
:program
|
80
|
-
:list_flags
|
65
|
+
:program => DEFAULT_FLAGS[:archive_program],
|
66
|
+
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
81
67
|
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
|
82
68
|
}
|
83
69
|
end
|
@@ -99,14 +85,14 @@ module IMW
|
|
99
85
|
end
|
100
86
|
end
|
101
87
|
|
102
|
-
end #
|
88
|
+
end # Targz
|
103
89
|
|
104
90
|
# A class to wrap a <tt>tar.bz2</tt> archive.
|
105
91
|
#
|
106
92
|
# Creation, appending, listing, and extraction flags are stored in
|
107
|
-
# <tt>IMW::Files::
|
93
|
+
# <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
|
108
94
|
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
109
|
-
class
|
95
|
+
class Tarbz2
|
110
96
|
|
111
97
|
include IMW::Files::BasicFile
|
112
98
|
include IMW::Files::Archive
|
@@ -169,7 +155,7 @@ module IMW
|
|
169
155
|
File.join(dirname,name + '.tar')
|
170
156
|
end
|
171
157
|
|
172
|
-
end #
|
158
|
+
end # Tarbz2
|
173
159
|
|
174
160
|
# A class to wrap a +rar+ archive.
|
175
161
|
#
|
@@ -330,11 +316,11 @@ module IMW
|
|
330
316
|
|
331
317
|
|
332
318
|
# make sure that tar.bz2 precedes bz2 and so on...
|
333
|
-
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::
|
334
|
-
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::
|
319
|
+
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
|
320
|
+
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
|
335
321
|
|
336
|
-
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::
|
337
|
-
FILE_REGEXPS << [/\.tgz$/, IMW::Files::
|
322
|
+
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
|
323
|
+
FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
|
338
324
|
|
339
325
|
FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
|
340
326
|
FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
|
data/lib/imw/files/csv.rb
CHANGED
@@ -39,7 +39,8 @@ module IMW
|
|
39
39
|
def initialize uri, mode='r', options = {}
|
40
40
|
options.reverse_merge!(self.class::DEFAULT_OPTIONS)
|
41
41
|
self.uri= uri
|
42
|
-
|
42
|
+
options.delete(:write) # FasterCSV complains about unkown options
|
43
|
+
super open(uri,mode), options
|
43
44
|
end
|
44
45
|
|
45
46
|
# Return the contents of this CSV file as an array of arrays.
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'imw/files/basicfile'
|
2
|
+
module IMW
|
3
|
+
module Files
|
4
|
+
class Directory
|
5
|
+
|
6
|
+
include IMW::Files::BasicFile
|
7
|
+
|
8
|
+
# FIXME these should be defined by BasicFile and then removed here but I don't see how...
|
9
|
+
# [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
|
10
|
+
# instance_eval do
|
11
|
+
# remove_method method
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
|
15
|
+
def uri= uri
|
16
|
+
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
17
|
+
@host = self.uri.host
|
18
|
+
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
19
|
+
@dirname = ::File.dirname path
|
20
|
+
@basename = ::File.basename path
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize uri
|
24
|
+
self.uri = uri
|
25
|
+
end
|
26
|
+
|
27
|
+
def [] selector='*'
|
28
|
+
Dir[File.join(path, selector)] if local?
|
29
|
+
end
|
30
|
+
|
31
|
+
# Copy the contents of this directory to +new_dir+.
|
32
|
+
def cp new_dir
|
33
|
+
raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
|
34
|
+
if local?
|
35
|
+
FileUtils.cp_r path, new_dir
|
36
|
+
else
|
37
|
+
raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
|
38
|
+
end
|
39
|
+
self.class.new(new_dir)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Move this directory to +new_dir+.
|
43
|
+
def mv new_dir
|
44
|
+
raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
|
45
|
+
if local?
|
46
|
+
FileUtils.mv path, new_dir
|
47
|
+
else
|
48
|
+
raise IMW::PathError.new("cannot move remote directories (yet!)")
|
49
|
+
end
|
50
|
+
self.class.new(new_dir)
|
51
|
+
end
|
52
|
+
alias_method :mv!, :mv
|
53
|
+
|
54
|
+
# Move this directory so it sits beneath +dir+.
|
55
|
+
def mv_to_dir dir
|
56
|
+
mv File.join(File.expand_path(dir),basename)
|
57
|
+
end
|
58
|
+
alias_method :mv_to_dir!, :mv_to_dir
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'spreadsheet'
|
2
|
+
|
3
|
+
# FIXME Main issue with this:
|
4
|
+
# You can make a new excel book and dump data to it no problem.
|
5
|
+
# However, something that doesn't seem to work is dumping to a file, opening,
|
6
|
+
# and dumping to it again. At the moment this is probably not a big deal.
|
7
|
+
|
8
|
+
|
9
|
+
module IMW
|
10
|
+
module Files
|
11
|
+
class Excel
|
12
|
+
include IMW::Files::BasicFile
|
13
|
+
include IMW::Files::Compressible
|
14
|
+
|
15
|
+
#need to initialize, load, and dump
|
16
|
+
attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
|
17
|
+
def initialize uri, mode, options={}
|
18
|
+
self.uri = uri
|
19
|
+
@max_lines = options[:max_lines] || 65000
|
20
|
+
@idx = 0
|
21
|
+
@book_idx = 0
|
22
|
+
@sht_idx = 0
|
23
|
+
unless self.exist?
|
24
|
+
make_new_book
|
25
|
+
make_new_sheet
|
26
|
+
else
|
27
|
+
get_existing_book
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def load
|
32
|
+
@sheet.map{|row| row.to_a}
|
33
|
+
end
|
34
|
+
|
35
|
+
def dump data
|
36
|
+
data.each do |line|
|
37
|
+
raise "too many lines" if too_many?
|
38
|
+
self << line
|
39
|
+
end
|
40
|
+
save unless no_data?
|
41
|
+
end
|
42
|
+
|
43
|
+
def << line
|
44
|
+
@sheet.row(@sht_row).concat( line )
|
45
|
+
@sht_row += 1
|
46
|
+
@idx += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
def make_new_book
|
50
|
+
@book = Spreadsheet::Workbook.new
|
51
|
+
@book_idx += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
def make_new_sheet
|
55
|
+
@sheet = @book.create_worksheet
|
56
|
+
@sht_idx += 1
|
57
|
+
@sht_row = 0 #always start at row 0 in a new sheet
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_existing_book
|
61
|
+
@book = Spreadsheet.open path
|
62
|
+
@sheet = book.worksheet 0
|
63
|
+
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
64
|
+
@sht_idx += 1
|
65
|
+
end
|
66
|
+
|
67
|
+
def incr_sheet
|
68
|
+
@sheet = book.worksheet @sht_idx
|
69
|
+
end
|
70
|
+
|
71
|
+
def too_many?
|
72
|
+
@sht_row >= @max_lines
|
73
|
+
end
|
74
|
+
|
75
|
+
def no_data?
|
76
|
+
@sht_row == 0
|
77
|
+
end
|
78
|
+
|
79
|
+
def save
|
80
|
+
@book.write path
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/imw/files/sgml.rb
CHANGED
@@ -1,17 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/sgml.rb -- SGML files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# For SGML-derived files, including XML, HTML, &c..
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
14
|
-
|
15
1
|
require 'hpricot'
|
16
2
|
require 'imw/files/text'
|
17
3
|
require 'imw/parsers/html_parser'
|
@@ -23,21 +9,16 @@ module IMW
|
|
23
9
|
|
24
10
|
attr_accessor :doc
|
25
11
|
|
26
|
-
def initialize uri, mode='r', options={}
|
27
|
-
super uri, mode, options
|
28
|
-
@doc = Hpricot(open(uri))
|
29
|
-
end
|
30
|
-
|
31
12
|
# Delegate to Hpricot
|
32
13
|
def method_missing method, *args, &block
|
33
14
|
@doc.send method, *args, &block
|
34
15
|
end
|
35
16
|
|
36
|
-
# Parse this file using the IMW
|
37
|
-
# either be passed in directly or constructed from a
|
38
|
-
# of matchers.
|
17
|
+
# Parse this file using the IMW::Parsers::HtmlParser. The
|
18
|
+
# parser can either be passed in directly or constructed from a
|
19
|
+
# passed hash of specs and/or matchers.
|
39
20
|
def parse *args
|
40
|
-
parser = args.first.is_a?(IMW::
|
21
|
+
parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
|
41
22
|
parser.parse(self)
|
42
23
|
end
|
43
24
|
|
data/lib/imw/files.rb
CHANGED
@@ -1,22 +1,8 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files.rb -- uniform interface to various files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
|
7
|
-
# object given a URI.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
-
|
16
1
|
require 'uri'
|
17
2
|
require 'open-uri'
|
18
3
|
require 'imw/utils'
|
19
4
|
require 'imw/files/basicfile'
|
5
|
+
require 'imw/files/directory'
|
20
6
|
require 'imw/files/archive'
|
21
7
|
require 'imw/files/compressible'
|
22
8
|
require 'imw/files/compressed_file'
|
@@ -28,13 +14,21 @@ module IMW
|
|
28
14
|
#
|
29
15
|
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
30
16
|
#
|
31
|
-
#
|
32
|
-
def self.open path, options = {}
|
33
|
-
|
34
|
-
|
17
|
+
#
|
18
|
+
def self.open path, options = {}, &block
|
19
|
+
if File.directory?(File.expand_path(path))
|
20
|
+
dir = Files::Directory.new(path)
|
21
|
+
yield dir if block_given?
|
22
|
+
dir
|
23
|
+
else
|
24
|
+
mode = options[:write] ? 'w' : 'r'
|
25
|
+
file = Files.file_class_for(path, options).new(path, mode, options)
|
26
|
+
yield file if block_given?
|
27
|
+
file
|
28
|
+
end
|
35
29
|
end
|
36
30
|
|
37
|
-
def self.open! path, options = {}
|
31
|
+
def self.open! path, options = {}, &block
|
38
32
|
self.open path, options.reverse_merge(:write => true)
|
39
33
|
end
|
40
34
|
|
@@ -50,13 +44,14 @@ module IMW
|
|
50
44
|
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
51
45
|
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
52
46
|
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
53
|
-
autoload :
|
54
|
-
autoload :
|
47
|
+
autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
|
48
|
+
autoload :Targz, 'imw/files/compressed_files_and_archives'
|
55
49
|
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
56
50
|
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
57
51
|
autoload :Xml, 'imw/files/sgml'
|
58
52
|
autoload :Html, 'imw/files/sgml'
|
59
|
-
|
53
|
+
autoload :Excel, 'imw/files/excel'
|
54
|
+
|
60
55
|
|
61
56
|
# An array used to match files to classes to handle them. The
|
62
57
|
# first element of each array is the regexp and the second names
|
@@ -70,33 +65,39 @@ module IMW
|
|
70
65
|
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
71
66
|
# <tt>.gz</tt>.
|
72
67
|
EXTENSION_HANDLERS = [
|
73
|
-
[
|
74
|
-
[/\.txt$/, :
|
75
|
-
[/\.
|
76
|
-
[/\.
|
77
|
-
[/\.
|
78
|
-
[/\.
|
79
|
-
[/\.
|
80
|
-
[/\.
|
81
|
-
[/\.
|
82
|
-
[/\.
|
83
|
-
[/\.
|
84
|
-
[/\.
|
85
|
-
[/\.
|
86
|
-
[/\.
|
87
|
-
[/\.
|
88
|
-
[/\.
|
89
|
-
[/\.
|
90
|
-
[/\.
|
91
|
-
[/\.
|
92
|
-
[/\.
|
93
|
-
[/\.
|
94
|
-
[/\.
|
68
|
+
[/\.txt$/, :text],
|
69
|
+
[/\.txt$/, :text],
|
70
|
+
[/\.dat$/, :text],
|
71
|
+
[/\.ascii$/, :text],
|
72
|
+
[/\.yaml$/, :yaml],
|
73
|
+
[/\.yml$/, :yaml],
|
74
|
+
[/\.csv$/, :csv],
|
75
|
+
[/\.tsv$/, :tsv],
|
76
|
+
[/\.json$/, :json],
|
77
|
+
[/\.bz2$/, :bz2],
|
78
|
+
[/\.gz$/, :gz],
|
79
|
+
[/\.tar\.bz2$/, :tarbz2],
|
80
|
+
[/\.tbz2$/, :tarbz2],
|
81
|
+
[/\.tar\.gz$/, :targz],
|
82
|
+
[/\.tgz$/, :targz],
|
83
|
+
[/\.tar$/, :tar],
|
84
|
+
[/\.rar$/, :rar],
|
85
|
+
[/\.zip$/, :zip],
|
86
|
+
[/\.xml$/, :xml],
|
87
|
+
[/\.html$/, :html],
|
88
|
+
[/\.htm$/, :html],
|
89
|
+
[/\.xlsx?$/, :excel]
|
95
90
|
]
|
96
|
-
|
91
|
+
|
92
|
+
SCHEME_HANDLERS = [
|
93
|
+
[/http/, :html]
|
94
|
+
]
|
95
|
+
|
97
96
|
protected
|
98
97
|
def self.file_class_for path, options = {}
|
99
98
|
klass = options.delete(:as)
|
99
|
+
|
100
|
+
# try to choose klass from path extension if not already set
|
100
101
|
unless klass
|
101
102
|
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
102
103
|
next unless regexp =~ path
|
@@ -104,7 +105,21 @@ module IMW
|
|
104
105
|
break
|
105
106
|
end
|
106
107
|
end
|
107
|
-
|
108
|
+
|
109
|
+
# try to choose klass from uri scheme if not already set
|
110
|
+
unless klass
|
111
|
+
scheme = URI.parse(path).scheme
|
112
|
+
SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
113
|
+
next unless regexp =~ scheme
|
114
|
+
klass = thing
|
115
|
+
break
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# just stick with text if still not set
|
120
|
+
klass = :text unless klass
|
121
|
+
|
122
|
+
klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
|
108
123
|
end
|
109
124
|
end
|
110
125
|
end
|