imw 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
data/lib/imw/dataset.rb
CHANGED
@@ -1,50 +1,64 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/dataset.rb -- imw dataset
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Defines basic properties of the <tt>IMW::Dataset</tt>
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
# puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
|
14
|
-
|
15
|
-
require 'rake'
|
16
|
-
require 'ostruct'
|
17
|
-
|
18
1
|
require 'imw/utils'
|
19
2
|
require 'imw/dataset/workflow'
|
20
|
-
require 'imw/dataset/
|
21
|
-
require 'imw/dataset/stats'
|
3
|
+
require 'imw/dataset/paths'
|
22
4
|
|
23
5
|
module IMW
|
24
6
|
|
25
|
-
# The
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
7
|
+
# The IMW::Dataset class is useful organizing a complex data
|
8
|
+
# transformation because it is capable of managing a collection of
|
9
|
+
# paths and the interdependencies between subparts of the
|
10
|
+
# transformation.
|
11
|
+
#
|
12
|
+
# == Manipulating Paths
|
13
|
+
#
|
14
|
+
# Storing paths makes code shorter and more readable. By default
|
15
|
+
# (this assumes the executing script is in a file
|
16
|
+
# /home/imw_user/data/foo.rb):
|
17
|
+
#
|
18
|
+
# dataset = IMW::Dataset.new
|
19
|
+
# dataset.path_to(:self)
|
20
|
+
# #=> '/home/imw_user/data'
|
21
|
+
# dataset.path_to(:ripd)
|
22
|
+
# #=> '/home/imw_user/data/ripd'
|
23
|
+
# dataset.path_to(:pkgd, 'final.tar.gz')
|
24
|
+
# #=> '/home/imw_user/data/pkgd/final.tar.gz'
|
25
|
+
#
|
26
|
+
# Paths can be added
|
27
|
+
#
|
28
|
+
# dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
|
29
|
+
# dataset.path_to(:sorted_output)
|
30
|
+
# #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
|
31
|
+
#
|
32
|
+
# as well as removed (via +remove_path+).
|
33
|
+
#
|
34
|
+
# == Defining Workflows
|
35
|
+
#
|
36
|
+
# IMW encourages you to think of transforming data as a network of
|
37
|
+
# interdependent steps (see IMW::Workflow). Each of IMW's five
|
38
|
+
# default steps maps to a named directory remembered by each
|
39
|
+
# dataset.
|
29
40
|
#
|
30
|
-
#
|
31
|
-
#
|
41
|
+
# The following example shows why this is a useful abstraction as
|
42
|
+
# well as illustrating some of the other functionality in IMW.
|
32
43
|
#
|
33
|
-
#
|
44
|
+
# == Example Dataset
|
34
45
|
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
46
|
+
# The first step is to import IMW and create the dataset
|
47
|
+
#
|
48
|
+
# require 'rubygems'
|
49
|
+
# require 'imw'
|
50
|
+
# dataset = IMW::Dataset.new
|
51
|
+
#
|
52
|
+
# You can pass in a handle (the name or "slug" for the dataset) as
|
53
|
+
# well as some options. Now define the steps you intend to take to
|
54
|
+
# complete the transformation:
|
39
55
|
#
|
40
56
|
# rip::
|
41
|
-
#
|
42
|
-
#
|
43
|
-
# subdirectory of the <tt>:ripd</tt> directory named for the URI
|
44
|
-
# of the source.
|
57
|
+
# Data is collected from a source (+http+, +ftp+, database, &c.)
|
58
|
+
# and deposited in the <tt>:ripd</tt> directory of this dataset.
|
45
59
|
#
|
46
60
|
# dataset.task :rip do
|
47
|
-
# IMW
|
61
|
+
# IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
|
48
62
|
# #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
|
49
63
|
#
|
50
64
|
# IMW::Rip.from_database :named => "weather_records",
|
@@ -174,59 +188,19 @@ module IMW
|
|
174
188
|
# framework in which to program.
|
175
189
|
class Dataset
|
176
190
|
|
177
|
-
# The <tt>Rake::TaskManager</tt> module allows the
|
178
|
-
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
179
|
-
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
180
|
-
# associated with the processing of this dataset.
|
181
|
-
include Rake::TaskManager
|
182
|
-
|
183
191
|
# The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
|
184
192
|
# dataset processing.
|
185
193
|
include IMW::Workflow
|
186
194
|
|
187
|
-
|
188
|
-
attr_accessor :data
|
189
|
-
|
190
|
-
# The default taxon assigned to a dataset.
|
191
|
-
DEFAULT_TAXON = nil
|
192
|
-
|
193
|
-
# Default options passed to <tt>Rake</tt>. Any class including
|
194
|
-
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
195
|
-
# this name.
|
196
|
-
DEFAULT_OPTIONS = {
|
197
|
-
:dry_run => false,
|
198
|
-
:trace => false,
|
199
|
-
:verbose => false
|
200
|
-
}
|
195
|
+
attr_accessor :handle, :options, :data
|
201
196
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
# argument is interpreted.
|
208
|
-
def initialize handle, options = {}
|
209
|
-
options = options.reverse_merge :taxon => DEFAULT_TAXON
|
210
|
-
|
211
|
-
# FIXME is this how the attribute writer functions should be
|
212
|
-
# called?
|
213
|
-
@handle = handle
|
214
|
-
@taxon = options[:taxon]
|
215
|
-
|
216
|
-
# for rake
|
217
|
-
@tasks = Hash.new
|
218
|
-
@rules = Array.new
|
219
|
-
@scope = Array.new
|
220
|
-
@last_description = nil
|
221
|
-
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
222
|
-
create_default_tasks
|
223
|
-
|
224
|
-
# sets an empty @paths hash; see utils/paths.rb
|
197
|
+
def initialize options = {}
|
198
|
+
@options = options
|
199
|
+
@handle = options[:handle]
|
200
|
+
initialize_workflow
|
201
|
+
set_root_paths
|
225
202
|
set_paths
|
226
|
-
|
227
|
-
|
228
|
-
def handle= thing
|
229
|
-
@handle = thing.is_a?(String) ? thing.to_handle : thing
|
203
|
+
set_tasks
|
230
204
|
end
|
231
205
|
|
232
206
|
end
|
data/lib/imw/files/basicfile.rb
CHANGED
@@ -20,7 +20,7 @@ module IMW
|
|
20
20
|
protected
|
21
21
|
|
22
22
|
def uri= uri
|
23
|
-
@uri =
|
23
|
+
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
24
24
|
@host = self.uri.host
|
25
25
|
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
26
26
|
@dirname = ::File.dirname path
|
@@ -53,7 +53,7 @@ module IMW
|
|
53
53
|
# path as a first argument.
|
54
54
|
[:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
|
55
55
|
define_method class_method do
|
56
|
-
File.send(class_method, path)
|
56
|
+
File.send(class_method, path)
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
@@ -61,7 +61,7 @@ module IMW
|
|
61
61
|
# to open files online too to check.
|
62
62
|
def exist?
|
63
63
|
if local?
|
64
|
-
::File.exist?(path)
|
64
|
+
::File.exist?(path)
|
65
65
|
else
|
66
66
|
begin
|
67
67
|
true if open(uri)
|
@@ -1,17 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Just required all the archive and compressed formats (+tar+, +bz2+,
|
7
|
-
# &c.)
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
1
|
module IMW
|
16
2
|
module Files
|
17
3
|
|
@@ -29,9 +15,9 @@ module IMW
|
|
29
15
|
# The default flags used creating, appending to, listing, and
|
30
16
|
# extracting a tar archive.
|
31
17
|
DEFAULT_FLAGS = {
|
32
|
-
:create
|
33
|
-
:append
|
34
|
-
:list
|
18
|
+
:create => "-cf",
|
19
|
+
:append => "-rf",
|
20
|
+
:list => "-tf",
|
35
21
|
:extract => "-xf",
|
36
22
|
:program => :tar
|
37
23
|
}
|
@@ -39,10 +25,10 @@ module IMW
|
|
39
25
|
def initialize uri, *args
|
40
26
|
self.uri= uri
|
41
27
|
@archive = {
|
42
|
-
:program
|
43
|
-
:create_flags
|
44
|
-
:append_flags
|
45
|
-
:list_flags
|
28
|
+
:program => DEFAULT_FLAGS[:program],
|
29
|
+
:create_flags => DEFAULT_FLAGS[:create],
|
30
|
+
:append_flags => DEFAULT_FLAGS[:append],
|
31
|
+
:list_flags => DEFAULT_FLAGS[:list],
|
46
32
|
:extract_flags => DEFAULT_FLAGS[:extract]
|
47
33
|
}
|
48
34
|
end
|
@@ -51,9 +37,9 @@ module IMW
|
|
51
37
|
# A class to wrap a <tt>tar.gz</tt> archive.
|
52
38
|
#
|
53
39
|
# Creation, appending, listing, and extraction flags are stored in
|
54
|
-
# <tt>IMW::Files::
|
40
|
+
# <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
|
55
41
|
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
56
|
-
class
|
42
|
+
class Targz
|
57
43
|
|
58
44
|
include IMW::Files::BasicFile
|
59
45
|
include IMW::Files::Archive
|
@@ -63,21 +49,21 @@ module IMW
|
|
63
49
|
# extracting a <tt>tar.gz</tt> archive.
|
64
50
|
DEFAULT_FLAGS = {
|
65
51
|
:decompression_program => :gzip,
|
66
|
-
:decompression_flags
|
67
|
-
:archive_program
|
68
|
-
:archive_list_flags
|
52
|
+
:decompression_flags => '-fd',
|
53
|
+
:archive_program => :tar,
|
54
|
+
:archive_list_flags => "-tf",
|
69
55
|
:archive_extract_flags => "-xzf"
|
70
56
|
}
|
71
57
|
|
72
58
|
def initialize uri, *args
|
73
59
|
self.uri= uri
|
74
60
|
@compression = {
|
75
|
-
:program
|
61
|
+
:program => DEFAULT_FLAGS[:decompression_program],
|
76
62
|
:decompression_flags => DEFAULT_FLAGS[:decompression_flags]
|
77
63
|
}
|
78
64
|
@archive = {
|
79
|
-
:program
|
80
|
-
:list_flags
|
65
|
+
:program => DEFAULT_FLAGS[:archive_program],
|
66
|
+
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
81
67
|
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
|
82
68
|
}
|
83
69
|
end
|
@@ -99,14 +85,14 @@ module IMW
|
|
99
85
|
end
|
100
86
|
end
|
101
87
|
|
102
|
-
end #
|
88
|
+
end # Targz
|
103
89
|
|
104
90
|
# A class to wrap a <tt>tar.bz2</tt> archive.
|
105
91
|
#
|
106
92
|
# Creation, appending, listing, and extraction flags are stored in
|
107
|
-
# <tt>IMW::Files::
|
93
|
+
# <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
|
108
94
|
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
109
|
-
class
|
95
|
+
class Tarbz2
|
110
96
|
|
111
97
|
include IMW::Files::BasicFile
|
112
98
|
include IMW::Files::Archive
|
@@ -169,7 +155,7 @@ module IMW
|
|
169
155
|
File.join(dirname,name + '.tar')
|
170
156
|
end
|
171
157
|
|
172
|
-
end #
|
158
|
+
end # Tarbz2
|
173
159
|
|
174
160
|
# A class to wrap a +rar+ archive.
|
175
161
|
#
|
@@ -330,11 +316,11 @@ module IMW
|
|
330
316
|
|
331
317
|
|
332
318
|
# make sure that tar.bz2 precedes bz2 and so on...
|
333
|
-
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::
|
334
|
-
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::
|
319
|
+
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
|
320
|
+
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::Tarbz2]
|
335
321
|
|
336
|
-
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::
|
337
|
-
FILE_REGEXPS << [/\.tgz$/, IMW::Files::
|
322
|
+
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::Targz]
|
323
|
+
FILE_REGEXPS << [/\.tgz$/, IMW::Files::Targz]
|
338
324
|
|
339
325
|
FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
|
340
326
|
FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
|
data/lib/imw/files/csv.rb
CHANGED
@@ -39,7 +39,8 @@ module IMW
|
|
39
39
|
def initialize uri, mode='r', options = {}
|
40
40
|
options.reverse_merge!(self.class::DEFAULT_OPTIONS)
|
41
41
|
self.uri= uri
|
42
|
-
|
42
|
+
options.delete(:write) # FasterCSV complains about unkown options
|
43
|
+
super open(uri,mode), options
|
43
44
|
end
|
44
45
|
|
45
46
|
# Return the contents of this CSV file as an array of arrays.
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'imw/files/basicfile'
|
2
|
+
module IMW
|
3
|
+
module Files
|
4
|
+
class Directory
|
5
|
+
|
6
|
+
include IMW::Files::BasicFile
|
7
|
+
|
8
|
+
# FIXME these should be defined by BasicFile and then removed here but I don't see how...
|
9
|
+
# [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
|
10
|
+
# instance_eval do
|
11
|
+
# remove_method method
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
|
15
|
+
def uri= uri
|
16
|
+
@uri = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
|
17
|
+
@host = self.uri.host
|
18
|
+
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
19
|
+
@dirname = ::File.dirname path
|
20
|
+
@basename = ::File.basename path
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize uri
|
24
|
+
self.uri = uri
|
25
|
+
end
|
26
|
+
|
27
|
+
def [] selector='*'
|
28
|
+
Dir[File.join(path, selector)] if local?
|
29
|
+
end
|
30
|
+
|
31
|
+
# Copy the contents of this directory to +new_dir+.
|
32
|
+
def cp new_dir
|
33
|
+
raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
|
34
|
+
if local?
|
35
|
+
FileUtils.cp_r path, new_dir
|
36
|
+
else
|
37
|
+
raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
|
38
|
+
end
|
39
|
+
self.class.new(new_dir)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Move this directory to +new_dir+.
|
43
|
+
def mv new_dir
|
44
|
+
raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
|
45
|
+
if local?
|
46
|
+
FileUtils.mv path, new_dir
|
47
|
+
else
|
48
|
+
raise IMW::PathError.new("cannot move remote directories (yet!)")
|
49
|
+
end
|
50
|
+
self.class.new(new_dir)
|
51
|
+
end
|
52
|
+
alias_method :mv!, :mv
|
53
|
+
|
54
|
+
# Move this directory so it sits beneath +dir+.
|
55
|
+
def mv_to_dir dir
|
56
|
+
mv File.join(File.expand_path(dir),basename)
|
57
|
+
end
|
58
|
+
alias_method :mv_to_dir!, :mv_to_dir
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'spreadsheet'
|
2
|
+
|
3
|
+
# FIXME Main issue with this:
|
4
|
+
# You can make a new excel book and dump data to it no problem.
|
5
|
+
# However, something that doesn't seem to work is dumping to a file, opening,
|
6
|
+
# and dumping to it again. At the moment this is probably not a big deal.
|
7
|
+
|
8
|
+
|
9
|
+
module IMW
|
10
|
+
module Files
|
11
|
+
class Excel
|
12
|
+
include IMW::Files::BasicFile
|
13
|
+
include IMW::Files::Compressible
|
14
|
+
|
15
|
+
#need to initialize, load, and dump
|
16
|
+
attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
|
17
|
+
def initialize uri, mode, options={}
|
18
|
+
self.uri = uri
|
19
|
+
@max_lines = options[:max_lines] || 65000
|
20
|
+
@idx = 0
|
21
|
+
@book_idx = 0
|
22
|
+
@sht_idx = 0
|
23
|
+
unless self.exist?
|
24
|
+
make_new_book
|
25
|
+
make_new_sheet
|
26
|
+
else
|
27
|
+
get_existing_book
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def load
|
32
|
+
@sheet.map{|row| row.to_a}
|
33
|
+
end
|
34
|
+
|
35
|
+
def dump data
|
36
|
+
data.each do |line|
|
37
|
+
raise "too many lines" if too_many?
|
38
|
+
self << line
|
39
|
+
end
|
40
|
+
save unless no_data?
|
41
|
+
end
|
42
|
+
|
43
|
+
def << line
|
44
|
+
@sheet.row(@sht_row).concat( line )
|
45
|
+
@sht_row += 1
|
46
|
+
@idx += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
def make_new_book
|
50
|
+
@book = Spreadsheet::Workbook.new
|
51
|
+
@book_idx += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
def make_new_sheet
|
55
|
+
@sheet = @book.create_worksheet
|
56
|
+
@sht_idx += 1
|
57
|
+
@sht_row = 0 #always start at row 0 in a new sheet
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_existing_book
|
61
|
+
@book = Spreadsheet.open path
|
62
|
+
@sheet = book.worksheet 0
|
63
|
+
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
64
|
+
@sht_idx += 1
|
65
|
+
end
|
66
|
+
|
67
|
+
def incr_sheet
|
68
|
+
@sheet = book.worksheet @sht_idx
|
69
|
+
end
|
70
|
+
|
71
|
+
def too_many?
|
72
|
+
@sht_row >= @max_lines
|
73
|
+
end
|
74
|
+
|
75
|
+
def no_data?
|
76
|
+
@sht_row == 0
|
77
|
+
end
|
78
|
+
|
79
|
+
def save
|
80
|
+
@book.write path
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/imw/files/sgml.rb
CHANGED
@@ -1,17 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/sgml.rb -- SGML files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# For SGML-derived files, including XML, HTML, &c..
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
14
|
-
|
15
1
|
require 'hpricot'
|
16
2
|
require 'imw/files/text'
|
17
3
|
require 'imw/parsers/html_parser'
|
@@ -23,21 +9,16 @@ module IMW
|
|
23
9
|
|
24
10
|
attr_accessor :doc
|
25
11
|
|
26
|
-
def initialize uri, mode='r', options={}
|
27
|
-
super uri, mode, options
|
28
|
-
@doc = Hpricot(open(uri))
|
29
|
-
end
|
30
|
-
|
31
12
|
# Delegate to Hpricot
|
32
13
|
def method_missing method, *args, &block
|
33
14
|
@doc.send method, *args, &block
|
34
15
|
end
|
35
16
|
|
36
|
-
# Parse this file using the IMW
|
37
|
-
# either be passed in directly or constructed from a
|
38
|
-
# of matchers.
|
17
|
+
# Parse this file using the IMW::Parsers::HtmlParser. The
|
18
|
+
# parser can either be passed in directly or constructed from a
|
19
|
+
# passed hash of specs and/or matchers.
|
39
20
|
def parse *args
|
40
|
-
parser = args.first.is_a?(IMW::
|
21
|
+
parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
|
41
22
|
parser.parse(self)
|
42
23
|
end
|
43
24
|
|
data/lib/imw/files.rb
CHANGED
@@ -1,22 +1,8 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files.rb -- uniform interface to various files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
|
7
|
-
# object given a URI.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
-
|
16
1
|
require 'uri'
|
17
2
|
require 'open-uri'
|
18
3
|
require 'imw/utils'
|
19
4
|
require 'imw/files/basicfile'
|
5
|
+
require 'imw/files/directory'
|
20
6
|
require 'imw/files/archive'
|
21
7
|
require 'imw/files/compressible'
|
22
8
|
require 'imw/files/compressed_file'
|
@@ -28,13 +14,21 @@ module IMW
|
|
28
14
|
#
|
29
15
|
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
30
16
|
#
|
31
|
-
#
|
32
|
-
def self.open path, options = {}
|
33
|
-
|
34
|
-
|
17
|
+
#
|
18
|
+
def self.open path, options = {}, &block
|
19
|
+
if File.directory?(File.expand_path(path))
|
20
|
+
dir = Files::Directory.new(path)
|
21
|
+
yield dir if block_given?
|
22
|
+
dir
|
23
|
+
else
|
24
|
+
mode = options[:write] ? 'w' : 'r'
|
25
|
+
file = Files.file_class_for(path, options).new(path, mode, options)
|
26
|
+
yield file if block_given?
|
27
|
+
file
|
28
|
+
end
|
35
29
|
end
|
36
30
|
|
37
|
-
def self.open! path, options = {}
|
31
|
+
def self.open! path, options = {}, &block
|
38
32
|
self.open path, options.reverse_merge(:write => true)
|
39
33
|
end
|
40
34
|
|
@@ -50,13 +44,14 @@ module IMW
|
|
50
44
|
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
51
45
|
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
52
46
|
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
53
|
-
autoload :
|
54
|
-
autoload :
|
47
|
+
autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
|
48
|
+
autoload :Targz, 'imw/files/compressed_files_and_archives'
|
55
49
|
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
56
50
|
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
57
51
|
autoload :Xml, 'imw/files/sgml'
|
58
52
|
autoload :Html, 'imw/files/sgml'
|
59
|
-
|
53
|
+
autoload :Excel, 'imw/files/excel'
|
54
|
+
|
60
55
|
|
61
56
|
# An array used to match files to classes to handle them. The
|
62
57
|
# first element of each array is the regexp and the second names
|
@@ -70,33 +65,39 @@ module IMW
|
|
70
65
|
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
71
66
|
# <tt>.gz</tt>.
|
72
67
|
EXTENSION_HANDLERS = [
|
73
|
-
[
|
74
|
-
[/\.txt$/, :
|
75
|
-
[/\.
|
76
|
-
[/\.
|
77
|
-
[/\.
|
78
|
-
[/\.
|
79
|
-
[/\.
|
80
|
-
[/\.
|
81
|
-
[/\.
|
82
|
-
[/\.
|
83
|
-
[/\.
|
84
|
-
[/\.
|
85
|
-
[/\.
|
86
|
-
[/\.
|
87
|
-
[/\.
|
88
|
-
[/\.
|
89
|
-
[/\.
|
90
|
-
[/\.
|
91
|
-
[/\.
|
92
|
-
[/\.
|
93
|
-
[/\.
|
94
|
-
[/\.
|
68
|
+
[/\.txt$/, :text],
|
69
|
+
[/\.txt$/, :text],
|
70
|
+
[/\.dat$/, :text],
|
71
|
+
[/\.ascii$/, :text],
|
72
|
+
[/\.yaml$/, :yaml],
|
73
|
+
[/\.yml$/, :yaml],
|
74
|
+
[/\.csv$/, :csv],
|
75
|
+
[/\.tsv$/, :tsv],
|
76
|
+
[/\.json$/, :json],
|
77
|
+
[/\.bz2$/, :bz2],
|
78
|
+
[/\.gz$/, :gz],
|
79
|
+
[/\.tar\.bz2$/, :tarbz2],
|
80
|
+
[/\.tbz2$/, :tarbz2],
|
81
|
+
[/\.tar\.gz$/, :targz],
|
82
|
+
[/\.tgz$/, :targz],
|
83
|
+
[/\.tar$/, :tar],
|
84
|
+
[/\.rar$/, :rar],
|
85
|
+
[/\.zip$/, :zip],
|
86
|
+
[/\.xml$/, :xml],
|
87
|
+
[/\.html$/, :html],
|
88
|
+
[/\.htm$/, :html],
|
89
|
+
[/\.xlsx?$/, :excel]
|
95
90
|
]
|
96
|
-
|
91
|
+
|
92
|
+
SCHEME_HANDLERS = [
|
93
|
+
[/http/, :html]
|
94
|
+
]
|
95
|
+
|
97
96
|
protected
|
98
97
|
def self.file_class_for path, options = {}
|
99
98
|
klass = options.delete(:as)
|
99
|
+
|
100
|
+
# try to choose klass from path extension if not already set
|
100
101
|
unless klass
|
101
102
|
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
102
103
|
next unless regexp =~ path
|
@@ -104,7 +105,21 @@ module IMW
|
|
104
105
|
break
|
105
106
|
end
|
106
107
|
end
|
107
|
-
|
108
|
+
|
109
|
+
# try to choose klass from uri scheme if not already set
|
110
|
+
unless klass
|
111
|
+
scheme = URI.parse(path).scheme
|
112
|
+
SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
113
|
+
next unless regexp =~ scheme
|
114
|
+
klass = thing
|
115
|
+
break
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# just stick with text if still not set
|
120
|
+
klass = :text unless klass
|
121
|
+
|
122
|
+
klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
|
108
123
|
end
|
109
124
|
end
|
110
125
|
end
|