imw 0.2.18 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nibbler'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class Game < Nibbler
|
6
|
+
element 'td/i/a' => :name
|
7
|
+
element 'td/a' => :publisher
|
8
|
+
element 'td[2]' => :year
|
9
|
+
end
|
10
|
+
|
11
|
+
class Table < Nibbler
|
12
|
+
|
13
|
+
elements "//h2/span[@id='Licensed_games']/following::table[1]/tr" => :licensed_games, :with => Game
|
14
|
+
elements "//h2/span[@id='Unlicensed_games']/following::table[1]/tr" => :unlicensed_games, :with => Game
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
foo = Table.parse open('http://en.wikipedia.org/wiki/List_of_Nintendo_Entertainment_System_games')
|
19
|
+
|
20
|
+
puts foo.unlicensed_games[1].inspect
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
|
data/examples/script.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'fastercsv'
|
7
|
+
|
8
|
+
url = "http://www.gamespot.com/games.html?platform=19&category=&type=games&mode=all&sort=title&sortdir=desc&page="
|
9
|
+
pages = (0..27)
|
10
|
+
|
11
|
+
FasterCSV.open("nes_gamespot.csv","w", :write_headers => true, :headers => %w[ title category release ]) do |csv|
|
12
|
+
pages.each do |page|
|
13
|
+
doc = Nokogiri::HTML(open(url + page.to_s))
|
14
|
+
doc.xpath("//tr").each do |node|
|
15
|
+
game = node.content.split("\n").map {|i| i.strip }.reject { |i| i.length == 0 }
|
16
|
+
csv << game unless game.include?("Release Date")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/imw.rb
CHANGED
@@ -1,60 +1,16 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler/setup'
|
3
|
-
require 'imw/boot'
|
4
2
|
require 'imw/utils'
|
3
|
+
require 'imw/error'
|
4
|
+
require 'imw/uri'
|
5
5
|
|
6
|
-
# The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
|
7
|
-
# extracting, parsing, munging, and packaging datasets. It allows you
|
8
|
-
# to handle different data formats transparently as well as organize
|
9
|
-
# transformations of data as a network of dependencies (a la Make or
|
10
|
-
# Rake).
|
11
|
-
#
|
12
|
-
# IMW has a few central concepts: resources, metadata, datasets,
|
13
|
-
# workflows, and repositories.
|
14
|
-
#
|
15
|
-
# Resources represent individual data resources like local files,
|
16
|
-
# websites, databases, &c. An IMW::Resource is typically instantiated
|
17
|
-
# via IMW.open, with IMW doing the work of figuring out what to return
|
18
|
-
# based on the URI passed in.
|
19
|
-
#
|
20
|
-
# A Resource can have a schema which describes the fields in its data.
|
21
|
-
# IMW::Metadata consists of classes which describe fields.
|
22
|
-
#
|
23
|
-
# Datasets represent collections of related data resources .. An
|
24
|
-
# IMW::Dataset comes with a pre-defined (but customizable) workflow
|
25
|
-
# that takes data resources through several steps: rip, parse, munge,
|
26
|
-
# and package. The workflow leverages Rake and so the various tasks
|
27
|
-
# that are necessary to process the data till it is nice and pretty
|
28
|
-
# can all be linked with dependencies.
|
29
|
-
#
|
30
|
-
# Repositories are collections of datasets and it is on these
|
31
|
-
# collections that the +imw+ command line tool operates.
|
32
6
|
module IMW
|
7
|
+
|
8
|
+
autoload :Recordizer, 'imw/recordizer'
|
33
9
|
autoload :Resource, 'imw/resource'
|
34
10
|
autoload :Schemes, 'imw/schemes'
|
35
|
-
autoload :
|
36
|
-
autoload :CompressedFiles, 'imw/compressed_files'
|
37
|
-
autoload :Formats, 'imw/formats'
|
38
|
-
autoload :Tools, 'imw/tools'
|
11
|
+
autoload :Formats, 'imw/formats'
|
39
12
|
autoload :Parsers, 'imw/parsers'
|
40
|
-
autoload :Dataset, 'imw/dataset'
|
41
|
-
autoload :Repository, 'imw/repository'
|
42
|
-
autoload :Metadata, 'imw/metadata'
|
43
13
|
|
44
|
-
# Open a resource at the given +uri+. The resource will
|
45
|
-
# automatically be extended by modules which make sense given the
|
46
|
-
# +uri+.
|
47
|
-
#
|
48
|
-
# See the documentation for IMW::Resource and the various modules
|
49
|
-
# within IMW::Resources for more information and options.
|
50
|
-
#
|
51
|
-
# Passing in an IMW::Resource will simply return it.
|
52
|
-
#
|
53
|
-
# @param [String, Addressable::URI, IMW::Resource] obj the URI to open
|
54
|
-
# @param [Hash] options
|
55
|
-
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
|
56
|
-
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
|
57
|
-
# @return [IMW::Resource] the resulting resource, property extended for the given URI
|
58
14
|
def self.open obj, options={}, &block
|
59
15
|
if obj.is_a?(IMW::Resource)
|
60
16
|
resource = obj
|
@@ -63,103 +19,35 @@ module IMW
|
|
63
19
|
options[:skip_modules] ||= (options[:without] || [])
|
64
20
|
resource = IMW::Resource.new(obj, options)
|
65
21
|
end
|
66
|
-
if block_given?
|
67
|
-
yield resource
|
68
|
-
resource.close
|
69
|
-
else
|
70
|
-
resource
|
71
|
-
end
|
72
22
|
end
|
73
23
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
24
|
+
class Resource
|
25
|
+
|
26
|
+
attr_reader :uri
|
27
|
+
|
28
|
+
def initialize(uri, mode='r')
|
29
|
+
raise FileModeError.new("'#{mode}' is not a valid access mode") unless valid_modes.include? mode
|
30
|
+
@uri = Uri.new(uri)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.open(uri, mode='r', &blk)
|
34
|
+
resource = Resource.new(uri, mode)
|
35
|
+
if block_given?
|
36
|
+
yield resource
|
37
|
+
else
|
38
|
+
return resource
|
89
39
|
end
|
90
|
-
else
|
91
|
-
new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory]))
|
92
|
-
new_dir.create
|
93
40
|
end
|
94
|
-
new_dir
|
95
|
-
end
|
96
|
-
|
97
|
-
# Works the same way as IMW.open except opens the resource for
|
98
|
-
# writing.
|
99
|
-
#
|
100
|
-
# @param [String, Addressable::URI] uri the URI to open
|
101
|
-
# @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
|
102
|
-
def self.open! uri, options={}, &block
|
103
|
-
open(uri, options.merge(:mode => 'w'), &block)
|
104
|
-
end
|
105
41
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
#
|
110
|
-
# @return [IMW::Repository] the default IMW repository
|
111
|
-
def self.repository
|
112
|
-
@@repository ||= IMW::Repository.new
|
113
|
-
end
|
42
|
+
def self.exists? resource
|
43
|
+
true
|
44
|
+
end
|
114
45
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
# an elegant way.
|
120
|
-
#
|
121
|
-
# IMW.dataset :my_dataset do
|
122
|
-
#
|
123
|
-
# # Define some paths we're going to use
|
124
|
-
# add_path :original, :rawd, 'original.csv'
|
125
|
-
# add_path :filtered, :fixd, 'filtered.csv'
|
126
|
-
# add_path :package, :pkgd, 'filtered.tar.bz2'
|
127
|
-
#
|
128
|
-
# # Copy a CSV filefrom a website to this machine.
|
129
|
-
# rip do
|
130
|
-
# open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
|
131
|
-
# end
|
132
|
-
#
|
133
|
-
# # Filter the original CSV data by the
|
134
|
-
# # <tt>meets_some_condition?</tt> method we define elsewhere...
|
135
|
-
# munge do
|
136
|
-
# open!(path_to(:filtered)) do |filtered|
|
137
|
-
# open(path_to(:original)).each do |row|
|
138
|
-
# filtered << row if meets_some_condition?(row)
|
139
|
-
# end
|
140
|
-
# end
|
141
|
-
#
|
142
|
-
# # Compress the filtered data to an archive.
|
143
|
-
# package do
|
144
|
-
# open(path_to(:filtered)).compress.mv(path_to(:package))
|
145
|
-
# end
|
146
|
-
# end
|
147
|
-
#
|
148
|
-
# See the <tt>/examples</tt> directory of the IMW distribution for
|
149
|
-
# more examples.
|
150
|
-
#
|
151
|
-
# @param [Symbol, String] handle the handle to identify this dataset with
|
152
|
-
# @param [Hash] options a hash of options (see IMW::Dataset)
|
153
|
-
# @return [IMW::Dataset] the new dataset
|
154
|
-
def self.dataset handle, options={}, &block
|
155
|
-
d = IMW::Dataset.new(handle, options.merge(:repository => IMW.repository))
|
156
|
-
d.instance_eval(&block) if block_given?
|
157
|
-
d
|
158
|
-
end
|
46
|
+
private
|
47
|
+
def valid_modes
|
48
|
+
%w[ r w a ]
|
49
|
+
end
|
159
50
|
|
160
|
-
end
|
51
|
+
end
|
161
52
|
|
162
|
-
# Works just like IMW.dataset but defined at a top-level scope.
|
163
|
-
def dataset handle, options={}, &block
|
164
|
-
IMW.dataset(handle, options, &block)
|
165
53
|
end
|
data/lib/imw/error.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
module IMW
|
2
|
+
module Recordizer
|
3
|
+
class HTMLSelectorRecordizer
|
4
|
+
|
5
|
+
def self.element(*args, &block)
|
6
|
+
selector, name, delegate = parse_rule_declaration(*args, &block)
|
7
|
+
rules[name] = [selector, delegate]
|
8
|
+
attr_accessor name
|
9
|
+
name
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.elements(*args, &block)
|
13
|
+
name = element(*args, &block)
|
14
|
+
rules[name] << true
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.recordize(doc)
|
22
|
+
self.new.recordize(doc)
|
23
|
+
end
|
24
|
+
|
25
|
+
def recordize(doc)
|
26
|
+
self.class.rules.each do |target, (selector, delegate, plural)|
|
27
|
+
if plural
|
28
|
+
send(target).concat doc.search(selector).map { |i| parse_result(i, delegate) }
|
29
|
+
else
|
30
|
+
send("#{target}=", parse_result(doc.at(selector), delegate))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
self.to_hash
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_hash
|
37
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
38
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
39
|
+
value = send(name)
|
40
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
41
|
+
hash
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def parse_result(node, delegate)
|
48
|
+
if delegate
|
49
|
+
delegate.respond_to?(:call) ? delegate.call(node) : delegate.recordize(node)
|
50
|
+
elsif node.respond_to? :inner_text
|
51
|
+
node.inner_text
|
52
|
+
else
|
53
|
+
node
|
54
|
+
end unless node.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def self.rules
|
60
|
+
@rules ||= {}
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.inherited(subclass)
|
64
|
+
subclass.rules.update self.rules
|
65
|
+
end
|
66
|
+
|
67
|
+
# Rule declaration forms:
|
68
|
+
#
|
69
|
+
# { 'selector' => :property, :with => delegate }
|
70
|
+
# #=> ['selector', :property, delegate]
|
71
|
+
#
|
72
|
+
# :title
|
73
|
+
# #=> ['title', :title, nil]
|
74
|
+
def self.parse_rule_declaration(*args, &block)
|
75
|
+
options, name = Hash === args.last ? args.pop : {}, args.first
|
76
|
+
delegate = options.delete(:with)
|
77
|
+
selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
|
78
|
+
raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
|
79
|
+
# eval block in context of a new scraper subclass
|
80
|
+
delegate = Class.new(delegate || Nibbler, &block) if block_given?
|
81
|
+
return selector, property, delegate
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module IMW
|
2
|
+
module Recordizer
|
3
|
+
class StringSliceRecordizer
|
4
|
+
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
def initialize ranges
|
8
|
+
@schema = ranges
|
9
|
+
end
|
10
|
+
|
11
|
+
def recordize line
|
12
|
+
format = schema
|
13
|
+
case format
|
14
|
+
when Array then slice_by_array(line, format)
|
15
|
+
when Hash then slice_by_hash(line, format)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def slice_range string, range
|
20
|
+
string.slice(range).strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def slice_by_array string, format
|
24
|
+
format.map { |range| slice_range(string, range) }
|
25
|
+
end
|
26
|
+
|
27
|
+
def slice_by_hash string, format
|
28
|
+
format.inject({}) do |hsh, (key, val)|
|
29
|
+
case val
|
30
|
+
when Range then hsh[key] = slice_range(string, val)
|
31
|
+
when Hash then hsh[key] = slice_by_hash(string, val)
|
32
|
+
end
|
33
|
+
hsh
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/imw/resource.rb
CHANGED
@@ -1,80 +1,10 @@
|
|
1
1
|
require 'imw/utils/has_uri'
|
2
2
|
|
3
3
|
module IMW
|
4
|
-
|
5
|
-
# A resource can be anything addressable via a URI. Examples
|
6
|
-
# include local files, remote files, webpages, &c.
|
7
|
-
#
|
8
|
-
# The IMW::Resource class takes a URI as input and then dynamically
|
9
|
-
# extends itself with appropriate modules from IMW. As an example,
|
10
|
-
# calling
|
11
|
-
#
|
12
|
-
# my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
|
13
|
-
#
|
14
|
-
# would return an IMW::Resource extended by
|
15
|
-
# IMW::Archives::Tarbz2 (among other modules) which
|
16
|
-
# therefore has methods for extracting, listing, and appending to
|
17
|
-
# the archive.
|
18
|
-
#
|
19
|
-
# Modules are so extended based on handlers defined in the
|
20
|
-
# <tt>imw/resources</tt> directory and accessible via
|
21
|
-
# IMW::Resource.handlers. You can define your own handlers by
|
22
|
-
# defining the constant IMW::Resource::USER_DEFINED_HANDLERS in your
|
23
|
-
# configuration file.
|
24
|
-
#
|
25
|
-
# The modules extending a particular IMW::Resource instance can be
|
26
|
-
# listed as follows
|
27
|
-
#
|
28
|
-
# my_archive.modules #=> [IMW::Local::Base, IMW::Local::File, IMW::Local::Compressible, IMW::Archives::Tarbz2]
|
29
|
-
#
|
30
|
-
# By default, resources are opened for reading. Passing in the
|
31
|
-
# appropriate <tt>:mode</tt> option changes this:
|
32
|
-
#
|
33
|
-
# IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
|
34
|
-
#
|
35
|
-
# If the <tt>:skip_modules</tt> option is passed in then the
|
36
|
-
# resource will not extend itself with any modules and will
|
37
|
-
# essentially only retain the bare functionality of a URI. This can
|
38
|
-
# be useful when subclassing IMW::Resource or dealing with a very
|
39
|
-
# strange kind of resource.
|
40
|
-
#
|
41
|
-
# Read the documentation for modules in IMW::Resources to learn more
|
42
|
-
# about the various behaviors an IMW::Resource can acquire.
|
43
|
-
#
|
44
|
-
# You can also instantiate an IMW::Resource using IMW.open, which
|
45
|
-
# accepts all the same arguments as IMW::Resource.new.
|
46
4
|
class Resource
|
47
5
|
|
48
|
-
|
49
|
-
attr_accessor :mode
|
50
|
-
|
51
|
-
# A copy of the options passed to this resource on initialization.
|
52
|
-
attr_accessor :resource_options
|
6
|
+
attr_accessor :mode, :resource_options
|
53
7
|
|
54
|
-
# Create a new resource representing +uri+.
|
55
|
-
#
|
56
|
-
# IMW will automatically extend the resulting IMW::Resource
|
57
|
-
# instance with modules appropriate for the given URI:
|
58
|
-
#
|
59
|
-
# r = IMW::Resource.new("http://www.infochimps.com")
|
60
|
-
# r.modules
|
61
|
-
# => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
62
|
-
#
|
63
|
-
# You can prevent this altogether by passing in
|
64
|
-
# <tt>:no_modules</tt>:
|
65
|
-
#
|
66
|
-
# r = IMW::Resource.new("http://www.infochimps.com", :no_modules => true)
|
67
|
-
# r.modules
|
68
|
-
# => []
|
69
|
-
#
|
70
|
-
# And you can exert more fine-grained control with the
|
71
|
-
# <tt>:use_modules</tt> and <tt>:skip_modules</tt> options, see
|
72
|
-
# IMW::Resource.extend_instance! for details.
|
73
|
-
#
|
74
|
-
# @param [String, Addressable::URI] uri
|
75
|
-
# @param [Hash] options
|
76
|
-
# @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
|
77
|
-
# @return [IMW::Resource]
|
78
8
|
def initialize uri, options={}
|
79
9
|
self.uri = uri
|
80
10
|
self.resource_options = options
|
@@ -85,20 +15,14 @@ module IMW
|
|
85
15
|
# Provides resources with a wrapped Addressable::URI object.
|
86
16
|
include IMW::Utils::HasURI
|
87
17
|
|
88
|
-
# Provides resources with a summary, metadata, & schema.
|
89
|
-
include IMW::Metadata::HasSummary
|
90
|
-
|
91
18
|
# Gives IMW::Resource instances with the ability to dynamically
|
92
19
|
# extend themselves with modules chosen from a set of handlers
|
93
20
|
# stored by the IMW::Resource class.
|
94
21
|
include IMW::Utils::DynamicallyExtendable
|
95
|
-
[IMW::Schemes::HANDLERS, IMW::
|
22
|
+
[IMW::Schemes::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
|
96
23
|
register_handlers *handlers
|
97
24
|
end
|
98
|
-
|
99
|
-
# Raise an error unless this resource exists.
|
100
|
-
#
|
101
|
-
# @param [String] message an optional message to include
|
25
|
+
|
102
26
|
def should_exist!(message=nil)
|
103
27
|
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
|
104
28
|
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
|
@@ -106,52 +30,12 @@ module IMW
|
|
106
30
|
self
|
107
31
|
end
|
108
32
|
|
109
|
-
# Close this resource.
|
110
|
-
#
|
111
|
-
# Modules should hook into super() as they need to redefine this
|
112
|
-
# method.
|
113
33
|
def close
|
114
34
|
end
|
115
35
|
|
116
|
-
# Open a copy of this resource.
|
117
|
-
#
|
118
|
-
# This is useful when wanting to reset file handles. Though -- be
|
119
|
-
# warned -- it does not close any file handles itself...
|
120
|
-
#
|
121
|
-
# @return [IMW::Resource] the new (old) resource
|
122
36
|
def reopen
|
123
37
|
IMW.open(uri.to_s)
|
124
38
|
end
|
125
39
|
|
126
|
-
# If +method+ begins with the strings +is+, +on+, or +via+ and
|
127
|
-
# ends with a question mark then we interpret it as a question
|
128
|
-
# this resource doesn't know how to answer -- so we have it answer
|
129
|
-
# +false+.
|
130
|
-
#
|
131
|
-
# As an example, consider the following loop:
|
132
|
-
#
|
133
|
-
# IMW.open('/tmp').all_contents.each do |obj|
|
134
|
-
# if obj.is_archive?
|
135
|
-
# # ... do something
|
136
|
-
# end
|
137
|
-
# end
|
138
|
-
#
|
139
|
-
# When +obj+ is initialized and it _isn't_ an archive, then it
|
140
|
-
# doesn't know about the <tt>is_archive?</tt> method -- but it
|
141
|
-
# should therefore answer false anyway.
|
142
|
-
#
|
143
|
-
# This lets a basic text file answer questions about whether it's
|
144
|
-
# an archive (or on S3, or accessed via some user-defined scheme,
|
145
|
-
# &c.) without needing to know anything about archives (or S3 or
|
146
|
-
# the user-defined scheme).
|
147
|
-
def method_missing method, *args
|
148
|
-
if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
|
149
|
-
# querying for a boolean response so answer false
|
150
|
-
return false
|
151
|
-
else
|
152
|
-
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{modules.join(', ')}"
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
40
|
end
|
157
41
|
end
|