imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'nibbler'
|
|
3
|
+
require 'open-uri'
|
|
4
|
+
|
|
5
|
+
class Game < Nibbler
|
|
6
|
+
element 'td/i/a' => :name
|
|
7
|
+
element 'td/a' => :publisher
|
|
8
|
+
element 'td[2]' => :year
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class Table < Nibbler
|
|
12
|
+
|
|
13
|
+
elements "//h2/span[@id='Licensed_games']/following::table[1]/tr" => :licensed_games, :with => Game
|
|
14
|
+
elements "//h2/span[@id='Unlicensed_games']/following::table[1]/tr" => :unlicensed_games, :with => Game
|
|
15
|
+
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
foo = Table.parse open('http://en.wikipedia.org/wiki/List_of_Nintendo_Entertainment_System_games')
|
|
19
|
+
|
|
20
|
+
puts foo.unlicensed_games[1].inspect
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
data/examples/script.rb
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'open-uri'
|
|
5
|
+
require 'nokogiri'
|
|
6
|
+
require 'fastercsv'
|
|
7
|
+
|
|
8
|
+
url = "http://www.gamespot.com/games.html?platform=19&category=&type=games&mode=all&sort=title&sortdir=desc&page="
|
|
9
|
+
pages = (0..27)
|
|
10
|
+
|
|
11
|
+
FasterCSV.open("nes_gamespot.csv","w", :write_headers => true, :headers => %w[ title category release ]) do |csv|
|
|
12
|
+
pages.each do |page|
|
|
13
|
+
doc = Nokogiri::HTML(open(url + page.to_s))
|
|
14
|
+
doc.xpath("//tr").each do |node|
|
|
15
|
+
game = node.content.split("\n").map {|i| i.strip }.reject { |i| i.length == 0 }
|
|
16
|
+
csv << game unless game.include?("Release Date")
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
data/lib/imw.rb
CHANGED
|
@@ -1,60 +1,16 @@
|
|
|
1
1
|
require 'rubygems'
|
|
2
|
-
require 'bundler/setup'
|
|
3
|
-
require 'imw/boot'
|
|
4
2
|
require 'imw/utils'
|
|
3
|
+
require 'imw/error'
|
|
4
|
+
require 'imw/uri'
|
|
5
5
|
|
|
6
|
-
# The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
|
|
7
|
-
# extracting, parsing, munging, and packaging datasets. It allows you
|
|
8
|
-
# to handle different data formats transparently as well as organize
|
|
9
|
-
# transformations of data as a network of dependencies (a la Make or
|
|
10
|
-
# Rake).
|
|
11
|
-
#
|
|
12
|
-
# IMW has a few central concepts: resources, metadata, datasets,
|
|
13
|
-
# workflows, and repositories.
|
|
14
|
-
#
|
|
15
|
-
# Resources represent individual data resources like local files,
|
|
16
|
-
# websites, databases, &c. An IMW::Resource is typically instantiated
|
|
17
|
-
# via IMW.open, with IMW doing the work of figuring out what to return
|
|
18
|
-
# based on the URI passed in.
|
|
19
|
-
#
|
|
20
|
-
# A Resource can have a schema which describes the fields in its data.
|
|
21
|
-
# IMW::Metadata consists of classes which describe fields.
|
|
22
|
-
#
|
|
23
|
-
# Datasets represent collections of related data resources .. An
|
|
24
|
-
# IMW::Dataset comes with a pre-defined (but customizable) workflow
|
|
25
|
-
# that takes data resources through several steps: rip, parse, munge,
|
|
26
|
-
# and package. The workflow leverages Rake and so the various tasks
|
|
27
|
-
# that are necessary to process the data till it is nice and pretty
|
|
28
|
-
# can all be linked with dependencies.
|
|
29
|
-
#
|
|
30
|
-
# Repositories are collections of datasets and it is on these
|
|
31
|
-
# collections that the +imw+ command line tool operates.
|
|
32
6
|
module IMW
|
|
7
|
+
|
|
8
|
+
autoload :Recordizer, 'imw/recordizer'
|
|
33
9
|
autoload :Resource, 'imw/resource'
|
|
34
10
|
autoload :Schemes, 'imw/schemes'
|
|
35
|
-
autoload :
|
|
36
|
-
autoload :CompressedFiles, 'imw/compressed_files'
|
|
37
|
-
autoload :Formats, 'imw/formats'
|
|
38
|
-
autoload :Tools, 'imw/tools'
|
|
11
|
+
autoload :Formats, 'imw/formats'
|
|
39
12
|
autoload :Parsers, 'imw/parsers'
|
|
40
|
-
autoload :Dataset, 'imw/dataset'
|
|
41
|
-
autoload :Repository, 'imw/repository'
|
|
42
|
-
autoload :Metadata, 'imw/metadata'
|
|
43
13
|
|
|
44
|
-
# Open a resource at the given +uri+. The resource will
|
|
45
|
-
# automatically be extended by modules which make sense given the
|
|
46
|
-
# +uri+.
|
|
47
|
-
#
|
|
48
|
-
# See the documentation for IMW::Resource and the various modules
|
|
49
|
-
# within IMW::Resources for more information and options.
|
|
50
|
-
#
|
|
51
|
-
# Passing in an IMW::Resource will simply return it.
|
|
52
|
-
#
|
|
53
|
-
# @param [String, Addressable::URI, IMW::Resource] obj the URI to open
|
|
54
|
-
# @param [Hash] options
|
|
55
|
-
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
|
|
56
|
-
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
|
|
57
|
-
# @return [IMW::Resource] the resulting resource, property extended for the given URI
|
|
58
14
|
def self.open obj, options={}, &block
|
|
59
15
|
if obj.is_a?(IMW::Resource)
|
|
60
16
|
resource = obj
|
|
@@ -63,103 +19,35 @@ module IMW
|
|
|
63
19
|
options[:skip_modules] ||= (options[:without] || [])
|
|
64
20
|
resource = IMW::Resource.new(obj, options)
|
|
65
21
|
end
|
|
66
|
-
if block_given?
|
|
67
|
-
yield resource
|
|
68
|
-
resource.close
|
|
69
|
-
else
|
|
70
|
-
resource
|
|
71
|
-
end
|
|
72
22
|
end
|
|
73
23
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
24
|
+
class Resource
|
|
25
|
+
|
|
26
|
+
attr_reader :uri
|
|
27
|
+
|
|
28
|
+
def initialize(uri, mode='r')
|
|
29
|
+
raise FileModeError.new("'#{mode}' is not a valid access mode") unless valid_modes.include? mode
|
|
30
|
+
@uri = Uri.new(uri)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.open(uri, mode='r', &blk)
|
|
34
|
+
resource = Resource.new(uri, mode)
|
|
35
|
+
if block_given?
|
|
36
|
+
yield resource
|
|
37
|
+
else
|
|
38
|
+
return resource
|
|
89
39
|
end
|
|
90
|
-
else
|
|
91
|
-
new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory]))
|
|
92
|
-
new_dir.create
|
|
93
40
|
end
|
|
94
|
-
new_dir
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Works the same way as IMW.open except opens the resource for
|
|
98
|
-
# writing.
|
|
99
|
-
#
|
|
100
|
-
# @param [String, Addressable::URI] uri the URI to open
|
|
101
|
-
# @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
|
|
102
|
-
def self.open! uri, options={}, &block
|
|
103
|
-
open(uri, options.merge(:mode => 'w'), &block)
|
|
104
|
-
end
|
|
105
41
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
#
|
|
110
|
-
# @return [IMW::Repository] the default IMW repository
|
|
111
|
-
def self.repository
|
|
112
|
-
@@repository ||= IMW::Repository.new
|
|
113
|
-
end
|
|
42
|
+
def self.exists? resource
|
|
43
|
+
true
|
|
44
|
+
end
|
|
114
45
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# an elegant way.
|
|
120
|
-
#
|
|
121
|
-
# IMW.dataset :my_dataset do
|
|
122
|
-
#
|
|
123
|
-
# # Define some paths we're going to use
|
|
124
|
-
# add_path :original, :rawd, 'original.csv'
|
|
125
|
-
# add_path :filtered, :fixd, 'filtered.csv'
|
|
126
|
-
# add_path :package, :pkgd, 'filtered.tar.bz2'
|
|
127
|
-
#
|
|
128
|
-
# # Copy a CSV filefrom a website to this machine.
|
|
129
|
-
# rip do
|
|
130
|
-
# open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
|
|
131
|
-
# end
|
|
132
|
-
#
|
|
133
|
-
# # Filter the original CSV data by the
|
|
134
|
-
# # <tt>meets_some_condition?</tt> method we define elsewhere...
|
|
135
|
-
# munge do
|
|
136
|
-
# open!(path_to(:filtered)) do |filtered|
|
|
137
|
-
# open(path_to(:original)).each do |row|
|
|
138
|
-
# filtered << row if meets_some_condition?(row)
|
|
139
|
-
# end
|
|
140
|
-
# end
|
|
141
|
-
#
|
|
142
|
-
# # Compress the filtered data to an archive.
|
|
143
|
-
# package do
|
|
144
|
-
# open(path_to(:filtered)).compress.mv(path_to(:package))
|
|
145
|
-
# end
|
|
146
|
-
# end
|
|
147
|
-
#
|
|
148
|
-
# See the <tt>/examples</tt> directory of the IMW distribution for
|
|
149
|
-
# more examples.
|
|
150
|
-
#
|
|
151
|
-
# @param [Symbol, String] handle the handle to identify this dataset with
|
|
152
|
-
# @param [Hash] options a hash of options (see IMW::Dataset)
|
|
153
|
-
# @return [IMW::Dataset] the new dataset
|
|
154
|
-
def self.dataset handle, options={}, &block
|
|
155
|
-
d = IMW::Dataset.new(handle, options.merge(:repository => IMW.repository))
|
|
156
|
-
d.instance_eval(&block) if block_given?
|
|
157
|
-
d
|
|
158
|
-
end
|
|
46
|
+
private
|
|
47
|
+
def valid_modes
|
|
48
|
+
%w[ r w a ]
|
|
49
|
+
end
|
|
159
50
|
|
|
160
|
-
end
|
|
51
|
+
end
|
|
161
52
|
|
|
162
|
-
# Works just like IMW.dataset but defined at a top-level scope.
|
|
163
|
-
def dataset handle, options={}, &block
|
|
164
|
-
IMW.dataset(handle, options, &block)
|
|
165
53
|
end
|
data/lib/imw/error.rb
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Recordizer
|
|
3
|
+
class HTMLSelectorRecordizer
|
|
4
|
+
|
|
5
|
+
def self.element(*args, &block)
|
|
6
|
+
selector, name, delegate = parse_rule_declaration(*args, &block)
|
|
7
|
+
rules[name] = [selector, delegate]
|
|
8
|
+
attr_accessor name
|
|
9
|
+
name
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.elements(*args, &block)
|
|
13
|
+
name = element(*args, &block)
|
|
14
|
+
rules[name] << true
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize
|
|
18
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.recordize(doc)
|
|
22
|
+
self.new.recordize(doc)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def recordize(doc)
|
|
26
|
+
self.class.rules.each do |target, (selector, delegate, plural)|
|
|
27
|
+
if plural
|
|
28
|
+
send(target).concat doc.search(selector).map { |i| parse_result(i, delegate) }
|
|
29
|
+
else
|
|
30
|
+
send("#{target}=", parse_result(doc.at(selector), delegate))
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
self.to_hash
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def to_hash
|
|
37
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
|
38
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
|
39
|
+
value = send(name)
|
|
40
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
|
41
|
+
hash
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
protected
|
|
46
|
+
|
|
47
|
+
def parse_result(node, delegate)
|
|
48
|
+
if delegate
|
|
49
|
+
delegate.respond_to?(:call) ? delegate.call(node) : delegate.recordize(node)
|
|
50
|
+
elsif node.respond_to? :inner_text
|
|
51
|
+
node.inner_text
|
|
52
|
+
else
|
|
53
|
+
node
|
|
54
|
+
end unless node.nil?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def self.rules
|
|
60
|
+
@rules ||= {}
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def self.inherited(subclass)
|
|
64
|
+
subclass.rules.update self.rules
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Rule declaration forms:
|
|
68
|
+
#
|
|
69
|
+
# { 'selector' => :property, :with => delegate }
|
|
70
|
+
# #=> ['selector', :property, delegate]
|
|
71
|
+
#
|
|
72
|
+
# :title
|
|
73
|
+
# #=> ['title', :title, nil]
|
|
74
|
+
def self.parse_rule_declaration(*args, &block)
|
|
75
|
+
options, name = Hash === args.last ? args.pop : {}, args.first
|
|
76
|
+
delegate = options.delete(:with)
|
|
77
|
+
selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
|
|
78
|
+
raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
|
|
79
|
+
# eval block in context of a new scraper subclass
|
|
80
|
+
delegate = Class.new(delegate || Nibbler, &block) if block_given?
|
|
81
|
+
return selector, property, delegate
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Recordizer
|
|
3
|
+
class StringSliceRecordizer
|
|
4
|
+
|
|
5
|
+
attr_reader :schema
|
|
6
|
+
|
|
7
|
+
def initialize ranges
|
|
8
|
+
@schema = ranges
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def recordize line
|
|
12
|
+
format = schema
|
|
13
|
+
case format
|
|
14
|
+
when Array then slice_by_array(line, format)
|
|
15
|
+
when Hash then slice_by_hash(line, format)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def slice_range string, range
|
|
20
|
+
string.slice(range).strip
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def slice_by_array string, format
|
|
24
|
+
format.map { |range| slice_range(string, range) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def slice_by_hash string, format
|
|
28
|
+
format.inject({}) do |hsh, (key, val)|
|
|
29
|
+
case val
|
|
30
|
+
when Range then hsh[key] = slice_range(string, val)
|
|
31
|
+
when Hash then hsh[key] = slice_by_hash(string, val)
|
|
32
|
+
end
|
|
33
|
+
hsh
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/imw/resource.rb
CHANGED
|
@@ -1,80 +1,10 @@
|
|
|
1
1
|
require 'imw/utils/has_uri'
|
|
2
2
|
|
|
3
3
|
module IMW
|
|
4
|
-
|
|
5
|
-
# A resource can be anything addressable via a URI. Examples
|
|
6
|
-
# include local files, remote files, webpages, &c.
|
|
7
|
-
#
|
|
8
|
-
# The IMW::Resource class takes a URI as input and then dynamically
|
|
9
|
-
# extends itself with appropriate modules from IMW. As an example,
|
|
10
|
-
# calling
|
|
11
|
-
#
|
|
12
|
-
# my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
|
|
13
|
-
#
|
|
14
|
-
# would return an IMW::Resource extended by
|
|
15
|
-
# IMW::Archives::Tarbz2 (among other modules) which
|
|
16
|
-
# therefore has methods for extracting, listing, and appending to
|
|
17
|
-
# the archive.
|
|
18
|
-
#
|
|
19
|
-
# Modules are so extended based on handlers defined in the
|
|
20
|
-
# <tt>imw/resources</tt> directory and accessible via
|
|
21
|
-
# IMW::Resource.handlers. You can define your own handlers by
|
|
22
|
-
# defining the constant IMW::Resource::USER_DEFINED_HANDLERS in your
|
|
23
|
-
# configuration file.
|
|
24
|
-
#
|
|
25
|
-
# The modules extending a particular IMW::Resource instance can be
|
|
26
|
-
# listed as follows
|
|
27
|
-
#
|
|
28
|
-
# my_archive.modules #=> [IMW::Local::Base, IMW::Local::File, IMW::Local::Compressible, IMW::Archives::Tarbz2]
|
|
29
|
-
#
|
|
30
|
-
# By default, resources are opened for reading. Passing in the
|
|
31
|
-
# appropriate <tt>:mode</tt> option changes this:
|
|
32
|
-
#
|
|
33
|
-
# IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
|
|
34
|
-
#
|
|
35
|
-
# If the <tt>:skip_modules</tt> option is passed in then the
|
|
36
|
-
# resource will not extend itself with any modules and will
|
|
37
|
-
# essentially only retain the bare functionality of a URI. This can
|
|
38
|
-
# be useful when subclassing IMW::Resource or dealing with a very
|
|
39
|
-
# strange kind of resource.
|
|
40
|
-
#
|
|
41
|
-
# Read the documentation for modules in IMW::Resources to learn more
|
|
42
|
-
# about the various behaviors an IMW::Resource can acquire.
|
|
43
|
-
#
|
|
44
|
-
# You can also instantiate an IMW::Resource using IMW.open, which
|
|
45
|
-
# accepts all the same arguments as IMW::Resource.new.
|
|
46
4
|
class Resource
|
|
47
5
|
|
|
48
|
-
|
|
49
|
-
attr_accessor :mode
|
|
50
|
-
|
|
51
|
-
# A copy of the options passed to this resource on initialization.
|
|
52
|
-
attr_accessor :resource_options
|
|
6
|
+
attr_accessor :mode, :resource_options
|
|
53
7
|
|
|
54
|
-
# Create a new resource representing +uri+.
|
|
55
|
-
#
|
|
56
|
-
# IMW will automatically extend the resulting IMW::Resource
|
|
57
|
-
# instance with modules appropriate for the given URI:
|
|
58
|
-
#
|
|
59
|
-
# r = IMW::Resource.new("http://www.infochimps.com")
|
|
60
|
-
# r.modules
|
|
61
|
-
# => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
|
62
|
-
#
|
|
63
|
-
# You can prevent this altogether by passing in
|
|
64
|
-
# <tt>:no_modules</tt>:
|
|
65
|
-
#
|
|
66
|
-
# r = IMW::Resource.new("http://www.infochimps.com", :no_modules => true)
|
|
67
|
-
# r.modules
|
|
68
|
-
# => []
|
|
69
|
-
#
|
|
70
|
-
# And you can exert more fine-grained control with the
|
|
71
|
-
# <tt>:use_modules</tt> and <tt>:skip_modules</tt> options, see
|
|
72
|
-
# IMW::Resource.extend_instance! for details.
|
|
73
|
-
#
|
|
74
|
-
# @param [String, Addressable::URI] uri
|
|
75
|
-
# @param [Hash] options
|
|
76
|
-
# @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
|
|
77
|
-
# @return [IMW::Resource]
|
|
78
8
|
def initialize uri, options={}
|
|
79
9
|
self.uri = uri
|
|
80
10
|
self.resource_options = options
|
|
@@ -85,20 +15,14 @@ module IMW
|
|
|
85
15
|
# Provides resources with a wrapped Addressable::URI object.
|
|
86
16
|
include IMW::Utils::HasURI
|
|
87
17
|
|
|
88
|
-
# Provides resources with a summary, metadata, & schema.
|
|
89
|
-
include IMW::Metadata::HasSummary
|
|
90
|
-
|
|
91
18
|
# Gives IMW::Resource instances with the ability to dynamically
|
|
92
19
|
# extend themselves with modules chosen from a set of handlers
|
|
93
20
|
# stored by the IMW::Resource class.
|
|
94
21
|
include IMW::Utils::DynamicallyExtendable
|
|
95
|
-
[IMW::Schemes::HANDLERS, IMW::
|
|
22
|
+
[IMW::Schemes::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
|
|
96
23
|
register_handlers *handlers
|
|
97
24
|
end
|
|
98
|
-
|
|
99
|
-
# Raise an error unless this resource exists.
|
|
100
|
-
#
|
|
101
|
-
# @param [String] message an optional message to include
|
|
25
|
+
|
|
102
26
|
def should_exist!(message=nil)
|
|
103
27
|
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
|
|
104
28
|
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
|
|
@@ -106,52 +30,12 @@ module IMW
|
|
|
106
30
|
self
|
|
107
31
|
end
|
|
108
32
|
|
|
109
|
-
# Close this resource.
|
|
110
|
-
#
|
|
111
|
-
# Modules should hook into super() as they need to redefine this
|
|
112
|
-
# method.
|
|
113
33
|
def close
|
|
114
34
|
end
|
|
115
35
|
|
|
116
|
-
# Open a copy of this resource.
|
|
117
|
-
#
|
|
118
|
-
# This is useful when wanting to reset file handles. Though -- be
|
|
119
|
-
# warned -- it does not close any file handles itself...
|
|
120
|
-
#
|
|
121
|
-
# @return [IMW::Resource] the new (old) resource
|
|
122
36
|
def reopen
|
|
123
37
|
IMW.open(uri.to_s)
|
|
124
38
|
end
|
|
125
39
|
|
|
126
|
-
# If +method+ begins with the strings +is+, +on+, or +via+ and
|
|
127
|
-
# ends with a question mark then we interpret it as a question
|
|
128
|
-
# this resource doesn't know how to answer -- so we have it answer
|
|
129
|
-
# +false+.
|
|
130
|
-
#
|
|
131
|
-
# As an example, consider the following loop:
|
|
132
|
-
#
|
|
133
|
-
# IMW.open('/tmp').all_contents.each do |obj|
|
|
134
|
-
# if obj.is_archive?
|
|
135
|
-
# # ... do something
|
|
136
|
-
# end
|
|
137
|
-
# end
|
|
138
|
-
#
|
|
139
|
-
# When +obj+ is initialized and it _isn't_ an archive, then it
|
|
140
|
-
# doesn't know about the <tt>is_archive?</tt> method -- but it
|
|
141
|
-
# should therefore answer false anyway.
|
|
142
|
-
#
|
|
143
|
-
# This lets a basic text file answer questions about whether it's
|
|
144
|
-
# an archive (or on S3, or accessed via some user-defined scheme,
|
|
145
|
-
# &c.) without needing to know anything about archives (or S3 or
|
|
146
|
-
# the user-defined scheme).
|
|
147
|
-
def method_missing method, *args
|
|
148
|
-
if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
|
|
149
|
-
# querying for a boolean response so answer false
|
|
150
|
-
return false
|
|
151
|
-
else
|
|
152
|
-
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{modules.join(', ')}"
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
|
|
156
40
|
end
|
|
157
41
|
end
|