nibbler 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +3 -3
- data/lib/nibbler.rb +43 -31
- metadata +4 -3
data/Rakefile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
task :default => :spec
|
1
|
+
task :default => [:loc, :spec]
|
2
2
|
|
3
3
|
desc %(Run specs)
|
4
4
|
task :spec do
|
@@ -13,12 +13,12 @@ task :loc do
|
|
13
13
|
file.each_line do |line|
|
14
14
|
case line
|
15
15
|
when /^class\b/ then counting = true
|
16
|
-
when /^\s*(
|
16
|
+
when /^\s*(#|$)/ then next
|
17
17
|
when /^end\b/ then break
|
18
18
|
end
|
19
19
|
loc += 1 if counting
|
20
20
|
end
|
21
21
|
|
22
|
-
puts loc
|
22
|
+
puts "#{loc} lines of code"
|
23
23
|
end
|
24
24
|
end
|
data/lib/nibbler.rb
CHANGED
@@ -1,20 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# A minimalistic, declarative HTML scraper
|
3
2
|
class Nibbler
|
4
3
|
attr_reader :doc
|
5
4
|
|
6
|
-
#
|
7
|
-
def initialize(doc)
|
8
|
-
@doc = self.class.convert_document(doc)
|
9
|
-
initialize_plural_accessors
|
10
|
-
end
|
11
|
-
|
12
|
-
# Initialize a new scraper and process data
|
13
|
-
def self.parse(html)
|
14
|
-
new(html).parse
|
15
|
-
end
|
16
|
-
|
17
|
-
# Specify a new singular scraping rule
|
5
|
+
# Declare a singular scraping rule
|
18
6
|
def self.element(*args, &block)
|
19
7
|
selector, name, delegate = parse_rule_declaration(*args, &block)
|
20
8
|
rules[name] = [selector, delegate]
|
@@ -22,19 +10,27 @@ class Nibbler
|
|
22
10
|
name
|
23
11
|
end
|
24
12
|
|
25
|
-
#
|
13
|
+
# Declare a plural scraping rule
|
26
14
|
def self.elements(*args, &block)
|
27
15
|
name = element(*args, &block)
|
28
16
|
rules[name] << true
|
29
17
|
end
|
30
18
|
|
31
|
-
#
|
19
|
+
# Process data by creating a new scraper
|
20
|
+
def self.parse(data) new(data).parse end
|
21
|
+
|
22
|
+
# Initialize the parser with raw data or a document
|
23
|
+
def initialize(data)
|
24
|
+
@doc = self.class.convert_document(data)
|
25
|
+
# initialize plural properties
|
26
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Parse the document and save values returned by selectors
|
32
30
|
def parse
|
33
31
|
self.class.rules.each do |target, (selector, delegate, plural)|
|
34
32
|
if plural
|
35
|
-
@doc.search(selector).
|
36
|
-
send(target) << parse_result(node, delegate)
|
37
|
-
end
|
33
|
+
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
38
34
|
else
|
39
35
|
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
40
36
|
end
|
@@ -42,6 +38,16 @@ class Nibbler
|
|
42
38
|
self
|
43
39
|
end
|
44
40
|
|
41
|
+
# Dump the extracted data into a hash with symbolized keys
|
42
|
+
def to_hash
|
43
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
44
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
45
|
+
value = send(name)
|
46
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
47
|
+
hash
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
45
51
|
protected
|
46
52
|
|
47
53
|
# `delegate` is optional, but should respond to `call` or `parse`
|
@@ -57,18 +63,20 @@ class Nibbler
|
|
57
63
|
|
58
64
|
private
|
59
65
|
|
66
|
+
# Parsing rules declared with `element` or `elements`
|
60
67
|
def self.rules
|
61
68
|
@rules ||= {}
|
62
69
|
end
|
63
70
|
|
71
|
+
# Make subclasses inherit the parsing rules
|
64
72
|
def self.inherited(subclass)
|
65
73
|
subclass.rules.update self.rules
|
66
74
|
end
|
67
75
|
|
68
|
-
# Rule declaration
|
76
|
+
# Rule declaration forms:
|
69
77
|
#
|
70
|
-
# { '
|
71
|
-
# #=> ['
|
78
|
+
# { 'selector' => :property, :with => delegate }
|
79
|
+
# #=> ['selector', :property, delegate]
|
72
80
|
#
|
73
81
|
# :title
|
74
82
|
# #=> ['title', :title, nil]
|
@@ -82,18 +90,12 @@ class Nibbler
|
|
82
90
|
return selector, property, delegate
|
83
91
|
end
|
84
92
|
|
85
|
-
|
86
|
-
self.class.rules.each do |name, (s, k, plural)|
|
87
|
-
send("#{name}=", []) if plural
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
93
|
+
# Parse data with Nokogiri unless it's already an acceptable document
|
91
94
|
def self.convert_document(doc)
|
92
|
-
|
95
|
+
if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
|
96
|
+
else
|
93
97
|
require 'nokogiri' unless defined? ::Nokogiri
|
94
98
|
Nokogiri doc
|
95
|
-
else
|
96
|
-
doc
|
97
99
|
end
|
98
100
|
end
|
99
101
|
end
|
@@ -180,6 +182,16 @@ if __FILE__ == $0
|
|
180
182
|
@blog.navigation_items.should == %w[Home About Help]
|
181
183
|
end
|
182
184
|
|
185
|
+
it "should convert to hash" do
|
186
|
+
hash = @blog.to_hash
|
187
|
+
hash[:navigation_items].should == %w[Home About Help]
|
188
|
+
hash[:title].should == "Maximum awesome"
|
189
|
+
article = hash[:articles].first
|
190
|
+
article[:title] == "First article"
|
191
|
+
article.key?(:link).should be_true
|
192
|
+
article[:link].should be_nil
|
193
|
+
end
|
194
|
+
|
183
195
|
it "should have title, pubdate for first article" do
|
184
196
|
article = @blog.articles[0]
|
185
197
|
article.title.should == 'First article'
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nibbler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
version:
|
10
|
+
version: 1.1.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- "Mislav Marohni\xC4\x87"
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-30 00:00:00 +02:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|