nibbler 1.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +3 -3
- data/lib/nibbler.rb +43 -31
- metadata +4 -3
data/Rakefile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
task :default => :spec
|
1
|
+
task :default => [:loc, :spec]
|
2
2
|
|
3
3
|
desc %(Run specs)
|
4
4
|
task :spec do
|
@@ -13,12 +13,12 @@ task :loc do
|
|
13
13
|
file.each_line do |line|
|
14
14
|
case line
|
15
15
|
when /^class\b/ then counting = true
|
16
|
-
when /^\s*(
|
16
|
+
when /^\s*(#|$)/ then next
|
17
17
|
when /^end\b/ then break
|
18
18
|
end
|
19
19
|
loc += 1 if counting
|
20
20
|
end
|
21
21
|
|
22
|
-
puts loc
|
22
|
+
puts "#{loc} lines of code"
|
23
23
|
end
|
24
24
|
end
|
data/lib/nibbler.rb
CHANGED
@@ -1,20 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# A minimalistic, declarative HTML scraper
|
3
2
|
class Nibbler
|
4
3
|
attr_reader :doc
|
5
4
|
|
6
|
-
#
|
7
|
-
def initialize(doc)
|
8
|
-
@doc = self.class.convert_document(doc)
|
9
|
-
initialize_plural_accessors
|
10
|
-
end
|
11
|
-
|
12
|
-
# Initialize a new scraper and process data
|
13
|
-
def self.parse(html)
|
14
|
-
new(html).parse
|
15
|
-
end
|
16
|
-
|
17
|
-
# Specify a new singular scraping rule
|
5
|
+
# Declare a singular scraping rule
|
18
6
|
def self.element(*args, &block)
|
19
7
|
selector, name, delegate = parse_rule_declaration(*args, &block)
|
20
8
|
rules[name] = [selector, delegate]
|
@@ -22,19 +10,27 @@ class Nibbler
|
|
22
10
|
name
|
23
11
|
end
|
24
12
|
|
25
|
-
#
|
13
|
+
# Declare a plural scraping rule
|
26
14
|
def self.elements(*args, &block)
|
27
15
|
name = element(*args, &block)
|
28
16
|
rules[name] << true
|
29
17
|
end
|
30
18
|
|
31
|
-
#
|
19
|
+
# Process data by creating a new scraper
|
20
|
+
def self.parse(data) new(data).parse end
|
21
|
+
|
22
|
+
# Initialize the parser with raw data or a document
|
23
|
+
def initialize(data)
|
24
|
+
@doc = self.class.convert_document(data)
|
25
|
+
# initialize plural properties
|
26
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Parse the document and save values returned by selectors
|
32
30
|
def parse
|
33
31
|
self.class.rules.each do |target, (selector, delegate, plural)|
|
34
32
|
if plural
|
35
|
-
@doc.search(selector).
|
36
|
-
send(target) << parse_result(node, delegate)
|
37
|
-
end
|
33
|
+
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
38
34
|
else
|
39
35
|
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
40
36
|
end
|
@@ -42,6 +38,16 @@ class Nibbler
|
|
42
38
|
self
|
43
39
|
end
|
44
40
|
|
41
|
+
# Dump the extracted data into a hash with symbolized keys
|
42
|
+
def to_hash
|
43
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
44
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
45
|
+
value = send(name)
|
46
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
47
|
+
hash
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
45
51
|
protected
|
46
52
|
|
47
53
|
# `delegate` is optional, but should respond to `call` or `parse`
|
@@ -57,18 +63,20 @@ class Nibbler
|
|
57
63
|
|
58
64
|
private
|
59
65
|
|
66
|
+
# Parsing rules declared with `element` or `elements`
|
60
67
|
def self.rules
|
61
68
|
@rules ||= {}
|
62
69
|
end
|
63
70
|
|
71
|
+
# Make subclasses inherit the parsing rules
|
64
72
|
def self.inherited(subclass)
|
65
73
|
subclass.rules.update self.rules
|
66
74
|
end
|
67
75
|
|
68
|
-
# Rule declaration
|
76
|
+
# Rule declaration forms:
|
69
77
|
#
|
70
|
-
# { '
|
71
|
-
# #=> ['
|
78
|
+
# { 'selector' => :property, :with => delegate }
|
79
|
+
# #=> ['selector', :property, delegate]
|
72
80
|
#
|
73
81
|
# :title
|
74
82
|
# #=> ['title', :title, nil]
|
@@ -82,18 +90,12 @@ class Nibbler
|
|
82
90
|
return selector, property, delegate
|
83
91
|
end
|
84
92
|
|
85
|
-
|
86
|
-
self.class.rules.each do |name, (s, k, plural)|
|
87
|
-
send("#{name}=", []) if plural
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
93
|
+
# Parse data with Nokogiri unless it's already an acceptable document
|
91
94
|
def self.convert_document(doc)
|
92
|
-
|
95
|
+
if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
|
96
|
+
else
|
93
97
|
require 'nokogiri' unless defined? ::Nokogiri
|
94
98
|
Nokogiri doc
|
95
|
-
else
|
96
|
-
doc
|
97
99
|
end
|
98
100
|
end
|
99
101
|
end
|
@@ -180,6 +182,16 @@ if __FILE__ == $0
|
|
180
182
|
@blog.navigation_items.should == %w[Home About Help]
|
181
183
|
end
|
182
184
|
|
185
|
+
it "should convert to hash" do
|
186
|
+
hash = @blog.to_hash
|
187
|
+
hash[:navigation_items].should == %w[Home About Help]
|
188
|
+
hash[:title].should == "Maximum awesome"
|
189
|
+
article = hash[:articles].first
|
190
|
+
article[:title] == "First article"
|
191
|
+
article.key?(:link).should be_true
|
192
|
+
article[:link].should be_nil
|
193
|
+
end
|
194
|
+
|
183
195
|
it "should have title, pubdate for first article" do
|
184
196
|
article = @blog.articles[0]
|
185
197
|
article.title.should == 'First article'
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nibbler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
version:
|
10
|
+
version: 1.1.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- "Mislav Marohni\xC4\x87"
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-30 00:00:00 +02:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|