nibbler 1.2.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +69 -27
- data/Rakefile +8 -4
- data/examples/delicious.rb +16 -18
- data/examples/tweetburner.rb +18 -9
- data/examples/twitter.rb +2 -2
- data/lib/nibbler.rb +97 -70
- data/lib/nibbler/json.rb +174 -13
- metadata +23 -43
data/README.md
CHANGED
@@ -1,32 +1,71 @@
|
|
1
1
|
Nibbler
|
2
2
|
=======
|
3
3
|
|
4
|
-
*Nibbler* is a
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
4
|
+
*Nibbler* is a small little tool (~100 LOC) that helps you map data structures to objects that you define.
|
5
|
+
|
6
|
+
It can be used for HTML screen scraping:
|
7
|
+
|
8
|
+
~~~ ruby
|
9
|
+
require 'nibbler'
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
class BlogScraper < Nibbler
|
13
|
+
element :title
|
14
|
+
|
15
|
+
elements 'div.hentry' => :articles do
|
16
|
+
element 'h2' => :title
|
17
|
+
element 'a/@href' => :url
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
blog = BlogScraper.parse open('http://example.com')
|
22
|
+
|
23
|
+
blog.title
|
24
|
+
#=> "My blog title"
|
25
|
+
|
26
|
+
blog.articles.first.title
|
27
|
+
#=> "First article title"
|
28
|
+
|
29
|
+
blog.articles.first.url
|
30
|
+
#=> "http://example.com/article"
|
31
|
+
~~~
|
32
|
+
|
33
|
+
For mapping XML API payloads:
|
34
|
+
|
35
|
+
~~~ ruby
|
36
|
+
class Movie < Nibbler
|
37
|
+
element './title/@regular' => :name
|
38
|
+
element './box_art/@large' => :poster_large
|
39
|
+
element 'release_year' => :year, :with => lambda { |node| node.text.to_i }
|
40
|
+
element './/link[@title="web page"]/@href' => :url
|
41
|
+
end
|
42
|
+
|
43
|
+
response = Net::HTTP.get_response URI('http://example.com/movie.xml')
|
44
|
+
movie = Movie.parse response.body
|
45
|
+
|
46
|
+
movie.name #=> "Toy Story 3"
|
47
|
+
movie.year #=> 2010
|
48
|
+
~~~
|
49
|
+
|
50
|
+
Or even for JSON:
|
51
|
+
|
52
|
+
~~~ ruby
|
53
|
+
require 'json'
|
54
|
+
require 'nibbler/json'
|
55
|
+
|
56
|
+
class Movie < NibblerJSON
|
57
|
+
element :title
|
58
|
+
element :year
|
59
|
+
elements :genres
|
60
|
+
# JSONPath selectors:
|
61
|
+
element '.links.alternate' => :url
|
62
|
+
element '.ratings.critics_score' => :critics_score
|
63
|
+
end
|
64
|
+
|
65
|
+
movie = Movie.parse json_string
|
66
|
+
~~~
|
67
|
+
|
68
|
+
There are sample scripts in the "examples/" directory:
|
30
69
|
|
31
70
|
ruby -Ilib -rubygems examples/delicious.rb
|
32
71
|
ruby -Ilib -rubygems examples/tweetburner.rb > output.csv
|
@@ -36,7 +75,10 @@ There are sample scripts in the "examples/" directory; run them with:
|
|
36
75
|
Requirements
|
37
76
|
------------
|
38
77
|
|
39
|
-
*None*. Well, [Nokogiri][] is a requirement if you pass in HTML
|
78
|
+
*None*. Well, [Nokogiri][] is a requirement if you pass in an HTML string for parsing, like in the example above. Otherwise you can initialize the scraper with an
|
79
|
+
Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
|
80
|
+
|
81
|
+
NibblerJSON needs a JSON parser if string content is passed, so "json" library should be installed on Ruby 1.8.
|
40
82
|
|
41
83
|
|
42
84
|
[wiki]: http://wiki.github.com/mislav/nibbler
|
data/Rakefile
CHANGED
@@ -2,7 +2,11 @@ task :default => [:loc, :spec]
|
|
2
2
|
|
3
3
|
desc %(Run specs)
|
4
4
|
task :spec do
|
5
|
-
|
5
|
+
tests = []
|
6
|
+
tests << %(ruby -Ilib -rubygems lib/nibbler.rb --color)
|
7
|
+
tests << %(ruby -Ilib -rubygems lib/nibbler/json.rb)
|
8
|
+
|
9
|
+
exit(1) if tests.any? {|cmd| !sh(cmd) }
|
6
10
|
end
|
7
11
|
|
8
12
|
desc %(Count lines of code in implementation)
|
@@ -12,13 +16,13 @@ task :loc do
|
|
12
16
|
|
13
17
|
file.each_line do |line|
|
14
18
|
case line
|
15
|
-
when /^class\b/
|
19
|
+
when /^(class|module)\b/ then counting = true
|
16
20
|
when /^\s*(#|$)/ then next
|
17
|
-
when /^end\b/
|
21
|
+
when /^end\b/ then break
|
18
22
|
end
|
19
23
|
loc += 1 if counting
|
20
24
|
end
|
21
25
|
|
22
26
|
puts "#{loc} lines of code"
|
23
27
|
end
|
24
|
-
end
|
28
|
+
end
|
data/examples/delicious.rb
CHANGED
@@ -5,33 +5,31 @@
|
|
5
5
|
|
6
6
|
require 'nibbler'
|
7
7
|
require 'open-uri'
|
8
|
-
require 'date'
|
9
8
|
|
10
9
|
# extracts data from a single bookmark
|
11
10
|
class Bookmark < Nibbler
|
12
|
-
element '
|
13
|
-
element '.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
11
|
+
element '.body .title' => :title
|
12
|
+
element '.note' => :description
|
13
|
+
|
14
|
+
element '.sub span' => :url
|
15
|
+
|
18
16
|
# tags are plural
|
19
|
-
elements '
|
20
|
-
|
21
|
-
#
|
22
|
-
element '
|
23
|
-
Date.strptime(span.inner_text.strip, '%d %b %y')
|
24
|
-
}
|
17
|
+
elements '.tag .name' => :tags
|
18
|
+
|
19
|
+
# extract timestamp from HTML attribute
|
20
|
+
element './@date' => :date, :with => lambda { |timestamp| Time.at timestamp.text.to_i }
|
25
21
|
end
|
26
22
|
|
27
23
|
# finds all bookmarks on the page
|
28
24
|
class Delicious < Nibbler
|
29
|
-
elements '
|
25
|
+
elements '.content .linkList .link' => :bookmarks, :with => Bookmark
|
30
26
|
end
|
31
27
|
|
32
28
|
mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
|
33
|
-
bookmark = mislav.bookmarks.first
|
34
29
|
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
mislav.bookmarks[0,3].each do |bookmark|
|
31
|
+
puts bookmark.title #=> "Some title"
|
32
|
+
p bookmark.tags #=> ['foo', 'bar', ...]
|
33
|
+
puts bookmark.date #=> <Date>
|
34
|
+
puts
|
35
|
+
end
|
data/examples/tweetburner.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
1
3
|
## Tweetburner.com archive dump
|
2
4
|
#
|
3
5
|
# I needed to dump my Tweetburner archive to CSV
|
@@ -14,8 +16,15 @@ module Tweetburner
|
|
14
16
|
SITE = URI('http://tweetburner.com')
|
15
17
|
|
16
18
|
class Scraper < ::Nibbler
|
17
|
-
|
18
|
-
|
19
|
+
def initialize url
|
20
|
+
doc = get_document url
|
21
|
+
super doc
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# open web pages with UTF-8 encoding
|
27
|
+
def get_document(url)
|
19
28
|
URI === url ? Nokogiri::HTML::Document.parse(open(url), url.to_s, 'UTF-8') : url
|
20
29
|
rescue OpenURI::HTTPError
|
21
30
|
$stderr.puts "ERROR opening #{url}"
|
@@ -31,7 +40,7 @@ module Tweetburner
|
|
31
40
|
element '.col-tweet-text' => :text, :with => lambda { |node|
|
32
41
|
node.text.sub(/\s+– .+?$/, '')
|
33
42
|
}
|
34
|
-
element '.col-clicks' => :clicks
|
43
|
+
element '.col-clicks' => :clicks, :with => lambda { |node| node.inner_text.to_i }
|
35
44
|
element '.col-created-at' => :created_at, :with => lambda { |node| DateTime.parse node.text }
|
36
45
|
|
37
46
|
def stats
|
@@ -58,18 +67,18 @@ module Tweetburner
|
|
58
67
|
def parse
|
59
68
|
super
|
60
69
|
if next_page_url
|
61
|
-
@doc =
|
70
|
+
@doc = get_document(URI(next_page_url))
|
62
71
|
self.parse
|
63
|
-
else
|
64
|
-
self
|
65
72
|
end
|
73
|
+
self
|
66
74
|
end
|
67
75
|
|
68
76
|
def to_csv(io = STDOUT)
|
69
77
|
io.sync = true if io == STDOUT
|
70
|
-
|
71
|
-
|
72
|
-
|
78
|
+
CSV(io) do |csv|
|
79
|
+
links.each do |link|
|
80
|
+
csv << [link.text, link.clicks, link.created_at, link.stats.destination]
|
81
|
+
end
|
73
82
|
end
|
74
83
|
end
|
75
84
|
end
|
data/examples/twitter.rb
CHANGED
@@ -11,11 +11,11 @@ require 'time'
|
|
11
11
|
|
12
12
|
# now here's the real deal
|
13
13
|
class Twitter < NibblerJSON
|
14
|
-
elements :tweets
|
14
|
+
elements :tweets do
|
15
15
|
element :created_at, :with => lambda { |time| Time.parse(time) }
|
16
16
|
element :text
|
17
17
|
element :id
|
18
|
-
element 'user' => :author
|
18
|
+
element 'user' => :author do
|
19
19
|
element 'name' => :full_name
|
20
20
|
element 'screen_name' => :username
|
21
21
|
end
|
data/lib/nibbler.rb
CHANGED
@@ -1,110 +1,137 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# DSL for defining data extraction rules from an abstract document object
|
2
|
+
module NibblerMethods
|
3
|
+
def self.extended(base)
|
4
|
+
base.send(:include, InstanceMethods) if base.is_a? Class
|
5
|
+
end
|
6
|
+
|
5
7
|
# Declare a singular scraping rule
|
6
|
-
def
|
8
|
+
def element(*args, &block)
|
7
9
|
selector, name, delegate = parse_rule_declaration(*args, &block)
|
8
10
|
rules[name] = [selector, delegate]
|
9
11
|
attr_accessor name
|
10
12
|
name
|
11
13
|
end
|
12
|
-
|
14
|
+
|
13
15
|
# Declare a plural scraping rule
|
14
|
-
def
|
16
|
+
def elements(*args, &block)
|
15
17
|
name = element(*args, &block)
|
16
18
|
rules[name] << true
|
17
19
|
end
|
18
|
-
|
19
|
-
# Process data by creating a new scraper
|
20
|
-
def self.parse(data) new(data).parse end
|
21
|
-
|
22
|
-
# Initialize the parser with raw data or a document
|
23
|
-
def initialize(data)
|
24
|
-
@doc = self.class.convert_document(data)
|
25
|
-
# initialize plural properties
|
26
|
-
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
27
|
-
end
|
28
|
-
|
29
|
-
# Parse the document and save values returned by selectors
|
30
|
-
def parse
|
31
|
-
self.class.rules.each do |target, (selector, delegate, plural)|
|
32
|
-
if plural
|
33
|
-
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
34
|
-
else
|
35
|
-
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
self
|
39
|
-
end
|
40
|
-
|
41
|
-
# Dump the extracted data into a hash with symbolized keys
|
42
|
-
def to_hash
|
43
|
-
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
44
|
-
self.class.rules.keys.inject({}) do |hash, name|
|
45
|
-
value = send(name)
|
46
|
-
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
47
|
-
hash
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
protected
|
52
|
-
|
53
|
-
# `delegate` is optional, but should respond to `call` or `parse`
|
54
|
-
def parse_result(node, delegate)
|
55
|
-
if delegate
|
56
|
-
delegate.respond_to?(:call) ? delegate.call(node) : delegate.parse(node)
|
57
|
-
elsif node.respond_to? :inner_text
|
58
|
-
node.inner_text
|
59
|
-
else
|
60
|
-
node
|
61
|
-
end unless node.nil?
|
62
|
-
end
|
63
|
-
|
64
|
-
private
|
65
|
-
|
20
|
+
|
66
21
|
# Parsing rules declared with `element` or `elements`
|
67
|
-
def
|
22
|
+
def rules
|
68
23
|
@rules ||= {}
|
69
24
|
end
|
70
|
-
|
25
|
+
|
26
|
+
# Process data by creating a new instance
|
27
|
+
def parse(doc) new(doc).parse end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
71
31
|
# Make subclasses inherit the parsing rules
|
72
|
-
def
|
32
|
+
def inherited(subclass)
|
33
|
+
super
|
73
34
|
subclass.rules.update self.rules
|
74
35
|
end
|
75
|
-
|
36
|
+
|
76
37
|
# Rule declaration forms:
|
77
|
-
#
|
38
|
+
#
|
78
39
|
# { 'selector' => :property, :with => delegate }
|
79
40
|
# #=> ['selector', :property, delegate]
|
80
|
-
#
|
41
|
+
#
|
81
42
|
# :title
|
82
43
|
# #=> ['title', :title, nil]
|
83
|
-
def
|
44
|
+
def parse_rule_declaration(*args, &block)
|
84
45
|
options, name = Hash === args.last ? args.pop : {}, args.first
|
85
46
|
delegate = options.delete(:with)
|
86
47
|
selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
|
87
48
|
raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
|
88
49
|
# eval block in context of a new scraper subclass
|
89
|
-
delegate = Class.new(delegate ||
|
50
|
+
delegate = Class.new(delegate || base_parser_class, &block) if block_given?
|
90
51
|
return selector, property, delegate
|
91
52
|
end
|
92
|
-
|
53
|
+
|
54
|
+
def base_parser_class
|
55
|
+
klass = self
|
56
|
+
klass = klass.superclass until klass.superclass == Object
|
57
|
+
klass
|
58
|
+
end
|
59
|
+
|
60
|
+
module InstanceMethods
|
61
|
+
attr_reader :doc
|
62
|
+
|
63
|
+
# Initialize the parser with a document
|
64
|
+
def initialize(doc)
|
65
|
+
@doc = doc
|
66
|
+
# initialize plural properties
|
67
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Parse the document and save values returned by selectors
|
71
|
+
def parse
|
72
|
+
self.class.rules.each do |target, (selector, delegate, plural)|
|
73
|
+
if plural
|
74
|
+
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
75
|
+
else
|
76
|
+
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
# Dump the extracted data into a hash with symbolized keys
|
83
|
+
def to_hash
|
84
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
85
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
86
|
+
value = send(name)
|
87
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
88
|
+
hash
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
# `delegate` is optional, but should respond to `call` or `parse`
|
95
|
+
def parse_result(node, delegate)
|
96
|
+
if delegate
|
97
|
+
method = delegate.is_a?(Proc) ? delegate : delegate.method(delegate.respond_to?(:call) ? :call : :parse)
|
98
|
+
method.arity == 1 ? method[node] : method[node, self]
|
99
|
+
else
|
100
|
+
node
|
101
|
+
end unless node.nil?
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# An HTML/XML scraper
|
107
|
+
class Nibbler
|
108
|
+
extend NibblerMethods
|
109
|
+
|
93
110
|
# Parse data with Nokogiri unless it's already an acceptable document
|
94
|
-
def
|
95
|
-
|
96
|
-
else
|
111
|
+
def initialize(doc)
|
112
|
+
unless doc.respond_to?(:at) and doc.respond_to?(:search)
|
97
113
|
require 'nokogiri' unless defined? ::Nokogiri
|
98
|
-
Nokogiri doc
|
114
|
+
doc = Nokogiri doc
|
99
115
|
end
|
116
|
+
super(doc)
|
100
117
|
end
|
101
|
-
end
|
102
118
|
|
119
|
+
protected
|
120
|
+
|
121
|
+
def parse_result(node, delegate)
|
122
|
+
if !delegate and node.respond_to? :inner_text
|
123
|
+
node.inner_text
|
124
|
+
else
|
125
|
+
super
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
103
129
|
|
104
130
|
## specs
|
105
131
|
|
106
132
|
if __FILE__ == $0
|
107
|
-
require '
|
133
|
+
require 'date'
|
134
|
+
require 'rspec/autorun'
|
108
135
|
HTML = DATA.read
|
109
136
|
|
110
137
|
class Article < Nibbler
|
data/lib/nibbler/json.rb
CHANGED
@@ -1,29 +1,190 @@
|
|
1
1
|
require 'nibbler'
|
2
|
+
require 'strscan'
|
2
3
|
|
3
4
|
# a wrapper for JSON data that provides `at` and `search`
|
4
5
|
class Nibbler::JsonDocument
|
5
6
|
attr_reader :data
|
6
|
-
|
7
|
-
def initialize(obj)
|
8
|
-
@data =
|
7
|
+
|
8
|
+
def initialize(obj, root = nil)
|
9
|
+
@data = obj.respond_to?(:to_str) ? JSON.parse(obj) : obj
|
10
|
+
@root = root
|
9
11
|
end
|
10
|
-
|
11
|
-
def
|
12
|
-
|
12
|
+
|
13
|
+
def root
|
14
|
+
@root || data
|
13
15
|
end
|
14
|
-
|
16
|
+
|
15
17
|
def search(selector)
|
16
|
-
|
18
|
+
if selector !~ /[^\w-]/
|
19
|
+
found = Array === data ? data : data[selector]
|
20
|
+
found = [] if found.nil?
|
21
|
+
found = [found] unless Array === found
|
22
|
+
else
|
23
|
+
found = scan_selector selector
|
24
|
+
end
|
25
|
+
found
|
17
26
|
end
|
18
|
-
|
27
|
+
|
19
28
|
def at(selector)
|
20
|
-
|
29
|
+
search(selector).first
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# stupid implementation of http://goessner.net/articles/JsonPath/
|
35
|
+
def scan_selector(selector)
|
36
|
+
s = StringScanner.new selector
|
37
|
+
found = s.scan(/\$/) ? root : data
|
38
|
+
found = [found] unless Array === found
|
39
|
+
|
40
|
+
while prop = s.scan(/\.\.?[\w-]+/)
|
41
|
+
prop.sub!(/\.\.?/, '')
|
42
|
+
found = if $&.size == 2
|
43
|
+
search_recursive(prop, found).compact
|
44
|
+
else
|
45
|
+
found.flatten.map {|i| i[prop] if Hash === i and i.key? prop }.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
if s.scan(/\[/)
|
49
|
+
if range = s.scan(/[\d:]+/)
|
50
|
+
start, till, = range.split(':', 2)
|
51
|
+
start = start.to_i
|
52
|
+
idx = !till ? start : till.empty?? start..-1 : start...(till.to_i)
|
53
|
+
found.map! {|i| i[idx] if Array === i }
|
54
|
+
found.compact!
|
55
|
+
elsif s.scan(/\?/)
|
56
|
+
expr = s.scan_until(/\)/) or raise
|
57
|
+
expr.gsub!('@', 'self')
|
58
|
+
found.flatten!
|
59
|
+
found.reject! {|i| !(i.instance_eval expr rescue nil) }
|
60
|
+
found.compact!
|
61
|
+
end
|
62
|
+
s.scan(/\]/) or raise
|
63
|
+
end
|
64
|
+
break if found.empty?
|
65
|
+
end
|
66
|
+
|
67
|
+
found.flatten!
|
68
|
+
found
|
69
|
+
end
|
70
|
+
|
71
|
+
def search_recursive(prop, items, found = [])
|
72
|
+
items.map { |item|
|
73
|
+
case item
|
74
|
+
when Hash
|
75
|
+
found << item[prop] if item.key? prop
|
76
|
+
search_recursive(prop, item.values, found)
|
77
|
+
when Array
|
78
|
+
search_recursive(prop, item, found)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
found
|
21
82
|
end
|
22
83
|
end
|
23
84
|
|
24
85
|
# a scraper that works with JsonDocument
|
25
|
-
class NibblerJSON
|
26
|
-
|
27
|
-
|
86
|
+
class NibblerJSON
|
87
|
+
extend NibblerMethods
|
88
|
+
|
89
|
+
def self.parse(data, parent = nil)
|
90
|
+
new(data, parent).parse
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(doc, parent = nil)
|
94
|
+
doc = Nibbler::JsonDocument.new(doc, parent && parent.doc.root) unless doc.respond_to? :search
|
95
|
+
super(doc)
|
28
96
|
end
|
29
97
|
end
|
98
|
+
|
99
|
+
if __FILE__ == $0
|
100
|
+
require 'json'
|
101
|
+
require 'forwardable'
|
102
|
+
require 'minitest/spec'
|
103
|
+
require 'minitest/autorun'
|
104
|
+
|
105
|
+
describe Nibbler::JsonDocument do
|
106
|
+
DOC = Nibbler::JsonDocument.new DATA.read
|
107
|
+
|
108
|
+
extend Forwardable
|
109
|
+
def_delegators :DOC, :at, :search
|
110
|
+
|
111
|
+
it "fetches unknown key" do
|
112
|
+
at('doesnotexist').must_be_nil
|
113
|
+
end
|
114
|
+
|
115
|
+
it "fetches existing key" do
|
116
|
+
at('title').must_equal "Toy Story 3"
|
117
|
+
end
|
118
|
+
|
119
|
+
it "fetches selector" do
|
120
|
+
at('.year').must_equal 2010
|
121
|
+
end
|
122
|
+
|
123
|
+
it "fetches deep selector" do
|
124
|
+
at('.release_dates.dvd').must_equal "2010-11-02"
|
125
|
+
end
|
126
|
+
|
127
|
+
it "fetches first item of array" do
|
128
|
+
at('.genres').must_equal "Animation"
|
129
|
+
end
|
130
|
+
|
131
|
+
it "fetches array" do
|
132
|
+
search('.genres').must_equal [ "Animation", "Kids & Family", "Comedy" ]
|
133
|
+
end
|
134
|
+
|
135
|
+
it "extracts subset of array" do
|
136
|
+
search('.genres[:2]').must_equal [ "Animation", "Kids & Family" ]
|
137
|
+
search('.genres[1:3]').must_equal [ "Kids & Family", "Comedy" ]
|
138
|
+
search('.genres[2:]').must_equal [ "Comedy" ]
|
139
|
+
end
|
140
|
+
|
141
|
+
it "searches recursively" do
|
142
|
+
search('..characters').must_equal ["Woody", "Moody", "Buzz Lightyear"]
|
143
|
+
end
|
144
|
+
|
145
|
+
it "respects array index" do
|
146
|
+
search('..characters[0]').must_equal ["Woody", "Buzz Lightyear"]
|
147
|
+
end
|
148
|
+
|
149
|
+
it "respects conditions" do
|
150
|
+
search('.abridged_cast[?(@["name"] =~ /tom/i)].characters').must_equal ["Woody", "Moody"]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
__END__
|
156
|
+
{
|
157
|
+
"title": "Toy Story 3",
|
158
|
+
"year": 2010,
|
159
|
+
"genres": [ "Animation", "Kids & Family", "Comedy" ],
|
160
|
+
"runtime": 103,
|
161
|
+
"release_dates": {
|
162
|
+
"theater": "2010-06-18",
|
163
|
+
"dvd": "2010-11-02"
|
164
|
+
},
|
165
|
+
"ratings": {
|
166
|
+
"critics_rating": "Certified Fresh",
|
167
|
+
"critics_score": 99,
|
168
|
+
"audience_rating": "Upright",
|
169
|
+
"audience_score": 91
|
170
|
+
},
|
171
|
+
"posters": {
|
172
|
+
"thumbnail": "http://content6.flixster.com/movie/11/13/43/11134356_mob.jpg",
|
173
|
+
"profile": "http://content6.flixster.com/movie/11/13/43/11134356_pro.jpg",
|
174
|
+
"detailed": "http://content6.flixster.com/movie/11/13/43/11134356_det.jpg",
|
175
|
+
"original": "http://content6.flixster.com/movie/11/13/43/11134356_ori.jpg"
|
176
|
+
},
|
177
|
+
"abridged_cast": [
|
178
|
+
{ "name": "Tom Hanks",
|
179
|
+
"characters": [ "Woody", "Moody" ] },
|
180
|
+
{ "name": "Tim Allen",
|
181
|
+
"characters": [ "Buzz Lightyear" ] }
|
182
|
+
],
|
183
|
+
"abridged_directors": [ {"name": "Lee Unkrich"} ],
|
184
|
+
"studio": "Walt Disney Pictures",
|
185
|
+
"alternate_ids": { "imdb": "0435761" },
|
186
|
+
"links": {
|
187
|
+
"self": "http://api.rottentomatoes.com/api/public/v1.0/movies/770672122.json",
|
188
|
+
"alternate": "http://www.rottentomatoes.com/m/toy_story_3/"
|
189
|
+
}
|
190
|
+
}
|
metadata
CHANGED
@@ -1,33 +1,23 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: nibbler
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 1
|
8
|
-
- 2
|
9
|
-
- 1
|
10
|
-
version: 1.2.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
13
|
-
-
|
7
|
+
authors:
|
8
|
+
- Mislav Marohnić
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
date: 2011-01-15 00:00:00 +01:00
|
19
|
-
default_executable:
|
12
|
+
date: 2012-01-17 00:00:00.000000000 Z
|
20
13
|
dependencies: []
|
21
|
-
|
22
|
-
|
14
|
+
description: Nibbler is a super simple and powerful declarative generic scraper written
|
15
|
+
in under 70 lines of code.
|
23
16
|
email: mislav.marohnic@gmail.com
|
24
17
|
executables: []
|
25
|
-
|
26
18
|
extensions: []
|
27
|
-
|
28
19
|
extra_rdoc_files: []
|
29
|
-
|
30
|
-
files:
|
20
|
+
files:
|
31
21
|
- Rakefile
|
32
22
|
- lib/nibbler/json.rb
|
33
23
|
- lib/nibbler.rb
|
@@ -36,39 +26,29 @@ files:
|
|
36
26
|
- examples/twitter.rb
|
37
27
|
- README.md
|
38
28
|
- LICENSE
|
39
|
-
|
40
|
-
homepage: http://github.com/mislav/nibbler
|
29
|
+
homepage: https://github.com/mislav/nibbler
|
41
30
|
licenses: []
|
42
|
-
|
43
31
|
post_install_message:
|
44
32
|
rdoc_options: []
|
45
|
-
|
46
|
-
require_paths:
|
33
|
+
require_paths:
|
47
34
|
- lib
|
48
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
36
|
none: false
|
50
|
-
requirements:
|
51
|
-
- -
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
|
54
|
-
|
55
|
-
- 0
|
56
|
-
version: "0"
|
57
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
42
|
none: false
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
|
63
|
-
segments:
|
64
|
-
- 0
|
65
|
-
version: "0"
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
66
47
|
requirements: []
|
67
|
-
|
68
48
|
rubyforge_project:
|
69
|
-
rubygems_version: 1.
|
49
|
+
rubygems_version: 1.8.12
|
70
50
|
signing_key:
|
71
51
|
specification_version: 3
|
72
52
|
summary: A cute HTML scraper / data extraction tool
|
73
53
|
test_files: []
|
74
|
-
|
54
|
+
has_rdoc:
|