nibbler 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +69 -27
- data/Rakefile +8 -4
- data/examples/delicious.rb +16 -18
- data/examples/tweetburner.rb +18 -9
- data/examples/twitter.rb +2 -2
- data/lib/nibbler.rb +97 -70
- data/lib/nibbler/json.rb +174 -13
- metadata +23 -43
data/README.md
CHANGED
@@ -1,32 +1,71 @@
|
|
1
1
|
Nibbler
|
2
2
|
=======
|
3
3
|
|
4
|
-
*Nibbler* is a
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
4
|
+
*Nibbler* is a small little tool (~100 LOC) that helps you map data structures to objects that you define.
|
5
|
+
|
6
|
+
It can be used for HTML screen scraping:
|
7
|
+
|
8
|
+
~~~ ruby
|
9
|
+
require 'nibbler'
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
class BlogScraper < Nibbler
|
13
|
+
element :title
|
14
|
+
|
15
|
+
elements 'div.hentry' => :articles do
|
16
|
+
element 'h2' => :title
|
17
|
+
element 'a/@href' => :url
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
blog = BlogScraper.parse open('http://example.com')
|
22
|
+
|
23
|
+
blog.title
|
24
|
+
#=> "My blog title"
|
25
|
+
|
26
|
+
blog.articles.first.title
|
27
|
+
#=> "First article title"
|
28
|
+
|
29
|
+
blog.articles.first.url
|
30
|
+
#=> "http://example.com/article"
|
31
|
+
~~~
|
32
|
+
|
33
|
+
For mapping XML API payloads:
|
34
|
+
|
35
|
+
~~~ ruby
|
36
|
+
class Movie < Nibbler
|
37
|
+
element './title/@regular' => :name
|
38
|
+
element './box_art/@large' => :poster_large
|
39
|
+
element 'release_year' => :year, :with => lambda { |node| node.text.to_i }
|
40
|
+
element './/link[@title="web page"]/@href' => :url
|
41
|
+
end
|
42
|
+
|
43
|
+
response = Net::HTTP.get_response URI('http://example.com/movie.xml')
|
44
|
+
movie = Movie.parse response.body
|
45
|
+
|
46
|
+
movie.name #=> "Toy Story 3"
|
47
|
+
movie.year #=> 2010
|
48
|
+
~~~
|
49
|
+
|
50
|
+
Or even for JSON:
|
51
|
+
|
52
|
+
~~~ ruby
|
53
|
+
require 'json'
|
54
|
+
require 'nibbler/json'
|
55
|
+
|
56
|
+
class Movie < NibblerJSON
|
57
|
+
element :title
|
58
|
+
element :year
|
59
|
+
elements :genres
|
60
|
+
# JSONPath selectors:
|
61
|
+
element '.links.alternate' => :url
|
62
|
+
element '.ratings.critics_score' => :critics_score
|
63
|
+
end
|
64
|
+
|
65
|
+
movie = Movie.parse json_string
|
66
|
+
~~~
|
67
|
+
|
68
|
+
There are sample scripts in the "examples/" directory:
|
30
69
|
|
31
70
|
ruby -Ilib -rubygems examples/delicious.rb
|
32
71
|
ruby -Ilib -rubygems examples/tweetburner.rb > output.csv
|
@@ -36,7 +75,10 @@ There are sample scripts in the "examples/" directory; run them with:
|
|
36
75
|
Requirements
|
37
76
|
------------
|
38
77
|
|
39
|
-
*None*. Well, [Nokogiri][] is a requirement if you pass in HTML
|
78
|
+
*None*. Well, [Nokogiri][] is a requirement if you pass in an HTML string for parsing, like in the example above. Otherwise you can initialize the scraper with an
|
79
|
+
Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
|
80
|
+
|
81
|
+
NibblerJSON needs a JSON parser if string content is passed, so "json" library should be installed on Ruby 1.8.
|
40
82
|
|
41
83
|
|
42
84
|
[wiki]: http://wiki.github.com/mislav/nibbler
|
data/Rakefile
CHANGED
@@ -2,7 +2,11 @@ task :default => [:loc, :spec]
|
|
2
2
|
|
3
3
|
desc %(Run specs)
|
4
4
|
task :spec do
|
5
|
-
|
5
|
+
tests = []
|
6
|
+
tests << %(ruby -Ilib -rubygems lib/nibbler.rb --color)
|
7
|
+
tests << %(ruby -Ilib -rubygems lib/nibbler/json.rb)
|
8
|
+
|
9
|
+
exit(1) if tests.any? {|cmd| !sh(cmd) }
|
6
10
|
end
|
7
11
|
|
8
12
|
desc %(Count lines of code in implementation)
|
@@ -12,13 +16,13 @@ task :loc do
|
|
12
16
|
|
13
17
|
file.each_line do |line|
|
14
18
|
case line
|
15
|
-
when /^class\b/
|
19
|
+
when /^(class|module)\b/ then counting = true
|
16
20
|
when /^\s*(#|$)/ then next
|
17
|
-
when /^end\b/
|
21
|
+
when /^end\b/ then break
|
18
22
|
end
|
19
23
|
loc += 1 if counting
|
20
24
|
end
|
21
25
|
|
22
26
|
puts "#{loc} lines of code"
|
23
27
|
end
|
24
|
-
end
|
28
|
+
end
|
data/examples/delicious.rb
CHANGED
@@ -5,33 +5,31 @@
|
|
5
5
|
|
6
6
|
require 'nibbler'
|
7
7
|
require 'open-uri'
|
8
|
-
require 'date'
|
9
8
|
|
10
9
|
# extracts data from a single bookmark
|
11
10
|
class Bookmark < Nibbler
|
12
|
-
element '
|
13
|
-
element '.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
11
|
+
element '.body .title' => :title
|
12
|
+
element '.note' => :description
|
13
|
+
|
14
|
+
element '.sub span' => :url
|
15
|
+
|
18
16
|
# tags are plural
|
19
|
-
elements '
|
20
|
-
|
21
|
-
#
|
22
|
-
element '
|
23
|
-
Date.strptime(span.inner_text.strip, '%d %b %y')
|
24
|
-
}
|
17
|
+
elements '.tag .name' => :tags
|
18
|
+
|
19
|
+
# extract timestamp from HTML attribute
|
20
|
+
element './@date' => :date, :with => lambda { |timestamp| Time.at timestamp.text.to_i }
|
25
21
|
end
|
26
22
|
|
27
23
|
# finds all bookmarks on the page
|
28
24
|
class Delicious < Nibbler
|
29
|
-
elements '
|
25
|
+
elements '.content .linkList .link' => :bookmarks, :with => Bookmark
|
30
26
|
end
|
31
27
|
|
32
28
|
mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
|
33
|
-
bookmark = mislav.bookmarks.first
|
34
29
|
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
mislav.bookmarks[0,3].each do |bookmark|
|
31
|
+
puts bookmark.title #=> "Some title"
|
32
|
+
p bookmark.tags #=> ['foo', 'bar', ...]
|
33
|
+
puts bookmark.date #=> <Date>
|
34
|
+
puts
|
35
|
+
end
|
data/examples/tweetburner.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
1
3
|
## Tweetburner.com archive dump
|
2
4
|
#
|
3
5
|
# I needed to dump my Tweetburner archive to CSV
|
@@ -14,8 +16,15 @@ module Tweetburner
|
|
14
16
|
SITE = URI('http://tweetburner.com')
|
15
17
|
|
16
18
|
class Scraper < ::Nibbler
|
17
|
-
|
18
|
-
|
19
|
+
def initialize url
|
20
|
+
doc = get_document url
|
21
|
+
super doc
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# open web pages with UTF-8 encoding
|
27
|
+
def get_document(url)
|
19
28
|
URI === url ? Nokogiri::HTML::Document.parse(open(url), url.to_s, 'UTF-8') : url
|
20
29
|
rescue OpenURI::HTTPError
|
21
30
|
$stderr.puts "ERROR opening #{url}"
|
@@ -31,7 +40,7 @@ module Tweetburner
|
|
31
40
|
element '.col-tweet-text' => :text, :with => lambda { |node|
|
32
41
|
node.text.sub(/\s+– .+?$/, '')
|
33
42
|
}
|
34
|
-
element '.col-clicks' => :clicks
|
43
|
+
element '.col-clicks' => :clicks, :with => lambda { |node| node.inner_text.to_i }
|
35
44
|
element '.col-created-at' => :created_at, :with => lambda { |node| DateTime.parse node.text }
|
36
45
|
|
37
46
|
def stats
|
@@ -58,18 +67,18 @@ module Tweetburner
|
|
58
67
|
def parse
|
59
68
|
super
|
60
69
|
if next_page_url
|
61
|
-
@doc =
|
70
|
+
@doc = get_document(URI(next_page_url))
|
62
71
|
self.parse
|
63
|
-
else
|
64
|
-
self
|
65
72
|
end
|
73
|
+
self
|
66
74
|
end
|
67
75
|
|
68
76
|
def to_csv(io = STDOUT)
|
69
77
|
io.sync = true if io == STDOUT
|
70
|
-
|
71
|
-
|
72
|
-
|
78
|
+
CSV(io) do |csv|
|
79
|
+
links.each do |link|
|
80
|
+
csv << [link.text, link.clicks, link.created_at, link.stats.destination]
|
81
|
+
end
|
73
82
|
end
|
74
83
|
end
|
75
84
|
end
|
data/examples/twitter.rb
CHANGED
@@ -11,11 +11,11 @@ require 'time'
|
|
11
11
|
|
12
12
|
# now here's the real deal
|
13
13
|
class Twitter < NibblerJSON
|
14
|
-
elements :tweets
|
14
|
+
elements :tweets do
|
15
15
|
element :created_at, :with => lambda { |time| Time.parse(time) }
|
16
16
|
element :text
|
17
17
|
element :id
|
18
|
-
element 'user' => :author
|
18
|
+
element 'user' => :author do
|
19
19
|
element 'name' => :full_name
|
20
20
|
element 'screen_name' => :username
|
21
21
|
end
|
data/lib/nibbler.rb
CHANGED
@@ -1,110 +1,137 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# DSL for defining data extraction rules from an abstract document object
|
2
|
+
module NibblerMethods
|
3
|
+
def self.extended(base)
|
4
|
+
base.send(:include, InstanceMethods) if base.is_a? Class
|
5
|
+
end
|
6
|
+
|
5
7
|
# Declare a singular scraping rule
|
6
|
-
def
|
8
|
+
def element(*args, &block)
|
7
9
|
selector, name, delegate = parse_rule_declaration(*args, &block)
|
8
10
|
rules[name] = [selector, delegate]
|
9
11
|
attr_accessor name
|
10
12
|
name
|
11
13
|
end
|
12
|
-
|
14
|
+
|
13
15
|
# Declare a plural scraping rule
|
14
|
-
def
|
16
|
+
def elements(*args, &block)
|
15
17
|
name = element(*args, &block)
|
16
18
|
rules[name] << true
|
17
19
|
end
|
18
|
-
|
19
|
-
# Process data by creating a new scraper
|
20
|
-
def self.parse(data) new(data).parse end
|
21
|
-
|
22
|
-
# Initialize the parser with raw data or a document
|
23
|
-
def initialize(data)
|
24
|
-
@doc = self.class.convert_document(data)
|
25
|
-
# initialize plural properties
|
26
|
-
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
27
|
-
end
|
28
|
-
|
29
|
-
# Parse the document and save values returned by selectors
|
30
|
-
def parse
|
31
|
-
self.class.rules.each do |target, (selector, delegate, plural)|
|
32
|
-
if plural
|
33
|
-
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
34
|
-
else
|
35
|
-
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
self
|
39
|
-
end
|
40
|
-
|
41
|
-
# Dump the extracted data into a hash with symbolized keys
|
42
|
-
def to_hash
|
43
|
-
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
44
|
-
self.class.rules.keys.inject({}) do |hash, name|
|
45
|
-
value = send(name)
|
46
|
-
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
47
|
-
hash
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
protected
|
52
|
-
|
53
|
-
# `delegate` is optional, but should respond to `call` or `parse`
|
54
|
-
def parse_result(node, delegate)
|
55
|
-
if delegate
|
56
|
-
delegate.respond_to?(:call) ? delegate.call(node) : delegate.parse(node)
|
57
|
-
elsif node.respond_to? :inner_text
|
58
|
-
node.inner_text
|
59
|
-
else
|
60
|
-
node
|
61
|
-
end unless node.nil?
|
62
|
-
end
|
63
|
-
|
64
|
-
private
|
65
|
-
|
20
|
+
|
66
21
|
# Parsing rules declared with `element` or `elements`
|
67
|
-
def
|
22
|
+
def rules
|
68
23
|
@rules ||= {}
|
69
24
|
end
|
70
|
-
|
25
|
+
|
26
|
+
# Process data by creating a new instance
|
27
|
+
def parse(doc) new(doc).parse end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
71
31
|
# Make subclasses inherit the parsing rules
|
72
|
-
def
|
32
|
+
def inherited(subclass)
|
33
|
+
super
|
73
34
|
subclass.rules.update self.rules
|
74
35
|
end
|
75
|
-
|
36
|
+
|
76
37
|
# Rule declaration forms:
|
77
|
-
#
|
38
|
+
#
|
78
39
|
# { 'selector' => :property, :with => delegate }
|
79
40
|
# #=> ['selector', :property, delegate]
|
80
|
-
#
|
41
|
+
#
|
81
42
|
# :title
|
82
43
|
# #=> ['title', :title, nil]
|
83
|
-
def
|
44
|
+
def parse_rule_declaration(*args, &block)
|
84
45
|
options, name = Hash === args.last ? args.pop : {}, args.first
|
85
46
|
delegate = options.delete(:with)
|
86
47
|
selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
|
87
48
|
raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
|
88
49
|
# eval block in context of a new scraper subclass
|
89
|
-
delegate = Class.new(delegate ||
|
50
|
+
delegate = Class.new(delegate || base_parser_class, &block) if block_given?
|
90
51
|
return selector, property, delegate
|
91
52
|
end
|
92
|
-
|
53
|
+
|
54
|
+
def base_parser_class
|
55
|
+
klass = self
|
56
|
+
klass = klass.superclass until klass.superclass == Object
|
57
|
+
klass
|
58
|
+
end
|
59
|
+
|
60
|
+
module InstanceMethods
|
61
|
+
attr_reader :doc
|
62
|
+
|
63
|
+
# Initialize the parser with a document
|
64
|
+
def initialize(doc)
|
65
|
+
@doc = doc
|
66
|
+
# initialize plural properties
|
67
|
+
self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Parse the document and save values returned by selectors
|
71
|
+
def parse
|
72
|
+
self.class.rules.each do |target, (selector, delegate, plural)|
|
73
|
+
if plural
|
74
|
+
send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
|
75
|
+
else
|
76
|
+
send("#{target}=", parse_result(@doc.at(selector), delegate))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
# Dump the extracted data into a hash with symbolized keys
|
83
|
+
def to_hash
|
84
|
+
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
|
85
|
+
self.class.rules.keys.inject({}) do |hash, name|
|
86
|
+
value = send(name)
|
87
|
+
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
|
88
|
+
hash
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
# `delegate` is optional, but should respond to `call` or `parse`
|
95
|
+
def parse_result(node, delegate)
|
96
|
+
if delegate
|
97
|
+
method = delegate.is_a?(Proc) ? delegate : delegate.method(delegate.respond_to?(:call) ? :call : :parse)
|
98
|
+
method.arity == 1 ? method[node] : method[node, self]
|
99
|
+
else
|
100
|
+
node
|
101
|
+
end unless node.nil?
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# An HTML/XML scraper
|
107
|
+
class Nibbler
|
108
|
+
extend NibblerMethods
|
109
|
+
|
93
110
|
# Parse data with Nokogiri unless it's already an acceptable document
|
94
|
-
def
|
95
|
-
|
96
|
-
else
|
111
|
+
def initialize(doc)
|
112
|
+
unless doc.respond_to?(:at) and doc.respond_to?(:search)
|
97
113
|
require 'nokogiri' unless defined? ::Nokogiri
|
98
|
-
Nokogiri doc
|
114
|
+
doc = Nokogiri doc
|
99
115
|
end
|
116
|
+
super(doc)
|
100
117
|
end
|
101
|
-
end
|
102
118
|
|
119
|
+
protected
|
120
|
+
|
121
|
+
def parse_result(node, delegate)
|
122
|
+
if !delegate and node.respond_to? :inner_text
|
123
|
+
node.inner_text
|
124
|
+
else
|
125
|
+
super
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
103
129
|
|
104
130
|
## specs
|
105
131
|
|
106
132
|
if __FILE__ == $0
|
107
|
-
require '
|
133
|
+
require 'date'
|
134
|
+
require 'rspec/autorun'
|
108
135
|
HTML = DATA.read
|
109
136
|
|
110
137
|
class Article < Nibbler
|
data/lib/nibbler/json.rb
CHANGED
@@ -1,29 +1,190 @@
|
|
1
1
|
require 'nibbler'
|
2
|
+
require 'strscan'
|
2
3
|
|
3
4
|
# a wrapper for JSON data that provides `at` and `search`
|
4
5
|
class Nibbler::JsonDocument
|
5
6
|
attr_reader :data
|
6
|
-
|
7
|
-
def initialize(obj)
|
8
|
-
@data =
|
7
|
+
|
8
|
+
def initialize(obj, root = nil)
|
9
|
+
@data = obj.respond_to?(:to_str) ? JSON.parse(obj) : obj
|
10
|
+
@root = root
|
9
11
|
end
|
10
|
-
|
11
|
-
def
|
12
|
-
|
12
|
+
|
13
|
+
def root
|
14
|
+
@root || data
|
13
15
|
end
|
14
|
-
|
16
|
+
|
15
17
|
def search(selector)
|
16
|
-
|
18
|
+
if selector !~ /[^\w-]/
|
19
|
+
found = Array === data ? data : data[selector]
|
20
|
+
found = [] if found.nil?
|
21
|
+
found = [found] unless Array === found
|
22
|
+
else
|
23
|
+
found = scan_selector selector
|
24
|
+
end
|
25
|
+
found
|
17
26
|
end
|
18
|
-
|
27
|
+
|
19
28
|
def at(selector)
|
20
|
-
|
29
|
+
search(selector).first
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# stupid implementation of http://goessner.net/articles/JsonPath/
|
35
|
+
def scan_selector(selector)
|
36
|
+
s = StringScanner.new selector
|
37
|
+
found = s.scan(/\$/) ? root : data
|
38
|
+
found = [found] unless Array === found
|
39
|
+
|
40
|
+
while prop = s.scan(/\.\.?[\w-]+/)
|
41
|
+
prop.sub!(/\.\.?/, '')
|
42
|
+
found = if $&.size == 2
|
43
|
+
search_recursive(prop, found).compact
|
44
|
+
else
|
45
|
+
found.flatten.map {|i| i[prop] if Hash === i and i.key? prop }.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
if s.scan(/\[/)
|
49
|
+
if range = s.scan(/[\d:]+/)
|
50
|
+
start, till, = range.split(':', 2)
|
51
|
+
start = start.to_i
|
52
|
+
idx = !till ? start : till.empty?? start..-1 : start...(till.to_i)
|
53
|
+
found.map! {|i| i[idx] if Array === i }
|
54
|
+
found.compact!
|
55
|
+
elsif s.scan(/\?/)
|
56
|
+
expr = s.scan_until(/\)/) or raise
|
57
|
+
expr.gsub!('@', 'self')
|
58
|
+
found.flatten!
|
59
|
+
found.reject! {|i| !(i.instance_eval expr rescue nil) }
|
60
|
+
found.compact!
|
61
|
+
end
|
62
|
+
s.scan(/\]/) or raise
|
63
|
+
end
|
64
|
+
break if found.empty?
|
65
|
+
end
|
66
|
+
|
67
|
+
found.flatten!
|
68
|
+
found
|
69
|
+
end
|
70
|
+
|
71
|
+
def search_recursive(prop, items, found = [])
|
72
|
+
items.map { |item|
|
73
|
+
case item
|
74
|
+
when Hash
|
75
|
+
found << item[prop] if item.key? prop
|
76
|
+
search_recursive(prop, item.values, found)
|
77
|
+
when Array
|
78
|
+
search_recursive(prop, item, found)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
found
|
21
82
|
end
|
22
83
|
end
|
23
84
|
|
24
85
|
# a scraper that works with JsonDocument
|
25
|
-
class NibblerJSON
|
26
|
-
|
27
|
-
|
86
|
+
class NibblerJSON
|
87
|
+
extend NibblerMethods
|
88
|
+
|
89
|
+
def self.parse(data, parent = nil)
|
90
|
+
new(data, parent).parse
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(doc, parent = nil)
|
94
|
+
doc = Nibbler::JsonDocument.new(doc, parent && parent.doc.root) unless doc.respond_to? :search
|
95
|
+
super(doc)
|
28
96
|
end
|
29
97
|
end
|
98
|
+
|
99
|
+
if __FILE__ == $0
|
100
|
+
require 'json'
|
101
|
+
require 'forwardable'
|
102
|
+
require 'minitest/spec'
|
103
|
+
require 'minitest/autorun'
|
104
|
+
|
105
|
+
describe Nibbler::JsonDocument do
|
106
|
+
DOC = Nibbler::JsonDocument.new DATA.read
|
107
|
+
|
108
|
+
extend Forwardable
|
109
|
+
def_delegators :DOC, :at, :search
|
110
|
+
|
111
|
+
it "fetches unknown key" do
|
112
|
+
at('doesnotexist').must_be_nil
|
113
|
+
end
|
114
|
+
|
115
|
+
it "fetches existing key" do
|
116
|
+
at('title').must_equal "Toy Story 3"
|
117
|
+
end
|
118
|
+
|
119
|
+
it "fetches selector" do
|
120
|
+
at('.year').must_equal 2010
|
121
|
+
end
|
122
|
+
|
123
|
+
it "fetches deep selector" do
|
124
|
+
at('.release_dates.dvd').must_equal "2010-11-02"
|
125
|
+
end
|
126
|
+
|
127
|
+
it "fetches first item of array" do
|
128
|
+
at('.genres').must_equal "Animation"
|
129
|
+
end
|
130
|
+
|
131
|
+
it "fetches array" do
|
132
|
+
search('.genres').must_equal [ "Animation", "Kids & Family", "Comedy" ]
|
133
|
+
end
|
134
|
+
|
135
|
+
it "extracts subset of array" do
|
136
|
+
search('.genres[:2]').must_equal [ "Animation", "Kids & Family" ]
|
137
|
+
search('.genres[1:3]').must_equal [ "Kids & Family", "Comedy" ]
|
138
|
+
search('.genres[2:]').must_equal [ "Comedy" ]
|
139
|
+
end
|
140
|
+
|
141
|
+
it "searches recursively" do
|
142
|
+
search('..characters').must_equal ["Woody", "Moody", "Buzz Lightyear"]
|
143
|
+
end
|
144
|
+
|
145
|
+
it "respects array index" do
|
146
|
+
search('..characters[0]').must_equal ["Woody", "Buzz Lightyear"]
|
147
|
+
end
|
148
|
+
|
149
|
+
it "respects conditions" do
|
150
|
+
search('.abridged_cast[?(@["name"] =~ /tom/i)].characters').must_equal ["Woody", "Moody"]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
__END__
|
156
|
+
{
|
157
|
+
"title": "Toy Story 3",
|
158
|
+
"year": 2010,
|
159
|
+
"genres": [ "Animation", "Kids & Family", "Comedy" ],
|
160
|
+
"runtime": 103,
|
161
|
+
"release_dates": {
|
162
|
+
"theater": "2010-06-18",
|
163
|
+
"dvd": "2010-11-02"
|
164
|
+
},
|
165
|
+
"ratings": {
|
166
|
+
"critics_rating": "Certified Fresh",
|
167
|
+
"critics_score": 99,
|
168
|
+
"audience_rating": "Upright",
|
169
|
+
"audience_score": 91
|
170
|
+
},
|
171
|
+
"posters": {
|
172
|
+
"thumbnail": "http://content6.flixster.com/movie/11/13/43/11134356_mob.jpg",
|
173
|
+
"profile": "http://content6.flixster.com/movie/11/13/43/11134356_pro.jpg",
|
174
|
+
"detailed": "http://content6.flixster.com/movie/11/13/43/11134356_det.jpg",
|
175
|
+
"original": "http://content6.flixster.com/movie/11/13/43/11134356_ori.jpg"
|
176
|
+
},
|
177
|
+
"abridged_cast": [
|
178
|
+
{ "name": "Tom Hanks",
|
179
|
+
"characters": [ "Woody", "Moody" ] },
|
180
|
+
{ "name": "Tim Allen",
|
181
|
+
"characters": [ "Buzz Lightyear" ] }
|
182
|
+
],
|
183
|
+
"abridged_directors": [ {"name": "Lee Unkrich"} ],
|
184
|
+
"studio": "Walt Disney Pictures",
|
185
|
+
"alternate_ids": { "imdb": "0435761" },
|
186
|
+
"links": {
|
187
|
+
"self": "http://api.rottentomatoes.com/api/public/v1.0/movies/770672122.json",
|
188
|
+
"alternate": "http://www.rottentomatoes.com/m/toy_story_3/"
|
189
|
+
}
|
190
|
+
}
|
metadata
CHANGED
@@ -1,33 +1,23 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: nibbler
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 1
|
8
|
-
- 2
|
9
|
-
- 1
|
10
|
-
version: 1.2.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
13
|
-
-
|
7
|
+
authors:
|
8
|
+
- Mislav Marohnić
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
date: 2011-01-15 00:00:00 +01:00
|
19
|
-
default_executable:
|
12
|
+
date: 2012-01-17 00:00:00.000000000 Z
|
20
13
|
dependencies: []
|
21
|
-
|
22
|
-
|
14
|
+
description: Nibbler is a super simple and powerful declarative generic scraper written
|
15
|
+
in under 70 lines of code.
|
23
16
|
email: mislav.marohnic@gmail.com
|
24
17
|
executables: []
|
25
|
-
|
26
18
|
extensions: []
|
27
|
-
|
28
19
|
extra_rdoc_files: []
|
29
|
-
|
30
|
-
files:
|
20
|
+
files:
|
31
21
|
- Rakefile
|
32
22
|
- lib/nibbler/json.rb
|
33
23
|
- lib/nibbler.rb
|
@@ -36,39 +26,29 @@ files:
|
|
36
26
|
- examples/twitter.rb
|
37
27
|
- README.md
|
38
28
|
- LICENSE
|
39
|
-
|
40
|
-
homepage: http://github.com/mislav/nibbler
|
29
|
+
homepage: https://github.com/mislav/nibbler
|
41
30
|
licenses: []
|
42
|
-
|
43
31
|
post_install_message:
|
44
32
|
rdoc_options: []
|
45
|
-
|
46
|
-
require_paths:
|
33
|
+
require_paths:
|
47
34
|
- lib
|
48
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
36
|
none: false
|
50
|
-
requirements:
|
51
|
-
- -
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
|
54
|
-
|
55
|
-
- 0
|
56
|
-
version: "0"
|
57
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
42
|
none: false
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
|
63
|
-
segments:
|
64
|
-
- 0
|
65
|
-
version: "0"
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
66
47
|
requirements: []
|
67
|
-
|
68
48
|
rubyforge_project:
|
69
|
-
rubygems_version: 1.
|
49
|
+
rubygems_version: 1.8.12
|
70
50
|
signing_key:
|
71
51
|
specification_version: 3
|
72
52
|
summary: A cute HTML scraper / data extraction tool
|
73
53
|
test_files: []
|
74
|
-
|
54
|
+
has_rdoc:
|