nibbler 1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2009 Mislav Marohnić
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,43 @@
1
+ Nibbler
2
+ =======
3
+
4
+ *Nibbler* is a cute HTML screen-scraping tool.
5
+
6
+ require 'nibbler'
7
+ require 'open-uri'
8
+
9
+ class BlogScraper < Nibbler
10
+ element :title
11
+
12
+ elements 'div.hentry' => :articles do
13
+ element 'h2' => :title
14
+ element 'a/@href' => :url
15
+ end
16
+ end
17
+
18
+ blog = BlogScraper.parse open('http://example.com')
19
+
20
+ blog.title
21
+ #=> "My blog title"
22
+
23
+ blog.articles.first.title
24
+ #=> "First article title"
25
+
26
+ blog.articles.first.url
27
+ #=> "http://example.com/article"
28
+
29
+ There are sample scripts in the "examples/" directory; run them with:
30
+
31
+ ruby -Ilib -rubygems examples/delicious.rb
32
+ ruby -Ilib -rubygems examples/tweetburner.rb > output.csv
33
+
34
+ [See the wiki][wiki] for more on how to use *Nibbler*.
35
+
36
+ Requirements
37
+ ------------
38
+
39
+ *None*. Well, [Nokogiri][] is a requirement if you pass in HTML content that needs to be parsed, like in the example above. Otherwise you can initialize the scraper with an Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
40
+
41
+
42
+ [wiki]: http://wiki.github.com/mislav/nibbler
43
+ [nokogiri]: http://nokogiri.rubyforge.org/nokogiri/
@@ -0,0 +1,24 @@
1
+ task :default => :spec
2
+
3
+ desc %(Run specs)
4
+ task :spec do
5
+ exec %(ruby -Ilib -rubygems lib/nibbler.rb --color)
6
+ end
7
+
8
+ desc %(Count lines of code in implementation)
9
+ task :loc do
10
+ File.open('lib/nibbler.rb') do |file|
11
+ loc, counting = 1, false
12
+
13
+ file.each_line do |line|
14
+ case line
15
+ when /^class\b/ then counting = true
16
+ when /^\s*(#|\Z)/ then next
17
+ when /^end\b/ then break
18
+ end
19
+ loc += 1 if counting
20
+ end
21
+
22
+ puts loc
23
+ end
24
+ end
@@ -0,0 +1,37 @@
1
+ ## Delicious bookmarks fetching
2
+ #
3
+ # Let's pretend that delicious.com doesn't have an API.
4
+ # This is a demonstration of the most common use-case.
5
+
6
+ require 'nibbler'
7
+ require 'open-uri'
8
+ require 'date'
9
+
10
+ # extracts data from a single bookmark
11
+ class Bookmark < Nibbler
12
+ element 'h4 a' => :title
13
+ element '.description' => :description
14
+
15
+ # extract attribute with xpath
16
+ element './/h4/a/@href' => :url
17
+
18
+ # tags are plural
19
+ elements 'ul.tag-chain .tagItem' => :tags
20
+
21
+ # dates are in form "22 OCT 09"
22
+ element '.dateGroup span' => :date, :with => lambda { |span|
23
+ Date.strptime(span.inner_text.strip, '%d %b %y')
24
+ }
25
+ end
26
+
27
+ # finds all bookmarks on the page
28
+ class Delicious < Nibbler
29
+ elements '#bookmarklist div.bookmark' => :bookmarks, :with => Bookmark
30
+ end
31
+
32
+ mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
33
+ bookmark = mislav.bookmarks.first
34
+
35
+ puts bookmark.title #=> "Some title"
36
+ p bookmark.tags #=> ['foo', 'bar', ...]
37
+ puts bookmark.date #=> <Date>
@@ -0,0 +1,78 @@
1
+ ## Tweetburner.com archive dump
2
+ #
3
+ # I needed to dump my Tweetburner archive to CSV
4
+ # http://tweetburner.com/users/mislav/archive
5
+
6
+ require 'nibbler'
7
+ require 'uri'
8
+ require 'open-uri'
9
+ require 'date'
10
+ require 'nokogiri'
11
+ require 'csv'
12
+
13
+ module Tweetburner
14
+ SITE = URI('http://tweetburner.com')
15
+
16
+ class Scraper < ::Nibbler
17
+ # add our behavior to convert_document; open web pages with UTF-8 encoding
18
+ def self.convert_document(url)
19
+ URI === url ? Nokogiri::HTML::Document.parse(open(url), url.to_s, 'UTF-8') : url
20
+ rescue OpenURI::HTTPError
21
+ $stderr.puts "ERROR opening #{url}"
22
+ Nokogiri('')
23
+ end
24
+ end
25
+
26
+ # a single link (table row one the archive page)
27
+ class Link < ::Nibbler
28
+ element './/a[starts-with(@href, "/links/")]/@href' => :stats_url, :with => lambda { |href|
29
+ SITE + href.text
30
+ }
31
+ element '.col-tweet-text' => :text, :with => lambda { |node|
32
+ node.text.sub(/\s+– .+?$/, '')
33
+ }
34
+ element '.col-clicks' => :clicks
35
+ element '.col-created-at' => :created_at, :with => lambda { |node| DateTime.parse node.text }
36
+
37
+ def stats
38
+ @stats ||= Stats.parse(stats_url)
39
+ end
40
+ end
41
+
42
+ # single link stats page parser
43
+ class Stats < Scraper
44
+ element '//*[@id="main-content"]/p/a/@href' => :destination
45
+ end
46
+
47
+ # parser for the paginated archive
48
+ class Archive < Scraper
49
+ def self.parse(username)
50
+ path = '/users/%s/archive' % username
51
+ super SITE + path
52
+ end
53
+
54
+ elements '//table//tr[position() > 1]' => :links, :with => Link
55
+ element '//*[@class="page-navigation"]//a[starts-with(text(), "Older")]/@href' => :next_page_url
56
+
57
+ # augment to recursively parse other pages
58
+ def parse
59
+ super
60
+ if next_page_url
61
+ @doc = self.class.convert_document(URI(next_page_url))
62
+ self.parse
63
+ else
64
+ self
65
+ end
66
+ end
67
+
68
+ def to_csv(io = STDOUT)
69
+ io.sync = true if io == STDOUT
70
+ csv = CSV::Writer.create io
71
+ links.each do |link|
72
+ csv << [link.text, link.clicks, link.created_at, link.stats.destination]
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ Tweetburner::Archive.parse('mislav').to_csv
@@ -0,0 +1,71 @@
1
+ ## JSON data extraction example
2
+ #
3
+ # This is an example how we're not limited to Nokogiri and HTML screen-scraping.
4
+ # Here we use Nibbler to extract tweets from a Twitter API JSON response.
5
+ #
6
+ # Requirements: a JSON library (tested with "json" gem)
7
+
8
+ require 'nibbler/json'
9
+ require 'json'
10
+ require 'time'
11
+
12
+ # now here's the real deal
13
+ class Twitter < NibblerJSON
14
+ elements :tweets, :with => NibblerJSON do
15
+ element :created_at, :with => lambda { |time| Time.parse(time) }
16
+ element :text
17
+ element :id
18
+ element 'user' => :author, :with => NibblerJSON do
19
+ element 'name' => :full_name
20
+ element 'screen_name' => :username
21
+ end
22
+ end
23
+ end
24
+
25
+ twitter = Twitter.parse(DATA.read)
26
+
27
+ twitter.tweets.each do |tweet|
28
+ puts "@%s: %s [%s]" % [tweet.author.username, tweet.text, tweet.created_at]
29
+ puts
30
+ end
31
+
32
+
33
+ __END__
34
+ [{"created_at": "Thu Oct 22 23:50:02 +0000 2009",
35
+ "text":
36
+ "\"It is OK being wrong.\" \"I don't have any experience in that field.\"",
37
+ "id": 5083117521,
38
+ "user":
39
+ {"name": "Ryan Bigg",
40
+ "created_at": "Thu Apr 24 03:23:53 +0000 2008",
41
+ "location": "iPhone: -27.471957,152.999225",
42
+ "profile_image_url":
43
+ "http://a1.twimg.com/profile_images/287965508/Photo_47_normal.jpg",
44
+ "url": "http://www.frozenplague.net",
45
+ "id": 14506011,
46
+ "followers_count": 432,
47
+ "description": "I work at Mocra and code Ruby on Rails",
48
+ "statuses_count": 7659,
49
+ "friends_count": 211,
50
+ "screen_name": "ryanbigg"},
51
+ "source": "<a href=\"http://www.atebits.com/\" rel=\"nofollow\">Tweetie</a>"},
52
+ {"created_at": "Mon Oct 19 23:43:50 +0000 2009",
53
+ "text":
54
+ "Programming is the art of forcing the exceptions of the real world into the absolutes of a computer.",
55
+ "id": 5004137490,
56
+ "user":
57
+ {"name": "Ryan Bates",
58
+ "created_at": "Fri Mar 28 19:10:25 +0000 2008",
59
+ "location": "Southern Oregon",
60
+ "profile_image_url":
61
+ "http://a1.twimg.com/profile_images/52189024/ryan_bates_cropped_normal.jpg",
62
+ "url": "http://railscasts.com",
63
+ "id": 14246143,
64
+ "followers_count": 3225,
65
+ "description": "Producer of Railscasts - Free Ruby on Rails Screencasts",
66
+ "profile_background_image_url":
67
+ "http://s.twimg.com/a/1255724203/images/themes/theme2/bg.gif",
68
+ "statuses_count": 2066,
69
+ "friends_count": 225,
70
+ "screen_name": "rbates"}
71
+ }]
@@ -0,0 +1,291 @@
1
+ ## A minimalistic, declarative HTML scraper
2
+
3
+ class Nibbler
4
+ attr_reader :doc
5
+
6
+ # Accepts string, open file, or Nokogiri-like document
7
+ def initialize(doc)
8
+ @doc = self.class.convert_document(doc)
9
+ initialize_plural_accessors
10
+ end
11
+
12
+ # Initialize a new scraper and process data
13
+ def self.parse(html)
14
+ new(html).parse
15
+ end
16
+
17
+ # Specify a new singular scraping rule
18
+ def self.element(*args, &block)
19
+ selector, name, delegate = parse_rule_declaration(*args, &block)
20
+ rules[name] = [selector, delegate]
21
+ attr_accessor name
22
+ name
23
+ end
24
+
25
+ # Specify a new plural scraping rule
26
+ def self.elements(*args, &block)
27
+ name = element(*args, &block)
28
+ rules[name] << true
29
+ end
30
+
31
+ # Let it do its thing!
32
+ def parse
33
+ self.class.rules.each do |target, (selector, delegate, plural)|
34
+ if plural
35
+ @doc.search(selector).each do |node|
36
+ send(target) << parse_result(node, delegate)
37
+ end
38
+ else
39
+ send("#{target}=", parse_result(@doc.at(selector), delegate))
40
+ end
41
+ end
42
+ self
43
+ end
44
+
45
+ protected
46
+
47
+ # `delegate` is optional, but should respond to `call` or `parse`
48
+ def parse_result(node, delegate)
49
+ if delegate
50
+ delegate.respond_to?(:call) ? delegate.call(node) : delegate.parse(node)
51
+ elsif node.respond_to? :inner_text
52
+ node.inner_text
53
+ else
54
+ node.to_s
55
+ end unless node.nil?
56
+ end
57
+
58
+ private
59
+
60
+ def self.rules
61
+ @rules ||= {}
62
+ end
63
+
64
+ def self.inherited(subclass)
65
+ subclass.rules.update self.rules
66
+ end
67
+
68
+ # Rule declaration is in Hash or single argument form:
69
+ #
70
+ # { '//some/selector' => :name, :with => delegate }
71
+ # #=> ['//some/selector', :name, delegate]
72
+ #
73
+ # :title
74
+ # #=> ['title', :title, nil]
75
+ def self.parse_rule_declaration(*args, &block)
76
+ options, name = Hash === args.last ? args.pop : {}, args.first
77
+ delegate = options.delete(:with)
78
+ selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
79
+ raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
80
+ # eval block in context of a new scraper subclass
81
+ delegate = Class.new(delegate || Nibbler, &block) if block_given?
82
+ return selector, property, delegate
83
+ end
84
+
85
+ def initialize_plural_accessors
86
+ self.class.rules.each do |name, (s, k, plural)|
87
+ send("#{name}=", []) if plural
88
+ end
89
+ end
90
+
91
+ def self.convert_document(doc)
92
+ unless doc.respond_to?(:at) && doc.respond_to?(:search)
93
+ require 'nokogiri' unless defined? ::Nokogiri
94
+ Nokogiri doc
95
+ else
96
+ doc
97
+ end
98
+ end
99
+ end
100
+
101
+
102
+ ## specs
103
+
104
+ if __FILE__ == $0
105
+ require 'spec/autorun'
106
+ HTML = DATA.read
107
+
108
+ class Article < Nibbler
109
+ element 'h1' => :title
110
+ element 'a/@href' => :link
111
+ end
112
+
113
+ class TimestampedArticle < Article
114
+ element 'p.pubdate' => :published, :with => lambda { |node|
115
+ node.inner_text.sub('Published on ', '')
116
+ }
117
+
118
+ def published_date
119
+ @date ||= Date.parse published
120
+ end
121
+ end
122
+
123
+ class SpecialArticle < Article
124
+ element 'span'
125
+ end
126
+
127
+ class BlogScraper < Nibbler
128
+ element :title
129
+ elements '#nav li' => :navigation_items
130
+ end
131
+
132
+ class OverrideBlogScraper < BlogScraper
133
+ elements :title
134
+ element '#nav li' => :navigation_items
135
+ end
136
+
137
+ class BlogWithArticles < BlogScraper
138
+ elements 'div.hentry' => :articles, :with => Article
139
+ end
140
+
141
+ class BlogWithTimestampedArticles < BlogScraper
142
+ elements 'div.hentry' => :articles, :with => TimestampedArticle
143
+ end
144
+
145
+ class BlogWithArticlesBlock < BlogScraper
146
+ elements 'div.hentry' => :articles do
147
+ element 'h1' => :title
148
+ end
149
+ end
150
+
151
+ class FakeHtmlParser
152
+ def initialize(name)
153
+ @name = name
154
+ end
155
+
156
+ def at(selector)
157
+ "fake #{@name}"
158
+ end
159
+
160
+ def search(selector)
161
+ (1..3).map { |n| self.class.new(@name + n.to_s) }
162
+ end
163
+ end
164
+
165
+ describe BlogWithTimestampedArticles do
166
+ before(:all) do
167
+ @blog = described_class.parse(HTML)
168
+ end
169
+
170
+ it "should have title" do
171
+ @blog.title.should == 'Maximum awesome'
172
+ end
173
+
174
+ it "should have articles" do
175
+ @blog.should have(2).articles
176
+ end
177
+
178
+ it "should have navigation items" do
179
+ @blog.should have(3).navigation_items
180
+ @blog.navigation_items.should == %w[Home About Help]
181
+ end
182
+
183
+ it "should have title, pubdate for first article" do
184
+ article = @blog.articles[0]
185
+ article.title.should == 'First article'
186
+ article.published.should == 'Oct 1'
187
+ article.published_date.month.should == 10
188
+ article.published_date.day.should == 1
189
+ article.link.should be_nil
190
+ end
191
+
192
+ it "should have title, link for second article" do
193
+ article = @blog.articles[1]
194
+ article.title.should == 'Second article'
195
+ article.published.should == 'Sep 5'
196
+ article.link.should == 'http://mislav.uniqpath.com'
197
+ end
198
+
199
+ it "should override singular properties when re-parsing" do
200
+ blog = @blog.dup
201
+ blog.instance_variable_set('@doc', Nokogiri::HTML(''))
202
+ blog.parse
203
+ blog.title.should be_nil
204
+ blog.should have(2).articles
205
+ end
206
+ end
207
+
208
+ describe SpecialArticle do
209
+ before(:all) do
210
+ doc = Nokogiri::HTML(HTML).at('//div[2]')
211
+ @article = described_class.parse(doc)
212
+ @parent_article = described_class.superclass.parse(doc)
213
+ end
214
+
215
+ it "should inherit title parsing from parent" do
216
+ @article.title.should == 'Second article'
217
+ end
218
+
219
+ it "should have additional 'span' rule" do
220
+ @article.span.should == 'My blog'
221
+ end
222
+
223
+ it "should not let superclass inherit rules" do
224
+ @parent_article.should_not respond_to(:span)
225
+ end
226
+ end
227
+
228
+ describe BlogWithArticles, 'with fake HTML parser' do
229
+ before(:all) do
230
+ doc = FakeHtmlParser.new('test')
231
+ @blog = described_class.parse(doc)
232
+ end
233
+
234
+ it "should have fake title" do
235
+ @blog.title.should == 'fake test'
236
+ end
237
+
238
+ it "should have fake articles" do
239
+ titles = @blog.articles.map { |a| a.title }
240
+ titles.should == ['fake test1', 'fake test2', 'fake test3']
241
+ end
242
+ end
243
+
244
+ describe OverrideBlogScraper do
245
+ before(:all) do
246
+ @blog = described_class.parse(HTML)
247
+ end
248
+
249
+ it "should have plural titles" do
250
+ @blog.title.should == ['Maximum awesome']
251
+ end
252
+
253
+ it "should have singular navigation item" do
254
+ @blog.navigation_items.should == 'Home'
255
+ end
256
+ end
257
+
258
+ describe BlogWithArticlesBlock do
259
+ before(:all) do
260
+ @blog = described_class.parse(HTML)
261
+ end
262
+
263
+ it "should have article objects" do
264
+ titles = @blog.articles.map { |article| article.title }
265
+ titles.should == ['First article', 'Second article']
266
+ end
267
+ end
268
+ end
269
+
270
+ __END__
271
+ <!doctype html>
272
+ <title>Maximum awesome</title>
273
+
274
+ <body>
275
+ <ol id="nav">
276
+ <li>Home</li>
277
+ <li>About</li>
278
+ <li>Help</li>
279
+ </ol>
280
+
281
+ <div class="hentry">
282
+ <h1>First article</h1>
283
+ <p class="pubdate">Published on Oct 1</p>
284
+ </div>
285
+
286
+ <div class="hentry">
287
+ <h1>Second article</h1>
288
+ <p class="pubdate">Published on Sep 5</p>
289
+ <span><a href="http://mislav.uniqpath.com">My blog</a></span>
290
+ </div>
291
+ </body>
@@ -0,0 +1,27 @@
1
+ require 'nibbler'
2
+
3
+ # a wrapper for JSON data that provides `at` and `search`
4
+ class Nibbler::JsonDocument
5
+ def initialize(obj)
6
+ @data = String === obj ? JSON.parse(obj) : obj
7
+ end
8
+
9
+ def self.[](obj)
10
+ self.class === obj ? obj : new(obj)
11
+ end
12
+
13
+ def search(selector)
14
+ @data.to_a
15
+ end
16
+
17
+ def at(selector)
18
+ @data[selector]
19
+ end
20
+ end
21
+
22
+ # a scraper that works with JsonDocument
23
+ class NibblerJSON < Nibbler
24
+ def self.convert_document(doc)
25
+ Nibbler::JsonDocument[doc]
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,121 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nibbler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 15
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 0
9
+ version: "1.0"
10
+ platform: ruby
11
+ authors:
12
+ - "Mislav Marohni\xC4\x87"
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-08-16 00:00:00 +02:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: yajl-ruby
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 9
29
+ segments:
30
+ - 0
31
+ - 7
32
+ - 5
33
+ version: 0.7.5
34
+ type: :development
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 113
45
+ segments:
46
+ - 1
47
+ - 4
48
+ - 3
49
+ - 1
50
+ version: 1.4.3.1
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ hash: 27
62
+ segments:
63
+ - 1
64
+ - 3
65
+ - 0
66
+ version: 1.3.0
67
+ type: :development
68
+ version_requirements: *id003
69
+ description: Nibbler is a super simple and powerful declarative generic scraper written in under 70 lines of code.
70
+ email: mislav.marohnic@gmail.com
71
+ executables: []
72
+
73
+ extensions: []
74
+
75
+ extra_rdoc_files: []
76
+
77
+ files:
78
+ - Rakefile
79
+ - lib/nibbler/json.rb
80
+ - lib/nibbler.rb
81
+ - examples/delicious.rb
82
+ - examples/tweetburner.rb
83
+ - examples/twitter.rb
84
+ - README.md
85
+ - LICENSE
86
+ has_rdoc: false
87
+ homepage: http://github.com/mislav/nibbler
88
+ licenses: []
89
+
90
+ post_install_message:
91
+ rdoc_options: []
92
+
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ none: false
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ hash: 3
101
+ segments:
102
+ - 0
103
+ version: "0"
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ hash: 3
110
+ segments:
111
+ - 0
112
+ version: "0"
113
+ requirements: []
114
+
115
+ rubyforge_project:
116
+ rubygems_version: 1.3.7
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: A cute HTML scraper / data extraction tool
120
+ test_files: []
121
+