graboid 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.mdown +8 -4
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +2 -2
- data/examples/ning_post.rb +2 -2
- data/examples/reddit_post.rb +35 -0
- data/examples/tumblr_post.rb +33 -0
- data/graboid.gemspec +5 -3
- data/lib/graboid.rb +1 -1
- data/lib/graboid/scraper.rb +16 -8
- data/spec/graboid/scraper_spec.rb +9 -1
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +1 -1
- metadata +8 -4
data/README.mdown
CHANGED
@@ -4,6 +4,10 @@
|
|
4
4
|
|
5
5
|
Simply awesome web scraping. Better docs later. See specs.
|
6
6
|
|
7
|
+
### 0.3.4 Update ###
|
8
|
+
|
9
|
+
[http://twoism.posterous.com/new-graboid-dsl](http://twoism.posterous.com/new-graboid-dsl, "New DSL")
|
10
|
+
|
7
11
|
### Installation ###
|
8
12
|
|
9
13
|
|
@@ -15,7 +19,7 @@
|
|
15
19
|
%w{rubygems graboid}.each { |f| require f }
|
16
20
|
|
17
21
|
class RedditEntry
|
18
|
-
include Graboid::
|
22
|
+
include Graboid::Scraper
|
19
23
|
|
20
24
|
selector '.entry'
|
21
25
|
|
@@ -26,7 +30,7 @@
|
|
26
30
|
entry.css('a').first['href']
|
27
31
|
end
|
28
32
|
|
29
|
-
|
33
|
+
page_with do |doc|
|
30
34
|
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
31
35
|
end
|
32
36
|
|
@@ -38,9 +42,9 @@
|
|
38
42
|
|
39
43
|
end
|
40
44
|
|
41
|
-
RedditEntry.source
|
45
|
+
@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 2 )
|
42
46
|
|
43
|
-
|
47
|
+
@posts.each do |p|
|
44
48
|
puts "title: #{p.title}"
|
45
49
|
puts "domain: #{p.domain}"
|
46
50
|
puts "link: #{p.link}"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.5
|
@@ -35,8 +35,8 @@ class ActiveRainPost
|
|
35
35
|
|
36
36
|
end
|
37
37
|
|
38
|
-
ActiveRainPost.source = 'http://activerain.com/blogs/
|
39
|
-
@posts = ActiveRainPost.all(:max_pages =>
|
38
|
+
ActiveRainPost.source = 'http://activerain.com/blogs/danawilkinson'
|
39
|
+
@posts = ActiveRainPost.all(:max_pages => 100)
|
40
40
|
|
41
41
|
@posts.each do |post|
|
42
42
|
puts "#{post.pub_date}"
|
data/examples/ning_post.rb
CHANGED
@@ -46,8 +46,8 @@ class NingPost
|
|
46
46
|
|
47
47
|
end
|
48
48
|
|
49
|
-
NING_URL = 'http://
|
50
|
-
@posts = NingPost.new( :source => NING_URL ).all(:max_pages =>
|
49
|
+
NING_URL = 'http://vstar650.ning.com/profiles/blog/list'
|
50
|
+
@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 10)
|
51
51
|
|
52
52
|
@posts.each do |post|
|
53
53
|
puts "#{post.pub_date} -- #{post.title}"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class RedditEntry
|
5
|
+
include Graboid::Scraper
|
6
|
+
|
7
|
+
selector '.entry'
|
8
|
+
|
9
|
+
set :title
|
10
|
+
set :domain, :selector => '.domain a'
|
11
|
+
|
12
|
+
set :link, :selector => '.title' do |entry|
|
13
|
+
entry.css('a').first['href']
|
14
|
+
end
|
15
|
+
|
16
|
+
page_with do |doc|
|
17
|
+
self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
18
|
+
end
|
19
|
+
|
20
|
+
before_paginate do
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "#{"*"*100}"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
|
29
|
+
|
30
|
+
@posts.each do |p|
|
31
|
+
puts "title: #{p.title}"
|
32
|
+
puts "domain: #{p.domain}"
|
33
|
+
puts "link: #{p.link}"
|
34
|
+
puts "#{"*"*100}"
|
35
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class TumblrEntry
|
5
|
+
include Graboid::Scraper
|
6
|
+
TUMBLR_CHUNK_SIZE = 20
|
7
|
+
|
8
|
+
selector 'post'
|
9
|
+
|
10
|
+
set :title, :selector => 'regular-title'
|
11
|
+
|
12
|
+
page_with do |doc|
|
13
|
+
next_tumblr_page
|
14
|
+
end
|
15
|
+
|
16
|
+
def next_tumblr_page
|
17
|
+
return nil if self.doc.css('post').empty?
|
18
|
+
"#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
|
19
|
+
end
|
20
|
+
|
21
|
+
before_paginate do
|
22
|
+
puts "opening page: #{self.source}"
|
23
|
+
puts "collection size: #{self.collection.length}"
|
24
|
+
puts "#{"*"*100}"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
@posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
|
30
|
+
|
31
|
+
@posts.each do |p|
|
32
|
+
puts "title: #{p.title}"
|
33
|
+
end
|
data/graboid.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-09}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -53,7 +53,9 @@ Gem::Specification.new do |s|
|
|
53
53
|
"spec/spec_helper.rb",
|
54
54
|
"examples/active_rain_post.rb",
|
55
55
|
"examples/live_journal_post.rb",
|
56
|
-
"examples/ning_post.rb"
|
56
|
+
"examples/ning_post.rb",
|
57
|
+
"examples/reddit_post.rb",
|
58
|
+
"examples/tumblr_post.rb"
|
57
59
|
]
|
58
60
|
|
59
61
|
if s.respond_to? :specification_version then
|
data/lib/graboid.rb
CHANGED
data/lib/graboid/scraper.rb
CHANGED
@@ -25,11 +25,9 @@ module Graboid
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def page_with &block
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def pager
|
32
|
-
@pager
|
28
|
+
define_method :pager do
|
29
|
+
instance_eval &block
|
30
|
+
end
|
33
31
|
end
|
34
32
|
|
35
33
|
def root_selector
|
@@ -76,7 +74,8 @@ module Graboid
|
|
76
74
|
alias_method :scrape, :all
|
77
75
|
|
78
76
|
def all_fragments
|
79
|
-
return page_fragments
|
77
|
+
return page_fragments unless self.respond_to?(:pager)
|
78
|
+
return page_fragments if self.pager(self.doc).nil?
|
80
79
|
old_source = self.source
|
81
80
|
|
82
81
|
while next_page?
|
@@ -151,18 +150,22 @@ module Graboid
|
|
151
150
|
|
152
151
|
def next_page?
|
153
152
|
if max_pages.zero?
|
154
|
-
return true unless self.
|
153
|
+
return true unless self.pager(doc).nil?
|
155
154
|
else
|
156
155
|
current_page <= max_pages-1
|
157
156
|
end
|
158
157
|
end
|
159
158
|
|
159
|
+
def original_source
|
160
|
+
@original_source
|
161
|
+
end
|
162
|
+
|
160
163
|
def page_fragments
|
161
164
|
doc.css(self.class.root_selector)
|
162
165
|
end
|
163
166
|
|
164
167
|
def paginate
|
165
|
-
next_page_url = self.
|
168
|
+
next_page_url = self.pager(doc)
|
166
169
|
self.source = next_page_url
|
167
170
|
self.current_page += 1
|
168
171
|
end
|
@@ -182,11 +185,16 @@ module Graboid
|
|
182
185
|
self.max_pages = 0
|
183
186
|
end
|
184
187
|
|
188
|
+
def host
|
189
|
+
self.source.scan(/http[s]?:\/\/.*\//).first
|
190
|
+
end
|
191
|
+
|
185
192
|
def source
|
186
193
|
@source
|
187
194
|
end
|
188
195
|
|
189
196
|
def source=(src)
|
197
|
+
@original_source = src if @original_source.nil?
|
190
198
|
@source = src
|
191
199
|
end
|
192
200
|
|
@@ -37,7 +37,7 @@ class ScraperWithPager
|
|
37
37
|
end
|
38
38
|
|
39
39
|
page_with do |doc|
|
40
|
-
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
40
|
+
'http://localhost:9393'+self.doc.css('a.next').first['href'] rescue nil
|
41
41
|
end
|
42
42
|
|
43
43
|
before_paginate do
|
@@ -172,16 +172,24 @@ describe Graboid::Scraper do
|
|
172
172
|
describe "with a limit" do
|
173
173
|
before(:each) do
|
174
174
|
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
175
|
+
#@scraper.expects(:run_before_paginate_callbacks).times(3)
|
175
176
|
@posts = @scraper.all(:max_pages => 3)
|
176
177
|
end
|
178
|
+
it "should set the callback" do
|
179
|
+
@scraper.callbacks[:before_paginate].should be_a Proc
|
180
|
+
end
|
177
181
|
it "should get 6 posts" do
|
178
182
|
@posts.length.should == 6
|
179
183
|
end
|
184
|
+
|
185
|
+
|
186
|
+
|
180
187
|
end
|
181
188
|
|
182
189
|
describe "without a limit" do
|
183
190
|
before(:each) do
|
184
191
|
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
192
|
+
@scraper.expects(:run_before_paginate_callbacks).times(8)
|
185
193
|
@posts = @scraper.all
|
186
194
|
end
|
187
195
|
it "should get 16 posts" do
|
data/spec/spec.opts
CHANGED
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 5
|
10
|
+
version: 0.3.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-07-09 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -94,6 +94,8 @@ files:
|
|
94
94
|
- spec/graboid_spec.rb
|
95
95
|
- spec/spec.opts
|
96
96
|
- spec/spec_helper.rb
|
97
|
+
- examples/reddit_post.rb
|
98
|
+
- examples/tumblr_post.rb
|
97
99
|
has_rdoc: true
|
98
100
|
homepage: http://github.com/twoism/graboid
|
99
101
|
licenses: []
|
@@ -137,3 +139,5 @@ test_files:
|
|
137
139
|
- examples/active_rain_post.rb
|
138
140
|
- examples/live_journal_post.rb
|
139
141
|
- examples/ning_post.rb
|
142
|
+
- examples/reddit_post.rb
|
143
|
+
- examples/tumblr_post.rb
|