graboid 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.mdown +8 -4
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +2 -2
- data/examples/ning_post.rb +2 -2
- data/examples/reddit_post.rb +35 -0
- data/examples/tumblr_post.rb +33 -0
- data/graboid.gemspec +5 -3
- data/lib/graboid.rb +1 -1
- data/lib/graboid/scraper.rb +16 -8
- data/spec/graboid/scraper_spec.rb +9 -1
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +1 -1
- metadata +8 -4
data/README.mdown
CHANGED
@@ -4,6 +4,10 @@
|
|
4
4
|
|
5
5
|
Simply awesome web scraping. Better docs later. See specs.
|
6
6
|
|
7
|
+
### 0.3.4 Update ###
|
8
|
+
|
9
|
+
[http://twoism.posterous.com/new-graboid-dsl](http://twoism.posterous.com/new-graboid-dsl, "New DSL")
|
10
|
+
|
7
11
|
### Installation ###
|
8
12
|
|
9
13
|
|
@@ -15,7 +19,7 @@
|
|
15
19
|
%w{rubygems graboid}.each { |f| require f }
|
16
20
|
|
17
21
|
class RedditEntry
|
18
|
-
include Graboid::
|
22
|
+
include Graboid::Scraper
|
19
23
|
|
20
24
|
selector '.entry'
|
21
25
|
|
@@ -26,7 +30,7 @@
|
|
26
30
|
entry.css('a').first['href']
|
27
31
|
end
|
28
32
|
|
29
|
-
|
33
|
+
page_with do |doc|
|
30
34
|
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
31
35
|
end
|
32
36
|
|
@@ -38,9 +42,9 @@
|
|
38
42
|
|
39
43
|
end
|
40
44
|
|
41
|
-
RedditEntry.source
|
45
|
+
@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 2 )
|
42
46
|
|
43
|
-
|
47
|
+
@posts.each do |p|
|
44
48
|
puts "title: #{p.title}"
|
45
49
|
puts "domain: #{p.domain}"
|
46
50
|
puts "link: #{p.link}"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.5
|
@@ -35,8 +35,8 @@ class ActiveRainPost
|
|
35
35
|
|
36
36
|
end
|
37
37
|
|
38
|
-
ActiveRainPost.source = 'http://activerain.com/blogs/
|
39
|
-
@posts = ActiveRainPost.all(:max_pages =>
|
38
|
+
ActiveRainPost.source = 'http://activerain.com/blogs/danawilkinson'
|
39
|
+
@posts = ActiveRainPost.all(:max_pages => 100)
|
40
40
|
|
41
41
|
@posts.each do |post|
|
42
42
|
puts "#{post.pub_date}"
|
data/examples/ning_post.rb
CHANGED
@@ -46,8 +46,8 @@ class NingPost
|
|
46
46
|
|
47
47
|
end
|
48
48
|
|
49
|
-
NING_URL = 'http://
|
50
|
-
@posts = NingPost.new( :source => NING_URL ).all(:max_pages =>
|
49
|
+
NING_URL = 'http://vstar650.ning.com/profiles/blog/list'
|
50
|
+
@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 10)
|
51
51
|
|
52
52
|
@posts.each do |post|
|
53
53
|
puts "#{post.pub_date} -- #{post.title}"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class RedditEntry
|
5
|
+
include Graboid::Scraper
|
6
|
+
|
7
|
+
selector '.entry'
|
8
|
+
|
9
|
+
set :title
|
10
|
+
set :domain, :selector => '.domain a'
|
11
|
+
|
12
|
+
set :link, :selector => '.title' do |entry|
|
13
|
+
entry.css('a').first['href']
|
14
|
+
end
|
15
|
+
|
16
|
+
page_with do |doc|
|
17
|
+
self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
18
|
+
end
|
19
|
+
|
20
|
+
before_paginate do
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "#{"*"*100}"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
|
29
|
+
|
30
|
+
@posts.each do |p|
|
31
|
+
puts "title: #{p.title}"
|
32
|
+
puts "domain: #{p.domain}"
|
33
|
+
puts "link: #{p.link}"
|
34
|
+
puts "#{"*"*100}"
|
35
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class TumblrEntry
|
5
|
+
include Graboid::Scraper
|
6
|
+
TUMBLR_CHUNK_SIZE = 20
|
7
|
+
|
8
|
+
selector 'post'
|
9
|
+
|
10
|
+
set :title, :selector => 'regular-title'
|
11
|
+
|
12
|
+
page_with do |doc|
|
13
|
+
next_tumblr_page
|
14
|
+
end
|
15
|
+
|
16
|
+
def next_tumblr_page
|
17
|
+
return nil if self.doc.css('post').empty?
|
18
|
+
"#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
|
19
|
+
end
|
20
|
+
|
21
|
+
before_paginate do
|
22
|
+
puts "opening page: #{self.source}"
|
23
|
+
puts "collection size: #{self.collection.length}"
|
24
|
+
puts "#{"*"*100}"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
@posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
|
30
|
+
|
31
|
+
@posts.each do |p|
|
32
|
+
puts "title: #{p.title}"
|
33
|
+
end
|
data/graboid.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-09}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -53,7 +53,9 @@ Gem::Specification.new do |s|
|
|
53
53
|
"spec/spec_helper.rb",
|
54
54
|
"examples/active_rain_post.rb",
|
55
55
|
"examples/live_journal_post.rb",
|
56
|
-
"examples/ning_post.rb"
|
56
|
+
"examples/ning_post.rb",
|
57
|
+
"examples/reddit_post.rb",
|
58
|
+
"examples/tumblr_post.rb"
|
57
59
|
]
|
58
60
|
|
59
61
|
if s.respond_to? :specification_version then
|
data/lib/graboid.rb
CHANGED
data/lib/graboid/scraper.rb
CHANGED
@@ -25,11 +25,9 @@ module Graboid
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def page_with &block
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def pager
|
32
|
-
@pager
|
28
|
+
define_method :pager do
|
29
|
+
instance_eval &block
|
30
|
+
end
|
33
31
|
end
|
34
32
|
|
35
33
|
def root_selector
|
@@ -76,7 +74,8 @@ module Graboid
|
|
76
74
|
alias_method :scrape, :all
|
77
75
|
|
78
76
|
def all_fragments
|
79
|
-
return page_fragments
|
77
|
+
return page_fragments unless self.respond_to?(:pager)
|
78
|
+
return page_fragments if self.pager(self.doc).nil?
|
80
79
|
old_source = self.source
|
81
80
|
|
82
81
|
while next_page?
|
@@ -151,18 +150,22 @@ module Graboid
|
|
151
150
|
|
152
151
|
def next_page?
|
153
152
|
if max_pages.zero?
|
154
|
-
return true unless self.
|
153
|
+
return true unless self.pager(doc).nil?
|
155
154
|
else
|
156
155
|
current_page <= max_pages-1
|
157
156
|
end
|
158
157
|
end
|
159
158
|
|
159
|
+
def original_source
|
160
|
+
@original_source
|
161
|
+
end
|
162
|
+
|
160
163
|
def page_fragments
|
161
164
|
doc.css(self.class.root_selector)
|
162
165
|
end
|
163
166
|
|
164
167
|
def paginate
|
165
|
-
next_page_url = self.
|
168
|
+
next_page_url = self.pager(doc)
|
166
169
|
self.source = next_page_url
|
167
170
|
self.current_page += 1
|
168
171
|
end
|
@@ -182,11 +185,16 @@ module Graboid
|
|
182
185
|
self.max_pages = 0
|
183
186
|
end
|
184
187
|
|
188
|
+
def host
|
189
|
+
self.source.scan(/http[s]?:\/\/.*\//).first
|
190
|
+
end
|
191
|
+
|
185
192
|
def source
|
186
193
|
@source
|
187
194
|
end
|
188
195
|
|
189
196
|
def source=(src)
|
197
|
+
@original_source = src if @original_source.nil?
|
190
198
|
@source = src
|
191
199
|
end
|
192
200
|
|
@@ -37,7 +37,7 @@ class ScraperWithPager
|
|
37
37
|
end
|
38
38
|
|
39
39
|
page_with do |doc|
|
40
|
-
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
40
|
+
'http://localhost:9393'+self.doc.css('a.next').first['href'] rescue nil
|
41
41
|
end
|
42
42
|
|
43
43
|
before_paginate do
|
@@ -172,16 +172,24 @@ describe Graboid::Scraper do
|
|
172
172
|
describe "with a limit" do
|
173
173
|
before(:each) do
|
174
174
|
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
175
|
+
#@scraper.expects(:run_before_paginate_callbacks).times(3)
|
175
176
|
@posts = @scraper.all(:max_pages => 3)
|
176
177
|
end
|
178
|
+
it "should set the callback" do
|
179
|
+
@scraper.callbacks[:before_paginate].should be_a Proc
|
180
|
+
end
|
177
181
|
it "should get 6 posts" do
|
178
182
|
@posts.length.should == 6
|
179
183
|
end
|
184
|
+
|
185
|
+
|
186
|
+
|
180
187
|
end
|
181
188
|
|
182
189
|
describe "without a limit" do
|
183
190
|
before(:each) do
|
184
191
|
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
192
|
+
@scraper.expects(:run_before_paginate_callbacks).times(8)
|
185
193
|
@posts = @scraper.all
|
186
194
|
end
|
187
195
|
it "should get 16 posts" do
|
data/spec/spec.opts
CHANGED
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 5
|
10
|
+
version: 0.3.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-07-09 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -94,6 +94,8 @@ files:
|
|
94
94
|
- spec/graboid_spec.rb
|
95
95
|
- spec/spec.opts
|
96
96
|
- spec/spec_helper.rb
|
97
|
+
- examples/reddit_post.rb
|
98
|
+
- examples/tumblr_post.rb
|
97
99
|
has_rdoc: true
|
98
100
|
homepage: http://github.com/twoism/graboid
|
99
101
|
licenses: []
|
@@ -137,3 +139,5 @@ test_files:
|
|
137
139
|
- examples/active_rain_post.rb
|
138
140
|
- examples/live_journal_post.rb
|
139
141
|
- examples/ning_post.rb
|
142
|
+
- examples/reddit_post.rb
|
143
|
+
- examples/tumblr_post.rb
|