graboid 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.mdown CHANGED
@@ -4,6 +4,10 @@
4
4
 
5
5
  Simply awesome web scraping. Better docs later. See specs.
6
6
 
7
+ ### 0.3.4 Update ###
8
+
9
+ [http://twoism.posterous.com/new-graboid-dsl](http://twoism.posterous.com/new-graboid-dsl, "New DSL")
10
+
7
11
  ### Installation ###
8
12
 
9
13
 
@@ -15,7 +19,7 @@
15
19
  %w{rubygems graboid}.each { |f| require f }
16
20
 
17
21
  class RedditEntry
18
- include Graboid::Entity
22
+ include Graboid::Scraper
19
23
 
20
24
  selector '.entry'
21
25
 
@@ -26,7 +30,7 @@
26
30
  entry.css('a').first['href']
27
31
  end
28
32
 
29
- pager do |doc|
33
+ page_with do |doc|
30
34
  doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
31
35
  end
32
36
 
@@ -38,9 +42,9 @@
38
42
 
39
43
  end
40
44
 
41
- RedditEntry.source = 'http://reddit.com'
45
+ @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 2 )
42
46
 
43
- RedditEntry.all(:max_pages => 5).each do |p|
47
+ @posts.each do |p|
44
48
  puts "title: #{p.title}"
45
49
  puts "domain: #{p.domain}"
46
50
  puts "link: #{p.link}"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.4
1
+ 0.3.5
@@ -35,8 +35,8 @@ class ActiveRainPost
35
35
 
36
36
  end
37
37
 
38
- ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
39
- @posts = ActiveRainPost.all(:max_pages => 1)
38
+ ActiveRainPost.source = 'http://activerain.com/blogs/danawilkinson'
39
+ @posts = ActiveRainPost.all(:max_pages => 100)
40
40
 
41
41
  @posts.each do |post|
42
42
  puts "#{post.pub_date}"
@@ -46,8 +46,8 @@ class NingPost
46
46
 
47
47
  end
48
48
 
49
- NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
50
- @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
49
+ NING_URL = 'http://vstar650.ning.com/profiles/blog/list'
50
+ @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 10)
51
51
 
52
52
  @posts.each do |post|
53
53
  puts "#{post.pub_date} -- #{post.title}"
@@ -0,0 +1,35 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class RedditEntry
5
+ include Graboid::Scraper
6
+
7
+ selector '.entry'
8
+
9
+ set :title
10
+ set :domain, :selector => '.domain a'
11
+
12
+ set :link, :selector => '.title' do |entry|
13
+ entry.css('a').first['href']
14
+ end
15
+
16
+ page_with do |doc|
17
+ self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
18
+ end
19
+
20
+ before_paginate do
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "#{"*"*100}"
24
+ end
25
+
26
+ end
27
+
28
+ @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
29
+
30
+ @posts.each do |p|
31
+ puts "title: #{p.title}"
32
+ puts "domain: #{p.domain}"
33
+ puts "link: #{p.link}"
34
+ puts "#{"*"*100}"
35
+ end
@@ -0,0 +1,33 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class TumblrEntry
5
+ include Graboid::Scraper
6
+ TUMBLR_CHUNK_SIZE = 20
7
+
8
+ selector 'post'
9
+
10
+ set :title, :selector => 'regular-title'
11
+
12
+ page_with do |doc|
13
+ next_tumblr_page
14
+ end
15
+
16
+ def next_tumblr_page
17
+ return nil if self.doc.css('post').empty?
18
+ "#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
19
+ end
20
+
21
+ before_paginate do
22
+ puts "opening page: #{self.source}"
23
+ puts "collection size: #{self.collection.length}"
24
+ puts "#{"*"*100}"
25
+ end
26
+
27
+ end
28
+
29
+ @posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
30
+
31
+ @posts.each do |p|
32
+ puts "title: #{p.title}"
33
+ end
data/graboid.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.4"
8
+ s.version = "0.3.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-16}
12
+ s.date = %q{2010-07-09}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -53,7 +53,9 @@ Gem::Specification.new do |s|
53
53
  "spec/spec_helper.rb",
54
54
  "examples/active_rain_post.rb",
55
55
  "examples/live_journal_post.rb",
56
- "examples/ning_post.rb"
56
+ "examples/ning_post.rb",
57
+ "examples/reddit_post.rb",
58
+ "examples/tumblr_post.rb"
57
59
  ]
58
60
 
59
61
  if s.respond_to? :specification_version then
data/lib/graboid.rb CHANGED
@@ -9,7 +9,7 @@ module Graboid
9
9
  extend self
10
10
 
11
11
  def user_agent
12
- @user_agent ||= 'Graboid'
12
+ @user_agent ||= 'Foo'
13
13
  end
14
14
 
15
15
  def user_agent=(agent)
@@ -25,11 +25,9 @@ module Graboid
25
25
  end
26
26
 
27
27
  def page_with &block
28
- @pager = block
29
- end
30
-
31
- def pager
32
- @pager
28
+ define_method :pager do
29
+ instance_eval &block
30
+ end
33
31
  end
34
32
 
35
33
  def root_selector
@@ -76,7 +74,8 @@ module Graboid
76
74
  alias_method :scrape, :all
77
75
 
78
76
  def all_fragments
79
- return page_fragments if self.class.pager.nil?
77
+ return page_fragments unless self.respond_to?(:pager)
78
+ return page_fragments if self.pager(self.doc).nil?
80
79
  old_source = self.source
81
80
 
82
81
  while next_page?
@@ -151,18 +150,22 @@ module Graboid
151
150
 
152
151
  def next_page?
153
152
  if max_pages.zero?
154
- return true unless self.class.pager.call(doc).nil?
153
+ return true unless self.pager(doc).nil?
155
154
  else
156
155
  current_page <= max_pages-1
157
156
  end
158
157
  end
159
158
 
159
+ def original_source
160
+ @original_source
161
+ end
162
+
160
163
  def page_fragments
161
164
  doc.css(self.class.root_selector)
162
165
  end
163
166
 
164
167
  def paginate
165
- next_page_url = self.class.pager.call(doc) rescue nil
168
+ next_page_url = self.pager(doc)
166
169
  self.source = next_page_url
167
170
  self.current_page += 1
168
171
  end
@@ -182,11 +185,16 @@ module Graboid
182
185
  self.max_pages = 0
183
186
  end
184
187
 
188
+ def host
189
+ self.source.scan(/http[s]?:\/\/.*\//).first
190
+ end
191
+
185
192
  def source
186
193
  @source
187
194
  end
188
195
 
189
196
  def source=(src)
197
+ @original_source = src if @original_source.nil?
190
198
  @source = src
191
199
  end
192
200
 
@@ -37,7 +37,7 @@ class ScraperWithPager
37
37
  end
38
38
 
39
39
  page_with do |doc|
40
- 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
40
+ 'http://localhost:9393'+self.doc.css('a.next').first['href'] rescue nil
41
41
  end
42
42
 
43
43
  before_paginate do
@@ -172,16 +172,24 @@ describe Graboid::Scraper do
172
172
  describe "with a limit" do
173
173
  before(:each) do
174
174
  @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
175
+ #@scraper.expects(:run_before_paginate_callbacks).times(3)
175
176
  @posts = @scraper.all(:max_pages => 3)
176
177
  end
178
+ it "should set the callback" do
179
+ @scraper.callbacks[:before_paginate].should be_a Proc
180
+ end
177
181
  it "should get 6 posts" do
178
182
  @posts.length.should == 6
179
183
  end
184
+
185
+
186
+
180
187
  end
181
188
 
182
189
  describe "without a limit" do
183
190
  before(:each) do
184
191
  @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
192
+ @scraper.expects(:run_before_paginate_callbacks).times(8)
185
193
  @posts = @scraper.all
186
194
  end
187
195
  it "should get 16 posts" do
data/spec/spec.opts CHANGED
@@ -1,3 +1,4 @@
1
1
  --colour
2
2
  --format nested
3
3
  --loadby mtime
4
+ --backtrace
data/spec/spec_helper.rb CHANGED
@@ -5,7 +5,7 @@ require 'spec'
5
5
  require 'spec/autorun'
6
6
 
7
7
  Spec::Runner.configure do |config|
8
-
8
+ config.mock_with :mocha
9
9
  end
10
10
 
11
11
  FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 4
10
- version: 0.3.4
9
+ - 5
10
+ version: 0.3.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-16 00:00:00 -07:00
18
+ date: 2010-07-09 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -94,6 +94,8 @@ files:
94
94
  - spec/graboid_spec.rb
95
95
  - spec/spec.opts
96
96
  - spec/spec_helper.rb
97
+ - examples/reddit_post.rb
98
+ - examples/tumblr_post.rb
97
99
  has_rdoc: true
98
100
  homepage: http://github.com/twoism/graboid
99
101
  licenses: []
@@ -137,3 +139,5 @@ test_files:
137
139
  - examples/active_rain_post.rb
138
140
  - examples/live_journal_post.rb
139
141
  - examples/ning_post.rb
142
+ - examples/reddit_post.rb
143
+ - examples/tumblr_post.rb