graboid 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.mdown CHANGED
@@ -4,6 +4,10 @@
4
4
 
5
5
  Simply awesome web scraping. Better docs later. See specs.
6
6
 
7
+ ### 0.3.4 Update ###
8
+
9
+ [http://twoism.posterous.com/new-graboid-dsl](http://twoism.posterous.com/new-graboid-dsl, "New DSL")
10
+
7
11
  ### Installation ###
8
12
 
9
13
 
@@ -15,7 +19,7 @@
15
19
  %w{rubygems graboid}.each { |f| require f }
16
20
 
17
21
  class RedditEntry
18
- include Graboid::Entity
22
+ include Graboid::Scraper
19
23
 
20
24
  selector '.entry'
21
25
 
@@ -26,7 +30,7 @@
26
30
  entry.css('a').first['href']
27
31
  end
28
32
 
29
- pager do |doc|
33
+ page_with do |doc|
30
34
  doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
31
35
  end
32
36
 
@@ -38,9 +42,9 @@
38
42
 
39
43
  end
40
44
 
41
- RedditEntry.source = 'http://reddit.com'
45
+ @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 2 )
42
46
 
43
- RedditEntry.all(:max_pages => 5).each do |p|
47
+ @posts.each do |p|
44
48
  puts "title: #{p.title}"
45
49
  puts "domain: #{p.domain}"
46
50
  puts "link: #{p.link}"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.4
1
+ 0.3.5
@@ -35,8 +35,8 @@ class ActiveRainPost
35
35
 
36
36
  end
37
37
 
38
- ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
39
- @posts = ActiveRainPost.all(:max_pages => 1)
38
+ ActiveRainPost.source = 'http://activerain.com/blogs/danawilkinson'
39
+ @posts = ActiveRainPost.all(:max_pages => 100)
40
40
 
41
41
  @posts.each do |post|
42
42
  puts "#{post.pub_date}"
@@ -46,8 +46,8 @@ class NingPost
46
46
 
47
47
  end
48
48
 
49
- NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
50
- @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
49
+ NING_URL = 'http://vstar650.ning.com/profiles/blog/list'
50
+ @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 10)
51
51
 
52
52
  @posts.each do |post|
53
53
  puts "#{post.pub_date} -- #{post.title}"
@@ -0,0 +1,35 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class RedditEntry
5
+ include Graboid::Scraper
6
+
7
+ selector '.entry'
8
+
9
+ set :title
10
+ set :domain, :selector => '.domain a'
11
+
12
+ set :link, :selector => '.title' do |entry|
13
+ entry.css('a').first['href']
14
+ end
15
+
16
+ page_with do |doc|
17
+ self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
18
+ end
19
+
20
+ before_paginate do
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "#{"*"*100}"
24
+ end
25
+
26
+ end
27
+
28
+ @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
29
+
30
+ @posts.each do |p|
31
+ puts "title: #{p.title}"
32
+ puts "domain: #{p.domain}"
33
+ puts "link: #{p.link}"
34
+ puts "#{"*"*100}"
35
+ end
@@ -0,0 +1,33 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class TumblrEntry
5
+ include Graboid::Scraper
6
+ TUMBLR_CHUNK_SIZE = 20
7
+
8
+ selector 'post'
9
+
10
+ set :title, :selector => 'regular-title'
11
+
12
+ page_with do |doc|
13
+ next_tumblr_page
14
+ end
15
+
16
+ def next_tumblr_page
17
+ return nil if self.doc.css('post').empty?
18
+ "#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
19
+ end
20
+
21
+ before_paginate do
22
+ puts "opening page: #{self.source}"
23
+ puts "collection size: #{self.collection.length}"
24
+ puts "#{"*"*100}"
25
+ end
26
+
27
+ end
28
+
29
+ @posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
30
+
31
+ @posts.each do |p|
32
+ puts "title: #{p.title}"
33
+ end
data/graboid.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.4"
8
+ s.version = "0.3.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-16}
12
+ s.date = %q{2010-07-09}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -53,7 +53,9 @@ Gem::Specification.new do |s|
53
53
  "spec/spec_helper.rb",
54
54
  "examples/active_rain_post.rb",
55
55
  "examples/live_journal_post.rb",
56
- "examples/ning_post.rb"
56
+ "examples/ning_post.rb",
57
+ "examples/reddit_post.rb",
58
+ "examples/tumblr_post.rb"
57
59
  ]
58
60
 
59
61
  if s.respond_to? :specification_version then
data/lib/graboid.rb CHANGED
@@ -9,7 +9,7 @@ module Graboid
9
9
  extend self
10
10
 
11
11
  def user_agent
12
- @user_agent ||= 'Graboid'
12
+ @user_agent ||= 'Foo'
13
13
  end
14
14
 
15
15
  def user_agent=(agent)
@@ -25,11 +25,9 @@ module Graboid
25
25
  end
26
26
 
27
27
  def page_with &block
28
- @pager = block
29
- end
30
-
31
- def pager
32
- @pager
28
+ define_method :pager do
29
+ instance_eval &block
30
+ end
33
31
  end
34
32
 
35
33
  def root_selector
@@ -76,7 +74,8 @@ module Graboid
76
74
  alias_method :scrape, :all
77
75
 
78
76
  def all_fragments
79
- return page_fragments if self.class.pager.nil?
77
+ return page_fragments unless self.respond_to?(:pager)
78
+ return page_fragments if self.pager(self.doc).nil?
80
79
  old_source = self.source
81
80
 
82
81
  while next_page?
@@ -151,18 +150,22 @@ module Graboid
151
150
 
152
151
  def next_page?
153
152
  if max_pages.zero?
154
- return true unless self.class.pager.call(doc).nil?
153
+ return true unless self.pager(doc).nil?
155
154
  else
156
155
  current_page <= max_pages-1
157
156
  end
158
157
  end
159
158
 
159
+ def original_source
160
+ @original_source
161
+ end
162
+
160
163
  def page_fragments
161
164
  doc.css(self.class.root_selector)
162
165
  end
163
166
 
164
167
  def paginate
165
- next_page_url = self.class.pager.call(doc) rescue nil
168
+ next_page_url = self.pager(doc)
166
169
  self.source = next_page_url
167
170
  self.current_page += 1
168
171
  end
@@ -182,11 +185,16 @@ module Graboid
182
185
  self.max_pages = 0
183
186
  end
184
187
 
188
+ def host
189
+ self.source.scan(/http[s]?:\/\/.*\//).first
190
+ end
191
+
185
192
  def source
186
193
  @source
187
194
  end
188
195
 
189
196
  def source=(src)
197
+ @original_source = src if @original_source.nil?
190
198
  @source = src
191
199
  end
192
200
 
@@ -37,7 +37,7 @@ class ScraperWithPager
37
37
  end
38
38
 
39
39
  page_with do |doc|
40
- 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
40
+ 'http://localhost:9393'+self.doc.css('a.next').first['href'] rescue nil
41
41
  end
42
42
 
43
43
  before_paginate do
@@ -172,16 +172,24 @@ describe Graboid::Scraper do
172
172
  describe "with a limit" do
173
173
  before(:each) do
174
174
  @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
175
+ #@scraper.expects(:run_before_paginate_callbacks).times(3)
175
176
  @posts = @scraper.all(:max_pages => 3)
176
177
  end
178
+ it "should set the callback" do
179
+ @scraper.callbacks[:before_paginate].should be_a Proc
180
+ end
177
181
  it "should get 6 posts" do
178
182
  @posts.length.should == 6
179
183
  end
184
+
185
+
186
+
180
187
  end
181
188
 
182
189
  describe "without a limit" do
183
190
  before(:each) do
184
191
  @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
192
+ @scraper.expects(:run_before_paginate_callbacks).times(8)
185
193
  @posts = @scraper.all
186
194
  end
187
195
  it "should get 16 posts" do
data/spec/spec.opts CHANGED
@@ -1,3 +1,4 @@
1
1
  --colour
2
2
  --format nested
3
3
  --loadby mtime
4
+ --backtrace
data/spec/spec_helper.rb CHANGED
@@ -5,7 +5,7 @@ require 'spec'
5
5
  require 'spec/autorun'
6
6
 
7
7
  Spec::Runner.configure do |config|
8
-
8
+ config.mock_with :mocha
9
9
  end
10
10
 
11
11
  FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 4
10
- version: 0.3.4
9
+ - 5
10
+ version: 0.3.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-16 00:00:00 -07:00
18
+ date: 2010-07-09 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -94,6 +94,8 @@ files:
94
94
  - spec/graboid_spec.rb
95
95
  - spec/spec.opts
96
96
  - spec/spec_helper.rb
97
+ - examples/reddit_post.rb
98
+ - examples/tumblr_post.rb
97
99
  has_rdoc: true
98
100
  homepage: http://github.com/twoism/graboid
99
101
  licenses: []
@@ -137,3 +139,5 @@ test_files:
137
139
  - examples/active_rain_post.rb
138
140
  - examples/live_journal_post.rb
139
141
  - examples/ning_post.rb
142
+ - examples/reddit_post.rb
143
+ - examples/tumblr_post.rb