graboid 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +17 -7
- data/examples/live_journal_post.rb +11 -6
- data/examples/ning_post.rb +7 -6
- data/graboid.gemspec +5 -2
- data/lib/graboid.rb +11 -2
- data/lib/graboid/entity.rb +3 -3
- data/lib/graboid/scraper.rb +204 -0
- data/spec/graboid/scraper_spec.rb +195 -0
- data/spec/spec_helper.rb +2 -0
- metadata +7 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.4
|
@@ -4,11 +4,21 @@ require File.join(dir, 'graboid')
|
|
4
4
|
class ActiveRainPost
|
5
5
|
include Graboid::Entity
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
selector '.blog_entry_wrapper'
|
8
|
+
|
9
|
+
set :title, :selector => 'h2 a'
|
10
|
+
set :pub_date, :selector => '.blog_entry' do |elm|
|
11
|
+
# awesome, the pub date is not contained within
|
12
|
+
# the .blog_entry_wrapper fragment.
|
13
|
+
begin
|
14
|
+
entry_id = elm['id'].gsub('blog_entry_','')
|
15
|
+
date_text = self.doc.css("#divbei#{entry_id} td").select{|td| td.text =~ /posted by/i }.first.text
|
16
|
+
date_text.match(/(\d{2}\/\d{2}\/\d{4})/).captures.first
|
17
|
+
rescue
|
18
|
+
""
|
19
|
+
end
|
20
|
+
end
|
21
|
+
set :body, :selector => 'div' do |elm|
|
12
22
|
elm.css('p').collect(&:to_html)
|
13
23
|
end
|
14
24
|
|
@@ -26,10 +36,10 @@ class ActiveRainPost
|
|
26
36
|
end
|
27
37
|
|
28
38
|
ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
|
29
|
-
@posts = ActiveRainPost.all
|
39
|
+
@posts = ActiveRainPost.all(:max_pages => 1)
|
30
40
|
|
31
41
|
@posts.each do |post|
|
32
|
-
puts "#{post.
|
42
|
+
puts "#{post.pub_date}"
|
33
43
|
puts "*"*100
|
34
44
|
end
|
35
45
|
|
@@ -6,9 +6,14 @@ class LiveJournalPost
|
|
6
6
|
|
7
7
|
root '.entrybox'
|
8
8
|
|
9
|
-
field :title,
|
10
|
-
field :body,
|
11
|
-
|
9
|
+
field :title, :selector => '.caption a'
|
10
|
+
field :body, :selector => 'td[@colspan="2"]'
|
11
|
+
|
12
|
+
field :pub_date, :selector => 'td.index' do |elm|
|
13
|
+
elm.text.match(/\[(.*)\|/)[1]
|
14
|
+
end
|
15
|
+
|
16
|
+
field :comment_link, :selector => '.caption a' do |elm|
|
12
17
|
elm['href']
|
13
18
|
end
|
14
19
|
|
@@ -25,11 +30,11 @@ class LiveJournalPost
|
|
25
30
|
|
26
31
|
end
|
27
32
|
|
28
|
-
LiveJournalPost.source
|
29
|
-
@posts
|
33
|
+
LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
|
34
|
+
@posts = LiveJournalPost.all(:max_pages => 3)
|
30
35
|
|
31
36
|
@posts.each do |post|
|
32
|
-
puts "#{post.title}"
|
37
|
+
puts "#{post.pub_date} - #{post.title}"
|
33
38
|
puts "#{post.comment_link}"
|
34
39
|
puts "#{post.body}"
|
35
40
|
puts "*"*100
|
data/examples/ning_post.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
2
3
|
|
3
4
|
class NingPost
|
4
|
-
include Graboid::
|
5
|
+
include Graboid::Scraper
|
5
6
|
|
6
7
|
selector 'div.xg_blog .xg_module_body'
|
7
8
|
|
@@ -25,10 +26,10 @@ class NingPost
|
|
25
26
|
# ning's list page only has an excerpt of the body. No biggie,
|
26
27
|
# we'll just go grab it.
|
27
28
|
show_url = elm.css('a').last["href"]
|
28
|
-
Nokogiri::HTML(open(show_url)).css('.postbody').to_html
|
29
|
+
Nokogiri::HTML(open(show_url,"User-Agent" => Graboid.user_agent)).css('.postbody').to_html
|
29
30
|
end
|
30
31
|
|
31
|
-
|
32
|
+
page_with do |doc|
|
32
33
|
doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
|
33
34
|
end
|
34
35
|
|
@@ -45,8 +46,8 @@ class NingPost
|
|
45
46
|
|
46
47
|
end
|
47
48
|
|
48
|
-
|
49
|
-
@posts
|
49
|
+
NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
|
50
|
+
@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
|
50
51
|
|
51
52
|
@posts.each do |post|
|
52
53
|
puts "#{post.pub_date} -- #{post.title}"
|
data/graboid.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-06-
|
12
|
+
s.date = %q{2010-06-16}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -29,11 +29,13 @@ Gem::Specification.new do |s|
|
|
29
29
|
"graboid.gemspec",
|
30
30
|
"lib/graboid.rb",
|
31
31
|
"lib/graboid/entity.rb",
|
32
|
+
"lib/graboid/scraper.rb",
|
32
33
|
"spec/fixtures/graboid.jpg",
|
33
34
|
"spec/fixtures/posts.html",
|
34
35
|
"spec/fixtures/server.rb",
|
35
36
|
"spec/fixtures/views/posts.erb",
|
36
37
|
"spec/graboid/entity_spec.rb",
|
38
|
+
"spec/graboid/scraper_spec.rb",
|
37
39
|
"spec/graboid_spec.rb",
|
38
40
|
"spec/spec.opts",
|
39
41
|
"spec/spec_helper.rb"
|
@@ -46,6 +48,7 @@ Gem::Specification.new do |s|
|
|
46
48
|
s.test_files = [
|
47
49
|
"spec/fixtures/server.rb",
|
48
50
|
"spec/graboid/entity_spec.rb",
|
51
|
+
"spec/graboid/scraper_spec.rb",
|
49
52
|
"spec/graboid_spec.rb",
|
50
53
|
"spec/spec_helper.rb",
|
51
54
|
"examples/active_rain_post.rb",
|
data/lib/graboid.rb
CHANGED
@@ -1,9 +1,18 @@
|
|
1
|
-
%w{rubygems nokogiri open-uri active_support}.each { |f| require f }
|
1
|
+
%w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
|
2
2
|
|
3
3
|
dir = Pathname(__FILE__).dirname.expand_path
|
4
4
|
|
5
5
|
require dir + 'graboid/entity'
|
6
|
-
|
6
|
+
require dir + 'graboid/scraper'
|
7
7
|
|
8
8
|
module Graboid
|
9
|
+
extend self
|
10
|
+
|
11
|
+
def user_agent
|
12
|
+
@user_agent ||= 'Graboid'
|
13
|
+
end
|
14
|
+
|
15
|
+
def user_agent=(agent)
|
16
|
+
@user_agent = agent
|
17
|
+
end
|
9
18
|
end
|
data/lib/graboid/entity.rb
CHANGED
@@ -5,7 +5,7 @@ module Graboid
|
|
5
5
|
klass.class_eval do
|
6
6
|
extend ClassMethods
|
7
7
|
include InstanceMethods
|
8
|
-
|
8
|
+
warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
|
9
9
|
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
10
10
|
end
|
11
11
|
end
|
@@ -67,7 +67,7 @@ module Graboid
|
|
67
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
68
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
69
69
|
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
70
|
-
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
|
71
71
|
|
72
72
|
extracted_hash
|
73
73
|
end
|
@@ -119,7 +119,7 @@ module Graboid
|
|
119
119
|
def read_source
|
120
120
|
case self.source
|
121
121
|
when /^http[s]?:\/\//
|
122
|
-
open
|
122
|
+
open(self.source, "User-Agent" => Graboid.user_agent)
|
123
123
|
when String
|
124
124
|
self.source
|
125
125
|
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
module Graboid
|
2
|
+
module Scraper
|
3
|
+
def self.included klass
|
4
|
+
klass.class_eval do
|
5
|
+
extend ClassMethods
|
6
|
+
include InstanceMethods
|
7
|
+
|
8
|
+
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
9
|
+
write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
module ClassMethods
|
14
|
+
|
15
|
+
def attribute_map
|
16
|
+
read_inheritable_attribute :attribute_map
|
17
|
+
end
|
18
|
+
|
19
|
+
def callbacks
|
20
|
+
read_inheritable_attribute :callbacks
|
21
|
+
end
|
22
|
+
|
23
|
+
def inferred_selector
|
24
|
+
@inferred_selector ||= ".#{self.to_s.underscore}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def page_with &block
|
28
|
+
@pager = block
|
29
|
+
end
|
30
|
+
|
31
|
+
def pager
|
32
|
+
@pager
|
33
|
+
end
|
34
|
+
|
35
|
+
def root_selector
|
36
|
+
@root_selector || inferred_selector
|
37
|
+
end
|
38
|
+
|
39
|
+
def selector selector
|
40
|
+
@root_selector = selector
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :root, :selector
|
44
|
+
|
45
|
+
def set name, opts={}, &block
|
46
|
+
opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
|
47
|
+
opts.merge!(:processor => block) if block_given?
|
48
|
+
|
49
|
+
attribute_map[name] = opts
|
50
|
+
end
|
51
|
+
|
52
|
+
[:before, :after].each do |prefix|
|
53
|
+
[:paginate, :extract].each do |suffix|
|
54
|
+
method_name = "#{prefix}_#{suffix}"
|
55
|
+
define_method method_name.to_sym do |&block|
|
56
|
+
self.callbacks["#{method_name}".to_sym] = block
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
module InstanceMethods
|
64
|
+
def initialize opts={}, &block
|
65
|
+
raise ArgumentError unless opts[:source].present?
|
66
|
+
self.source = opts[:source]
|
67
|
+
end
|
68
|
+
|
69
|
+
def all opts={}, reload=false
|
70
|
+
return self.collection if reload and !self.collection.empty?
|
71
|
+
reset_context
|
72
|
+
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
73
|
+
all_fragments.collect{ |frag| extract_instance(frag) }
|
74
|
+
end
|
75
|
+
|
76
|
+
alias_method :scrape, :all
|
77
|
+
|
78
|
+
def all_fragments
|
79
|
+
return page_fragments if self.class.pager.nil?
|
80
|
+
old_source = self.source
|
81
|
+
|
82
|
+
while next_page?
|
83
|
+
self.collection += page_fragments
|
84
|
+
run_before_paginate_callbacks
|
85
|
+
paginate
|
86
|
+
run_after_paginate_callbacks
|
87
|
+
end
|
88
|
+
|
89
|
+
self.source = old_source
|
90
|
+
self.collection
|
91
|
+
end
|
92
|
+
|
93
|
+
def attribute_map
|
94
|
+
self.class.attribute_map
|
95
|
+
end
|
96
|
+
|
97
|
+
def callbacks
|
98
|
+
self.class.callbacks
|
99
|
+
end
|
100
|
+
|
101
|
+
def collection
|
102
|
+
@collection ||= []
|
103
|
+
end
|
104
|
+
|
105
|
+
def collection=(col)
|
106
|
+
@collection = col
|
107
|
+
end
|
108
|
+
|
109
|
+
def current_page
|
110
|
+
@current_page ||= 0
|
111
|
+
end
|
112
|
+
|
113
|
+
def current_page=num
|
114
|
+
@current_page = num
|
115
|
+
end
|
116
|
+
|
117
|
+
def doc
|
118
|
+
eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
|
119
|
+
end
|
120
|
+
|
121
|
+
def extract_instance fragment
|
122
|
+
OpenStruct.new(hash_map fragment)
|
123
|
+
end
|
124
|
+
|
125
|
+
def hash_map fragment
|
126
|
+
attribute_map.inject({}) do |extracted_hash, at|
|
127
|
+
selector, processor = at.last[:selector], at.last[:processor]
|
128
|
+
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
129
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
|
130
|
+
|
131
|
+
extracted_hash
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def max_pages
|
136
|
+
@max_pages ||= 0
|
137
|
+
end
|
138
|
+
|
139
|
+
def max_pages=num
|
140
|
+
@max_pages = num
|
141
|
+
end
|
142
|
+
|
143
|
+
def mode
|
144
|
+
@mode ||= :html
|
145
|
+
end
|
146
|
+
|
147
|
+
def mode=(m)
|
148
|
+
raise ArgumentError unless [:html, :xml].include?(m)
|
149
|
+
@mode = m
|
150
|
+
end
|
151
|
+
|
152
|
+
def next_page?
|
153
|
+
if max_pages.zero?
|
154
|
+
return true unless self.class.pager.call(doc).nil?
|
155
|
+
else
|
156
|
+
current_page <= max_pages-1
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def page_fragments
|
161
|
+
doc.css(self.class.root_selector)
|
162
|
+
end
|
163
|
+
|
164
|
+
def paginate
|
165
|
+
next_page_url = self.class.pager.call(doc) rescue nil
|
166
|
+
self.source = next_page_url
|
167
|
+
self.current_page += 1
|
168
|
+
end
|
169
|
+
|
170
|
+
def read_source
|
171
|
+
case self.source
|
172
|
+
when /^http[s]?:\/\//
|
173
|
+
open(self.source ,"User-Agent" => Graboid.user_agent)
|
174
|
+
when String
|
175
|
+
self.source
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def reset_context
|
180
|
+
self.collection = []
|
181
|
+
self.current_page = 0
|
182
|
+
self.max_pages = 0
|
183
|
+
end
|
184
|
+
|
185
|
+
def source
|
186
|
+
@source
|
187
|
+
end
|
188
|
+
|
189
|
+
def source=(src)
|
190
|
+
@source = src
|
191
|
+
end
|
192
|
+
|
193
|
+
[:before, :after].each do |prefix|
|
194
|
+
[:paginate, :extract].each do |suffix|
|
195
|
+
method_name = "#{prefix}_#{suffix}"
|
196
|
+
define_method "run_#{method_name}_callbacks" do
|
197
|
+
self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
class MockScraper
|
4
|
+
include Graboid::Scraper
|
5
|
+
|
6
|
+
set :title
|
7
|
+
set :body
|
8
|
+
set :author
|
9
|
+
set :date, :selector => '.author' do |elm|
|
10
|
+
elm.text.match(/\((.*)\)/)[1]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class WorkingScraper
|
15
|
+
include Graboid::Scraper
|
16
|
+
|
17
|
+
selector '.post'
|
18
|
+
|
19
|
+
set :title
|
20
|
+
set :body
|
21
|
+
set :author
|
22
|
+
set :date, :selector => '.author' do |elm|
|
23
|
+
elm.text.match(/\((.*)\)/)[1]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class ScraperWithPager
|
28
|
+
include Graboid::Scraper
|
29
|
+
|
30
|
+
selector '.post'
|
31
|
+
|
32
|
+
set :title
|
33
|
+
set :body
|
34
|
+
set :author
|
35
|
+
set :date, :selector => '.author' do |elm|
|
36
|
+
elm.text.match(/\((.*)\)/)[1]
|
37
|
+
end
|
38
|
+
|
39
|
+
page_with do |doc|
|
40
|
+
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
41
|
+
end
|
42
|
+
|
43
|
+
before_paginate do
|
44
|
+
puts "page: #{source}"
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
describe Graboid::Scraper do
|
52
|
+
describe "#root_selector" do
|
53
|
+
it "should be set" do
|
54
|
+
MockScraper.root_selector.should == '.mock_scraper'
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "when inferred from class" do
|
58
|
+
|
59
|
+
before(:each) do
|
60
|
+
class Phony; include Graboid::Scraper; end
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should infer .phony" do
|
64
|
+
Phony.root_selector.should == '.phony'
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "#set" do
|
70
|
+
describe "simple syntax" do
|
71
|
+
|
72
|
+
before(:each) do
|
73
|
+
MockScraper.set :body
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be set in the attr map" do
|
77
|
+
MockScraper.attribute_map[:body].should be_a Hash
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should set the selector" do
|
81
|
+
MockScraper.attribute_map[:body][:selector].should == '.body'
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe "custom selector syntax" do
|
86
|
+
before(:each) do
|
87
|
+
MockScraper.set :body, :selector => '.custom'
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should set the selector" do
|
91
|
+
MockScraper.attribute_map[:body][:selector].should == '.custom'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "custom selector syntax with a lambda" do
|
96
|
+
|
97
|
+
before(:each) do
|
98
|
+
MockScraper.set :body, :selector => '.custom' do |item|
|
99
|
+
"from lambda"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should set the selector" do
|
104
|
+
MockScraper.attribute_map[:body][:selector].should == '.custom'
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should set the processor" do
|
108
|
+
MockScraper.attribute_map[:body][:processor].should be_a Proc
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "#new" do
|
115
|
+
describe "when supplied a source" do
|
116
|
+
before(:each) do
|
117
|
+
@scraper = WorkingScraper.new( :source => TEST_SERVER_URL )
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should have the correct attribute_map" do
|
121
|
+
@scraper.attribute_map[:body][:selector].should == '.body'
|
122
|
+
end
|
123
|
+
|
124
|
+
it "should set the instance source" do
|
125
|
+
@scraper.source.should == TEST_SERVER_URL
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should set the doc source" do
|
129
|
+
@scraper.doc.should be_a Nokogiri::HTML::Document
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
describe "#all_fragments" do
|
134
|
+
before(:each) do
|
135
|
+
@scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
|
136
|
+
@fragments = @scraper.all_fragments
|
137
|
+
end
|
138
|
+
|
139
|
+
it "should return the NodeSet" do
|
140
|
+
@fragments.should be_a Nokogiri::XML::NodeSet
|
141
|
+
end
|
142
|
+
|
143
|
+
it "should have 2 results" do
|
144
|
+
@fragments.count.should == 2
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "#all" do
|
149
|
+
before(:each) do
|
150
|
+
@scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
|
151
|
+
end
|
152
|
+
|
153
|
+
it "should return 2 WorkingPosts" do
|
154
|
+
@scraper.all(:max_pages => 3).length.should == 2
|
155
|
+
end
|
156
|
+
|
157
|
+
[:current_page, :max_pages].each do |m|
|
158
|
+
describe "##{m}" do
|
159
|
+
it "should be 0 by default" do
|
160
|
+
@scraper.send(m).should == 0
|
161
|
+
end
|
162
|
+
it "should be 3" do
|
163
|
+
@scraper.send("#{m}=",3)
|
164
|
+
@scraper.send(m).should == 3
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
|
171
|
+
describe "#page_with" do
|
172
|
+
describe "with a limit" do
|
173
|
+
before(:each) do
|
174
|
+
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
175
|
+
@posts = @scraper.all(:max_pages => 3)
|
176
|
+
end
|
177
|
+
it "should get 6 posts" do
|
178
|
+
@posts.length.should == 6
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "without a limit" do
|
183
|
+
before(:each) do
|
184
|
+
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
185
|
+
@posts = @scraper.all
|
186
|
+
end
|
187
|
+
it "should get 16 posts" do
|
188
|
+
@posts.length.should == 16
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 4
|
10
|
+
version: 0.3.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-16 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -84,11 +84,13 @@ files:
|
|
84
84
|
- graboid.gemspec
|
85
85
|
- lib/graboid.rb
|
86
86
|
- lib/graboid/entity.rb
|
87
|
+
- lib/graboid/scraper.rb
|
87
88
|
- spec/fixtures/graboid.jpg
|
88
89
|
- spec/fixtures/posts.html
|
89
90
|
- spec/fixtures/server.rb
|
90
91
|
- spec/fixtures/views/posts.erb
|
91
92
|
- spec/graboid/entity_spec.rb
|
93
|
+
- spec/graboid/scraper_spec.rb
|
92
94
|
- spec/graboid_spec.rb
|
93
95
|
- spec/spec.opts
|
94
96
|
- spec/spec_helper.rb
|
@@ -129,6 +131,7 @@ summary: web scraping made easy
|
|
129
131
|
test_files:
|
130
132
|
- spec/fixtures/server.rb
|
131
133
|
- spec/graboid/entity_spec.rb
|
134
|
+
- spec/graboid/scraper_spec.rb
|
132
135
|
- spec/graboid_spec.rb
|
133
136
|
- spec/spec_helper.rb
|
134
137
|
- examples/active_rain_post.rb
|