graboid 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +17 -7
- data/examples/live_journal_post.rb +11 -6
- data/examples/ning_post.rb +7 -6
- data/graboid.gemspec +5 -2
- data/lib/graboid.rb +11 -2
- data/lib/graboid/entity.rb +3 -3
- data/lib/graboid/scraper.rb +204 -0
- data/spec/graboid/scraper_spec.rb +195 -0
- data/spec/spec_helper.rb +2 -0
- metadata +7 -4
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.3.
|
|
1
|
+
0.3.4
|
|
@@ -4,11 +4,21 @@ require File.join(dir, 'graboid')
|
|
|
4
4
|
class ActiveRainPost
|
|
5
5
|
include Graboid::Entity
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
7
|
+
selector '.blog_entry_wrapper'
|
|
8
|
+
|
|
9
|
+
set :title, :selector => 'h2 a'
|
|
10
|
+
set :pub_date, :selector => '.blog_entry' do |elm|
|
|
11
|
+
# awesome, the pub date is not contained within
|
|
12
|
+
# the .blog_entry_wrapper fragment.
|
|
13
|
+
begin
|
|
14
|
+
entry_id = elm['id'].gsub('blog_entry_','')
|
|
15
|
+
date_text = self.doc.css("#divbei#{entry_id} td").select{|td| td.text =~ /posted by/i }.first.text
|
|
16
|
+
date_text.match(/(\d{2}\/\d{2}\/\d{4})/).captures.first
|
|
17
|
+
rescue
|
|
18
|
+
""
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
set :body, :selector => 'div' do |elm|
|
|
12
22
|
elm.css('p').collect(&:to_html)
|
|
13
23
|
end
|
|
14
24
|
|
|
@@ -26,10 +36,10 @@ class ActiveRainPost
|
|
|
26
36
|
end
|
|
27
37
|
|
|
28
38
|
ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
|
|
29
|
-
@posts = ActiveRainPost.all
|
|
39
|
+
@posts = ActiveRainPost.all(:max_pages => 1)
|
|
30
40
|
|
|
31
41
|
@posts.each do |post|
|
|
32
|
-
puts "#{post.
|
|
42
|
+
puts "#{post.pub_date}"
|
|
33
43
|
puts "*"*100
|
|
34
44
|
end
|
|
35
45
|
|
|
@@ -6,9 +6,14 @@ class LiveJournalPost
|
|
|
6
6
|
|
|
7
7
|
root '.entrybox'
|
|
8
8
|
|
|
9
|
-
field :title,
|
|
10
|
-
field :body,
|
|
11
|
-
|
|
9
|
+
field :title, :selector => '.caption a'
|
|
10
|
+
field :body, :selector => 'td[@colspan="2"]'
|
|
11
|
+
|
|
12
|
+
field :pub_date, :selector => 'td.index' do |elm|
|
|
13
|
+
elm.text.match(/\[(.*)\|/)[1]
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
field :comment_link, :selector => '.caption a' do |elm|
|
|
12
17
|
elm['href']
|
|
13
18
|
end
|
|
14
19
|
|
|
@@ -25,11 +30,11 @@ class LiveJournalPost
|
|
|
25
30
|
|
|
26
31
|
end
|
|
27
32
|
|
|
28
|
-
LiveJournalPost.source
|
|
29
|
-
@posts
|
|
33
|
+
LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
|
|
34
|
+
@posts = LiveJournalPost.all(:max_pages => 3)
|
|
30
35
|
|
|
31
36
|
@posts.each do |post|
|
|
32
|
-
puts "#{post.title}"
|
|
37
|
+
puts "#{post.pub_date} - #{post.title}"
|
|
33
38
|
puts "#{post.comment_link}"
|
|
34
39
|
puts "#{post.body}"
|
|
35
40
|
puts "*"*100
|
data/examples/ning_post.rb
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
2
|
+
require File.join(dir, 'graboid')
|
|
2
3
|
|
|
3
4
|
class NingPost
|
|
4
|
-
include Graboid::
|
|
5
|
+
include Graboid::Scraper
|
|
5
6
|
|
|
6
7
|
selector 'div.xg_blog .xg_module_body'
|
|
7
8
|
|
|
@@ -25,10 +26,10 @@ class NingPost
|
|
|
25
26
|
# ning's list page only has an excerpt of the body. No biggie,
|
|
26
27
|
# we'll just go grab it.
|
|
27
28
|
show_url = elm.css('a').last["href"]
|
|
28
|
-
Nokogiri::HTML(open(show_url)).css('.postbody').to_html
|
|
29
|
+
Nokogiri::HTML(open(show_url,"User-Agent" => Graboid.user_agent)).css('.postbody').to_html
|
|
29
30
|
end
|
|
30
31
|
|
|
31
|
-
|
|
32
|
+
page_with do |doc|
|
|
32
33
|
doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
|
|
33
34
|
end
|
|
34
35
|
|
|
@@ -45,8 +46,8 @@ class NingPost
|
|
|
45
46
|
|
|
46
47
|
end
|
|
47
48
|
|
|
48
|
-
|
|
49
|
-
@posts
|
|
49
|
+
NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
|
|
50
|
+
@posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
|
|
50
51
|
|
|
51
52
|
@posts.each do |post|
|
|
52
53
|
puts "#{post.pub_date} -- #{post.title}"
|
data/graboid.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = %q{graboid}
|
|
8
|
-
s.version = "0.3.
|
|
8
|
+
s.version = "0.3.4"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Christopher Burnett"]
|
|
12
|
-
s.date = %q{2010-06-
|
|
12
|
+
s.date = %q{2010-06-16}
|
|
13
13
|
s.description = %q{web scraping made easier}
|
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
|
15
15
|
s.extra_rdoc_files = [
|
|
@@ -29,11 +29,13 @@ Gem::Specification.new do |s|
|
|
|
29
29
|
"graboid.gemspec",
|
|
30
30
|
"lib/graboid.rb",
|
|
31
31
|
"lib/graboid/entity.rb",
|
|
32
|
+
"lib/graboid/scraper.rb",
|
|
32
33
|
"spec/fixtures/graboid.jpg",
|
|
33
34
|
"spec/fixtures/posts.html",
|
|
34
35
|
"spec/fixtures/server.rb",
|
|
35
36
|
"spec/fixtures/views/posts.erb",
|
|
36
37
|
"spec/graboid/entity_spec.rb",
|
|
38
|
+
"spec/graboid/scraper_spec.rb",
|
|
37
39
|
"spec/graboid_spec.rb",
|
|
38
40
|
"spec/spec.opts",
|
|
39
41
|
"spec/spec_helper.rb"
|
|
@@ -46,6 +48,7 @@ Gem::Specification.new do |s|
|
|
|
46
48
|
s.test_files = [
|
|
47
49
|
"spec/fixtures/server.rb",
|
|
48
50
|
"spec/graboid/entity_spec.rb",
|
|
51
|
+
"spec/graboid/scraper_spec.rb",
|
|
49
52
|
"spec/graboid_spec.rb",
|
|
50
53
|
"spec/spec_helper.rb",
|
|
51
54
|
"examples/active_rain_post.rb",
|
data/lib/graboid.rb
CHANGED
|
@@ -1,9 +1,18 @@
|
|
|
1
|
-
%w{rubygems nokogiri open-uri active_support}.each { |f| require f }
|
|
1
|
+
%w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
|
|
2
2
|
|
|
3
3
|
dir = Pathname(__FILE__).dirname.expand_path
|
|
4
4
|
|
|
5
5
|
require dir + 'graboid/entity'
|
|
6
|
-
|
|
6
|
+
require dir + 'graboid/scraper'
|
|
7
7
|
|
|
8
8
|
module Graboid
|
|
9
|
+
extend self
|
|
10
|
+
|
|
11
|
+
def user_agent
|
|
12
|
+
@user_agent ||= 'Graboid'
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def user_agent=(agent)
|
|
16
|
+
@user_agent = agent
|
|
17
|
+
end
|
|
9
18
|
end
|
data/lib/graboid/entity.rb
CHANGED
|
@@ -5,7 +5,7 @@ module Graboid
|
|
|
5
5
|
klass.class_eval do
|
|
6
6
|
extend ClassMethods
|
|
7
7
|
include InstanceMethods
|
|
8
|
-
|
|
8
|
+
warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
|
|
9
9
|
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
|
10
10
|
end
|
|
11
11
|
end
|
|
@@ -67,7 +67,7 @@ module Graboid
|
|
|
67
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
|
68
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
|
69
69
|
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
|
70
|
-
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
|
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
|
|
71
71
|
|
|
72
72
|
extracted_hash
|
|
73
73
|
end
|
|
@@ -119,7 +119,7 @@ module Graboid
|
|
|
119
119
|
def read_source
|
|
120
120
|
case self.source
|
|
121
121
|
when /^http[s]?:\/\//
|
|
122
|
-
open
|
|
122
|
+
open(self.source, "User-Agent" => Graboid.user_agent)
|
|
123
123
|
when String
|
|
124
124
|
self.source
|
|
125
125
|
end
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
module Graboid
|
|
2
|
+
module Scraper
|
|
3
|
+
def self.included klass
|
|
4
|
+
klass.class_eval do
|
|
5
|
+
extend ClassMethods
|
|
6
|
+
include InstanceMethods
|
|
7
|
+
|
|
8
|
+
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
|
9
|
+
write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
module ClassMethods
|
|
14
|
+
|
|
15
|
+
def attribute_map
|
|
16
|
+
read_inheritable_attribute :attribute_map
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def callbacks
|
|
20
|
+
read_inheritable_attribute :callbacks
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def inferred_selector
|
|
24
|
+
@inferred_selector ||= ".#{self.to_s.underscore}"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def page_with &block
|
|
28
|
+
@pager = block
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def pager
|
|
32
|
+
@pager
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def root_selector
|
|
36
|
+
@root_selector || inferred_selector
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def selector selector
|
|
40
|
+
@root_selector = selector
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
alias_method :root, :selector
|
|
44
|
+
|
|
45
|
+
def set name, opts={}, &block
|
|
46
|
+
opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
|
|
47
|
+
opts.merge!(:processor => block) if block_given?
|
|
48
|
+
|
|
49
|
+
attribute_map[name] = opts
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
[:before, :after].each do |prefix|
|
|
53
|
+
[:paginate, :extract].each do |suffix|
|
|
54
|
+
method_name = "#{prefix}_#{suffix}"
|
|
55
|
+
define_method method_name.to_sym do |&block|
|
|
56
|
+
self.callbacks["#{method_name}".to_sym] = block
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
module InstanceMethods
|
|
64
|
+
def initialize opts={}, &block
|
|
65
|
+
raise ArgumentError unless opts[:source].present?
|
|
66
|
+
self.source = opts[:source]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def all opts={}, reload=false
|
|
70
|
+
return self.collection if reload and !self.collection.empty?
|
|
71
|
+
reset_context
|
|
72
|
+
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
|
73
|
+
all_fragments.collect{ |frag| extract_instance(frag) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
alias_method :scrape, :all
|
|
77
|
+
|
|
78
|
+
def all_fragments
|
|
79
|
+
return page_fragments if self.class.pager.nil?
|
|
80
|
+
old_source = self.source
|
|
81
|
+
|
|
82
|
+
while next_page?
|
|
83
|
+
self.collection += page_fragments
|
|
84
|
+
run_before_paginate_callbacks
|
|
85
|
+
paginate
|
|
86
|
+
run_after_paginate_callbacks
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
self.source = old_source
|
|
90
|
+
self.collection
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def attribute_map
|
|
94
|
+
self.class.attribute_map
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def callbacks
|
|
98
|
+
self.class.callbacks
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def collection
|
|
102
|
+
@collection ||= []
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def collection=(col)
|
|
106
|
+
@collection = col
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def current_page
|
|
110
|
+
@current_page ||= 0
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def current_page=num
|
|
114
|
+
@current_page = num
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def doc
|
|
118
|
+
eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def extract_instance fragment
|
|
122
|
+
OpenStruct.new(hash_map fragment)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def hash_map fragment
|
|
126
|
+
attribute_map.inject({}) do |extracted_hash, at|
|
|
127
|
+
selector, processor = at.last[:selector], at.last[:processor]
|
|
128
|
+
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
|
129
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
|
|
130
|
+
|
|
131
|
+
extracted_hash
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def max_pages
|
|
136
|
+
@max_pages ||= 0
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def max_pages=num
|
|
140
|
+
@max_pages = num
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def mode
|
|
144
|
+
@mode ||= :html
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def mode=(m)
|
|
148
|
+
raise ArgumentError unless [:html, :xml].include?(m)
|
|
149
|
+
@mode = m
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def next_page?
|
|
153
|
+
if max_pages.zero?
|
|
154
|
+
return true unless self.class.pager.call(doc).nil?
|
|
155
|
+
else
|
|
156
|
+
current_page <= max_pages-1
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def page_fragments
|
|
161
|
+
doc.css(self.class.root_selector)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def paginate
|
|
165
|
+
next_page_url = self.class.pager.call(doc) rescue nil
|
|
166
|
+
self.source = next_page_url
|
|
167
|
+
self.current_page += 1
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def read_source
|
|
171
|
+
case self.source
|
|
172
|
+
when /^http[s]?:\/\//
|
|
173
|
+
open(self.source ,"User-Agent" => Graboid.user_agent)
|
|
174
|
+
when String
|
|
175
|
+
self.source
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def reset_context
|
|
180
|
+
self.collection = []
|
|
181
|
+
self.current_page = 0
|
|
182
|
+
self.max_pages = 0
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def source
|
|
186
|
+
@source
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def source=(src)
|
|
190
|
+
@source = src
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
[:before, :after].each do |prefix|
|
|
194
|
+
[:paginate, :extract].each do |suffix|
|
|
195
|
+
method_name = "#{prefix}_#{suffix}"
|
|
196
|
+
define_method "run_#{method_name}_callbacks" do
|
|
197
|
+
self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
2
|
+
|
|
3
|
+
class MockScraper
|
|
4
|
+
include Graboid::Scraper
|
|
5
|
+
|
|
6
|
+
set :title
|
|
7
|
+
set :body
|
|
8
|
+
set :author
|
|
9
|
+
set :date, :selector => '.author' do |elm|
|
|
10
|
+
elm.text.match(/\((.*)\)/)[1]
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
class WorkingScraper
|
|
15
|
+
include Graboid::Scraper
|
|
16
|
+
|
|
17
|
+
selector '.post'
|
|
18
|
+
|
|
19
|
+
set :title
|
|
20
|
+
set :body
|
|
21
|
+
set :author
|
|
22
|
+
set :date, :selector => '.author' do |elm|
|
|
23
|
+
elm.text.match(/\((.*)\)/)[1]
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
class ScraperWithPager
|
|
28
|
+
include Graboid::Scraper
|
|
29
|
+
|
|
30
|
+
selector '.post'
|
|
31
|
+
|
|
32
|
+
set :title
|
|
33
|
+
set :body
|
|
34
|
+
set :author
|
|
35
|
+
set :date, :selector => '.author' do |elm|
|
|
36
|
+
elm.text.match(/\((.*)\)/)[1]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
page_with do |doc|
|
|
40
|
+
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
before_paginate do
|
|
44
|
+
puts "page: #{source}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
describe Graboid::Scraper do
|
|
52
|
+
describe "#root_selector" do
|
|
53
|
+
it "should be set" do
|
|
54
|
+
MockScraper.root_selector.should == '.mock_scraper'
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
describe "when inferred from class" do
|
|
58
|
+
|
|
59
|
+
before(:each) do
|
|
60
|
+
class Phony; include Graboid::Scraper; end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "should infer .phony" do
|
|
64
|
+
Phony.root_selector.should == '.phony'
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
describe "#set" do
|
|
70
|
+
describe "simple syntax" do
|
|
71
|
+
|
|
72
|
+
before(:each) do
|
|
73
|
+
MockScraper.set :body
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it "should be set in the attr map" do
|
|
77
|
+
MockScraper.attribute_map[:body].should be_a Hash
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it "should set the selector" do
|
|
81
|
+
MockScraper.attribute_map[:body][:selector].should == '.body'
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
describe "custom selector syntax" do
|
|
86
|
+
before(:each) do
|
|
87
|
+
MockScraper.set :body, :selector => '.custom'
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it "should set the selector" do
|
|
91
|
+
MockScraper.attribute_map[:body][:selector].should == '.custom'
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe "custom selector syntax with a lambda" do
|
|
96
|
+
|
|
97
|
+
before(:each) do
|
|
98
|
+
MockScraper.set :body, :selector => '.custom' do |item|
|
|
99
|
+
"from lambda"
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "should set the selector" do
|
|
104
|
+
MockScraper.attribute_map[:body][:selector].should == '.custom'
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it "should set the processor" do
|
|
108
|
+
MockScraper.attribute_map[:body][:processor].should be_a Proc
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
describe "#new" do
|
|
115
|
+
describe "when supplied a source" do
|
|
116
|
+
before(:each) do
|
|
117
|
+
@scraper = WorkingScraper.new( :source => TEST_SERVER_URL )
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
it "should have the correct attribute_map" do
|
|
121
|
+
@scraper.attribute_map[:body][:selector].should == '.body'
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it "should set the instance source" do
|
|
125
|
+
@scraper.source.should == TEST_SERVER_URL
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
it "should set the doc source" do
|
|
129
|
+
@scraper.doc.should be_a Nokogiri::HTML::Document
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
describe "#all_fragments" do
|
|
134
|
+
before(:each) do
|
|
135
|
+
@scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
|
|
136
|
+
@fragments = @scraper.all_fragments
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it "should return the NodeSet" do
|
|
140
|
+
@fragments.should be_a Nokogiri::XML::NodeSet
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "should have 2 results" do
|
|
144
|
+
@fragments.count.should == 2
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
describe "#all" do
|
|
149
|
+
before(:each) do
|
|
150
|
+
@scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "should return 2 WorkingPosts" do
|
|
154
|
+
@scraper.all(:max_pages => 3).length.should == 2
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
[:current_page, :max_pages].each do |m|
|
|
158
|
+
describe "##{m}" do
|
|
159
|
+
it "should be 0 by default" do
|
|
160
|
+
@scraper.send(m).should == 0
|
|
161
|
+
end
|
|
162
|
+
it "should be 3" do
|
|
163
|
+
@scraper.send("#{m}=",3)
|
|
164
|
+
@scraper.send(m).should == 3
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe "#page_with" do
|
|
172
|
+
describe "with a limit" do
|
|
173
|
+
before(:each) do
|
|
174
|
+
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
|
175
|
+
@posts = @scraper.all(:max_pages => 3)
|
|
176
|
+
end
|
|
177
|
+
it "should get 6 posts" do
|
|
178
|
+
@posts.length.should == 6
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
describe "without a limit" do
|
|
183
|
+
before(:each) do
|
|
184
|
+
@scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
|
|
185
|
+
@posts = @scraper.all
|
|
186
|
+
end
|
|
187
|
+
it "should get 16 posts" do
|
|
188
|
+
@posts.length.should == 16
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
end
|
|
195
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: graboid
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 27
|
|
5
5
|
prerelease: false
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
8
|
- 3
|
|
9
|
-
-
|
|
10
|
-
version: 0.3.
|
|
9
|
+
- 4
|
|
10
|
+
version: 0.3.4
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Christopher Burnett
|
|
@@ -15,7 +15,7 @@ autorequire:
|
|
|
15
15
|
bindir: bin
|
|
16
16
|
cert_chain: []
|
|
17
17
|
|
|
18
|
-
date: 2010-06-
|
|
18
|
+
date: 2010-06-16 00:00:00 -07:00
|
|
19
19
|
default_executable:
|
|
20
20
|
dependencies:
|
|
21
21
|
- !ruby/object:Gem::Dependency
|
|
@@ -84,11 +84,13 @@ files:
|
|
|
84
84
|
- graboid.gemspec
|
|
85
85
|
- lib/graboid.rb
|
|
86
86
|
- lib/graboid/entity.rb
|
|
87
|
+
- lib/graboid/scraper.rb
|
|
87
88
|
- spec/fixtures/graboid.jpg
|
|
88
89
|
- spec/fixtures/posts.html
|
|
89
90
|
- spec/fixtures/server.rb
|
|
90
91
|
- spec/fixtures/views/posts.erb
|
|
91
92
|
- spec/graboid/entity_spec.rb
|
|
93
|
+
- spec/graboid/scraper_spec.rb
|
|
92
94
|
- spec/graboid_spec.rb
|
|
93
95
|
- spec/spec.opts
|
|
94
96
|
- spec/spec_helper.rb
|
|
@@ -129,6 +131,7 @@ summary: web scraping made easy
|
|
|
129
131
|
test_files:
|
|
130
132
|
- spec/fixtures/server.rb
|
|
131
133
|
- spec/graboid/entity_spec.rb
|
|
134
|
+
- spec/graboid/scraper_spec.rb
|
|
132
135
|
- spec/graboid_spec.rb
|
|
133
136
|
- spec/spec_helper.rb
|
|
134
137
|
- examples/active_rain_post.rb
|