graboid 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.3
1
+ 0.3.4
@@ -4,11 +4,21 @@ require File.join(dir, 'graboid')
4
4
  class ActiveRainPost
5
5
  include Graboid::Entity
6
6
 
7
- root '.blog_entry'
8
-
9
- field :title, :selector => 'h2'
10
-
11
- field :body, :selector => 'div' do |elm|
7
+ selector '.blog_entry_wrapper'
8
+
9
+ set :title, :selector => 'h2 a'
10
+ set :pub_date, :selector => '.blog_entry' do |elm|
11
+ # awesome, the pub date is not contained within
12
+ # the .blog_entry_wrapper fragment.
13
+ begin
14
+ entry_id = elm['id'].gsub('blog_entry_','')
15
+ date_text = self.doc.css("#divbei#{entry_id} td").select{|td| td.text =~ /posted by/i }.first.text
16
+ date_text.match(/(\d{2}\/\d{2}\/\d{4})/).captures.first
17
+ rescue
18
+ ""
19
+ end
20
+ end
21
+ set :body, :selector => 'div' do |elm|
12
22
  elm.css('p').collect(&:to_html)
13
23
  end
14
24
 
@@ -26,10 +36,10 @@ class ActiveRainPost
26
36
  end
27
37
 
28
38
  ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
29
- @posts = ActiveRainPost.all
39
+ @posts = ActiveRainPost.all(:max_pages => 1)
30
40
 
31
41
  @posts.each do |post|
32
- puts "#{post.title}"
42
+ puts "#{post.pub_date}"
33
43
  puts "*"*100
34
44
  end
35
45
 
@@ -6,9 +6,14 @@ class LiveJournalPost
6
6
 
7
7
  root '.entrybox'
8
8
 
9
- field :title, :selector => '.caption a'
10
- field :body, :selector => 'td[@colspan="2"]'
11
- field :comment_link, :selector => '.caption a' do |elm|
9
+ field :title, :selector => '.caption a'
10
+ field :body, :selector => 'td[@colspan="2"]'
11
+
12
+ field :pub_date, :selector => 'td.index' do |elm|
13
+ elm.text.match(/\[(.*)\|/)[1]
14
+ end
15
+
16
+ field :comment_link, :selector => '.caption a' do |elm|
12
17
  elm['href']
13
18
  end
14
19
 
@@ -25,11 +30,11 @@ class LiveJournalPost
25
30
 
26
31
  end
27
32
 
28
- LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
29
- @posts = LiveJournalPost.all(:max_pages => 3)
33
+ LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
34
+ @posts = LiveJournalPost.all(:max_pages => 3)
30
35
 
31
36
  @posts.each do |post|
32
- puts "#{post.title}"
37
+ puts "#{post.pub_date} - #{post.title}"
33
38
  puts "#{post.comment_link}"
34
39
  puts "#{post.body}"
35
40
  puts "*"*100
@@ -1,7 +1,8 @@
1
- %w{rubygems graboid}.each {|f| require f }
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
2
3
 
3
4
  class NingPost
4
- include Graboid::Entity
5
+ include Graboid::Scraper
5
6
 
6
7
  selector 'div.xg_blog .xg_module_body'
7
8
 
@@ -25,10 +26,10 @@ class NingPost
25
26
  # ning's list page only has an excerpt of the body. No biggie,
26
27
  # we'll just go grab it.
27
28
  show_url = elm.css('a').last["href"]
28
- Nokogiri::HTML(open(show_url)).css('.postbody').to_html
29
+ Nokogiri::HTML(open(show_url,"User-Agent" => Graboid.user_agent)).css('.postbody').to_html
29
30
  end
30
31
 
31
- pager do |doc|
32
+ page_with do |doc|
32
33
  doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
33
34
  end
34
35
 
@@ -45,8 +46,8 @@ class NingPost
45
46
 
46
47
  end
47
48
 
48
- NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
49
- @posts = NingPost.all(:max_pages => 1)
49
+ NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
50
+ @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
50
51
 
51
52
  @posts.each do |post|
52
53
  puts "#{post.pub_date} -- #{post.title}"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.3"
8
+ s.version = "0.3.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-15}
12
+ s.date = %q{2010-06-16}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -29,11 +29,13 @@ Gem::Specification.new do |s|
29
29
  "graboid.gemspec",
30
30
  "lib/graboid.rb",
31
31
  "lib/graboid/entity.rb",
32
+ "lib/graboid/scraper.rb",
32
33
  "spec/fixtures/graboid.jpg",
33
34
  "spec/fixtures/posts.html",
34
35
  "spec/fixtures/server.rb",
35
36
  "spec/fixtures/views/posts.erb",
36
37
  "spec/graboid/entity_spec.rb",
38
+ "spec/graboid/scraper_spec.rb",
37
39
  "spec/graboid_spec.rb",
38
40
  "spec/spec.opts",
39
41
  "spec/spec_helper.rb"
@@ -46,6 +48,7 @@ Gem::Specification.new do |s|
46
48
  s.test_files = [
47
49
  "spec/fixtures/server.rb",
48
50
  "spec/graboid/entity_spec.rb",
51
+ "spec/graboid/scraper_spec.rb",
49
52
  "spec/graboid_spec.rb",
50
53
  "spec/spec_helper.rb",
51
54
  "examples/active_rain_post.rb",
@@ -1,9 +1,18 @@
1
- %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
1
+ %w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
2
2
 
3
3
  dir = Pathname(__FILE__).dirname.expand_path
4
4
 
5
5
  require dir + 'graboid/entity'
6
-
6
+ require dir + 'graboid/scraper'
7
7
 
8
8
  module Graboid
9
+ extend self
10
+
11
+ def user_agent
12
+ @user_agent ||= 'Graboid'
13
+ end
14
+
15
+ def user_agent=(agent)
16
+ @user_agent = agent
17
+ end
9
18
  end
@@ -5,7 +5,7 @@ module Graboid
5
5
  klass.class_eval do
6
6
  extend ClassMethods
7
7
  include InstanceMethods
8
-
8
+ warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
9
9
  write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
10
10
  end
11
11
  end
@@ -67,7 +67,7 @@ module Graboid
67
67
  attribute_map.inject({}) do |extracted_hash, at|
68
68
  selector, processor = at.last[:selector], at.last[:processor]
69
69
  node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
- extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
71
71
 
72
72
  extracted_hash
73
73
  end
@@ -119,7 +119,7 @@ module Graboid
119
119
  def read_source
120
120
  case self.source
121
121
  when /^http[s]?:\/\//
122
- open self.source
122
+ open(self.source, "User-Agent" => Graboid.user_agent)
123
123
  when String
124
124
  self.source
125
125
  end
@@ -0,0 +1,204 @@
1
+ module Graboid
2
+ module Scraper
3
+ def self.included klass
4
+ klass.class_eval do
5
+ extend ClassMethods
6
+ include InstanceMethods
7
+
8
+ write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
+ write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
10
+ end
11
+ end
12
+
13
+ module ClassMethods
14
+
15
+ def attribute_map
16
+ read_inheritable_attribute :attribute_map
17
+ end
18
+
19
+ def callbacks
20
+ read_inheritable_attribute :callbacks
21
+ end
22
+
23
+ def inferred_selector
24
+ @inferred_selector ||= ".#{self.to_s.underscore}"
25
+ end
26
+
27
+ def page_with &block
28
+ @pager = block
29
+ end
30
+
31
+ def pager
32
+ @pager
33
+ end
34
+
35
+ def root_selector
36
+ @root_selector || inferred_selector
37
+ end
38
+
39
+ def selector selector
40
+ @root_selector = selector
41
+ end
42
+
43
+ alias_method :root, :selector
44
+
45
+ def set name, opts={}, &block
46
+ opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
47
+ opts.merge!(:processor => block) if block_given?
48
+
49
+ attribute_map[name] = opts
50
+ end
51
+
52
+ [:before, :after].each do |prefix|
53
+ [:paginate, :extract].each do |suffix|
54
+ method_name = "#{prefix}_#{suffix}"
55
+ define_method method_name.to_sym do |&block|
56
+ self.callbacks["#{method_name}".to_sym] = block
57
+ end
58
+ end
59
+ end
60
+
61
+ end
62
+
63
+ module InstanceMethods
64
+ def initialize opts={}, &block
65
+ raise ArgumentError unless opts[:source].present?
66
+ self.source = opts[:source]
67
+ end
68
+
69
+ def all opts={}, reload=false
70
+ return self.collection if reload and !self.collection.empty?
71
+ reset_context
72
+ self.max_pages = opts[:max_pages] if opts[:max_pages].present?
73
+ all_fragments.collect{ |frag| extract_instance(frag) }
74
+ end
75
+
76
+ alias_method :scrape, :all
77
+
78
+ def all_fragments
79
+ return page_fragments if self.class.pager.nil?
80
+ old_source = self.source
81
+
82
+ while next_page?
83
+ self.collection += page_fragments
84
+ run_before_paginate_callbacks
85
+ paginate
86
+ run_after_paginate_callbacks
87
+ end
88
+
89
+ self.source = old_source
90
+ self.collection
91
+ end
92
+
93
+ def attribute_map
94
+ self.class.attribute_map
95
+ end
96
+
97
+ def callbacks
98
+ self.class.callbacks
99
+ end
100
+
101
+ def collection
102
+ @collection ||= []
103
+ end
104
+
105
+ def collection=(col)
106
+ @collection = col
107
+ end
108
+
109
+ def current_page
110
+ @current_page ||= 0
111
+ end
112
+
113
+ def current_page=num
114
+ @current_page = num
115
+ end
116
+
117
+ def doc
118
+ eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
119
+ end
120
+
121
+ def extract_instance fragment
122
+ OpenStruct.new(hash_map fragment)
123
+ end
124
+
125
+ def hash_map fragment
126
+ attribute_map.inject({}) do |extracted_hash, at|
127
+ selector, processor = at.last[:selector], at.last[:processor]
128
+ node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
129
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
130
+
131
+ extracted_hash
132
+ end
133
+ end
134
+
135
+ def max_pages
136
+ @max_pages ||= 0
137
+ end
138
+
139
+ def max_pages=num
140
+ @max_pages = num
141
+ end
142
+
143
+ def mode
144
+ @mode ||= :html
145
+ end
146
+
147
+ def mode=(m)
148
+ raise ArgumentError unless [:html, :xml].include?(m)
149
+ @mode = m
150
+ end
151
+
152
+ def next_page?
153
+ if max_pages.zero?
154
+ return true unless self.class.pager.call(doc).nil?
155
+ else
156
+ current_page <= max_pages-1
157
+ end
158
+ end
159
+
160
+ def page_fragments
161
+ doc.css(self.class.root_selector)
162
+ end
163
+
164
+ def paginate
165
+ next_page_url = self.class.pager.call(doc) rescue nil
166
+ self.source = next_page_url
167
+ self.current_page += 1
168
+ end
169
+
170
+ def read_source
171
+ case self.source
172
+ when /^http[s]?:\/\//
173
+ open(self.source ,"User-Agent" => Graboid.user_agent)
174
+ when String
175
+ self.source
176
+ end
177
+ end
178
+
179
+ def reset_context
180
+ self.collection = []
181
+ self.current_page = 0
182
+ self.max_pages = 0
183
+ end
184
+
185
+ def source
186
+ @source
187
+ end
188
+
189
+ def source=(src)
190
+ @source = src
191
+ end
192
+
193
+ [:before, :after].each do |prefix|
194
+ [:paginate, :extract].each do |suffix|
195
+ method_name = "#{prefix}_#{suffix}"
196
+ define_method "run_#{method_name}_callbacks" do
197
+ self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
198
+ end
199
+ end
200
+ end
201
+
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,195 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ class MockScraper
4
+ include Graboid::Scraper
5
+
6
+ set :title
7
+ set :body
8
+ set :author
9
+ set :date, :selector => '.author' do |elm|
10
+ elm.text.match(/\((.*)\)/)[1]
11
+ end
12
+ end
13
+
14
+ class WorkingScraper
15
+ include Graboid::Scraper
16
+
17
+ selector '.post'
18
+
19
+ set :title
20
+ set :body
21
+ set :author
22
+ set :date, :selector => '.author' do |elm|
23
+ elm.text.match(/\((.*)\)/)[1]
24
+ end
25
+ end
26
+
27
+ class ScraperWithPager
28
+ include Graboid::Scraper
29
+
30
+ selector '.post'
31
+
32
+ set :title
33
+ set :body
34
+ set :author
35
+ set :date, :selector => '.author' do |elm|
36
+ elm.text.match(/\((.*)\)/)[1]
37
+ end
38
+
39
+ page_with do |doc|
40
+ 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
41
+ end
42
+
43
+ before_paginate do
44
+ puts "page: #{source}"
45
+ end
46
+
47
+ end
48
+
49
+
50
+
51
+ describe Graboid::Scraper do
52
+ describe "#root_selector" do
53
+ it "should be set" do
54
+ MockScraper.root_selector.should == '.mock_scraper'
55
+ end
56
+
57
+ describe "when inferred from class" do
58
+
59
+ before(:each) do
60
+ class Phony; include Graboid::Scraper; end
61
+ end
62
+
63
+ it "should infer .phony" do
64
+ Phony.root_selector.should == '.phony'
65
+ end
66
+ end
67
+ end
68
+
69
+ describe "#set" do
70
+ describe "simple syntax" do
71
+
72
+ before(:each) do
73
+ MockScraper.set :body
74
+ end
75
+
76
+ it "should be set in the attr map" do
77
+ MockScraper.attribute_map[:body].should be_a Hash
78
+ end
79
+
80
+ it "should set the selector" do
81
+ MockScraper.attribute_map[:body][:selector].should == '.body'
82
+ end
83
+ end
84
+
85
+ describe "custom selector syntax" do
86
+ before(:each) do
87
+ MockScraper.set :body, :selector => '.custom'
88
+ end
89
+
90
+ it "should set the selector" do
91
+ MockScraper.attribute_map[:body][:selector].should == '.custom'
92
+ end
93
+ end
94
+
95
+ describe "custom selector syntax with a lambda" do
96
+
97
+ before(:each) do
98
+ MockScraper.set :body, :selector => '.custom' do |item|
99
+ "from lambda"
100
+ end
101
+ end
102
+
103
+ it "should set the selector" do
104
+ MockScraper.attribute_map[:body][:selector].should == '.custom'
105
+ end
106
+
107
+ it "should set the processor" do
108
+ MockScraper.attribute_map[:body][:processor].should be_a Proc
109
+ end
110
+
111
+ end
112
+ end
113
+
114
+ describe "#new" do
115
+ describe "when supplied a source" do
116
+ before(:each) do
117
+ @scraper = WorkingScraper.new( :source => TEST_SERVER_URL )
118
+ end
119
+
120
+ it "should have the correct attribute_map" do
121
+ @scraper.attribute_map[:body][:selector].should == '.body'
122
+ end
123
+
124
+ it "should set the instance source" do
125
+ @scraper.source.should == TEST_SERVER_URL
126
+ end
127
+
128
+ it "should set the doc source" do
129
+ @scraper.doc.should be_a Nokogiri::HTML::Document
130
+ end
131
+ end
132
+
133
+ describe "#all_fragments" do
134
+ before(:each) do
135
+ @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
136
+ @fragments = @scraper.all_fragments
137
+ end
138
+
139
+ it "should return the NodeSet" do
140
+ @fragments.should be_a Nokogiri::XML::NodeSet
141
+ end
142
+
143
+ it "should have 2 results" do
144
+ @fragments.count.should == 2
145
+ end
146
+ end
147
+
148
+ describe "#all" do
149
+ before(:each) do
150
+ @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
151
+ end
152
+
153
+ it "should return 2 WorkingPosts" do
154
+ @scraper.all(:max_pages => 3).length.should == 2
155
+ end
156
+
157
+ [:current_page, :max_pages].each do |m|
158
+ describe "##{m}" do
159
+ it "should be 0 by default" do
160
+ @scraper.send(m).should == 0
161
+ end
162
+ it "should be 3" do
163
+ @scraper.send("#{m}=",3)
164
+ @scraper.send(m).should == 3
165
+ end
166
+ end
167
+ end
168
+
169
+ end
170
+
171
+ describe "#page_with" do
172
+ describe "with a limit" do
173
+ before(:each) do
174
+ @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
175
+ @posts = @scraper.all(:max_pages => 3)
176
+ end
177
+ it "should get 6 posts" do
178
+ @posts.length.should == 6
179
+ end
180
+ end
181
+
182
+ describe "without a limit" do
183
+ before(:each) do
184
+ @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
185
+ @posts = @scraper.all
186
+ end
187
+ it "should get 16 posts" do
188
+ @posts.length.should == 16
189
+ end
190
+ end
191
+
192
+ end
193
+
194
+ end
195
+ end
@@ -53,3 +53,5 @@ class PostWithPager
53
53
  end
54
54
 
55
55
  end
56
+
57
+ TEST_SERVER_URL = 'http://localhost:9393/posts'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 3
10
- version: 0.3.3
9
+ - 4
10
+ version: 0.3.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-15 00:00:00 -07:00
18
+ date: 2010-06-16 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -84,11 +84,13 @@ files:
84
84
  - graboid.gemspec
85
85
  - lib/graboid.rb
86
86
  - lib/graboid/entity.rb
87
+ - lib/graboid/scraper.rb
87
88
  - spec/fixtures/graboid.jpg
88
89
  - spec/fixtures/posts.html
89
90
  - spec/fixtures/server.rb
90
91
  - spec/fixtures/views/posts.erb
91
92
  - spec/graboid/entity_spec.rb
93
+ - spec/graboid/scraper_spec.rb
92
94
  - spec/graboid_spec.rb
93
95
  - spec/spec.opts
94
96
  - spec/spec_helper.rb
@@ -129,6 +131,7 @@ summary: web scraping made easy
129
131
  test_files:
130
132
  - spec/fixtures/server.rb
131
133
  - spec/graboid/entity_spec.rb
134
+ - spec/graboid/scraper_spec.rb
132
135
  - spec/graboid_spec.rb
133
136
  - spec/spec_helper.rb
134
137
  - examples/active_rain_post.rb