graboid 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.3
1
+ 0.3.4
@@ -4,11 +4,21 @@ require File.join(dir, 'graboid')
4
4
  class ActiveRainPost
5
5
  include Graboid::Entity
6
6
 
7
- root '.blog_entry'
8
-
9
- field :title, :selector => 'h2'
10
-
11
- field :body, :selector => 'div' do |elm|
7
+ selector '.blog_entry_wrapper'
8
+
9
+ set :title, :selector => 'h2 a'
10
+ set :pub_date, :selector => '.blog_entry' do |elm|
11
+ # awesome, the pub date is not contained within
12
+ # the .blog_entry_wrapper fragment.
13
+ begin
14
+ entry_id = elm['id'].gsub('blog_entry_','')
15
+ date_text = self.doc.css("#divbei#{entry_id} td").select{|td| td.text =~ /posted by/i }.first.text
16
+ date_text.match(/(\d{2}\/\d{2}\/\d{4})/).captures.first
17
+ rescue
18
+ ""
19
+ end
20
+ end
21
+ set :body, :selector => 'div' do |elm|
12
22
  elm.css('p').collect(&:to_html)
13
23
  end
14
24
 
@@ -26,10 +36,10 @@ class ActiveRainPost
26
36
  end
27
37
 
28
38
  ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
29
- @posts = ActiveRainPost.all
39
+ @posts = ActiveRainPost.all(:max_pages => 1)
30
40
 
31
41
  @posts.each do |post|
32
- puts "#{post.title}"
42
+ puts "#{post.pub_date}"
33
43
  puts "*"*100
34
44
  end
35
45
 
@@ -6,9 +6,14 @@ class LiveJournalPost
6
6
 
7
7
  root '.entrybox'
8
8
 
9
- field :title, :selector => '.caption a'
10
- field :body, :selector => 'td[@colspan="2"]'
11
- field :comment_link, :selector => '.caption a' do |elm|
9
+ field :title, :selector => '.caption a'
10
+ field :body, :selector => 'td[@colspan="2"]'
11
+
12
+ field :pub_date, :selector => 'td.index' do |elm|
13
+ elm.text.match(/\[(.*)\|/)[1]
14
+ end
15
+
16
+ field :comment_link, :selector => '.caption a' do |elm|
12
17
  elm['href']
13
18
  end
14
19
 
@@ -25,11 +30,11 @@ class LiveJournalPost
25
30
 
26
31
  end
27
32
 
28
- LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
29
- @posts = LiveJournalPost.all(:max_pages => 3)
33
+ LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
34
+ @posts = LiveJournalPost.all(:max_pages => 3)
30
35
 
31
36
  @posts.each do |post|
32
- puts "#{post.title}"
37
+ puts "#{post.pub_date} - #{post.title}"
33
38
  puts "#{post.comment_link}"
34
39
  puts "#{post.body}"
35
40
  puts "*"*100
@@ -1,7 +1,8 @@
1
- %w{rubygems graboid}.each {|f| require f }
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
2
3
 
3
4
  class NingPost
4
- include Graboid::Entity
5
+ include Graboid::Scraper
5
6
 
6
7
  selector 'div.xg_blog .xg_module_body'
7
8
 
@@ -25,10 +26,10 @@ class NingPost
25
26
  # ning's list page only has an excerpt of the body. No biggie,
26
27
  # we'll just go grab it.
27
28
  show_url = elm.css('a').last["href"]
28
- Nokogiri::HTML(open(show_url)).css('.postbody').to_html
29
+ Nokogiri::HTML(open(show_url,"User-Agent" => Graboid.user_agent)).css('.postbody').to_html
29
30
  end
30
31
 
31
- pager do |doc|
32
+ page_with do |doc|
32
33
  doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
33
34
  end
34
35
 
@@ -45,8 +46,8 @@ class NingPost
45
46
 
46
47
  end
47
48
 
48
- NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
49
- @posts = NingPost.all(:max_pages => 1)
49
+ NING_URL = 'http://www.friendsorenemies.com/profiles/blog/list?user=3vx1daeuxrt14'
50
+ @posts = NingPost.new( :source => NING_URL ).all(:max_pages => 2)
50
51
 
51
52
  @posts.each do |post|
52
53
  puts "#{post.pub_date} -- #{post.title}"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.3"
8
+ s.version = "0.3.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-15}
12
+ s.date = %q{2010-06-16}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -29,11 +29,13 @@ Gem::Specification.new do |s|
29
29
  "graboid.gemspec",
30
30
  "lib/graboid.rb",
31
31
  "lib/graboid/entity.rb",
32
+ "lib/graboid/scraper.rb",
32
33
  "spec/fixtures/graboid.jpg",
33
34
  "spec/fixtures/posts.html",
34
35
  "spec/fixtures/server.rb",
35
36
  "spec/fixtures/views/posts.erb",
36
37
  "spec/graboid/entity_spec.rb",
38
+ "spec/graboid/scraper_spec.rb",
37
39
  "spec/graboid_spec.rb",
38
40
  "spec/spec.opts",
39
41
  "spec/spec_helper.rb"
@@ -46,6 +48,7 @@ Gem::Specification.new do |s|
46
48
  s.test_files = [
47
49
  "spec/fixtures/server.rb",
48
50
  "spec/graboid/entity_spec.rb",
51
+ "spec/graboid/scraper_spec.rb",
49
52
  "spec/graboid_spec.rb",
50
53
  "spec/spec_helper.rb",
51
54
  "examples/active_rain_post.rb",
@@ -1,9 +1,18 @@
1
- %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
1
+ %w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
2
2
 
3
3
  dir = Pathname(__FILE__).dirname.expand_path
4
4
 
5
5
  require dir + 'graboid/entity'
6
-
6
+ require dir + 'graboid/scraper'
7
7
 
8
8
  module Graboid
9
+ extend self
10
+
11
+ def user_agent
12
+ @user_agent ||= 'Graboid'
13
+ end
14
+
15
+ def user_agent=(agent)
16
+ @user_agent = agent
17
+ end
9
18
  end
@@ -5,7 +5,7 @@ module Graboid
5
5
  klass.class_eval do
6
6
  extend ClassMethods
7
7
  include InstanceMethods
8
-
8
+ warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
9
9
  write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
10
10
  end
11
11
  end
@@ -67,7 +67,7 @@ module Graboid
67
67
  attribute_map.inject({}) do |extracted_hash, at|
68
68
  selector, processor = at.last[:selector], at.last[:processor]
69
69
  node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
- extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
71
71
 
72
72
  extracted_hash
73
73
  end
@@ -119,7 +119,7 @@ module Graboid
119
119
  def read_source
120
120
  case self.source
121
121
  when /^http[s]?:\/\//
122
- open self.source
122
+ open(self.source, "User-Agent" => Graboid.user_agent)
123
123
  when String
124
124
  self.source
125
125
  end
@@ -0,0 +1,204 @@
1
+ module Graboid
2
+ module Scraper
3
+ def self.included klass
4
+ klass.class_eval do
5
+ extend ClassMethods
6
+ include InstanceMethods
7
+
8
+ write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
+ write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
10
+ end
11
+ end
12
+
13
+ module ClassMethods
14
+
15
+ def attribute_map
16
+ read_inheritable_attribute :attribute_map
17
+ end
18
+
19
+ def callbacks
20
+ read_inheritable_attribute :callbacks
21
+ end
22
+
23
+ def inferred_selector
24
+ @inferred_selector ||= ".#{self.to_s.underscore}"
25
+ end
26
+
27
+ def page_with &block
28
+ @pager = block
29
+ end
30
+
31
+ def pager
32
+ @pager
33
+ end
34
+
35
+ def root_selector
36
+ @root_selector || inferred_selector
37
+ end
38
+
39
+ def selector selector
40
+ @root_selector = selector
41
+ end
42
+
43
+ alias_method :root, :selector
44
+
45
+ def set name, opts={}, &block
46
+ opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
47
+ opts.merge!(:processor => block) if block_given?
48
+
49
+ attribute_map[name] = opts
50
+ end
51
+
52
+ [:before, :after].each do |prefix|
53
+ [:paginate, :extract].each do |suffix|
54
+ method_name = "#{prefix}_#{suffix}"
55
+ define_method method_name.to_sym do |&block|
56
+ self.callbacks["#{method_name}".to_sym] = block
57
+ end
58
+ end
59
+ end
60
+
61
+ end
62
+
63
+ module InstanceMethods
64
+ def initialize opts={}, &block
65
+ raise ArgumentError unless opts[:source].present?
66
+ self.source = opts[:source]
67
+ end
68
+
69
+ def all opts={}, reload=false
70
+ return self.collection if reload and !self.collection.empty?
71
+ reset_context
72
+ self.max_pages = opts[:max_pages] if opts[:max_pages].present?
73
+ all_fragments.collect{ |frag| extract_instance(frag) }
74
+ end
75
+
76
+ alias_method :scrape, :all
77
+
78
+ def all_fragments
79
+ return page_fragments if self.class.pager.nil?
80
+ old_source = self.source
81
+
82
+ while next_page?
83
+ self.collection += page_fragments
84
+ run_before_paginate_callbacks
85
+ paginate
86
+ run_after_paginate_callbacks
87
+ end
88
+
89
+ self.source = old_source
90
+ self.collection
91
+ end
92
+
93
+ def attribute_map
94
+ self.class.attribute_map
95
+ end
96
+
97
+ def callbacks
98
+ self.class.callbacks
99
+ end
100
+
101
+ def collection
102
+ @collection ||= []
103
+ end
104
+
105
+ def collection=(col)
106
+ @collection = col
107
+ end
108
+
109
+ def current_page
110
+ @current_page ||= 0
111
+ end
112
+
113
+ def current_page=num
114
+ @current_page = num
115
+ end
116
+
117
+ def doc
118
+ eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
119
+ end
120
+
121
+ def extract_instance fragment
122
+ OpenStruct.new(hash_map fragment)
123
+ end
124
+
125
+ def hash_map fragment
126
+ attribute_map.inject({}) do |extracted_hash, at|
127
+ selector, processor = at.last[:selector], at.last[:processor]
128
+ node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
129
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) #rescue ""
130
+
131
+ extracted_hash
132
+ end
133
+ end
134
+
135
+ def max_pages
136
+ @max_pages ||= 0
137
+ end
138
+
139
+ def max_pages=num
140
+ @max_pages = num
141
+ end
142
+
143
+ def mode
144
+ @mode ||= :html
145
+ end
146
+
147
+ def mode=(m)
148
+ raise ArgumentError unless [:html, :xml].include?(m)
149
+ @mode = m
150
+ end
151
+
152
+ def next_page?
153
+ if max_pages.zero?
154
+ return true unless self.class.pager.call(doc).nil?
155
+ else
156
+ current_page <= max_pages-1
157
+ end
158
+ end
159
+
160
+ def page_fragments
161
+ doc.css(self.class.root_selector)
162
+ end
163
+
164
+ def paginate
165
+ next_page_url = self.class.pager.call(doc) rescue nil
166
+ self.source = next_page_url
167
+ self.current_page += 1
168
+ end
169
+
170
+ def read_source
171
+ case self.source
172
+ when /^http[s]?:\/\//
173
+ open(self.source ,"User-Agent" => Graboid.user_agent)
174
+ when String
175
+ self.source
176
+ end
177
+ end
178
+
179
+ def reset_context
180
+ self.collection = []
181
+ self.current_page = 0
182
+ self.max_pages = 0
183
+ end
184
+
185
+ def source
186
+ @source
187
+ end
188
+
189
+ def source=(src)
190
+ @source = src
191
+ end
192
+
193
+ [:before, :after].each do |prefix|
194
+ [:paginate, :extract].each do |suffix|
195
+ method_name = "#{prefix}_#{suffix}"
196
+ define_method "run_#{method_name}_callbacks" do
197
+ self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
198
+ end
199
+ end
200
+ end
201
+
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,195 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ class MockScraper
4
+ include Graboid::Scraper
5
+
6
+ set :title
7
+ set :body
8
+ set :author
9
+ set :date, :selector => '.author' do |elm|
10
+ elm.text.match(/\((.*)\)/)[1]
11
+ end
12
+ end
13
+
14
+ class WorkingScraper
15
+ include Graboid::Scraper
16
+
17
+ selector '.post'
18
+
19
+ set :title
20
+ set :body
21
+ set :author
22
+ set :date, :selector => '.author' do |elm|
23
+ elm.text.match(/\((.*)\)/)[1]
24
+ end
25
+ end
26
+
27
+ class ScraperWithPager
28
+ include Graboid::Scraper
29
+
30
+ selector '.post'
31
+
32
+ set :title
33
+ set :body
34
+ set :author
35
+ set :date, :selector => '.author' do |elm|
36
+ elm.text.match(/\((.*)\)/)[1]
37
+ end
38
+
39
+ page_with do |doc|
40
+ 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
41
+ end
42
+
43
+ before_paginate do
44
+ puts "page: #{source}"
45
+ end
46
+
47
+ end
48
+
49
+
50
+
51
+ describe Graboid::Scraper do
52
+ describe "#root_selector" do
53
+ it "should be set" do
54
+ MockScraper.root_selector.should == '.mock_scraper'
55
+ end
56
+
57
+ describe "when inferred from class" do
58
+
59
+ before(:each) do
60
+ class Phony; include Graboid::Scraper; end
61
+ end
62
+
63
+ it "should infer .phony" do
64
+ Phony.root_selector.should == '.phony'
65
+ end
66
+ end
67
+ end
68
+
69
+ describe "#set" do
70
+ describe "simple syntax" do
71
+
72
+ before(:each) do
73
+ MockScraper.set :body
74
+ end
75
+
76
+ it "should be set in the attr map" do
77
+ MockScraper.attribute_map[:body].should be_a Hash
78
+ end
79
+
80
+ it "should set the selector" do
81
+ MockScraper.attribute_map[:body][:selector].should == '.body'
82
+ end
83
+ end
84
+
85
+ describe "custom selector syntax" do
86
+ before(:each) do
87
+ MockScraper.set :body, :selector => '.custom'
88
+ end
89
+
90
+ it "should set the selector" do
91
+ MockScraper.attribute_map[:body][:selector].should == '.custom'
92
+ end
93
+ end
94
+
95
+ describe "custom selector syntax with a lambda" do
96
+
97
+ before(:each) do
98
+ MockScraper.set :body, :selector => '.custom' do |item|
99
+ "from lambda"
100
+ end
101
+ end
102
+
103
+ it "should set the selector" do
104
+ MockScraper.attribute_map[:body][:selector].should == '.custom'
105
+ end
106
+
107
+ it "should set the processor" do
108
+ MockScraper.attribute_map[:body][:processor].should be_a Proc
109
+ end
110
+
111
+ end
112
+ end
113
+
114
+ describe "#new" do
115
+ describe "when supplied a source" do
116
+ before(:each) do
117
+ @scraper = WorkingScraper.new( :source => TEST_SERVER_URL )
118
+ end
119
+
120
+ it "should have the correct attribute_map" do
121
+ @scraper.attribute_map[:body][:selector].should == '.body'
122
+ end
123
+
124
+ it "should set the instance source" do
125
+ @scraper.source.should == TEST_SERVER_URL
126
+ end
127
+
128
+ it "should set the doc source" do
129
+ @scraper.doc.should be_a Nokogiri::HTML::Document
130
+ end
131
+ end
132
+
133
+ describe "#all_fragments" do
134
+ before(:each) do
135
+ @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
136
+ @fragments = @scraper.all_fragments
137
+ end
138
+
139
+ it "should return the NodeSet" do
140
+ @fragments.should be_a Nokogiri::XML::NodeSet
141
+ end
142
+
143
+ it "should have 2 results" do
144
+ @fragments.count.should == 2
145
+ end
146
+ end
147
+
148
+ describe "#all" do
149
+ before(:each) do
150
+ @scraper = WorkingScraper.new( :source => POSTS_HTML_STR )
151
+ end
152
+
153
+ it "should return 2 WorkingPosts" do
154
+ @scraper.all(:max_pages => 3).length.should == 2
155
+ end
156
+
157
+ [:current_page, :max_pages].each do |m|
158
+ describe "##{m}" do
159
+ it "should be 0 by default" do
160
+ @scraper.send(m).should == 0
161
+ end
162
+ it "should be 3" do
163
+ @scraper.send("#{m}=",3)
164
+ @scraper.send(m).should == 3
165
+ end
166
+ end
167
+ end
168
+
169
+ end
170
+
171
+ describe "#page_with" do
172
+ describe "with a limit" do
173
+ before(:each) do
174
+ @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
175
+ @posts = @scraper.all(:max_pages => 3)
176
+ end
177
+ it "should get 6 posts" do
178
+ @posts.length.should == 6
179
+ end
180
+ end
181
+
182
+ describe "without a limit" do
183
+ before(:each) do
184
+ @scraper = ScraperWithPager.new( :source => 'http://localhost:9393/posts' )
185
+ @posts = @scraper.all
186
+ end
187
+ it "should get 16 posts" do
188
+ @posts.length.should == 16
189
+ end
190
+ end
191
+
192
+ end
193
+
194
+ end
195
+ end
@@ -53,3 +53,5 @@ class PostWithPager
53
53
  end
54
54
 
55
55
  end
56
+
57
+ TEST_SERVER_URL = 'http://localhost:9393/posts'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 3
10
- version: 0.3.3
9
+ - 4
10
+ version: 0.3.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-15 00:00:00 -07:00
18
+ date: 2010-06-16 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -84,11 +84,13 @@ files:
84
84
  - graboid.gemspec
85
85
  - lib/graboid.rb
86
86
  - lib/graboid/entity.rb
87
+ - lib/graboid/scraper.rb
87
88
  - spec/fixtures/graboid.jpg
88
89
  - spec/fixtures/posts.html
89
90
  - spec/fixtures/server.rb
90
91
  - spec/fixtures/views/posts.erb
91
92
  - spec/graboid/entity_spec.rb
93
+ - spec/graboid/scraper_spec.rb
92
94
  - spec/graboid_spec.rb
93
95
  - spec/spec.opts
94
96
  - spec/spec_helper.rb
@@ -129,6 +131,7 @@ summary: web scraping made easy
129
131
  test_files:
130
132
  - spec/fixtures/server.rb
131
133
  - spec/graboid/entity_spec.rb
134
+ - spec/graboid/scraper_spec.rb
132
135
  - spec/graboid_spec.rb
133
136
  - spec/spec_helper.rb
134
137
  - examples/active_rain_post.rb