graboid 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.3.2
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.0"
8
+ s.version = "0.3.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
@@ -14,8 +14,7 @@ Gem::Specification.new do |s|
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown",
18
- "README.mdown.orig"
17
+ "README.mdown"
19
18
  ]
20
19
  s.files = [
21
20
  ".document",
@@ -29,6 +28,8 @@ Gem::Specification.new do |s|
29
28
  "lib/graboid/entity.rb",
30
29
  "spec/fixtures/graboid.jpg",
31
30
  "spec/fixtures/posts.html",
31
+ "spec/fixtures/server.rb",
32
+ "spec/fixtures/views/posts.erb",
32
33
  "spec/graboid/entity_spec.rb",
33
34
  "spec/graboid_spec.rb",
34
35
  "spec/spec.opts",
@@ -40,7 +41,8 @@ Gem::Specification.new do |s|
40
41
  s.rubygems_version = %q{1.3.7}
41
42
  s.summary = %q{web scraping made easy}
42
43
  s.test_files = [
43
- "spec/graboid/entity_spec.rb",
44
+ "spec/fixtures/server.rb",
45
+ "spec/graboid/entity_spec.rb",
44
46
  "spec/graboid_spec.rb",
45
47
  "spec/spec_helper.rb"
46
48
  ]
@@ -3,8 +3,9 @@ module Graboid
3
3
 
4
4
  def self.included klass
5
5
  klass.class_eval do
6
- extend ClassMethods
6
+ extend ClassMethods
7
7
  include InstanceMethods
8
+
8
9
  write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
10
  end
10
11
  end
@@ -43,7 +44,15 @@ module Graboid
43
44
  end
44
45
 
45
46
  def doc
46
- Nokogiri::HTML read_source
47
+ eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
48
+ end
49
+
50
+ def collection
51
+ @collection ||= []
52
+ end
53
+
54
+ def collection=(col)
55
+ @collection = col
47
56
  end
48
57
 
49
58
  def attribute_map
@@ -57,7 +66,8 @@ module Graboid
57
66
  def hash_map fragment
58
67
  attribute_map.inject({}) do |extracted_hash, at|
59
68
  selector, processor = at.last[:selector], at.last[:processor]
60
- extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
69
+ node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
61
71
 
62
72
  extracted_hash
63
73
  end
@@ -65,15 +75,15 @@ module Graboid
65
75
 
66
76
  def all_fragments
67
77
  return page_fragments if @pager.nil?
68
- old_source = self.source
69
- @collection = []
78
+ old_source = self.source
70
79
  while next_page?
71
- @frags = page_fragments
72
- @collection += @frags
80
+ self.collection += page_fragments
81
+ run_before_paginate_callbacks
73
82
  paginate
83
+ run_after_paginate_callbacks
74
84
  end
75
85
  self.source = old_source
76
- @collection
86
+ self.collection
77
87
  end
78
88
 
79
89
  def paginate
@@ -83,7 +93,11 @@ module Graboid
83
93
  end
84
94
 
85
95
  def next_page?
86
- (current_page <= max_pages-1)
96
+ if max_pages.zero?
97
+ return true unless @pager.call(doc).nil?
98
+ else
99
+ current_page <= max_pages-1
100
+ end
87
101
  end
88
102
 
89
103
  def page_fragments
@@ -91,13 +105,20 @@ module Graboid
91
105
  end
92
106
 
93
107
  def all opts={}
108
+ reset_context
94
109
  self.max_pages = opts[:max_pages] if opts[:max_pages].present?
95
110
  all_fragments.collect{ |frag| extract_instance(frag) }
96
111
  end
97
112
 
113
+ def reset_context
114
+ self.collection = []
115
+ self.current_page = 0
116
+ self.max_pages = 0
117
+ end
118
+
98
119
  def read_source
99
120
  case self.source
100
- when /^http:\/\//
121
+ when /^http[s]?:\/\//
101
122
  open self.source
102
123
  when String
103
124
  self.source
@@ -108,6 +129,15 @@ module Graboid
108
129
  @pager = block
109
130
  end
110
131
 
132
+ def mode
133
+ @mode ||= :html
134
+ end
135
+
136
+ def mode=(m)
137
+ raise ArgumentError unless [:html, :xml].include?(m)
138
+ @mode = m
139
+ end
140
+
111
141
  def max_pages
112
142
  @max_pages ||= 0
113
143
  end
@@ -124,6 +154,21 @@ module Graboid
124
154
  @current_page = num
125
155
  end
126
156
 
157
+ instance_eval do
158
+ [:before, :after].each do |prefix|
159
+ [:paginate, :extract].each do |suffix|
160
+ method_name = "#{prefix}_#{suffix}"
161
+ define_method method_name.to_sym do |&block|
162
+ instance_variable_set "@#{method_name}", block
163
+ end
164
+ define_method "run_#{method_name}_callbacks" do
165
+ ivar = instance_variable_get("@#{method_name}")
166
+ self.class_eval { ivar.call } unless ivar.nil?
167
+ end
168
+ end
169
+ end
170
+ end
171
+
127
172
  end # ClassMethods
128
173
 
129
174
  module InstanceMethods
@@ -0,0 +1,8 @@
1
+ %w{rubygems sinatra}.each {|f| require f }
2
+
3
+ get "/posts" do
4
+ @total_pages = 8
5
+ @page = params[:page].to_i || 1
6
+ @limit = 2
7
+ erb :posts
8
+ end
@@ -0,0 +1,37 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>posts</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Posterous">
10
+ <!-- Date: 2010-06-10 -->
11
+ </head>
12
+ <body>
13
+
14
+ <% @limit.times do |num| %>
15
+
16
+ <div class="post" id="<%= num + (@page*@limit)-1 %>">
17
+
18
+ <p class="title">Post <%= num + (@page*@limit)-1 %></p>
19
+
20
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
21
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
22
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
23
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
24
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
25
+ </p>
26
+ <span class="author">Someone Awesome (06/11/2010)</span>
27
+
28
+ </div>
29
+
30
+ <% end %>
31
+
32
+ <% if @page < @total_pages %>
33
+ <a class="next" href="/posts?page=<%= @page.next %>">next</a>
34
+ <% end %>
35
+
36
+ </body>
37
+ </html>
@@ -1,11 +1,5 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
- class Post
4
- include Graboid::Entity
5
-
6
- selector '.post'
7
- end
8
-
9
3
  describe Graboid::Entity do
10
4
  describe "#source" do
11
5
  describe "when url" do
@@ -112,12 +106,6 @@ describe Graboid::Entity do
112
106
 
113
107
  describe "#all_fragments" do
114
108
  before(:each) do
115
-
116
- class WorkingPost
117
- include Graboid::Entity
118
- selector '.post'
119
- set :body
120
- end
121
109
 
122
110
  WorkingPost.source = POSTS_HTML_STR
123
111
  @fragments = WorkingPost.all_fragments
@@ -136,17 +124,7 @@ describe Graboid::Entity do
136
124
  describe "#extract_instance" do
137
125
 
138
126
  before(:each) do
139
- class WorkingPost
140
- include Graboid::Entity
141
- selector '.post'
142
- set :title
143
- set :body
144
- set :author
145
- set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
- end
147
-
148
127
  @instance = WorkingPost.extract_instance(POST_FRAGMENT)
149
-
150
128
  end
151
129
 
152
130
  it "should return a WorkingPost instance" do
@@ -165,17 +143,7 @@ describe Graboid::Entity do
165
143
 
166
144
  describe "#all" do
167
145
  before(:each) do
168
- class WorkingPost
169
- include Graboid::Entity
170
- selector '.post'
171
- set :title
172
- set :body
173
- set :author
174
- set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
- end
176
-
177
146
  WorkingPost.source = POSTS_HTML_STR
178
-
179
147
  end
180
148
 
181
149
  it "should return 2 WorkingPosts" do
@@ -196,32 +164,42 @@ describe Graboid::Entity do
196
164
  end
197
165
  end
198
166
 
167
+ describe "#mode" do
168
+ it "should be html by default" do
169
+ WorkingPost.mode.should == :html
170
+ end
171
+ it "should throw an error for invalid values" do
172
+ lambda {
173
+ WorkingPost.mode = :derp
174
+ }.should raise_error ArgumentError
175
+ end
176
+ it "should change to :xml" do
177
+ WorkingPost.mode = :xml
178
+ WorkingPost.mode.should == :xml
179
+ end
180
+ end
181
+
199
182
  describe "#pager" do
200
- before(:each) do
201
-
202
- class RedditEntry
203
- include Graboid::Entity
204
-
205
- selector '.entry'
206
-
207
- set :title
208
- set :domain, :selector => '.domain a'
209
- set :link, :selector => '.title' do |entry|
210
- entry.css('a').first['href']
211
- end
212
-
213
- pager do |doc|
214
- doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
215
- end
216
-
183
+ describe "with a limit" do
184
+ before(:each) do
185
+ PostWithPager.source = 'http://localhost:9393/posts'
186
+ @posts = PostWithPager.all(:max_pages => 3)
187
+ end
188
+ it "should get 2 posts" do
189
+ @posts.length.should == 6
217
190
  end
218
- RedditEntry.source = 'http://reddit.com'
219
- @posts = RedditEntry.all(:max_pages => 2)
220
191
  end
221
- it "should get 70 posts" do
222
- @posts.length.should == 70
192
+
193
+ describe "without a limit" do
194
+ before(:each) do
195
+ PostWithPager.source = 'http://localhost:9393/posts'
196
+ @posts = PostWithPager.all
197
+ end
198
+ it "should get 2 posts" do
199
+ @posts.length.should == 16
200
+ end
223
201
  end
202
+
224
203
  end
225
-
226
-
204
+
227
205
  end
@@ -8,7 +8,50 @@ Spec::Runner.configure do |config|
8
8
 
9
9
  end
10
10
 
11
- file_path = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
- POSTS_HTML_STR = File.read(file_path){|f| f.read }
13
- d = Nokogiri::HTML(POSTS_HTML_STR)
14
- POST_FRAGMENT = Nokogiri::HTML::fragment(d.css('.post').first.to_html)
11
+ FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
+ POSTS_HTML_STR = File.read(FIXTURE_PATH){|f| f.read }
13
+ POST_DOC = Nokogiri::HTML(POSTS_HTML_STR)
14
+ POST_FRAGMENT = Nokogiri::HTML::fragment(POST_DOC.css('.post').first.to_html)
15
+
16
+ class Post
17
+ include Graboid::Entity
18
+
19
+ selector '.post'
20
+ end
21
+
22
+ class WorkingPost
23
+ include Graboid::Entity
24
+
25
+ selector '.post'
26
+
27
+ set :title
28
+ set :body
29
+ set :author
30
+ set :date, :selector => '.author' do |elm|
31
+ elm.text.match(/\((.*)\)/)[1]
32
+ end
33
+ end
34
+
35
+ class PostWithPager
36
+ include Graboid::Entity
37
+
38
+ selector '.post'
39
+
40
+ set :title
41
+ set :body
42
+ set :author
43
+ set :date, :selector => '.author' do |elm|
44
+ elm.text.match(/\((.*)\)/)[1]
45
+ end
46
+
47
+ pager do |doc|
48
+ link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
49
+ #puts link.inspect
50
+ link
51
+ end
52
+
53
+ before_paginate do
54
+ puts "page: #{self.source}"
55
+ end
56
+
57
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 0
10
- version: 0.3.0
9
+ - 2
10
+ version: 0.3.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -71,7 +71,6 @@ extensions: []
71
71
  extra_rdoc_files:
72
72
  - LICENSE
73
73
  - README.mdown
74
- - README.mdown.orig
75
74
  files:
76
75
  - .document
77
76
  - .gitignore
@@ -84,11 +83,12 @@ files:
84
83
  - lib/graboid/entity.rb
85
84
  - spec/fixtures/graboid.jpg
86
85
  - spec/fixtures/posts.html
86
+ - spec/fixtures/server.rb
87
+ - spec/fixtures/views/posts.erb
87
88
  - spec/graboid/entity_spec.rb
88
89
  - spec/graboid_spec.rb
89
90
  - spec/spec.opts
90
91
  - spec/spec_helper.rb
91
- - README.mdown.orig
92
92
  has_rdoc: true
93
93
  homepage: http://github.com/twoism/graboid
94
94
  licenses: []
@@ -124,6 +124,7 @@ signing_key:
124
124
  specification_version: 3
125
125
  summary: web scraping made easy
126
126
  test_files:
127
+ - spec/fixtures/server.rb
127
128
  - spec/graboid/entity_spec.rb
128
129
  - spec/graboid_spec.rb
129
130
  - spec/spec_helper.rb
@@ -1,61 +0,0 @@
1
- ### Graboid ###
2
-
3
- ![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
-
5
- Simply awesome web scraping. Better docs later. See specs.
6
-
7
- ### Installation ###
8
-
9
-
10
- gem install nokogiri graboid
11
-
12
-
13
- ### Usage ###
14
-
15
- %w{rubygems graboid}.each { |f| require f }
16
-
17
- class RedditEntry
18
- include Graboid::Entity
19
-
20
- selector '.entry'
21
- <<<<<<< HEAD
22
-
23
- set :title
24
- set :domain, :selector => '.domain a'
25
- set :link, :selector => '.title' do |entry|
26
- =======
27
-
28
- field :title
29
- field :domain, :selector => '.domain a'
30
- field :link, :selector => '.title' do |entry|
31
- >>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
32
- entry.css('a').first['href']
33
- end
34
-
35
- pager do |doc|
36
- doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
37
- end
38
-
39
- end
40
-
41
- RedditEntry.source = 'http://reddit.com'
42
-
43
- RedditEntry.all(:max_pages => 2).each do |p|
44
- puts "title: #{p.title}"
45
- puts "domain: #{p.domain}"
46
- puts "link: #{p.link}"
47
- end
48
-
49
- ##Note on Patches/Pull Requests
50
-
51
- * Fork the project.
52
- * Make your feature addition or bug fix.
53
- * Add tests for it. This is important so I don't break it in a
54
- future version unintentionally.
55
- * Commit, do not mess with rakefile, version, or history.
56
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
57
- * Send me a pull request. Bonus points for topic branches.
58
-
59
- ## Copyright
60
-
61
- Copyright (c) 2010 Christopher Burnett. See LICENSE for details.