graboid 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.3.2
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.0"
8
+ s.version = "0.3.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
@@ -14,8 +14,7 @@ Gem::Specification.new do |s|
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown",
18
- "README.mdown.orig"
17
+ "README.mdown"
19
18
  ]
20
19
  s.files = [
21
20
  ".document",
@@ -29,6 +28,8 @@ Gem::Specification.new do |s|
29
28
  "lib/graboid/entity.rb",
30
29
  "spec/fixtures/graboid.jpg",
31
30
  "spec/fixtures/posts.html",
31
+ "spec/fixtures/server.rb",
32
+ "spec/fixtures/views/posts.erb",
32
33
  "spec/graboid/entity_spec.rb",
33
34
  "spec/graboid_spec.rb",
34
35
  "spec/spec.opts",
@@ -40,7 +41,8 @@ Gem::Specification.new do |s|
40
41
  s.rubygems_version = %q{1.3.7}
41
42
  s.summary = %q{web scraping made easy}
42
43
  s.test_files = [
43
- "spec/graboid/entity_spec.rb",
44
+ "spec/fixtures/server.rb",
45
+ "spec/graboid/entity_spec.rb",
44
46
  "spec/graboid_spec.rb",
45
47
  "spec/spec_helper.rb"
46
48
  ]
@@ -3,8 +3,9 @@ module Graboid
3
3
 
4
4
  def self.included klass
5
5
  klass.class_eval do
6
- extend ClassMethods
6
+ extend ClassMethods
7
7
  include InstanceMethods
8
+
8
9
  write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
10
  end
10
11
  end
@@ -43,7 +44,15 @@ module Graboid
43
44
  end
44
45
 
45
46
  def doc
46
- Nokogiri::HTML read_source
47
+ eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
48
+ end
49
+
50
+ def collection
51
+ @collection ||= []
52
+ end
53
+
54
+ def collection=(col)
55
+ @collection = col
47
56
  end
48
57
 
49
58
  def attribute_map
@@ -57,7 +66,8 @@ module Graboid
57
66
  def hash_map fragment
58
67
  attribute_map.inject({}) do |extracted_hash, at|
59
68
  selector, processor = at.last[:selector], at.last[:processor]
60
- extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
69
+ node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
61
71
 
62
72
  extracted_hash
63
73
  end
@@ -65,15 +75,15 @@ module Graboid
65
75
 
66
76
  def all_fragments
67
77
  return page_fragments if @pager.nil?
68
- old_source = self.source
69
- @collection = []
78
+ old_source = self.source
70
79
  while next_page?
71
- @frags = page_fragments
72
- @collection += @frags
80
+ self.collection += page_fragments
81
+ run_before_paginate_callbacks
73
82
  paginate
83
+ run_after_paginate_callbacks
74
84
  end
75
85
  self.source = old_source
76
- @collection
86
+ self.collection
77
87
  end
78
88
 
79
89
  def paginate
@@ -83,7 +93,11 @@ module Graboid
83
93
  end
84
94
 
85
95
  def next_page?
86
- (current_page <= max_pages-1)
96
+ if max_pages.zero?
97
+ return true unless @pager.call(doc).nil?
98
+ else
99
+ current_page <= max_pages-1
100
+ end
87
101
  end
88
102
 
89
103
  def page_fragments
@@ -91,13 +105,20 @@ module Graboid
91
105
  end
92
106
 
93
107
  def all opts={}
108
+ reset_context
94
109
  self.max_pages = opts[:max_pages] if opts[:max_pages].present?
95
110
  all_fragments.collect{ |frag| extract_instance(frag) }
96
111
  end
97
112
 
113
+ def reset_context
114
+ self.collection = []
115
+ self.current_page = 0
116
+ self.max_pages = 0
117
+ end
118
+
98
119
  def read_source
99
120
  case self.source
100
- when /^http:\/\//
121
+ when /^http[s]?:\/\//
101
122
  open self.source
102
123
  when String
103
124
  self.source
@@ -108,6 +129,15 @@ module Graboid
108
129
  @pager = block
109
130
  end
110
131
 
132
+ def mode
133
+ @mode ||= :html
134
+ end
135
+
136
+ def mode=(m)
137
+ raise ArgumentError unless [:html, :xml].include?(m)
138
+ @mode = m
139
+ end
140
+
111
141
  def max_pages
112
142
  @max_pages ||= 0
113
143
  end
@@ -124,6 +154,21 @@ module Graboid
124
154
  @current_page = num
125
155
  end
126
156
 
157
+ instance_eval do
158
+ [:before, :after].each do |prefix|
159
+ [:paginate, :extract].each do |suffix|
160
+ method_name = "#{prefix}_#{suffix}"
161
+ define_method method_name.to_sym do |&block|
162
+ instance_variable_set "@#{method_name}", block
163
+ end
164
+ define_method "run_#{method_name}_callbacks" do
165
+ ivar = instance_variable_get("@#{method_name}")
166
+ self.class_eval { ivar.call } unless ivar.nil?
167
+ end
168
+ end
169
+ end
170
+ end
171
+
127
172
  end # ClassMethods
128
173
 
129
174
  module InstanceMethods
@@ -0,0 +1,8 @@
1
+ %w{rubygems sinatra}.each {|f| require f }
2
+
3
+ get "/posts" do
4
+ @total_pages = 8
5
+ @page = params[:page].to_i || 1
6
+ @limit = 2
7
+ erb :posts
8
+ end
@@ -0,0 +1,37 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>posts</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Posterous">
10
+ <!-- Date: 2010-06-10 -->
11
+ </head>
12
+ <body>
13
+
14
+ <% @limit.times do |num| %>
15
+
16
+ <div class="post" id="<%= num + (@page*@limit)-1 %>">
17
+
18
+ <p class="title">Post <%= num + (@page*@limit)-1 %></p>
19
+
20
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
21
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
22
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
23
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
24
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
25
+ </p>
26
+ <span class="author">Someone Awesome (06/11/2010)</span>
27
+
28
+ </div>
29
+
30
+ <% end %>
31
+
32
+ <% if @page < @total_pages %>
33
+ <a class="next" href="/posts?page=<%= @page.next %>">next</a>
34
+ <% end %>
35
+
36
+ </body>
37
+ </html>
@@ -1,11 +1,5 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
- class Post
4
- include Graboid::Entity
5
-
6
- selector '.post'
7
- end
8
-
9
3
  describe Graboid::Entity do
10
4
  describe "#source" do
11
5
  describe "when url" do
@@ -112,12 +106,6 @@ describe Graboid::Entity do
112
106
 
113
107
  describe "#all_fragments" do
114
108
  before(:each) do
115
-
116
- class WorkingPost
117
- include Graboid::Entity
118
- selector '.post'
119
- set :body
120
- end
121
109
 
122
110
  WorkingPost.source = POSTS_HTML_STR
123
111
  @fragments = WorkingPost.all_fragments
@@ -136,17 +124,7 @@ describe Graboid::Entity do
136
124
  describe "#extract_instance" do
137
125
 
138
126
  before(:each) do
139
- class WorkingPost
140
- include Graboid::Entity
141
- selector '.post'
142
- set :title
143
- set :body
144
- set :author
145
- set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
- end
147
-
148
127
  @instance = WorkingPost.extract_instance(POST_FRAGMENT)
149
-
150
128
  end
151
129
 
152
130
  it "should return a WorkingPost instance" do
@@ -165,17 +143,7 @@ describe Graboid::Entity do
165
143
 
166
144
  describe "#all" do
167
145
  before(:each) do
168
- class WorkingPost
169
- include Graboid::Entity
170
- selector '.post'
171
- set :title
172
- set :body
173
- set :author
174
- set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
- end
176
-
177
146
  WorkingPost.source = POSTS_HTML_STR
178
-
179
147
  end
180
148
 
181
149
  it "should return 2 WorkingPosts" do
@@ -196,32 +164,42 @@ describe Graboid::Entity do
196
164
  end
197
165
  end
198
166
 
167
+ describe "#mode" do
168
+ it "should be html by default" do
169
+ WorkingPost.mode.should == :html
170
+ end
171
+ it "should throw an error for invalid values" do
172
+ lambda {
173
+ WorkingPost.mode = :derp
174
+ }.should raise_error ArgumentError
175
+ end
176
+ it "should change to :xml" do
177
+ WorkingPost.mode = :xml
178
+ WorkingPost.mode.should == :xml
179
+ end
180
+ end
181
+
199
182
  describe "#pager" do
200
- before(:each) do
201
-
202
- class RedditEntry
203
- include Graboid::Entity
204
-
205
- selector '.entry'
206
-
207
- set :title
208
- set :domain, :selector => '.domain a'
209
- set :link, :selector => '.title' do |entry|
210
- entry.css('a').first['href']
211
- end
212
-
213
- pager do |doc|
214
- doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
215
- end
216
-
183
+ describe "with a limit" do
184
+ before(:each) do
185
+ PostWithPager.source = 'http://localhost:9393/posts'
186
+ @posts = PostWithPager.all(:max_pages => 3)
187
+ end
188
+ it "should get 2 posts" do
189
+ @posts.length.should == 6
217
190
  end
218
- RedditEntry.source = 'http://reddit.com'
219
- @posts = RedditEntry.all(:max_pages => 2)
220
191
  end
221
- it "should get 70 posts" do
222
- @posts.length.should == 70
192
+
193
+ describe "without a limit" do
194
+ before(:each) do
195
+ PostWithPager.source = 'http://localhost:9393/posts'
196
+ @posts = PostWithPager.all
197
+ end
198
+ it "should get 2 posts" do
199
+ @posts.length.should == 16
200
+ end
223
201
  end
202
+
224
203
  end
225
-
226
-
204
+
227
205
  end
@@ -8,7 +8,50 @@ Spec::Runner.configure do |config|
8
8
 
9
9
  end
10
10
 
11
- file_path = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
- POSTS_HTML_STR = File.read(file_path){|f| f.read }
13
- d = Nokogiri::HTML(POSTS_HTML_STR)
14
- POST_FRAGMENT = Nokogiri::HTML::fragment(d.css('.post').first.to_html)
11
+ FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
+ POSTS_HTML_STR = File.read(FIXTURE_PATH){|f| f.read }
13
+ POST_DOC = Nokogiri::HTML(POSTS_HTML_STR)
14
+ POST_FRAGMENT = Nokogiri::HTML::fragment(POST_DOC.css('.post').first.to_html)
15
+
16
+ class Post
17
+ include Graboid::Entity
18
+
19
+ selector '.post'
20
+ end
21
+
22
+ class WorkingPost
23
+ include Graboid::Entity
24
+
25
+ selector '.post'
26
+
27
+ set :title
28
+ set :body
29
+ set :author
30
+ set :date, :selector => '.author' do |elm|
31
+ elm.text.match(/\((.*)\)/)[1]
32
+ end
33
+ end
34
+
35
+ class PostWithPager
36
+ include Graboid::Entity
37
+
38
+ selector '.post'
39
+
40
+ set :title
41
+ set :body
42
+ set :author
43
+ set :date, :selector => '.author' do |elm|
44
+ elm.text.match(/\((.*)\)/)[1]
45
+ end
46
+
47
+ pager do |doc|
48
+ link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
49
+ #puts link.inspect
50
+ link
51
+ end
52
+
53
+ before_paginate do
54
+ puts "page: #{self.source}"
55
+ end
56
+
57
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 0
10
- version: 0.3.0
9
+ - 2
10
+ version: 0.3.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -71,7 +71,6 @@ extensions: []
71
71
  extra_rdoc_files:
72
72
  - LICENSE
73
73
  - README.mdown
74
- - README.mdown.orig
75
74
  files:
76
75
  - .document
77
76
  - .gitignore
@@ -84,11 +83,12 @@ files:
84
83
  - lib/graboid/entity.rb
85
84
  - spec/fixtures/graboid.jpg
86
85
  - spec/fixtures/posts.html
86
+ - spec/fixtures/server.rb
87
+ - spec/fixtures/views/posts.erb
87
88
  - spec/graboid/entity_spec.rb
88
89
  - spec/graboid_spec.rb
89
90
  - spec/spec.opts
90
91
  - spec/spec_helper.rb
91
- - README.mdown.orig
92
92
  has_rdoc: true
93
93
  homepage: http://github.com/twoism/graboid
94
94
  licenses: []
@@ -124,6 +124,7 @@ signing_key:
124
124
  specification_version: 3
125
125
  summary: web scraping made easy
126
126
  test_files:
127
+ - spec/fixtures/server.rb
127
128
  - spec/graboid/entity_spec.rb
128
129
  - spec/graboid_spec.rb
129
130
  - spec/spec_helper.rb
@@ -1,61 +0,0 @@
1
- ### Graboid ###
2
-
3
- ![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
-
5
- Simply awesome web scraping. Better docs later. See specs.
6
-
7
- ### Installation ###
8
-
9
-
10
- gem install nokogiri graboid
11
-
12
-
13
- ### Usage ###
14
-
15
- %w{rubygems graboid}.each { |f| require f }
16
-
17
- class RedditEntry
18
- include Graboid::Entity
19
-
20
- selector '.entry'
21
- <<<<<<< HEAD
22
-
23
- set :title
24
- set :domain, :selector => '.domain a'
25
- set :link, :selector => '.title' do |entry|
26
- =======
27
-
28
- field :title
29
- field :domain, :selector => '.domain a'
30
- field :link, :selector => '.title' do |entry|
31
- >>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
32
- entry.css('a').first['href']
33
- end
34
-
35
- pager do |doc|
36
- doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
37
- end
38
-
39
- end
40
-
41
- RedditEntry.source = 'http://reddit.com'
42
-
43
- RedditEntry.all(:max_pages => 2).each do |p|
44
- puts "title: #{p.title}"
45
- puts "domain: #{p.domain}"
46
- puts "link: #{p.link}"
47
- end
48
-
49
- ##Note on Patches/Pull Requests
50
-
51
- * Fork the project.
52
- * Make your feature addition or bug fix.
53
- * Add tests for it. This is important so I don't break it in a
54
- future version unintentionally.
55
- * Commit, do not mess with rakefile, version, or history.
56
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
57
- * Send me a pull request. Bonus points for topic branches.
58
-
59
- ## Copyright
60
-
61
- Copyright (c) 2010 Christopher Burnett. See LICENSE for details.