graboid 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/graboid.gemspec +6 -4
- data/lib/graboid/entity.rb +55 -10
- data/spec/fixtures/server.rb +8 -0
- data/spec/fixtures/views/posts.erb +37 -0
- data/spec/graboid/entity_spec.rb +33 -55
- data/spec/spec_helper.rb +47 -4
- metadata +6 -5
- data/README.mdown.orig +0 -61
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.2
|
data/graboid.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
@@ -14,8 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.mdown"
|
18
|
-
"README.mdown.orig"
|
17
|
+
"README.mdown"
|
19
18
|
]
|
20
19
|
s.files = [
|
21
20
|
".document",
|
@@ -29,6 +28,8 @@ Gem::Specification.new do |s|
|
|
29
28
|
"lib/graboid/entity.rb",
|
30
29
|
"spec/fixtures/graboid.jpg",
|
31
30
|
"spec/fixtures/posts.html",
|
31
|
+
"spec/fixtures/server.rb",
|
32
|
+
"spec/fixtures/views/posts.erb",
|
32
33
|
"spec/graboid/entity_spec.rb",
|
33
34
|
"spec/graboid_spec.rb",
|
34
35
|
"spec/spec.opts",
|
@@ -40,7 +41,8 @@ Gem::Specification.new do |s|
|
|
40
41
|
s.rubygems_version = %q{1.3.7}
|
41
42
|
s.summary = %q{web scraping made easy}
|
42
43
|
s.test_files = [
|
43
|
-
"spec/
|
44
|
+
"spec/fixtures/server.rb",
|
45
|
+
"spec/graboid/entity_spec.rb",
|
44
46
|
"spec/graboid_spec.rb",
|
45
47
|
"spec/spec_helper.rb"
|
46
48
|
]
|
data/lib/graboid/entity.rb
CHANGED
@@ -3,8 +3,9 @@ module Graboid
|
|
3
3
|
|
4
4
|
def self.included klass
|
5
5
|
klass.class_eval do
|
6
|
-
extend
|
6
|
+
extend ClassMethods
|
7
7
|
include InstanceMethods
|
8
|
+
|
8
9
|
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
9
10
|
end
|
10
11
|
end
|
@@ -43,7 +44,15 @@ module Graboid
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def doc
|
46
|
-
Nokogiri
|
47
|
+
eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
|
48
|
+
end
|
49
|
+
|
50
|
+
def collection
|
51
|
+
@collection ||= []
|
52
|
+
end
|
53
|
+
|
54
|
+
def collection=(col)
|
55
|
+
@collection = col
|
47
56
|
end
|
48
57
|
|
49
58
|
def attribute_map
|
@@ -57,7 +66,8 @@ module Graboid
|
|
57
66
|
def hash_map fragment
|
58
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
59
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
60
|
-
|
69
|
+
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
|
61
71
|
|
62
72
|
extracted_hash
|
63
73
|
end
|
@@ -65,15 +75,15 @@ module Graboid
|
|
65
75
|
|
66
76
|
def all_fragments
|
67
77
|
return page_fragments if @pager.nil?
|
68
|
-
old_source
|
69
|
-
@collection = []
|
78
|
+
old_source = self.source
|
70
79
|
while next_page?
|
71
|
-
|
72
|
-
|
80
|
+
self.collection += page_fragments
|
81
|
+
run_before_paginate_callbacks
|
73
82
|
paginate
|
83
|
+
run_after_paginate_callbacks
|
74
84
|
end
|
75
85
|
self.source = old_source
|
76
|
-
|
86
|
+
self.collection
|
77
87
|
end
|
78
88
|
|
79
89
|
def paginate
|
@@ -83,7 +93,11 @@ module Graboid
|
|
83
93
|
end
|
84
94
|
|
85
95
|
def next_page?
|
86
|
-
|
96
|
+
if max_pages.zero?
|
97
|
+
return true unless @pager.call(doc).nil?
|
98
|
+
else
|
99
|
+
current_page <= max_pages-1
|
100
|
+
end
|
87
101
|
end
|
88
102
|
|
89
103
|
def page_fragments
|
@@ -91,13 +105,20 @@ module Graboid
|
|
91
105
|
end
|
92
106
|
|
93
107
|
def all opts={}
|
108
|
+
reset_context
|
94
109
|
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
95
110
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
96
111
|
end
|
97
112
|
|
113
|
+
def reset_context
|
114
|
+
self.collection = []
|
115
|
+
self.current_page = 0
|
116
|
+
self.max_pages = 0
|
117
|
+
end
|
118
|
+
|
98
119
|
def read_source
|
99
120
|
case self.source
|
100
|
-
when /^http
|
121
|
+
when /^http[s]?:\/\//
|
101
122
|
open self.source
|
102
123
|
when String
|
103
124
|
self.source
|
@@ -108,6 +129,15 @@ module Graboid
|
|
108
129
|
@pager = block
|
109
130
|
end
|
110
131
|
|
132
|
+
def mode
|
133
|
+
@mode ||= :html
|
134
|
+
end
|
135
|
+
|
136
|
+
def mode=(m)
|
137
|
+
raise ArgumentError unless [:html, :xml].include?(m)
|
138
|
+
@mode = m
|
139
|
+
end
|
140
|
+
|
111
141
|
def max_pages
|
112
142
|
@max_pages ||= 0
|
113
143
|
end
|
@@ -124,6 +154,21 @@ module Graboid
|
|
124
154
|
@current_page = num
|
125
155
|
end
|
126
156
|
|
157
|
+
instance_eval do
|
158
|
+
[:before, :after].each do |prefix|
|
159
|
+
[:paginate, :extract].each do |suffix|
|
160
|
+
method_name = "#{prefix}_#{suffix}"
|
161
|
+
define_method method_name.to_sym do |&block|
|
162
|
+
instance_variable_set "@#{method_name}", block
|
163
|
+
end
|
164
|
+
define_method "run_#{method_name}_callbacks" do
|
165
|
+
ivar = instance_variable_get("@#{method_name}")
|
166
|
+
self.class_eval { ivar.call } unless ivar.nil?
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
127
172
|
end # ClassMethods
|
128
173
|
|
129
174
|
module InstanceMethods
|
@@ -0,0 +1,37 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>posts</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Posterous">
|
10
|
+
<!-- Date: 2010-06-10 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
|
14
|
+
<% @limit.times do |num| %>
|
15
|
+
|
16
|
+
<div class="post" id="<%= num + (@page*@limit)-1 %>">
|
17
|
+
|
18
|
+
<p class="title">Post <%= num + (@page*@limit)-1 %></p>
|
19
|
+
|
20
|
+
<p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
21
|
+
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
22
|
+
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
23
|
+
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
24
|
+
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
25
|
+
</p>
|
26
|
+
<span class="author">Someone Awesome (06/11/2010)</span>
|
27
|
+
|
28
|
+
</div>
|
29
|
+
|
30
|
+
<% end %>
|
31
|
+
|
32
|
+
<% if @page < @total_pages %>
|
33
|
+
<a class="next" href="/posts?page=<%= @page.next %>">next</a>
|
34
|
+
<% end %>
|
35
|
+
|
36
|
+
</body>
|
37
|
+
</html>
|
data/spec/graboid/entity_spec.rb
CHANGED
@@ -1,11 +1,5 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
class Post
|
4
|
-
include Graboid::Entity
|
5
|
-
|
6
|
-
selector '.post'
|
7
|
-
end
|
8
|
-
|
9
3
|
describe Graboid::Entity do
|
10
4
|
describe "#source" do
|
11
5
|
describe "when url" do
|
@@ -112,12 +106,6 @@ describe Graboid::Entity do
|
|
112
106
|
|
113
107
|
describe "#all_fragments" do
|
114
108
|
before(:each) do
|
115
|
-
|
116
|
-
class WorkingPost
|
117
|
-
include Graboid::Entity
|
118
|
-
selector '.post'
|
119
|
-
set :body
|
120
|
-
end
|
121
109
|
|
122
110
|
WorkingPost.source = POSTS_HTML_STR
|
123
111
|
@fragments = WorkingPost.all_fragments
|
@@ -136,17 +124,7 @@ describe Graboid::Entity do
|
|
136
124
|
describe "#extract_instance" do
|
137
125
|
|
138
126
|
before(:each) do
|
139
|
-
class WorkingPost
|
140
|
-
include Graboid::Entity
|
141
|
-
selector '.post'
|
142
|
-
set :title
|
143
|
-
set :body
|
144
|
-
set :author
|
145
|
-
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
146
|
-
end
|
147
|
-
|
148
127
|
@instance = WorkingPost.extract_instance(POST_FRAGMENT)
|
149
|
-
|
150
128
|
end
|
151
129
|
|
152
130
|
it "should return a WorkingPost instance" do
|
@@ -165,17 +143,7 @@ describe Graboid::Entity do
|
|
165
143
|
|
166
144
|
describe "#all" do
|
167
145
|
before(:each) do
|
168
|
-
class WorkingPost
|
169
|
-
include Graboid::Entity
|
170
|
-
selector '.post'
|
171
|
-
set :title
|
172
|
-
set :body
|
173
|
-
set :author
|
174
|
-
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
175
|
-
end
|
176
|
-
|
177
146
|
WorkingPost.source = POSTS_HTML_STR
|
178
|
-
|
179
147
|
end
|
180
148
|
|
181
149
|
it "should return 2 WorkingPosts" do
|
@@ -196,32 +164,42 @@ describe Graboid::Entity do
|
|
196
164
|
end
|
197
165
|
end
|
198
166
|
|
167
|
+
describe "#mode" do
|
168
|
+
it "should be html by default" do
|
169
|
+
WorkingPost.mode.should == :html
|
170
|
+
end
|
171
|
+
it "should throw an error for invalid values" do
|
172
|
+
lambda {
|
173
|
+
WorkingPost.mode = :derp
|
174
|
+
}.should raise_error ArgumentError
|
175
|
+
end
|
176
|
+
it "should change to :xml" do
|
177
|
+
WorkingPost.mode = :xml
|
178
|
+
WorkingPost.mode.should == :xml
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
199
182
|
describe "#pager" do
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
set :title
|
208
|
-
set :domain, :selector => '.domain a'
|
209
|
-
set :link, :selector => '.title' do |entry|
|
210
|
-
entry.css('a').first['href']
|
211
|
-
end
|
212
|
-
|
213
|
-
pager do |doc|
|
214
|
-
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
215
|
-
end
|
216
|
-
|
183
|
+
describe "with a limit" do
|
184
|
+
before(:each) do
|
185
|
+
PostWithPager.source = 'http://localhost:9393/posts'
|
186
|
+
@posts = PostWithPager.all(:max_pages => 3)
|
187
|
+
end
|
188
|
+
it "should get 2 posts" do
|
189
|
+
@posts.length.should == 6
|
217
190
|
end
|
218
|
-
RedditEntry.source = 'http://reddit.com'
|
219
|
-
@posts = RedditEntry.all(:max_pages => 2)
|
220
191
|
end
|
221
|
-
|
222
|
-
|
192
|
+
|
193
|
+
describe "without a limit" do
|
194
|
+
before(:each) do
|
195
|
+
PostWithPager.source = 'http://localhost:9393/posts'
|
196
|
+
@posts = PostWithPager.all
|
197
|
+
end
|
198
|
+
it "should get 2 posts" do
|
199
|
+
@posts.length.should == 16
|
200
|
+
end
|
223
201
|
end
|
202
|
+
|
224
203
|
end
|
225
|
-
|
226
|
-
|
204
|
+
|
227
205
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -8,7 +8,50 @@ Spec::Runner.configure do |config|
|
|
8
8
|
|
9
9
|
end
|
10
10
|
|
11
|
-
|
12
|
-
POSTS_HTML_STR = File.read(
|
13
|
-
|
14
|
-
POST_FRAGMENT = Nokogiri::HTML::fragment(
|
11
|
+
FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
|
12
|
+
POSTS_HTML_STR = File.read(FIXTURE_PATH){|f| f.read }
|
13
|
+
POST_DOC = Nokogiri::HTML(POSTS_HTML_STR)
|
14
|
+
POST_FRAGMENT = Nokogiri::HTML::fragment(POST_DOC.css('.post').first.to_html)
|
15
|
+
|
16
|
+
class Post
|
17
|
+
include Graboid::Entity
|
18
|
+
|
19
|
+
selector '.post'
|
20
|
+
end
|
21
|
+
|
22
|
+
class WorkingPost
|
23
|
+
include Graboid::Entity
|
24
|
+
|
25
|
+
selector '.post'
|
26
|
+
|
27
|
+
set :title
|
28
|
+
set :body
|
29
|
+
set :author
|
30
|
+
set :date, :selector => '.author' do |elm|
|
31
|
+
elm.text.match(/\((.*)\)/)[1]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class PostWithPager
|
36
|
+
include Graboid::Entity
|
37
|
+
|
38
|
+
selector '.post'
|
39
|
+
|
40
|
+
set :title
|
41
|
+
set :body
|
42
|
+
set :author
|
43
|
+
set :date, :selector => '.author' do |elm|
|
44
|
+
elm.text.match(/\((.*)\)/)[1]
|
45
|
+
end
|
46
|
+
|
47
|
+
pager do |doc|
|
48
|
+
link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
49
|
+
#puts link.inspect
|
50
|
+
link
|
51
|
+
end
|
52
|
+
|
53
|
+
before_paginate do
|
54
|
+
puts "page: #{self.source}"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 2
|
10
|
+
version: 0.3.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -71,7 +71,6 @@ extensions: []
|
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE
|
73
73
|
- README.mdown
|
74
|
-
- README.mdown.orig
|
75
74
|
files:
|
76
75
|
- .document
|
77
76
|
- .gitignore
|
@@ -84,11 +83,12 @@ files:
|
|
84
83
|
- lib/graboid/entity.rb
|
85
84
|
- spec/fixtures/graboid.jpg
|
86
85
|
- spec/fixtures/posts.html
|
86
|
+
- spec/fixtures/server.rb
|
87
|
+
- spec/fixtures/views/posts.erb
|
87
88
|
- spec/graboid/entity_spec.rb
|
88
89
|
- spec/graboid_spec.rb
|
89
90
|
- spec/spec.opts
|
90
91
|
- spec/spec_helper.rb
|
91
|
-
- README.mdown.orig
|
92
92
|
has_rdoc: true
|
93
93
|
homepage: http://github.com/twoism/graboid
|
94
94
|
licenses: []
|
@@ -124,6 +124,7 @@ signing_key:
|
|
124
124
|
specification_version: 3
|
125
125
|
summary: web scraping made easy
|
126
126
|
test_files:
|
127
|
+
- spec/fixtures/server.rb
|
127
128
|
- spec/graboid/entity_spec.rb
|
128
129
|
- spec/graboid_spec.rb
|
129
130
|
- spec/spec_helper.rb
|
data/README.mdown.orig
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
### Graboid ###
|
2
|
-
|
3
|
-

|
4
|
-
|
5
|
-
Simply awesome web scraping. Better docs later. See specs.
|
6
|
-
|
7
|
-
### Installation ###
|
8
|
-
|
9
|
-
|
10
|
-
gem install nokogiri graboid
|
11
|
-
|
12
|
-
|
13
|
-
### Usage ###
|
14
|
-
|
15
|
-
%w{rubygems graboid}.each { |f| require f }
|
16
|
-
|
17
|
-
class RedditEntry
|
18
|
-
include Graboid::Entity
|
19
|
-
|
20
|
-
selector '.entry'
|
21
|
-
<<<<<<< HEAD
|
22
|
-
|
23
|
-
set :title
|
24
|
-
set :domain, :selector => '.domain a'
|
25
|
-
set :link, :selector => '.title' do |entry|
|
26
|
-
=======
|
27
|
-
|
28
|
-
field :title
|
29
|
-
field :domain, :selector => '.domain a'
|
30
|
-
field :link, :selector => '.title' do |entry|
|
31
|
-
>>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
|
32
|
-
entry.css('a').first['href']
|
33
|
-
end
|
34
|
-
|
35
|
-
pager do |doc|
|
36
|
-
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
RedditEntry.source = 'http://reddit.com'
|
42
|
-
|
43
|
-
RedditEntry.all(:max_pages => 2).each do |p|
|
44
|
-
puts "title: #{p.title}"
|
45
|
-
puts "domain: #{p.domain}"
|
46
|
-
puts "link: #{p.link}"
|
47
|
-
end
|
48
|
-
|
49
|
-
##Note on Patches/Pull Requests
|
50
|
-
|
51
|
-
* Fork the project.
|
52
|
-
* Make your feature addition or bug fix.
|
53
|
-
* Add tests for it. This is important so I don't break it in a
|
54
|
-
future version unintentionally.
|
55
|
-
* Commit, do not mess with rakefile, version, or history.
|
56
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
57
|
-
* Send me a pull request. Bonus points for topic branches.
|
58
|
-
|
59
|
-
## Copyright
|
60
|
-
|
61
|
-
Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
|