graboid 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/graboid.gemspec +6 -4
- data/lib/graboid/entity.rb +55 -10
- data/spec/fixtures/server.rb +8 -0
- data/spec/fixtures/views/posts.erb +37 -0
- data/spec/graboid/entity_spec.rb +33 -55
- data/spec/spec_helper.rb +47 -4
- metadata +6 -5
- data/README.mdown.orig +0 -61
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.2
|
data/graboid.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
@@ -14,8 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.mdown"
|
18
|
-
"README.mdown.orig"
|
17
|
+
"README.mdown"
|
19
18
|
]
|
20
19
|
s.files = [
|
21
20
|
".document",
|
@@ -29,6 +28,8 @@ Gem::Specification.new do |s|
|
|
29
28
|
"lib/graboid/entity.rb",
|
30
29
|
"spec/fixtures/graboid.jpg",
|
31
30
|
"spec/fixtures/posts.html",
|
31
|
+
"spec/fixtures/server.rb",
|
32
|
+
"spec/fixtures/views/posts.erb",
|
32
33
|
"spec/graboid/entity_spec.rb",
|
33
34
|
"spec/graboid_spec.rb",
|
34
35
|
"spec/spec.opts",
|
@@ -40,7 +41,8 @@ Gem::Specification.new do |s|
|
|
40
41
|
s.rubygems_version = %q{1.3.7}
|
41
42
|
s.summary = %q{web scraping made easy}
|
42
43
|
s.test_files = [
|
43
|
-
"spec/
|
44
|
+
"spec/fixtures/server.rb",
|
45
|
+
"spec/graboid/entity_spec.rb",
|
44
46
|
"spec/graboid_spec.rb",
|
45
47
|
"spec/spec_helper.rb"
|
46
48
|
]
|
data/lib/graboid/entity.rb
CHANGED
@@ -3,8 +3,9 @@ module Graboid
|
|
3
3
|
|
4
4
|
def self.included klass
|
5
5
|
klass.class_eval do
|
6
|
-
extend
|
6
|
+
extend ClassMethods
|
7
7
|
include InstanceMethods
|
8
|
+
|
8
9
|
write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
|
9
10
|
end
|
10
11
|
end
|
@@ -43,7 +44,15 @@ module Graboid
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def doc
|
46
|
-
Nokogiri
|
47
|
+
eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
|
48
|
+
end
|
49
|
+
|
50
|
+
def collection
|
51
|
+
@collection ||= []
|
52
|
+
end
|
53
|
+
|
54
|
+
def collection=(col)
|
55
|
+
@collection = col
|
47
56
|
end
|
48
57
|
|
49
58
|
def attribute_map
|
@@ -57,7 +66,8 @@ module Graboid
|
|
57
66
|
def hash_map fragment
|
58
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
59
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
60
|
-
|
69
|
+
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
|
61
71
|
|
62
72
|
extracted_hash
|
63
73
|
end
|
@@ -65,15 +75,15 @@ module Graboid
|
|
65
75
|
|
66
76
|
def all_fragments
|
67
77
|
return page_fragments if @pager.nil?
|
68
|
-
old_source
|
69
|
-
@collection = []
|
78
|
+
old_source = self.source
|
70
79
|
while next_page?
|
71
|
-
|
72
|
-
|
80
|
+
self.collection += page_fragments
|
81
|
+
run_before_paginate_callbacks
|
73
82
|
paginate
|
83
|
+
run_after_paginate_callbacks
|
74
84
|
end
|
75
85
|
self.source = old_source
|
76
|
-
|
86
|
+
self.collection
|
77
87
|
end
|
78
88
|
|
79
89
|
def paginate
|
@@ -83,7 +93,11 @@ module Graboid
|
|
83
93
|
end
|
84
94
|
|
85
95
|
def next_page?
|
86
|
-
|
96
|
+
if max_pages.zero?
|
97
|
+
return true unless @pager.call(doc).nil?
|
98
|
+
else
|
99
|
+
current_page <= max_pages-1
|
100
|
+
end
|
87
101
|
end
|
88
102
|
|
89
103
|
def page_fragments
|
@@ -91,13 +105,20 @@ module Graboid
|
|
91
105
|
end
|
92
106
|
|
93
107
|
def all opts={}
|
108
|
+
reset_context
|
94
109
|
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
95
110
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
96
111
|
end
|
97
112
|
|
113
|
+
def reset_context
|
114
|
+
self.collection = []
|
115
|
+
self.current_page = 0
|
116
|
+
self.max_pages = 0
|
117
|
+
end
|
118
|
+
|
98
119
|
def read_source
|
99
120
|
case self.source
|
100
|
-
when /^http
|
121
|
+
when /^http[s]?:\/\//
|
101
122
|
open self.source
|
102
123
|
when String
|
103
124
|
self.source
|
@@ -108,6 +129,15 @@ module Graboid
|
|
108
129
|
@pager = block
|
109
130
|
end
|
110
131
|
|
132
|
+
def mode
|
133
|
+
@mode ||= :html
|
134
|
+
end
|
135
|
+
|
136
|
+
def mode=(m)
|
137
|
+
raise ArgumentError unless [:html, :xml].include?(m)
|
138
|
+
@mode = m
|
139
|
+
end
|
140
|
+
|
111
141
|
def max_pages
|
112
142
|
@max_pages ||= 0
|
113
143
|
end
|
@@ -124,6 +154,21 @@ module Graboid
|
|
124
154
|
@current_page = num
|
125
155
|
end
|
126
156
|
|
157
|
+
instance_eval do
|
158
|
+
[:before, :after].each do |prefix|
|
159
|
+
[:paginate, :extract].each do |suffix|
|
160
|
+
method_name = "#{prefix}_#{suffix}"
|
161
|
+
define_method method_name.to_sym do |&block|
|
162
|
+
instance_variable_set "@#{method_name}", block
|
163
|
+
end
|
164
|
+
define_method "run_#{method_name}_callbacks" do
|
165
|
+
ivar = instance_variable_get("@#{method_name}")
|
166
|
+
self.class_eval { ivar.call } unless ivar.nil?
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
127
172
|
end # ClassMethods
|
128
173
|
|
129
174
|
module InstanceMethods
|
@@ -0,0 +1,37 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>posts</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Posterous">
|
10
|
+
<!-- Date: 2010-06-10 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
|
14
|
+
<% @limit.times do |num| %>
|
15
|
+
|
16
|
+
<div class="post" id="<%= num + (@page*@limit)-1 %>">
|
17
|
+
|
18
|
+
<p class="title">Post <%= num + (@page*@limit)-1 %></p>
|
19
|
+
|
20
|
+
<p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
21
|
+
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
22
|
+
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
23
|
+
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
24
|
+
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
25
|
+
</p>
|
26
|
+
<span class="author">Someone Awesome (06/11/2010)</span>
|
27
|
+
|
28
|
+
</div>
|
29
|
+
|
30
|
+
<% end %>
|
31
|
+
|
32
|
+
<% if @page < @total_pages %>
|
33
|
+
<a class="next" href="/posts?page=<%= @page.next %>">next</a>
|
34
|
+
<% end %>
|
35
|
+
|
36
|
+
</body>
|
37
|
+
</html>
|
data/spec/graboid/entity_spec.rb
CHANGED
@@ -1,11 +1,5 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
class Post
|
4
|
-
include Graboid::Entity
|
5
|
-
|
6
|
-
selector '.post'
|
7
|
-
end
|
8
|
-
|
9
3
|
describe Graboid::Entity do
|
10
4
|
describe "#source" do
|
11
5
|
describe "when url" do
|
@@ -112,12 +106,6 @@ describe Graboid::Entity do
|
|
112
106
|
|
113
107
|
describe "#all_fragments" do
|
114
108
|
before(:each) do
|
115
|
-
|
116
|
-
class WorkingPost
|
117
|
-
include Graboid::Entity
|
118
|
-
selector '.post'
|
119
|
-
set :body
|
120
|
-
end
|
121
109
|
|
122
110
|
WorkingPost.source = POSTS_HTML_STR
|
123
111
|
@fragments = WorkingPost.all_fragments
|
@@ -136,17 +124,7 @@ describe Graboid::Entity do
|
|
136
124
|
describe "#extract_instance" do
|
137
125
|
|
138
126
|
before(:each) do
|
139
|
-
class WorkingPost
|
140
|
-
include Graboid::Entity
|
141
|
-
selector '.post'
|
142
|
-
set :title
|
143
|
-
set :body
|
144
|
-
set :author
|
145
|
-
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
146
|
-
end
|
147
|
-
|
148
127
|
@instance = WorkingPost.extract_instance(POST_FRAGMENT)
|
149
|
-
|
150
128
|
end
|
151
129
|
|
152
130
|
it "should return a WorkingPost instance" do
|
@@ -165,17 +143,7 @@ describe Graboid::Entity do
|
|
165
143
|
|
166
144
|
describe "#all" do
|
167
145
|
before(:each) do
|
168
|
-
class WorkingPost
|
169
|
-
include Graboid::Entity
|
170
|
-
selector '.post'
|
171
|
-
set :title
|
172
|
-
set :body
|
173
|
-
set :author
|
174
|
-
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
175
|
-
end
|
176
|
-
|
177
146
|
WorkingPost.source = POSTS_HTML_STR
|
178
|
-
|
179
147
|
end
|
180
148
|
|
181
149
|
it "should return 2 WorkingPosts" do
|
@@ -196,32 +164,42 @@ describe Graboid::Entity do
|
|
196
164
|
end
|
197
165
|
end
|
198
166
|
|
167
|
+
describe "#mode" do
|
168
|
+
it "should be html by default" do
|
169
|
+
WorkingPost.mode.should == :html
|
170
|
+
end
|
171
|
+
it "should throw an error for invalid values" do
|
172
|
+
lambda {
|
173
|
+
WorkingPost.mode = :derp
|
174
|
+
}.should raise_error ArgumentError
|
175
|
+
end
|
176
|
+
it "should change to :xml" do
|
177
|
+
WorkingPost.mode = :xml
|
178
|
+
WorkingPost.mode.should == :xml
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
199
182
|
describe "#pager" do
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
set :title
|
208
|
-
set :domain, :selector => '.domain a'
|
209
|
-
set :link, :selector => '.title' do |entry|
|
210
|
-
entry.css('a').first['href']
|
211
|
-
end
|
212
|
-
|
213
|
-
pager do |doc|
|
214
|
-
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
215
|
-
end
|
216
|
-
|
183
|
+
describe "with a limit" do
|
184
|
+
before(:each) do
|
185
|
+
PostWithPager.source = 'http://localhost:9393/posts'
|
186
|
+
@posts = PostWithPager.all(:max_pages => 3)
|
187
|
+
end
|
188
|
+
it "should get 2 posts" do
|
189
|
+
@posts.length.should == 6
|
217
190
|
end
|
218
|
-
RedditEntry.source = 'http://reddit.com'
|
219
|
-
@posts = RedditEntry.all(:max_pages => 2)
|
220
191
|
end
|
221
|
-
|
222
|
-
|
192
|
+
|
193
|
+
describe "without a limit" do
|
194
|
+
before(:each) do
|
195
|
+
PostWithPager.source = 'http://localhost:9393/posts'
|
196
|
+
@posts = PostWithPager.all
|
197
|
+
end
|
198
|
+
it "should get 2 posts" do
|
199
|
+
@posts.length.should == 16
|
200
|
+
end
|
223
201
|
end
|
202
|
+
|
224
203
|
end
|
225
|
-
|
226
|
-
|
204
|
+
|
227
205
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -8,7 +8,50 @@ Spec::Runner.configure do |config|
|
|
8
8
|
|
9
9
|
end
|
10
10
|
|
11
|
-
|
12
|
-
POSTS_HTML_STR = File.read(
|
13
|
-
|
14
|
-
POST_FRAGMENT = Nokogiri::HTML::fragment(
|
11
|
+
FIXTURE_PATH = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
|
12
|
+
POSTS_HTML_STR = File.read(FIXTURE_PATH){|f| f.read }
|
13
|
+
POST_DOC = Nokogiri::HTML(POSTS_HTML_STR)
|
14
|
+
POST_FRAGMENT = Nokogiri::HTML::fragment(POST_DOC.css('.post').first.to_html)
|
15
|
+
|
16
|
+
class Post
|
17
|
+
include Graboid::Entity
|
18
|
+
|
19
|
+
selector '.post'
|
20
|
+
end
|
21
|
+
|
22
|
+
class WorkingPost
|
23
|
+
include Graboid::Entity
|
24
|
+
|
25
|
+
selector '.post'
|
26
|
+
|
27
|
+
set :title
|
28
|
+
set :body
|
29
|
+
set :author
|
30
|
+
set :date, :selector => '.author' do |elm|
|
31
|
+
elm.text.match(/\((.*)\)/)[1]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class PostWithPager
|
36
|
+
include Graboid::Entity
|
37
|
+
|
38
|
+
selector '.post'
|
39
|
+
|
40
|
+
set :title
|
41
|
+
set :body
|
42
|
+
set :author
|
43
|
+
set :date, :selector => '.author' do |elm|
|
44
|
+
elm.text.match(/\((.*)\)/)[1]
|
45
|
+
end
|
46
|
+
|
47
|
+
pager do |doc|
|
48
|
+
link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
49
|
+
#puts link.inspect
|
50
|
+
link
|
51
|
+
end
|
52
|
+
|
53
|
+
before_paginate do
|
54
|
+
puts "page: #{self.source}"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 2
|
10
|
+
version: 0.3.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -71,7 +71,6 @@ extensions: []
|
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE
|
73
73
|
- README.mdown
|
74
|
-
- README.mdown.orig
|
75
74
|
files:
|
76
75
|
- .document
|
77
76
|
- .gitignore
|
@@ -84,11 +83,12 @@ files:
|
|
84
83
|
- lib/graboid/entity.rb
|
85
84
|
- spec/fixtures/graboid.jpg
|
86
85
|
- spec/fixtures/posts.html
|
86
|
+
- spec/fixtures/server.rb
|
87
|
+
- spec/fixtures/views/posts.erb
|
87
88
|
- spec/graboid/entity_spec.rb
|
88
89
|
- spec/graboid_spec.rb
|
89
90
|
- spec/spec.opts
|
90
91
|
- spec/spec_helper.rb
|
91
|
-
- README.mdown.orig
|
92
92
|
has_rdoc: true
|
93
93
|
homepage: http://github.com/twoism/graboid
|
94
94
|
licenses: []
|
@@ -124,6 +124,7 @@ signing_key:
|
|
124
124
|
specification_version: 3
|
125
125
|
summary: web scraping made easy
|
126
126
|
test_files:
|
127
|
+
- spec/fixtures/server.rb
|
127
128
|
- spec/graboid/entity_spec.rb
|
128
129
|
- spec/graboid_spec.rb
|
129
130
|
- spec/spec_helper.rb
|
data/README.mdown.orig
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
### Graboid ###
|
2
|
-
|
3
|
-
![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
|
4
|
-
|
5
|
-
Simply awesome web scraping. Better docs later. See specs.
|
6
|
-
|
7
|
-
### Installation ###
|
8
|
-
|
9
|
-
|
10
|
-
gem install nokogiri graboid
|
11
|
-
|
12
|
-
|
13
|
-
### Usage ###
|
14
|
-
|
15
|
-
%w{rubygems graboid}.each { |f| require f }
|
16
|
-
|
17
|
-
class RedditEntry
|
18
|
-
include Graboid::Entity
|
19
|
-
|
20
|
-
selector '.entry'
|
21
|
-
<<<<<<< HEAD
|
22
|
-
|
23
|
-
set :title
|
24
|
-
set :domain, :selector => '.domain a'
|
25
|
-
set :link, :selector => '.title' do |entry|
|
26
|
-
=======
|
27
|
-
|
28
|
-
field :title
|
29
|
-
field :domain, :selector => '.domain a'
|
30
|
-
field :link, :selector => '.title' do |entry|
|
31
|
-
>>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
|
32
|
-
entry.css('a').first['href']
|
33
|
-
end
|
34
|
-
|
35
|
-
pager do |doc|
|
36
|
-
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
RedditEntry.source = 'http://reddit.com'
|
42
|
-
|
43
|
-
RedditEntry.all(:max_pages => 2).each do |p|
|
44
|
-
puts "title: #{p.title}"
|
45
|
-
puts "domain: #{p.domain}"
|
46
|
-
puts "link: #{p.link}"
|
47
|
-
end
|
48
|
-
|
49
|
-
##Note on Patches/Pull Requests
|
50
|
-
|
51
|
-
* Fork the project.
|
52
|
-
* Make your feature addition or bug fix.
|
53
|
-
* Add tests for it. This is important so I don't break it in a
|
54
|
-
future version unintentionally.
|
55
|
-
* Commit, do not mess with rakefile, version, or history.
|
56
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
57
|
-
* Send me a pull request. Bonus points for topic branches.
|
58
|
-
|
59
|
-
## Copyright
|
60
|
-
|
61
|
-
Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
|