graboid 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.mdown +20 -61
- data/README.mdown.orig +61 -0
- data/VERSION +1 -1
- data/graboid.gemspec +4 -3
- data/lib/graboid/entity.rb +57 -11
- data/spec/graboid/entity_spec.rb +59 -19
- metadata +7 -5
data/README.mdown
CHANGED
@@ -7,78 +7,37 @@
|
|
7
7
|
### Installation ###
|
8
8
|
|
9
9
|
|
10
|
-
gem install graboid
|
10
|
+
gem install nokogiri graboid
|
11
11
|
|
12
12
|
|
13
13
|
### Usage ###
|
14
14
|
|
15
|
+
%w{rubygems graboid}.each { |f| require f }
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
19
|
-
"http://www.w3.org/TR/html4/strict.dtd">
|
20
|
-
|
21
|
-
<html lang="en">
|
22
|
-
<head>
|
23
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
24
|
-
<title>posts</title>
|
25
|
-
<meta name="generator" content="TextMate http://macromates.com/">
|
26
|
-
<meta name="author" content="Posterous">
|
27
|
-
<!-- Date: 2010-06-10 -->
|
28
|
-
</head>
|
29
|
-
<body>
|
30
|
-
|
31
|
-
<div class="post" id="1">
|
32
|
-
|
33
|
-
<p class="title">Post 1</p>
|
34
|
-
|
35
|
-
<p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
36
|
-
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
37
|
-
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
38
|
-
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
39
|
-
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
40
|
-
</p>
|
41
|
-
<span class="author">Someone Awesome (06/11/2010)</span>
|
42
|
-
|
43
|
-
</div>
|
17
|
+
class RedditEntry
|
18
|
+
include Graboid::Entity
|
44
19
|
|
45
|
-
|
20
|
+
selector '.entry'
|
46
21
|
|
47
|
-
|
22
|
+
set :title
|
23
|
+
set :domain, :selector => '.domain a'
|
24
|
+
set :link, :selector => '.title' do |entry|
|
25
|
+
entry.css('a').first['href']
|
26
|
+
end
|
27
|
+
|
28
|
+
pager do |doc|
|
29
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
30
|
+
end
|
48
31
|
|
49
|
-
|
50
|
-
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
51
|
-
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
52
|
-
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
53
|
-
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
54
|
-
</p>
|
55
|
-
<span class="author">Someone Awesome (06/11/2010)</span>
|
32
|
+
end
|
56
33
|
|
57
|
-
|
34
|
+
RedditEntry.source = 'http://reddit.com'
|
58
35
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
class Post
|
65
|
-
include Graboid::Entity
|
66
|
-
|
67
|
-
field :title
|
68
|
-
field :body
|
69
|
-
field :author
|
70
|
-
field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
36
|
+
RedditEntry.all(:max_pages => 2).each do |p|
|
37
|
+
puts "title: #{p.title}"
|
38
|
+
puts "domain: #{p.domain}"
|
39
|
+
puts "link: #{p.link}"
|
71
40
|
end
|
72
|
-
|
73
|
-
Post.source = 'The HTML string or URL to the document'
|
74
|
-
|
75
|
-
@post = Post.all.first
|
76
|
-
|
77
|
-
puts @post.date
|
78
|
-
=> 06/11/2010
|
79
|
-
|
80
|
-
puts @post.title
|
81
|
-
=> Post 1
|
82
41
|
|
83
42
|
##Note on Patches/Pull Requests
|
84
43
|
|
data/README.mdown.orig
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
### Graboid ###
|
2
|
+
|
3
|
+
![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
|
4
|
+
|
5
|
+
Simply awesome web scraping. Better docs later. See specs.
|
6
|
+
|
7
|
+
### Installation ###
|
8
|
+
|
9
|
+
|
10
|
+
gem install nokogiri graboid
|
11
|
+
|
12
|
+
|
13
|
+
### Usage ###
|
14
|
+
|
15
|
+
%w{rubygems graboid}.each { |f| require f }
|
16
|
+
|
17
|
+
class RedditEntry
|
18
|
+
include Graboid::Entity
|
19
|
+
|
20
|
+
selector '.entry'
|
21
|
+
<<<<<<< HEAD
|
22
|
+
|
23
|
+
set :title
|
24
|
+
set :domain, :selector => '.domain a'
|
25
|
+
set :link, :selector => '.title' do |entry|
|
26
|
+
=======
|
27
|
+
|
28
|
+
field :title
|
29
|
+
field :domain, :selector => '.domain a'
|
30
|
+
field :link, :selector => '.title' do |entry|
|
31
|
+
>>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
|
32
|
+
entry.css('a').first['href']
|
33
|
+
end
|
34
|
+
|
35
|
+
pager do |doc|
|
36
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
RedditEntry.source = 'http://reddit.com'
|
42
|
+
|
43
|
+
RedditEntry.all(:max_pages => 2).each do |p|
|
44
|
+
puts "title: #{p.title}"
|
45
|
+
puts "domain: #{p.domain}"
|
46
|
+
puts "link: #{p.link}"
|
47
|
+
end
|
48
|
+
|
49
|
+
##Note on Patches/Pull Requests
|
50
|
+
|
51
|
+
* Fork the project.
|
52
|
+
* Make your feature addition or bug fix.
|
53
|
+
* Add tests for it. This is important so I don't break it in a
|
54
|
+
future version unintentionally.
|
55
|
+
* Commit, do not mess with rakefile, version, or history.
|
56
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
57
|
+
* Send me a pull request. Bonus points for topic branches.
|
58
|
+
|
59
|
+
## Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/graboid.gemspec
CHANGED
@@ -5,16 +5,17 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-06-
|
12
|
+
s.date = %q{2010-06-14}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.mdown"
|
17
|
+
"README.mdown",
|
18
|
+
"README.mdown.orig"
|
18
19
|
]
|
19
20
|
s.files = [
|
20
21
|
".document",
|
data/lib/graboid/entity.rb
CHANGED
@@ -14,22 +14,26 @@ module Graboid
|
|
14
14
|
def source
|
15
15
|
@source
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def source=(src)
|
19
19
|
@source = src
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
22
|
+
def set name, opts={}, &block
|
23
23
|
opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
|
24
24
|
opts.merge!(:processor => block) if block_given?
|
25
25
|
|
26
26
|
attribute_map[name] = opts
|
27
27
|
end
|
28
28
|
|
29
|
-
|
29
|
+
alias_method :field, :set
|
30
|
+
|
31
|
+
def selector selector
|
30
32
|
@root_selector = selector
|
31
33
|
end
|
32
|
-
|
34
|
+
|
35
|
+
alias_method :root, :selector
|
36
|
+
|
33
37
|
def root_selector
|
34
38
|
@root_selector || inferred_selector
|
35
39
|
end
|
@@ -53,8 +57,6 @@ module Graboid
|
|
53
57
|
def hash_map fragment
|
54
58
|
attribute_map.inject({}) do |extracted_hash, at|
|
55
59
|
selector, processor = at.last[:selector], at.last[:processor]
|
56
|
-
|
57
|
-
|
58
60
|
extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
|
59
61
|
|
60
62
|
extracted_hash
|
@@ -62,22 +64,66 @@ module Graboid
|
|
62
64
|
end
|
63
65
|
|
64
66
|
def all_fragments
|
65
|
-
|
67
|
+
return page_fragments if @pager.nil?
|
68
|
+
old_source = self.source
|
69
|
+
@collection = []
|
70
|
+
while next_page?
|
71
|
+
@frags = page_fragments
|
72
|
+
@collection += @frags
|
73
|
+
paginate
|
74
|
+
end
|
75
|
+
self.source = old_source
|
76
|
+
@collection
|
77
|
+
end
|
78
|
+
|
79
|
+
def paginate
|
80
|
+
next_page_url = @pager.call(doc) rescue nil
|
81
|
+
self.source = next_page_url
|
82
|
+
self.current_page += 1
|
66
83
|
end
|
67
84
|
|
68
|
-
def
|
85
|
+
def next_page?
|
86
|
+
(current_page <= max_pages-1)
|
87
|
+
end
|
88
|
+
|
89
|
+
def page_fragments
|
90
|
+
doc.css(root_selector)
|
91
|
+
end
|
92
|
+
|
93
|
+
def all opts={}
|
94
|
+
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
69
95
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
70
96
|
end
|
71
97
|
|
72
98
|
def read_source
|
73
|
-
case
|
99
|
+
case self.source
|
74
100
|
when /^http:\/\//
|
75
|
-
open
|
101
|
+
open self.source
|
76
102
|
when String
|
77
|
-
|
103
|
+
self.source
|
78
104
|
end
|
79
105
|
end
|
80
106
|
|
107
|
+
def pager &block
|
108
|
+
@pager = block
|
109
|
+
end
|
110
|
+
|
111
|
+
def max_pages
|
112
|
+
@max_pages ||= 0
|
113
|
+
end
|
114
|
+
|
115
|
+
def max_pages=num
|
116
|
+
@max_pages = num
|
117
|
+
end
|
118
|
+
|
119
|
+
def current_page
|
120
|
+
@current_page ||= 0
|
121
|
+
end
|
122
|
+
|
123
|
+
def current_page=num
|
124
|
+
@current_page = num
|
125
|
+
end
|
126
|
+
|
81
127
|
end # ClassMethods
|
82
128
|
|
83
129
|
module InstanceMethods
|
data/spec/graboid/entity_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
class Post
|
4
4
|
include Graboid::Entity
|
5
5
|
|
6
|
-
|
6
|
+
selector '.post'
|
7
7
|
end
|
8
8
|
|
9
9
|
describe Graboid::Entity do
|
@@ -12,7 +12,7 @@ describe Graboid::Entity do
|
|
12
12
|
before(:each) do
|
13
13
|
Post.source = 'http://foo.com/'
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
it "should set the source" do
|
17
17
|
Post.source.should == 'http://foo.com/'
|
18
18
|
end
|
@@ -65,11 +65,11 @@ describe Graboid::Entity do
|
|
65
65
|
|
66
66
|
end
|
67
67
|
|
68
|
-
describe "#
|
68
|
+
describe "#set" do
|
69
69
|
describe "simple syntax" do
|
70
70
|
|
71
71
|
before(:each) do
|
72
|
-
Post.
|
72
|
+
Post.set :body
|
73
73
|
end
|
74
74
|
|
75
75
|
it "should be set in the attr map" do
|
@@ -83,7 +83,7 @@ describe Graboid::Entity do
|
|
83
83
|
|
84
84
|
describe "custom selector syntax" do
|
85
85
|
before(:each) do
|
86
|
-
Post.
|
86
|
+
Post.set :body, :selector => '.custom'
|
87
87
|
end
|
88
88
|
|
89
89
|
it "should set the selector" do
|
@@ -94,7 +94,7 @@ describe Graboid::Entity do
|
|
94
94
|
describe "custom selector syntax with a lambda" do
|
95
95
|
|
96
96
|
before(:each) do
|
97
|
-
Post.
|
97
|
+
Post.set :body, :selector => '.custom' do |item|
|
98
98
|
"from lambda"
|
99
99
|
end
|
100
100
|
end
|
@@ -115,10 +115,10 @@ describe Graboid::Entity do
|
|
115
115
|
|
116
116
|
class WorkingPost
|
117
117
|
include Graboid::Entity
|
118
|
-
|
119
|
-
|
118
|
+
selector '.post'
|
119
|
+
set :body
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
WorkingPost.source = POSTS_HTML_STR
|
123
123
|
@fragments = WorkingPost.all_fragments
|
124
124
|
end
|
@@ -138,11 +138,11 @@ describe Graboid::Entity do
|
|
138
138
|
before(:each) do
|
139
139
|
class WorkingPost
|
140
140
|
include Graboid::Entity
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
141
|
+
selector '.post'
|
142
|
+
set :title
|
143
|
+
set :body
|
144
|
+
set :author
|
145
|
+
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
146
146
|
end
|
147
147
|
|
148
148
|
@instance = WorkingPost.extract_instance(POST_FRAGMENT)
|
@@ -167,11 +167,11 @@ describe Graboid::Entity do
|
|
167
167
|
before(:each) do
|
168
168
|
class WorkingPost
|
169
169
|
include Graboid::Entity
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
170
|
+
selector '.post'
|
171
|
+
set :title
|
172
|
+
set :body
|
173
|
+
set :author
|
174
|
+
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
175
175
|
end
|
176
176
|
|
177
177
|
WorkingPost.source = POSTS_HTML_STR
|
@@ -184,4 +184,44 @@ describe Graboid::Entity do
|
|
184
184
|
|
185
185
|
end
|
186
186
|
|
187
|
+
[:current_page, :max_pages].each do |m|
|
188
|
+
describe "##{m}" do
|
189
|
+
it "should be 0 by default" do
|
190
|
+
Post.send(m).should == 0
|
191
|
+
end
|
192
|
+
it "should be 3" do
|
193
|
+
Post.send("#{m}=",3)
|
194
|
+
Post.send(m).should == 3
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
describe "#pager" do
|
200
|
+
before(:each) do
|
201
|
+
|
202
|
+
class RedditEntry
|
203
|
+
include Graboid::Entity
|
204
|
+
|
205
|
+
selector '.entry'
|
206
|
+
|
207
|
+
set :title
|
208
|
+
set :domain, :selector => '.domain a'
|
209
|
+
set :link, :selector => '.title' do |entry|
|
210
|
+
entry.css('a').first['href']
|
211
|
+
end
|
212
|
+
|
213
|
+
pager do |doc|
|
214
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
215
|
+
end
|
216
|
+
|
217
|
+
end
|
218
|
+
RedditEntry.source = 'http://reddit.com'
|
219
|
+
@posts = RedditEntry.all(:max_pages => 2)
|
220
|
+
end
|
221
|
+
it "should get 70 posts" do
|
222
|
+
@posts.length.should == 70
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
|
187
227
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-14 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -71,6 +71,7 @@ extensions: []
|
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE
|
73
73
|
- README.mdown
|
74
|
+
- README.mdown.orig
|
74
75
|
files:
|
75
76
|
- .document
|
76
77
|
- .gitignore
|
@@ -87,6 +88,7 @@ files:
|
|
87
88
|
- spec/graboid_spec.rb
|
88
89
|
- spec/spec.opts
|
89
90
|
- spec/spec_helper.rb
|
91
|
+
- README.mdown.orig
|
90
92
|
has_rdoc: true
|
91
93
|
homepage: http://github.com/twoism/graboid
|
92
94
|
licenses: []
|