graboid 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.mdown +20 -61
- data/README.mdown.orig +61 -0
- data/VERSION +1 -1
- data/graboid.gemspec +4 -3
- data/lib/graboid/entity.rb +57 -11
- data/spec/graboid/entity_spec.rb +59 -19
- metadata +7 -5
data/README.mdown
CHANGED
@@ -7,78 +7,37 @@
|
|
7
7
|
### Installation ###
|
8
8
|
|
9
9
|
|
10
|
-
gem install graboid
|
10
|
+
gem install nokogiri graboid
|
11
11
|
|
12
12
|
|
13
13
|
### Usage ###
|
14
14
|
|
15
|
+
%w{rubygems graboid}.each { |f| require f }
|
15
16
|
|
16
|
-
|
17
|
-
|
18
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
19
|
-
"http://www.w3.org/TR/html4/strict.dtd">
|
20
|
-
|
21
|
-
<html lang="en">
|
22
|
-
<head>
|
23
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
24
|
-
<title>posts</title>
|
25
|
-
<meta name="generator" content="TextMate http://macromates.com/">
|
26
|
-
<meta name="author" content="Posterous">
|
27
|
-
<!-- Date: 2010-06-10 -->
|
28
|
-
</head>
|
29
|
-
<body>
|
30
|
-
|
31
|
-
<div class="post" id="1">
|
32
|
-
|
33
|
-
<p class="title">Post 1</p>
|
34
|
-
|
35
|
-
<p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
|
36
|
-
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
37
|
-
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
38
|
-
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
39
|
-
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
40
|
-
</p>
|
41
|
-
<span class="author">Someone Awesome (06/11/2010)</span>
|
42
|
-
|
43
|
-
</div>
|
17
|
+
class RedditEntry
|
18
|
+
include Graboid::Entity
|
44
19
|
|
45
|
-
|
20
|
+
selector '.entry'
|
46
21
|
|
47
|
-
|
22
|
+
set :title
|
23
|
+
set :domain, :selector => '.domain a'
|
24
|
+
set :link, :selector => '.title' do |entry|
|
25
|
+
entry.css('a').first['href']
|
26
|
+
end
|
27
|
+
|
28
|
+
pager do |doc|
|
29
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
30
|
+
end
|
48
31
|
|
49
|
-
|
50
|
-
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
|
51
|
-
ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
|
52
|
-
in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
|
53
|
-
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
54
|
-
</p>
|
55
|
-
<span class="author">Someone Awesome (06/11/2010)</span>
|
32
|
+
end
|
56
33
|
|
57
|
-
|
34
|
+
RedditEntry.source = 'http://reddit.com'
|
58
35
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
class Post
|
65
|
-
include Graboid::Entity
|
66
|
-
|
67
|
-
field :title
|
68
|
-
field :body
|
69
|
-
field :author
|
70
|
-
field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
36
|
+
RedditEntry.all(:max_pages => 2).each do |p|
|
37
|
+
puts "title: #{p.title}"
|
38
|
+
puts "domain: #{p.domain}"
|
39
|
+
puts "link: #{p.link}"
|
71
40
|
end
|
72
|
-
|
73
|
-
Post.source = 'The HTML string or URL to the document'
|
74
|
-
|
75
|
-
@post = Post.all.first
|
76
|
-
|
77
|
-
puts @post.date
|
78
|
-
=> 06/11/2010
|
79
|
-
|
80
|
-
puts @post.title
|
81
|
-
=> Post 1
|
82
41
|
|
83
42
|
##Note on Patches/Pull Requests
|
84
43
|
|
data/README.mdown.orig
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
### Graboid ###
|
2
|
+
|
3
|
+

|
4
|
+
|
5
|
+
Simply awesome web scraping. Better docs later. See specs.
|
6
|
+
|
7
|
+
### Installation ###
|
8
|
+
|
9
|
+
|
10
|
+
gem install nokogiri graboid
|
11
|
+
|
12
|
+
|
13
|
+
### Usage ###
|
14
|
+
|
15
|
+
%w{rubygems graboid}.each { |f| require f }
|
16
|
+
|
17
|
+
class RedditEntry
|
18
|
+
include Graboid::Entity
|
19
|
+
|
20
|
+
selector '.entry'
|
21
|
+
<<<<<<< HEAD
|
22
|
+
|
23
|
+
set :title
|
24
|
+
set :domain, :selector => '.domain a'
|
25
|
+
set :link, :selector => '.title' do |entry|
|
26
|
+
=======
|
27
|
+
|
28
|
+
field :title
|
29
|
+
field :domain, :selector => '.domain a'
|
30
|
+
field :link, :selector => '.title' do |entry|
|
31
|
+
>>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
|
32
|
+
entry.css('a').first['href']
|
33
|
+
end
|
34
|
+
|
35
|
+
pager do |doc|
|
36
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
RedditEntry.source = 'http://reddit.com'
|
42
|
+
|
43
|
+
RedditEntry.all(:max_pages => 2).each do |p|
|
44
|
+
puts "title: #{p.title}"
|
45
|
+
puts "domain: #{p.domain}"
|
46
|
+
puts "link: #{p.link}"
|
47
|
+
end
|
48
|
+
|
49
|
+
##Note on Patches/Pull Requests
|
50
|
+
|
51
|
+
* Fork the project.
|
52
|
+
* Make your feature addition or bug fix.
|
53
|
+
* Add tests for it. This is important so I don't break it in a
|
54
|
+
future version unintentionally.
|
55
|
+
* Commit, do not mess with rakefile, version, or history.
|
56
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
57
|
+
* Send me a pull request. Bonus points for topic branches.
|
58
|
+
|
59
|
+
## Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/graboid.gemspec
CHANGED
@@ -5,16 +5,17 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-06-
|
12
|
+
s.date = %q{2010-06-14}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.mdown"
|
17
|
+
"README.mdown",
|
18
|
+
"README.mdown.orig"
|
18
19
|
]
|
19
20
|
s.files = [
|
20
21
|
".document",
|
data/lib/graboid/entity.rb
CHANGED
@@ -14,22 +14,26 @@ module Graboid
|
|
14
14
|
def source
|
15
15
|
@source
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def source=(src)
|
19
19
|
@source = src
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
22
|
+
def set name, opts={}, &block
|
23
23
|
opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
|
24
24
|
opts.merge!(:processor => block) if block_given?
|
25
25
|
|
26
26
|
attribute_map[name] = opts
|
27
27
|
end
|
28
28
|
|
29
|
-
|
29
|
+
alias_method :field, :set
|
30
|
+
|
31
|
+
def selector selector
|
30
32
|
@root_selector = selector
|
31
33
|
end
|
32
|
-
|
34
|
+
|
35
|
+
alias_method :root, :selector
|
36
|
+
|
33
37
|
def root_selector
|
34
38
|
@root_selector || inferred_selector
|
35
39
|
end
|
@@ -53,8 +57,6 @@ module Graboid
|
|
53
57
|
def hash_map fragment
|
54
58
|
attribute_map.inject({}) do |extracted_hash, at|
|
55
59
|
selector, processor = at.last[:selector], at.last[:processor]
|
56
|
-
|
57
|
-
|
58
60
|
extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
|
59
61
|
|
60
62
|
extracted_hash
|
@@ -62,22 +64,66 @@ module Graboid
|
|
62
64
|
end
|
63
65
|
|
64
66
|
def all_fragments
|
65
|
-
|
67
|
+
return page_fragments if @pager.nil?
|
68
|
+
old_source = self.source
|
69
|
+
@collection = []
|
70
|
+
while next_page?
|
71
|
+
@frags = page_fragments
|
72
|
+
@collection += @frags
|
73
|
+
paginate
|
74
|
+
end
|
75
|
+
self.source = old_source
|
76
|
+
@collection
|
77
|
+
end
|
78
|
+
|
79
|
+
def paginate
|
80
|
+
next_page_url = @pager.call(doc) rescue nil
|
81
|
+
self.source = next_page_url
|
82
|
+
self.current_page += 1
|
66
83
|
end
|
67
84
|
|
68
|
-
def
|
85
|
+
def next_page?
|
86
|
+
(current_page <= max_pages-1)
|
87
|
+
end
|
88
|
+
|
89
|
+
def page_fragments
|
90
|
+
doc.css(root_selector)
|
91
|
+
end
|
92
|
+
|
93
|
+
def all opts={}
|
94
|
+
self.max_pages = opts[:max_pages] if opts[:max_pages].present?
|
69
95
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
70
96
|
end
|
71
97
|
|
72
98
|
def read_source
|
73
|
-
case
|
99
|
+
case self.source
|
74
100
|
when /^http:\/\//
|
75
|
-
open
|
101
|
+
open self.source
|
76
102
|
when String
|
77
|
-
|
103
|
+
self.source
|
78
104
|
end
|
79
105
|
end
|
80
106
|
|
107
|
+
def pager &block
|
108
|
+
@pager = block
|
109
|
+
end
|
110
|
+
|
111
|
+
def max_pages
|
112
|
+
@max_pages ||= 0
|
113
|
+
end
|
114
|
+
|
115
|
+
def max_pages=num
|
116
|
+
@max_pages = num
|
117
|
+
end
|
118
|
+
|
119
|
+
def current_page
|
120
|
+
@current_page ||= 0
|
121
|
+
end
|
122
|
+
|
123
|
+
def current_page=num
|
124
|
+
@current_page = num
|
125
|
+
end
|
126
|
+
|
81
127
|
end # ClassMethods
|
82
128
|
|
83
129
|
module InstanceMethods
|
data/spec/graboid/entity_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
class Post
|
4
4
|
include Graboid::Entity
|
5
5
|
|
6
|
-
|
6
|
+
selector '.post'
|
7
7
|
end
|
8
8
|
|
9
9
|
describe Graboid::Entity do
|
@@ -12,7 +12,7 @@ describe Graboid::Entity do
|
|
12
12
|
before(:each) do
|
13
13
|
Post.source = 'http://foo.com/'
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
it "should set the source" do
|
17
17
|
Post.source.should == 'http://foo.com/'
|
18
18
|
end
|
@@ -65,11 +65,11 @@ describe Graboid::Entity do
|
|
65
65
|
|
66
66
|
end
|
67
67
|
|
68
|
-
describe "#
|
68
|
+
describe "#set" do
|
69
69
|
describe "simple syntax" do
|
70
70
|
|
71
71
|
before(:each) do
|
72
|
-
Post.
|
72
|
+
Post.set :body
|
73
73
|
end
|
74
74
|
|
75
75
|
it "should be set in the attr map" do
|
@@ -83,7 +83,7 @@ describe Graboid::Entity do
|
|
83
83
|
|
84
84
|
describe "custom selector syntax" do
|
85
85
|
before(:each) do
|
86
|
-
Post.
|
86
|
+
Post.set :body, :selector => '.custom'
|
87
87
|
end
|
88
88
|
|
89
89
|
it "should set the selector" do
|
@@ -94,7 +94,7 @@ describe Graboid::Entity do
|
|
94
94
|
describe "custom selector syntax with a lambda" do
|
95
95
|
|
96
96
|
before(:each) do
|
97
|
-
Post.
|
97
|
+
Post.set :body, :selector => '.custom' do |item|
|
98
98
|
"from lambda"
|
99
99
|
end
|
100
100
|
end
|
@@ -115,10 +115,10 @@ describe Graboid::Entity do
|
|
115
115
|
|
116
116
|
class WorkingPost
|
117
117
|
include Graboid::Entity
|
118
|
-
|
119
|
-
|
118
|
+
selector '.post'
|
119
|
+
set :body
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
WorkingPost.source = POSTS_HTML_STR
|
123
123
|
@fragments = WorkingPost.all_fragments
|
124
124
|
end
|
@@ -138,11 +138,11 @@ describe Graboid::Entity do
|
|
138
138
|
before(:each) do
|
139
139
|
class WorkingPost
|
140
140
|
include Graboid::Entity
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
141
|
+
selector '.post'
|
142
|
+
set :title
|
143
|
+
set :body
|
144
|
+
set :author
|
145
|
+
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
146
146
|
end
|
147
147
|
|
148
148
|
@instance = WorkingPost.extract_instance(POST_FRAGMENT)
|
@@ -167,11 +167,11 @@ describe Graboid::Entity do
|
|
167
167
|
before(:each) do
|
168
168
|
class WorkingPost
|
169
169
|
include Graboid::Entity
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
170
|
+
selector '.post'
|
171
|
+
set :title
|
172
|
+
set :body
|
173
|
+
set :author
|
174
|
+
set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
|
175
175
|
end
|
176
176
|
|
177
177
|
WorkingPost.source = POSTS_HTML_STR
|
@@ -184,4 +184,44 @@ describe Graboid::Entity do
|
|
184
184
|
|
185
185
|
end
|
186
186
|
|
187
|
+
[:current_page, :max_pages].each do |m|
|
188
|
+
describe "##{m}" do
|
189
|
+
it "should be 0 by default" do
|
190
|
+
Post.send(m).should == 0
|
191
|
+
end
|
192
|
+
it "should be 3" do
|
193
|
+
Post.send("#{m}=",3)
|
194
|
+
Post.send(m).should == 3
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
describe "#pager" do
|
200
|
+
before(:each) do
|
201
|
+
|
202
|
+
class RedditEntry
|
203
|
+
include Graboid::Entity
|
204
|
+
|
205
|
+
selector '.entry'
|
206
|
+
|
207
|
+
set :title
|
208
|
+
set :domain, :selector => '.domain a'
|
209
|
+
set :link, :selector => '.title' do |entry|
|
210
|
+
entry.css('a').first['href']
|
211
|
+
end
|
212
|
+
|
213
|
+
pager do |doc|
|
214
|
+
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
215
|
+
end
|
216
|
+
|
217
|
+
end
|
218
|
+
RedditEntry.source = 'http://reddit.com'
|
219
|
+
@posts = RedditEntry.all(:max_pages => 2)
|
220
|
+
end
|
221
|
+
it "should get 70 posts" do
|
222
|
+
@posts.length.should == 70
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
|
187
227
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-14 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -71,6 +71,7 @@ extensions: []
|
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE
|
73
73
|
- README.mdown
|
74
|
+
- README.mdown.orig
|
74
75
|
files:
|
75
76
|
- .document
|
76
77
|
- .gitignore
|
@@ -87,6 +88,7 @@ files:
|
|
87
88
|
- spec/graboid_spec.rb
|
88
89
|
- spec/spec.opts
|
89
90
|
- spec/spec_helper.rb
|
91
|
+
- README.mdown.orig
|
90
92
|
has_rdoc: true
|
91
93
|
homepage: http://github.com/twoism/graboid
|
92
94
|
licenses: []
|