graboid 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,78 +7,37 @@
7
7
  ### Installation ###
8
8
 
9
9
 
10
- gem install graboid
10
+ gem install nokogiri graboid
11
11
 
12
12
 
13
13
  ### Usage ###
14
14
 
15
+ %w{rubygems graboid}.each { |f| require f }
15
16
 
16
- ##### Simple Extraction with clean markup #####
17
-
18
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
19
- "http://www.w3.org/TR/html4/strict.dtd">
20
-
21
- <html lang="en">
22
- <head>
23
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
24
- <title>posts</title>
25
- <meta name="generator" content="TextMate http://macromates.com/">
26
- <meta name="author" content="Posterous">
27
- <!-- Date: 2010-06-10 -->
28
- </head>
29
- <body>
30
-
31
- <div class="post" id="1">
32
-
33
- <p class="title">Post 1</p>
34
-
35
- <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
36
- incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
37
- ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
38
- in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
39
- non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
40
- </p>
41
- <span class="author">Someone Awesome (06/11/2010)</span>
42
-
43
- </div>
17
+ class RedditEntry
18
+ include Graboid::Entity
44
19
 
45
- <div class="post" id="2">
20
+ selector '.entry'
46
21
 
47
- <p class="title">Post 2</p>
22
+ set :title
23
+ set :domain, :selector => '.domain a'
24
+ set :link, :selector => '.title' do |entry|
25
+ entry.css('a').first['href']
26
+ end
27
+
28
+ pager do |doc|
29
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
30
+ end
48
31
 
49
- <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
50
- incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
51
- ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
52
- in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
53
- non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
54
- </p>
55
- <span class="author">Someone Awesome (06/11/2010)</span>
32
+ end
56
33
 
57
- </div>
34
+ RedditEntry.source = 'http://reddit.com'
58
35
 
59
- </body>
60
- </html>
61
-
62
- To extract the Posts use:
63
-
64
- class Post
65
- include Graboid::Entity
66
-
67
- field :title
68
- field :body
69
- field :author
70
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
36
+ RedditEntry.all(:max_pages => 2).each do |p|
37
+ puts "title: #{p.title}"
38
+ puts "domain: #{p.domain}"
39
+ puts "link: #{p.link}"
71
40
  end
72
-
73
- Post.source = 'The HTML string or URL to the document'
74
-
75
- @post = Post.all.first
76
-
77
- puts @post.date
78
- => 06/11/2010
79
-
80
- puts @post.title
81
- => Post 1
82
41
 
83
42
  ##Note on Patches/Pull Requests
84
43
 
@@ -0,0 +1,61 @@
1
+ ### Graboid ###
2
+
3
+ ![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
+
5
+ Simply awesome web scraping. Better docs later. See specs.
6
+
7
+ ### Installation ###
8
+
9
+
10
+ gem install nokogiri graboid
11
+
12
+
13
+ ### Usage ###
14
+
15
+ %w{rubygems graboid}.each { |f| require f }
16
+
17
+ class RedditEntry
18
+ include Graboid::Entity
19
+
20
+ selector '.entry'
21
+ <<<<<<< HEAD
22
+
23
+ set :title
24
+ set :domain, :selector => '.domain a'
25
+ set :link, :selector => '.title' do |entry|
26
+ =======
27
+
28
+ field :title
29
+ field :domain, :selector => '.domain a'
30
+ field :link, :selector => '.title' do |entry|
31
+ >>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
32
+ entry.css('a').first['href']
33
+ end
34
+
35
+ pager do |doc|
36
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
37
+ end
38
+
39
+ end
40
+
41
+ RedditEntry.source = 'http://reddit.com'
42
+
43
+ RedditEntry.all(:max_pages => 2).each do |p|
44
+ puts "title: #{p.title}"
45
+ puts "domain: #{p.domain}"
46
+ puts "link: #{p.link}"
47
+ end
48
+
49
+ ##Note on Patches/Pull Requests
50
+
51
+ * Fork the project.
52
+ * Make your feature addition or bug fix.
53
+ * Add tests for it. This is important so I don't break it in a
54
+ future version unintentionally.
55
+ * Commit, do not mess with rakefile, version, or history.
56
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
57
+ * Send me a pull request. Bonus points for topic branches.
58
+
59
+ ## Copyright
60
+
61
+ Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
@@ -5,16 +5,17 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-11}
12
+ s.date = %q{2010-06-14}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown"
17
+ "README.mdown",
18
+ "README.mdown.orig"
18
19
  ]
19
20
  s.files = [
20
21
  ".document",
@@ -14,22 +14,26 @@ module Graboid
14
14
  def source
15
15
  @source
16
16
  end
17
-
17
+
18
18
  def source=(src)
19
19
  @source = src
20
20
  end
21
21
 
22
- def field name, opts={}, &block
22
+ def set name, opts={}, &block
23
23
  opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
24
24
  opts.merge!(:processor => block) if block_given?
25
25
 
26
26
  attribute_map[name] = opts
27
27
  end
28
28
 
29
- def root selector
29
+ alias_method :field, :set
30
+
31
+ def selector selector
30
32
  @root_selector = selector
31
33
  end
32
-
34
+
35
+ alias_method :root, :selector
36
+
33
37
  def root_selector
34
38
  @root_selector || inferred_selector
35
39
  end
@@ -53,8 +57,6 @@ module Graboid
53
57
  def hash_map fragment
54
58
  attribute_map.inject({}) do |extracted_hash, at|
55
59
  selector, processor = at.last[:selector], at.last[:processor]
56
-
57
-
58
60
  extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
59
61
 
60
62
  extracted_hash
@@ -62,22 +64,66 @@ module Graboid
62
64
  end
63
65
 
64
66
  def all_fragments
65
- doc.css root_selector
67
+ return page_fragments if @pager.nil?
68
+ old_source = self.source
69
+ @collection = []
70
+ while next_page?
71
+ @frags = page_fragments
72
+ @collection += @frags
73
+ paginate
74
+ end
75
+ self.source = old_source
76
+ @collection
77
+ end
78
+
79
+ def paginate
80
+ next_page_url = @pager.call(doc) rescue nil
81
+ self.source = next_page_url
82
+ self.current_page += 1
66
83
  end
67
84
 
68
- def all
85
+ def next_page?
86
+ (current_page <= max_pages-1)
87
+ end
88
+
89
+ def page_fragments
90
+ doc.css(root_selector)
91
+ end
92
+
93
+ def all opts={}
94
+ self.max_pages = opts[:max_pages] if opts[:max_pages].present?
69
95
  all_fragments.collect{ |frag| extract_instance(frag) }
70
96
  end
71
97
 
72
98
  def read_source
73
- case @source
99
+ case self.source
74
100
  when /^http:\/\//
75
- open @source
101
+ open self.source
76
102
  when String
77
- @source
103
+ self.source
78
104
  end
79
105
  end
80
106
 
107
+ def pager &block
108
+ @pager = block
109
+ end
110
+
111
+ def max_pages
112
+ @max_pages ||= 0
113
+ end
114
+
115
+ def max_pages=num
116
+ @max_pages = num
117
+ end
118
+
119
+ def current_page
120
+ @current_page ||= 0
121
+ end
122
+
123
+ def current_page=num
124
+ @current_page = num
125
+ end
126
+
81
127
  end # ClassMethods
82
128
 
83
129
  module InstanceMethods
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  class Post
4
4
  include Graboid::Entity
5
5
 
6
- root '.post'
6
+ selector '.post'
7
7
  end
8
8
 
9
9
  describe Graboid::Entity do
@@ -12,7 +12,7 @@ describe Graboid::Entity do
12
12
  before(:each) do
13
13
  Post.source = 'http://foo.com/'
14
14
  end
15
-
15
+
16
16
  it "should set the source" do
17
17
  Post.source.should == 'http://foo.com/'
18
18
  end
@@ -65,11 +65,11 @@ describe Graboid::Entity do
65
65
 
66
66
  end
67
67
 
68
- describe "#field" do
68
+ describe "#set" do
69
69
  describe "simple syntax" do
70
70
 
71
71
  before(:each) do
72
- Post.field :body
72
+ Post.set :body
73
73
  end
74
74
 
75
75
  it "should be set in the attr map" do
@@ -83,7 +83,7 @@ describe Graboid::Entity do
83
83
 
84
84
  describe "custom selector syntax" do
85
85
  before(:each) do
86
- Post.field :body, :selector => '.custom'
86
+ Post.set :body, :selector => '.custom'
87
87
  end
88
88
 
89
89
  it "should set the selector" do
@@ -94,7 +94,7 @@ describe Graboid::Entity do
94
94
  describe "custom selector syntax with a lambda" do
95
95
 
96
96
  before(:each) do
97
- Post.field :body, :selector => '.custom' do |item|
97
+ Post.set :body, :selector => '.custom' do |item|
98
98
  "from lambda"
99
99
  end
100
100
  end
@@ -115,10 +115,10 @@ describe Graboid::Entity do
115
115
 
116
116
  class WorkingPost
117
117
  include Graboid::Entity
118
- root '.post'
119
- field :body
118
+ selector '.post'
119
+ set :body
120
120
  end
121
-
121
+
122
122
  WorkingPost.source = POSTS_HTML_STR
123
123
  @fragments = WorkingPost.all_fragments
124
124
  end
@@ -138,11 +138,11 @@ describe Graboid::Entity do
138
138
  before(:each) do
139
139
  class WorkingPost
140
140
  include Graboid::Entity
141
- root '.post'
142
- field :title
143
- field :body
144
- field :author
145
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
141
+ selector '.post'
142
+ set :title
143
+ set :body
144
+ set :author
145
+ set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
146
  end
147
147
 
148
148
  @instance = WorkingPost.extract_instance(POST_FRAGMENT)
@@ -167,11 +167,11 @@ describe Graboid::Entity do
167
167
  before(:each) do
168
168
  class WorkingPost
169
169
  include Graboid::Entity
170
- root '.post'
171
- field :title
172
- field :body
173
- field :author
174
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
170
+ selector '.post'
171
+ set :title
172
+ set :body
173
+ set :author
174
+ set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
175
  end
176
176
 
177
177
  WorkingPost.source = POSTS_HTML_STR
@@ -184,4 +184,44 @@ describe Graboid::Entity do
184
184
 
185
185
  end
186
186
 
187
+ [:current_page, :max_pages].each do |m|
188
+ describe "##{m}" do
189
+ it "should be 0 by default" do
190
+ Post.send(m).should == 0
191
+ end
192
+ it "should be 3" do
193
+ Post.send("#{m}=",3)
194
+ Post.send(m).should == 3
195
+ end
196
+ end
197
+ end
198
+
199
+ describe "#pager" do
200
+ before(:each) do
201
+
202
+ class RedditEntry
203
+ include Graboid::Entity
204
+
205
+ selector '.entry'
206
+
207
+ set :title
208
+ set :domain, :selector => '.domain a'
209
+ set :link, :selector => '.title' do |entry|
210
+ entry.css('a').first['href']
211
+ end
212
+
213
+ pager do |doc|
214
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
215
+ end
216
+
217
+ end
218
+ RedditEntry.source = 'http://reddit.com'
219
+ @posts = RedditEntry.all(:max_pages => 2)
220
+ end
221
+ it "should get 70 posts" do
222
+ @posts.length.should == 70
223
+ end
224
+ end
225
+
226
+
187
227
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-11 00:00:00 -07:00
18
+ date: 2010-06-14 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -71,6 +71,7 @@ extensions: []
71
71
  extra_rdoc_files:
72
72
  - LICENSE
73
73
  - README.mdown
74
+ - README.mdown.orig
74
75
  files:
75
76
  - .document
76
77
  - .gitignore
@@ -87,6 +88,7 @@ files:
87
88
  - spec/graboid_spec.rb
88
89
  - spec/spec.opts
89
90
  - spec/spec_helper.rb
91
+ - README.mdown.orig
90
92
  has_rdoc: true
91
93
  homepage: http://github.com/twoism/graboid
92
94
  licenses: []