graboid 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,78 +7,37 @@
7
7
  ### Installation ###
8
8
 
9
9
 
10
- gem install graboid
10
+ gem install nokogiri graboid
11
11
 
12
12
 
13
13
  ### Usage ###
14
14
 
15
+ %w{rubygems graboid}.each { |f| require f }
15
16
 
16
- ##### Simple Extraction with clean markup #####
17
-
18
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
19
- "http://www.w3.org/TR/html4/strict.dtd">
20
-
21
- <html lang="en">
22
- <head>
23
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
24
- <title>posts</title>
25
- <meta name="generator" content="TextMate http://macromates.com/">
26
- <meta name="author" content="Posterous">
27
- <!-- Date: 2010-06-10 -->
28
- </head>
29
- <body>
30
-
31
- <div class="post" id="1">
32
-
33
- <p class="title">Post 1</p>
34
-
35
- <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
36
- incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
37
- ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
38
- in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
39
- non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
40
- </p>
41
- <span class="author">Someone Awesome (06/11/2010)</span>
42
-
43
- </div>
17
+ class RedditEntry
18
+ include Graboid::Entity
44
19
 
45
- <div class="post" id="2">
20
+ selector '.entry'
46
21
 
47
- <p class="title">Post 2</p>
22
+ set :title
23
+ set :domain, :selector => '.domain a'
24
+ set :link, :selector => '.title' do |entry|
25
+ entry.css('a').first['href']
26
+ end
27
+
28
+ pager do |doc|
29
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
30
+ end
48
31
 
49
- <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
50
- incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
51
- ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
52
- in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
53
- non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
54
- </p>
55
- <span class="author">Someone Awesome (06/11/2010)</span>
32
+ end
56
33
 
57
- </div>
34
+ RedditEntry.source = 'http://reddit.com'
58
35
 
59
- </body>
60
- </html>
61
-
62
- To extract the Posts use:
63
-
64
- class Post
65
- include Graboid::Entity
66
-
67
- field :title
68
- field :body
69
- field :author
70
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
36
+ RedditEntry.all(:max_pages => 2).each do |p|
37
+ puts "title: #{p.title}"
38
+ puts "domain: #{p.domain}"
39
+ puts "link: #{p.link}"
71
40
  end
72
-
73
- Post.source = 'The HTML string or URL to the document'
74
-
75
- @post = Post.all.first
76
-
77
- puts @post.date
78
- => 06/11/2010
79
-
80
- puts @post.title
81
- => Post 1
82
41
 
83
42
  ##Note on Patches/Pull Requests
84
43
 
@@ -0,0 +1,61 @@
1
+ ### Graboid ###
2
+
3
+ ![Graboid](http://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
+
5
+ Simply awesome web scraping. Better docs later. See specs.
6
+
7
+ ### Installation ###
8
+
9
+
10
+ gem install nokogiri graboid
11
+
12
+
13
+ ### Usage ###
14
+
15
+ %w{rubygems graboid}.each { |f| require f }
16
+
17
+ class RedditEntry
18
+ include Graboid::Entity
19
+
20
+ selector '.entry'
21
+ <<<<<<< HEAD
22
+
23
+ set :title
24
+ set :domain, :selector => '.domain a'
25
+ set :link, :selector => '.title' do |entry|
26
+ =======
27
+
28
+ field :title
29
+ field :domain, :selector => '.domain a'
30
+ field :link, :selector => '.title' do |entry|
31
+ >>>>>>> ea3c69202c1af78378fd4fb4b2d9ccd2098bc9d8
32
+ entry.css('a').first['href']
33
+ end
34
+
35
+ pager do |doc|
36
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
37
+ end
38
+
39
+ end
40
+
41
+ RedditEntry.source = 'http://reddit.com'
42
+
43
+ RedditEntry.all(:max_pages => 2).each do |p|
44
+ puts "title: #{p.title}"
45
+ puts "domain: #{p.domain}"
46
+ puts "link: #{p.link}"
47
+ end
48
+
49
+ ##Note on Patches/Pull Requests
50
+
51
+ * Fork the project.
52
+ * Make your feature addition or bug fix.
53
+ * Add tests for it. This is important so I don't break it in a
54
+ future version unintentionally.
55
+ * Commit, do not mess with rakefile, version, or history.
56
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
57
+ * Send me a pull request. Bonus points for topic branches.
58
+
59
+ ## Copyright
60
+
61
+ Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
@@ -5,16 +5,17 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-11}
12
+ s.date = %q{2010-06-14}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown"
17
+ "README.mdown",
18
+ "README.mdown.orig"
18
19
  ]
19
20
  s.files = [
20
21
  ".document",
@@ -14,22 +14,26 @@ module Graboid
14
14
  def source
15
15
  @source
16
16
  end
17
-
17
+
18
18
  def source=(src)
19
19
  @source = src
20
20
  end
21
21
 
22
- def field name, opts={}, &block
22
+ def set name, opts={}, &block
23
23
  opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
24
24
  opts.merge!(:processor => block) if block_given?
25
25
 
26
26
  attribute_map[name] = opts
27
27
  end
28
28
 
29
- def root selector
29
+ alias_method :field, :set
30
+
31
+ def selector selector
30
32
  @root_selector = selector
31
33
  end
32
-
34
+
35
+ alias_method :root, :selector
36
+
33
37
  def root_selector
34
38
  @root_selector || inferred_selector
35
39
  end
@@ -53,8 +57,6 @@ module Graboid
53
57
  def hash_map fragment
54
58
  attribute_map.inject({}) do |extracted_hash, at|
55
59
  selector, processor = at.last[:selector], at.last[:processor]
56
-
57
-
58
60
  extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first) rescue ""
59
61
 
60
62
  extracted_hash
@@ -62,22 +64,66 @@ module Graboid
62
64
  end
63
65
 
64
66
  def all_fragments
65
- doc.css root_selector
67
+ return page_fragments if @pager.nil?
68
+ old_source = self.source
69
+ @collection = []
70
+ while next_page?
71
+ @frags = page_fragments
72
+ @collection += @frags
73
+ paginate
74
+ end
75
+ self.source = old_source
76
+ @collection
77
+ end
78
+
79
+ def paginate
80
+ next_page_url = @pager.call(doc) rescue nil
81
+ self.source = next_page_url
82
+ self.current_page += 1
66
83
  end
67
84
 
68
- def all
85
+ def next_page?
86
+ (current_page <= max_pages-1)
87
+ end
88
+
89
+ def page_fragments
90
+ doc.css(root_selector)
91
+ end
92
+
93
+ def all opts={}
94
+ self.max_pages = opts[:max_pages] if opts[:max_pages].present?
69
95
  all_fragments.collect{ |frag| extract_instance(frag) }
70
96
  end
71
97
 
72
98
  def read_source
73
- case @source
99
+ case self.source
74
100
  when /^http:\/\//
75
- open @source
101
+ open self.source
76
102
  when String
77
- @source
103
+ self.source
78
104
  end
79
105
  end
80
106
 
107
+ def pager &block
108
+ @pager = block
109
+ end
110
+
111
+ def max_pages
112
+ @max_pages ||= 0
113
+ end
114
+
115
+ def max_pages=num
116
+ @max_pages = num
117
+ end
118
+
119
+ def current_page
120
+ @current_page ||= 0
121
+ end
122
+
123
+ def current_page=num
124
+ @current_page = num
125
+ end
126
+
81
127
  end # ClassMethods
82
128
 
83
129
  module InstanceMethods
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  class Post
4
4
  include Graboid::Entity
5
5
 
6
- root '.post'
6
+ selector '.post'
7
7
  end
8
8
 
9
9
  describe Graboid::Entity do
@@ -12,7 +12,7 @@ describe Graboid::Entity do
12
12
  before(:each) do
13
13
  Post.source = 'http://foo.com/'
14
14
  end
15
-
15
+
16
16
  it "should set the source" do
17
17
  Post.source.should == 'http://foo.com/'
18
18
  end
@@ -65,11 +65,11 @@ describe Graboid::Entity do
65
65
 
66
66
  end
67
67
 
68
- describe "#field" do
68
+ describe "#set" do
69
69
  describe "simple syntax" do
70
70
 
71
71
  before(:each) do
72
- Post.field :body
72
+ Post.set :body
73
73
  end
74
74
 
75
75
  it "should be set in the attr map" do
@@ -83,7 +83,7 @@ describe Graboid::Entity do
83
83
 
84
84
  describe "custom selector syntax" do
85
85
  before(:each) do
86
- Post.field :body, :selector => '.custom'
86
+ Post.set :body, :selector => '.custom'
87
87
  end
88
88
 
89
89
  it "should set the selector" do
@@ -94,7 +94,7 @@ describe Graboid::Entity do
94
94
  describe "custom selector syntax with a lambda" do
95
95
 
96
96
  before(:each) do
97
- Post.field :body, :selector => '.custom' do |item|
97
+ Post.set :body, :selector => '.custom' do |item|
98
98
  "from lambda"
99
99
  end
100
100
  end
@@ -115,10 +115,10 @@ describe Graboid::Entity do
115
115
 
116
116
  class WorkingPost
117
117
  include Graboid::Entity
118
- root '.post'
119
- field :body
118
+ selector '.post'
119
+ set :body
120
120
  end
121
-
121
+
122
122
  WorkingPost.source = POSTS_HTML_STR
123
123
  @fragments = WorkingPost.all_fragments
124
124
  end
@@ -138,11 +138,11 @@ describe Graboid::Entity do
138
138
  before(:each) do
139
139
  class WorkingPost
140
140
  include Graboid::Entity
141
- root '.post'
142
- field :title
143
- field :body
144
- field :author
145
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
141
+ selector '.post'
142
+ set :title
143
+ set :body
144
+ set :author
145
+ set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
146
  end
147
147
 
148
148
  @instance = WorkingPost.extract_instance(POST_FRAGMENT)
@@ -167,11 +167,11 @@ describe Graboid::Entity do
167
167
  before(:each) do
168
168
  class WorkingPost
169
169
  include Graboid::Entity
170
- root '.post'
171
- field :title
172
- field :body
173
- field :author
174
- field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
170
+ selector '.post'
171
+ set :title
172
+ set :body
173
+ set :author
174
+ set :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
175
  end
176
176
 
177
177
  WorkingPost.source = POSTS_HTML_STR
@@ -184,4 +184,44 @@ describe Graboid::Entity do
184
184
 
185
185
  end
186
186
 
187
+ [:current_page, :max_pages].each do |m|
188
+ describe "##{m}" do
189
+ it "should be 0 by default" do
190
+ Post.send(m).should == 0
191
+ end
192
+ it "should be 3" do
193
+ Post.send("#{m}=",3)
194
+ Post.send(m).should == 3
195
+ end
196
+ end
197
+ end
198
+
199
+ describe "#pager" do
200
+ before(:each) do
201
+
202
+ class RedditEntry
203
+ include Graboid::Entity
204
+
205
+ selector '.entry'
206
+
207
+ set :title
208
+ set :domain, :selector => '.domain a'
209
+ set :link, :selector => '.title' do |entry|
210
+ entry.css('a').first['href']
211
+ end
212
+
213
+ pager do |doc|
214
+ doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
215
+ end
216
+
217
+ end
218
+ RedditEntry.source = 'http://reddit.com'
219
+ @posts = RedditEntry.all(:max_pages => 2)
220
+ end
221
+ it "should get 70 posts" do
222
+ @posts.length.should == 70
223
+ end
224
+ end
225
+
226
+
187
227
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-11 00:00:00 -07:00
18
+ date: 2010-06-14 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -71,6 +71,7 @@ extensions: []
71
71
  extra_rdoc_files:
72
72
  - LICENSE
73
73
  - README.mdown
74
+ - README.mdown.orig
74
75
  files:
75
76
  - .document
76
77
  - .gitignore
@@ -87,6 +88,7 @@ files:
87
88
  - spec/graboid_spec.rb
88
89
  - spec/spec.opts
89
90
  - spec/spec_helper.rb
91
+ - README.mdown.orig
90
92
  has_rdoc: true
91
93
  homepage: http://github.com/twoism/graboid
92
94
  licenses: []