graboid 0.3.5 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/graboid.gemspec +31 -38
- data/lib/graboid.rb +12 -2
- data/lib/graboid/entity.rb +63 -44
- data/lib/graboid/scraper.rb +28 -12
- metadata +10 -29
- data/.gitignore +0 -21
- data/examples/reddit_post.rb +0 -35
- data/examples/tumblr_post.rb +0 -33
data/Rakefile
CHANGED
@@ -12,7 +12,6 @@ begin
|
|
12
12
|
gem.authors = ["Christopher Burnett"]
|
13
13
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
14
|
gem.add_dependency "nokogiri"
|
15
|
-
gem.add_dependency "activesupport"
|
16
15
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
16
|
end
|
18
17
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/graboid.gemspec
CHANGED
@@ -1,61 +1,57 @@
|
|
1
1
|
# Generated by jeweler
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2011-02-22}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
|
17
|
+
"README.mdown"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
"spec/spec_helper.rb"
|
21
|
+
"LICENSE",
|
22
|
+
"README.mdown",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"examples/active_rain_post.rb",
|
26
|
+
"examples/live_journal_post.rb",
|
27
|
+
"examples/ning_post.rb",
|
28
|
+
"graboid.gemspec",
|
29
|
+
"lib/graboid.rb",
|
30
|
+
"lib/graboid/entity.rb",
|
31
|
+
"lib/graboid/scraper.rb",
|
32
|
+
"spec/fixtures/graboid.jpg",
|
33
|
+
"spec/fixtures/posts.html",
|
34
|
+
"spec/fixtures/server.rb",
|
35
|
+
"spec/fixtures/views/posts.erb",
|
36
|
+
"spec/graboid/entity_spec.rb",
|
37
|
+
"spec/graboid/scraper_spec.rb",
|
38
|
+
"spec/graboid_spec.rb",
|
39
|
+
"spec/spec.opts",
|
40
|
+
"spec/spec_helper.rb"
|
42
41
|
]
|
43
42
|
s.homepage = %q{http://github.com/twoism/graboid}
|
44
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
45
43
|
s.require_paths = ["lib"]
|
46
44
|
s.rubygems_version = %q{1.3.7}
|
47
45
|
s.summary = %q{web scraping made easy}
|
48
46
|
s.test_files = [
|
47
|
+
"examples/active_rain_post.rb",
|
48
|
+
"examples/live_journal_post.rb",
|
49
|
+
"examples/ning_post.rb",
|
49
50
|
"spec/fixtures/server.rb",
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
"examples/active_rain_post.rb",
|
55
|
-
"examples/live_journal_post.rb",
|
56
|
-
"examples/ning_post.rb",
|
57
|
-
"examples/reddit_post.rb",
|
58
|
-
"examples/tumblr_post.rb"
|
51
|
+
"spec/graboid/entity_spec.rb",
|
52
|
+
"spec/graboid/scraper_spec.rb",
|
53
|
+
"spec/graboid_spec.rb",
|
54
|
+
"spec/spec_helper.rb"
|
59
55
|
]
|
60
56
|
|
61
57
|
if s.respond_to? :specification_version then
|
@@ -65,16 +61,13 @@ Gem::Specification.new do |s|
|
|
65
61
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
66
62
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
67
63
|
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
68
|
-
s.add_runtime_dependency(%q<activesupport>, [">= 0"])
|
69
64
|
else
|
70
65
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
71
66
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
72
|
-
s.add_dependency(%q<activesupport>, [">= 0"])
|
73
67
|
end
|
74
68
|
else
|
75
69
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
76
70
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
77
|
-
s.add_dependency(%q<activesupport>, [">= 0"])
|
78
71
|
end
|
79
72
|
end
|
80
73
|
|
data/lib/graboid.rb
CHANGED
@@ -1,10 +1,20 @@
|
|
1
|
-
%w{rubygems nokogiri open-uri
|
1
|
+
%w{rubygems nokogiri open-uri ostruct}.each { |f| require f }
|
2
2
|
|
3
3
|
dir = Pathname(__FILE__).dirname.expand_path
|
4
4
|
|
5
5
|
require dir + 'graboid/entity'
|
6
6
|
require dir + 'graboid/scraper'
|
7
7
|
|
8
|
+
class String
|
9
|
+
def underscore
|
10
|
+
self.to_s.gsub(/::/, '/').
|
11
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
12
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
13
|
+
tr("-", "_").
|
14
|
+
downcase
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
8
18
|
module Graboid
|
9
19
|
extend self
|
10
20
|
|
@@ -15,4 +25,4 @@ module Graboid
|
|
15
25
|
def user_agent=(agent)
|
16
26
|
@user_agent = agent
|
17
27
|
end
|
18
|
-
end
|
28
|
+
end
|
data/lib/graboid/entity.rb
CHANGED
@@ -1,68 +1,87 @@
|
|
1
1
|
module Graboid
|
2
2
|
module Entity
|
3
|
-
|
3
|
+
|
4
4
|
def self.included klass
|
5
5
|
klass.class_eval do
|
6
6
|
extend ClassMethods
|
7
7
|
include InstanceMethods
|
8
|
-
|
9
|
-
|
8
|
+
|
9
|
+
inherited_attributes :attribute_map
|
10
|
+
@attribute_map = {}
|
10
11
|
end
|
11
12
|
end
|
12
|
-
|
13
|
+
|
13
14
|
module ClassMethods
|
14
|
-
|
15
|
+
|
16
|
+
def inherited_attributes(*args)
|
17
|
+
@inherited_attributes ||= [:inherited_attributes]
|
18
|
+
@inherited_attributes += args
|
19
|
+
args.each do |arg|
|
20
|
+
class_eval %(
|
21
|
+
class << self; attr_accessor :#{arg} end
|
22
|
+
)
|
23
|
+
end
|
24
|
+
@inherited_attributes
|
25
|
+
end
|
26
|
+
|
27
|
+
def inherited(subclass)
|
28
|
+
@inherited_attributes.each do |inheritable_attribute|
|
29
|
+
instance_var = "@#{inheritable_attribute}"
|
30
|
+
subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
15
34
|
def source
|
16
35
|
@source
|
17
36
|
end
|
18
|
-
|
37
|
+
|
19
38
|
def source=(src)
|
20
39
|
@source = src
|
21
40
|
end
|
22
|
-
|
41
|
+
|
23
42
|
def set name, opts={}, &block
|
24
|
-
opts.merge!(:selector => ".#{name}")
|
43
|
+
opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
|
25
44
|
opts.merge!(:processor => block) if block_given?
|
26
|
-
|
45
|
+
|
27
46
|
attribute_map[name] = opts
|
28
47
|
end
|
29
|
-
|
48
|
+
|
30
49
|
alias_method :field, :set
|
31
|
-
|
50
|
+
|
32
51
|
def selector selector
|
33
52
|
@root_selector = selector
|
34
53
|
end
|
35
|
-
|
54
|
+
|
36
55
|
alias_method :root, :selector
|
37
56
|
|
38
57
|
def root_selector
|
39
58
|
@root_selector || inferred_selector
|
40
59
|
end
|
41
|
-
|
60
|
+
|
42
61
|
def inferred_selector
|
43
62
|
@inferred_selector ||= ".#{self.to_s.underscore}"
|
44
63
|
end
|
45
|
-
|
64
|
+
|
46
65
|
def doc
|
47
66
|
eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
|
48
67
|
end
|
49
|
-
|
68
|
+
|
50
69
|
def collection
|
51
70
|
@collection ||= []
|
52
71
|
end
|
53
|
-
|
72
|
+
|
54
73
|
def collection=(col)
|
55
74
|
@collection = col
|
56
75
|
end
|
57
|
-
|
76
|
+
|
58
77
|
def attribute_map
|
59
78
|
read_inheritable_attribute :attribute_map
|
60
79
|
end
|
61
|
-
|
80
|
+
|
62
81
|
def extract_instance fragment
|
63
82
|
new(hash_map(fragment))
|
64
83
|
end
|
65
|
-
|
84
|
+
|
66
85
|
def hash_map fragment
|
67
86
|
attribute_map.inject({}) do |extracted_hash, at|
|
68
87
|
selector, processor = at.last[:selector], at.last[:processor]
|
@@ -72,7 +91,7 @@ module Graboid
|
|
72
91
|
extracted_hash
|
73
92
|
end
|
74
93
|
end
|
75
|
-
|
94
|
+
|
76
95
|
def all_fragments
|
77
96
|
return page_fragments if @pager.nil?
|
78
97
|
old_source = self.source
|
@@ -85,13 +104,13 @@ module Graboid
|
|
85
104
|
self.source = old_source
|
86
105
|
self.collection
|
87
106
|
end
|
88
|
-
|
107
|
+
|
89
108
|
def paginate
|
90
109
|
next_page_url = @pager.call(doc) rescue nil
|
91
110
|
self.source = next_page_url
|
92
111
|
self.current_page += 1
|
93
112
|
end
|
94
|
-
|
113
|
+
|
95
114
|
def next_page?
|
96
115
|
if max_pages.zero?
|
97
116
|
return true unless @pager.call(doc).nil?
|
@@ -99,23 +118,23 @@ module Graboid
|
|
99
118
|
current_page <= max_pages-1
|
100
119
|
end
|
101
120
|
end
|
102
|
-
|
121
|
+
|
103
122
|
def page_fragments
|
104
123
|
doc.css(root_selector)
|
105
124
|
end
|
106
|
-
|
125
|
+
|
107
126
|
def all opts={}
|
108
127
|
reset_context
|
109
|
-
self.max_pages = opts[:max_pages]
|
128
|
+
self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
|
110
129
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
111
130
|
end
|
112
|
-
|
131
|
+
|
113
132
|
def reset_context
|
114
133
|
self.collection = []
|
115
134
|
self.current_page = 0
|
116
135
|
self.max_pages = 0
|
117
136
|
end
|
118
|
-
|
137
|
+
|
119
138
|
def read_source
|
120
139
|
case self.source
|
121
140
|
when /^http[s]?:\/\//
|
@@ -124,36 +143,36 @@ module Graboid
|
|
124
143
|
self.source
|
125
144
|
end
|
126
145
|
end
|
127
|
-
|
146
|
+
|
128
147
|
def pager &block
|
129
148
|
@pager = block
|
130
149
|
end
|
131
|
-
|
150
|
+
|
132
151
|
def mode
|
133
152
|
@mode ||= :html
|
134
153
|
end
|
135
|
-
|
154
|
+
|
136
155
|
def mode=(m)
|
137
156
|
raise ArgumentError unless [:html, :xml].include?(m)
|
138
157
|
@mode = m
|
139
158
|
end
|
140
|
-
|
159
|
+
|
141
160
|
def max_pages
|
142
161
|
@max_pages ||= 0
|
143
162
|
end
|
144
|
-
|
163
|
+
|
145
164
|
def max_pages=num
|
146
165
|
@max_pages = num
|
147
166
|
end
|
148
|
-
|
167
|
+
|
149
168
|
def current_page
|
150
169
|
@current_page ||= 0
|
151
170
|
end
|
152
|
-
|
171
|
+
|
153
172
|
def current_page=num
|
154
173
|
@current_page = num
|
155
174
|
end
|
156
|
-
|
175
|
+
|
157
176
|
instance_eval do
|
158
177
|
[:before, :after].each do |prefix|
|
159
178
|
[:paginate, :extract].each do |suffix|
|
@@ -163,29 +182,29 @@ module Graboid
|
|
163
182
|
end
|
164
183
|
define_method "run_#{method_name}_callbacks" do
|
165
184
|
ivar = instance_variable_get("@#{method_name}")
|
166
|
-
|
185
|
+
class_eval { ivar.call } unless ivar.nil?
|
167
186
|
end
|
168
187
|
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
172
191
|
end # ClassMethods
|
173
|
-
|
192
|
+
|
174
193
|
module InstanceMethods
|
175
|
-
|
194
|
+
|
176
195
|
def initialize opts={}
|
177
196
|
opts.each do |k,v|
|
178
|
-
self.class_eval do
|
197
|
+
self.class.class_eval do
|
179
198
|
define_method k do
|
180
199
|
v
|
181
200
|
end
|
182
201
|
end
|
183
202
|
end
|
184
203
|
end
|
185
|
-
|
204
|
+
|
186
205
|
def attribute_map
|
187
206
|
self.class.attribute_map
|
188
207
|
end
|
189
208
|
end # InstanceMethods
|
190
209
|
end
|
191
|
-
end
|
210
|
+
end
|
data/lib/graboid/scraper.rb
CHANGED
@@ -1,19 +1,35 @@
|
|
1
1
|
module Graboid
|
2
|
+
|
2
3
|
module Scraper
|
3
4
|
def self.included klass
|
4
5
|
klass.class_eval do
|
5
6
|
extend ClassMethods
|
6
7
|
include InstanceMethods
|
7
|
-
|
8
|
-
|
9
|
-
|
8
|
+
|
9
|
+
inherited_attributes :attribute_map, :callbacks
|
10
|
+
@attribute_map = {}
|
11
|
+
@callbacks = {}
|
10
12
|
end
|
11
13
|
end
|
12
|
-
|
14
|
+
|
13
15
|
module ClassMethods
|
14
|
-
|
15
|
-
def
|
16
|
-
|
16
|
+
|
17
|
+
def inherited_attributes(*args)
|
18
|
+
@inherited_attributes ||= [:inherited_attributes]
|
19
|
+
@inherited_attributes += args
|
20
|
+
args.each do |arg|
|
21
|
+
class_eval %(
|
22
|
+
class << self; attr_accessor :#{arg} end
|
23
|
+
)
|
24
|
+
end
|
25
|
+
@inherited_attributes
|
26
|
+
end
|
27
|
+
|
28
|
+
def inherited(subclass)
|
29
|
+
@inherited_attributes.each do |inheritable_attribute|
|
30
|
+
instance_var = "@#{inheritable_attribute}"
|
31
|
+
subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
|
32
|
+
end
|
17
33
|
end
|
18
34
|
|
19
35
|
def callbacks
|
@@ -41,7 +57,7 @@ module Graboid
|
|
41
57
|
alias_method :root, :selector
|
42
58
|
|
43
59
|
def set name, opts={}, &block
|
44
|
-
opts.merge!(:selector => ".#{name}")
|
60
|
+
opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
|
45
61
|
opts.merge!(:processor => block) if block_given?
|
46
62
|
|
47
63
|
attribute_map[name] = opts
|
@@ -60,14 +76,14 @@ module Graboid
|
|
60
76
|
|
61
77
|
module InstanceMethods
|
62
78
|
def initialize opts={}, &block
|
63
|
-
raise ArgumentError
|
79
|
+
raise ArgumentError if opts[:source].nil?
|
64
80
|
self.source = opts[:source]
|
65
81
|
end
|
66
82
|
|
67
83
|
def all opts={}, reload=false
|
68
84
|
return self.collection if reload and !self.collection.empty?
|
69
85
|
reset_context
|
70
|
-
self.max_pages = opts[:max_pages]
|
86
|
+
self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
|
71
87
|
all_fragments.collect{ |frag| extract_instance(frag) }
|
72
88
|
end
|
73
89
|
|
@@ -202,11 +218,11 @@ module Graboid
|
|
202
218
|
[:paginate, :extract].each do |suffix|
|
203
219
|
method_name = "#{prefix}_#{suffix}"
|
204
220
|
define_method "run_#{method_name}_callbacks" do
|
205
|
-
self.instance_eval &callbacks[method_name.to_sym]
|
221
|
+
self.instance_eval &callbacks[method_name.to_sym] unless callbacks[method_name.to_sym].nil?
|
206
222
|
end
|
207
223
|
end
|
208
224
|
end
|
209
225
|
|
210
226
|
end
|
211
227
|
end
|
212
|
-
end
|
228
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 4
|
9
|
+
- 0
|
10
|
+
version: 0.4.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-02-22 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -48,20 +48,6 @@ dependencies:
|
|
48
48
|
version: "0"
|
49
49
|
type: :runtime
|
50
50
|
version_requirements: *id002
|
51
|
-
- !ruby/object:Gem::Dependency
|
52
|
-
name: activesupport
|
53
|
-
prerelease: false
|
54
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
-
none: false
|
56
|
-
requirements:
|
57
|
-
- - ">="
|
58
|
-
- !ruby/object:Gem::Version
|
59
|
-
hash: 3
|
60
|
-
segments:
|
61
|
-
- 0
|
62
|
-
version: "0"
|
63
|
-
type: :runtime
|
64
|
-
version_requirements: *id003
|
65
51
|
description: web scraping made easier
|
66
52
|
email: signalstatic@gmail.com
|
67
53
|
executables: []
|
@@ -73,7 +59,6 @@ extra_rdoc_files:
|
|
73
59
|
- README.mdown
|
74
60
|
files:
|
75
61
|
- .document
|
76
|
-
- .gitignore
|
77
62
|
- LICENSE
|
78
63
|
- README.mdown
|
79
64
|
- Rakefile
|
@@ -94,15 +79,13 @@ files:
|
|
94
79
|
- spec/graboid_spec.rb
|
95
80
|
- spec/spec.opts
|
96
81
|
- spec/spec_helper.rb
|
97
|
-
- examples/reddit_post.rb
|
98
|
-
- examples/tumblr_post.rb
|
99
82
|
has_rdoc: true
|
100
83
|
homepage: http://github.com/twoism/graboid
|
101
84
|
licenses: []
|
102
85
|
|
103
86
|
post_install_message:
|
104
|
-
rdoc_options:
|
105
|
-
|
87
|
+
rdoc_options: []
|
88
|
+
|
106
89
|
require_paths:
|
107
90
|
- lib
|
108
91
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -131,13 +114,11 @@ signing_key:
|
|
131
114
|
specification_version: 3
|
132
115
|
summary: web scraping made easy
|
133
116
|
test_files:
|
117
|
+
- examples/active_rain_post.rb
|
118
|
+
- examples/live_journal_post.rb
|
119
|
+
- examples/ning_post.rb
|
134
120
|
- spec/fixtures/server.rb
|
135
121
|
- spec/graboid/entity_spec.rb
|
136
122
|
- spec/graboid/scraper_spec.rb
|
137
123
|
- spec/graboid_spec.rb
|
138
124
|
- spec/spec_helper.rb
|
139
|
-
- examples/active_rain_post.rb
|
140
|
-
- examples/live_journal_post.rb
|
141
|
-
- examples/ning_post.rb
|
142
|
-
- examples/reddit_post.rb
|
143
|
-
- examples/tumblr_post.rb
|
data/.gitignore
DELETED
data/examples/reddit_post.rb
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
-
require File.join(dir, 'graboid')
|
3
|
-
|
4
|
-
class RedditEntry
|
5
|
-
include Graboid::Scraper
|
6
|
-
|
7
|
-
selector '.entry'
|
8
|
-
|
9
|
-
set :title
|
10
|
-
set :domain, :selector => '.domain a'
|
11
|
-
|
12
|
-
set :link, :selector => '.title' do |entry|
|
13
|
-
entry.css('a').first['href']
|
14
|
-
end
|
15
|
-
|
16
|
-
page_with do |doc|
|
17
|
-
self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
18
|
-
end
|
19
|
-
|
20
|
-
before_paginate do
|
21
|
-
puts "opening page: #{self.source}"
|
22
|
-
puts "collection size: #{self.collection.length}"
|
23
|
-
puts "#{"*"*100}"
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
@posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
|
29
|
-
|
30
|
-
@posts.each do |p|
|
31
|
-
puts "title: #{p.title}"
|
32
|
-
puts "domain: #{p.domain}"
|
33
|
-
puts "link: #{p.link}"
|
34
|
-
puts "#{"*"*100}"
|
35
|
-
end
|
data/examples/tumblr_post.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
-
require File.join(dir, 'graboid')
|
3
|
-
|
4
|
-
class TumblrEntry
|
5
|
-
include Graboid::Scraper
|
6
|
-
TUMBLR_CHUNK_SIZE = 20
|
7
|
-
|
8
|
-
selector 'post'
|
9
|
-
|
10
|
-
set :title, :selector => 'regular-title'
|
11
|
-
|
12
|
-
page_with do |doc|
|
13
|
-
next_tumblr_page
|
14
|
-
end
|
15
|
-
|
16
|
-
def next_tumblr_page
|
17
|
-
return nil if self.doc.css('post').empty?
|
18
|
-
"#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
|
19
|
-
end
|
20
|
-
|
21
|
-
before_paginate do
|
22
|
-
puts "opening page: #{self.source}"
|
23
|
-
puts "collection size: #{self.collection.length}"
|
24
|
-
puts "#{"*"*100}"
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
@posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
|
30
|
-
|
31
|
-
@posts.each do |p|
|
32
|
-
puts "title: #{p.title}"
|
33
|
-
end
|