graboid 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.mdown +10 -2
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +36 -0
- data/examples/live_journal_post.rb +38 -0
- data/examples/ning_post.rb +56 -0
- data/graboid.gemspec +9 -3
- data/lib/graboid.rb +5 -1
- data/lib/graboid/entity.rb +1 -1
- data/spec/spec_helper.rb +1 -3
- metadata +10 -4
data/README.mdown
CHANGED
@@ -21,22 +21,30 @@
|
|
21
21
|
|
22
22
|
set :title
|
23
23
|
set :domain, :selector => '.domain a'
|
24
|
+
|
24
25
|
set :link, :selector => '.title' do |entry|
|
25
26
|
entry.css('a').first['href']
|
26
27
|
end
|
27
|
-
|
28
|
+
|
28
29
|
pager do |doc|
|
29
30
|
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
30
31
|
end
|
31
32
|
|
33
|
+
before_paginate do
|
34
|
+
puts "opening page: #{self.source}"
|
35
|
+
puts "collection size: #{self.collection.length}"
|
36
|
+
puts "#{"*"*100}"
|
37
|
+
end
|
38
|
+
|
32
39
|
end
|
33
40
|
|
34
41
|
RedditEntry.source = 'http://reddit.com'
|
35
42
|
|
36
|
-
RedditEntry.all(:max_pages =>
|
43
|
+
RedditEntry.all(:max_pages => 5).each do |p|
|
37
44
|
puts "title: #{p.title}"
|
38
45
|
puts "domain: #{p.domain}"
|
39
46
|
puts "link: #{p.link}"
|
47
|
+
puts "#{"*"*100}"
|
40
48
|
end
|
41
49
|
|
42
50
|
##Note on Patches/Pull Requests
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.3
|
@@ -0,0 +1,36 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class ActiveRainPost
|
5
|
+
include Graboid::Entity
|
6
|
+
|
7
|
+
root '.blog_entry'
|
8
|
+
|
9
|
+
field :title, :selector => 'h2'
|
10
|
+
|
11
|
+
field :body, :selector => 'div' do |elm|
|
12
|
+
elm.css('p').collect(&:to_html)
|
13
|
+
end
|
14
|
+
|
15
|
+
pager do |doc|
|
16
|
+
"http://activerain.com" + doc.css('.pagination a').select{|a| a.text =~ /Next/i }.first['href'] rescue nil
|
17
|
+
end
|
18
|
+
|
19
|
+
before_paginate do
|
20
|
+
# logging for fun
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "*"*100
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
|
29
|
+
@posts = ActiveRainPost.all
|
30
|
+
|
31
|
+
@posts.each do |post|
|
32
|
+
puts "#{post.title}"
|
33
|
+
puts "*"*100
|
34
|
+
end
|
35
|
+
|
36
|
+
puts "total: #{@posts.length}"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class LiveJournalPost
|
5
|
+
include Graboid::Entity
|
6
|
+
|
7
|
+
root '.entrybox'
|
8
|
+
|
9
|
+
field :title, :selector => '.caption a'
|
10
|
+
field :body, :selector => 'td[@colspan="2"]'
|
11
|
+
field :comment_link, :selector => '.caption a' do |elm|
|
12
|
+
elm['href']
|
13
|
+
end
|
14
|
+
|
15
|
+
pager do |doc|
|
16
|
+
doc.css('a').select{|a| a.text =~ /earlier/i }.first['href'] rescue nil
|
17
|
+
end
|
18
|
+
|
19
|
+
before_paginate do
|
20
|
+
# logging for fun
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "*"*100
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
|
29
|
+
@posts = LiveJournalPost.all(:max_pages => 3)
|
30
|
+
|
31
|
+
@posts.each do |post|
|
32
|
+
puts "#{post.title}"
|
33
|
+
puts "#{post.comment_link}"
|
34
|
+
puts "#{post.body}"
|
35
|
+
puts "*"*100
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "total: #{@posts.length}"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
%w{rubygems graboid}.each {|f| require f }
|
2
|
+
|
3
|
+
class NingPost
|
4
|
+
include Graboid::Entity
|
5
|
+
|
6
|
+
selector 'div.xg_blog .xg_module_body'
|
7
|
+
|
8
|
+
set :title do |elm|
|
9
|
+
elm.text.match(/^\s*(.*)$\s*/).captures.first
|
10
|
+
end
|
11
|
+
|
12
|
+
set :pub_date, :selector => 'p[class=small]' do |elm|
|
13
|
+
elm.text.match(/on (.* \d+, \d{4})/)[1]
|
14
|
+
end
|
15
|
+
|
16
|
+
set :comment_link, :selector => 'p[class=small]' do |elm|
|
17
|
+
elm.css('a').select {|n| n['href'] =~ /comments/ }.first['href'] rescue nil
|
18
|
+
end
|
19
|
+
|
20
|
+
set :link, :selector => '.title' do |elm|
|
21
|
+
elm.css('a').last["href"]
|
22
|
+
end
|
23
|
+
|
24
|
+
set :body, :selector => '.title' do |elm|
|
25
|
+
# ning's list page only has an excerpt of the body. No biggie,
|
26
|
+
# we'll just go grab it.
|
27
|
+
show_url = elm.css('a').last["href"]
|
28
|
+
Nokogiri::HTML(open(show_url)).css('.postbody').to_html
|
29
|
+
end
|
30
|
+
|
31
|
+
pager do |doc|
|
32
|
+
doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
|
33
|
+
end
|
34
|
+
|
35
|
+
before_paginate do
|
36
|
+
# clearing empty rows. ning has shit markup
|
37
|
+
# and very few relevant class names.
|
38
|
+
self.collection.delete_if {|post| post.css('h3').length == 0 }
|
39
|
+
|
40
|
+
# logging for fun
|
41
|
+
puts "opening page: #{self.source}"
|
42
|
+
puts "collection size: #{self.collection.length}"
|
43
|
+
puts "*"*100
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
|
49
|
+
@posts = NingPost.all(:max_pages => 1)
|
50
|
+
|
51
|
+
@posts.each do |post|
|
52
|
+
puts "#{post.pub_date} -- #{post.title}"
|
53
|
+
puts "*"*100
|
54
|
+
end
|
55
|
+
|
56
|
+
puts "total: #{@posts.length}"
|
data/graboid.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-06-
|
12
|
+
s.date = %q{2010-06-15}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -23,6 +23,9 @@ Gem::Specification.new do |s|
|
|
23
23
|
"README.mdown",
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
|
+
"examples/active_rain_post.rb",
|
27
|
+
"examples/live_journal_post.rb",
|
28
|
+
"examples/ning_post.rb",
|
26
29
|
"graboid.gemspec",
|
27
30
|
"lib/graboid.rb",
|
28
31
|
"lib/graboid/entity.rb",
|
@@ -44,7 +47,10 @@ Gem::Specification.new do |s|
|
|
44
47
|
"spec/fixtures/server.rb",
|
45
48
|
"spec/graboid/entity_spec.rb",
|
46
49
|
"spec/graboid_spec.rb",
|
47
|
-
"spec/spec_helper.rb"
|
50
|
+
"spec/spec_helper.rb",
|
51
|
+
"examples/active_rain_post.rb",
|
52
|
+
"examples/live_journal_post.rb",
|
53
|
+
"examples/ning_post.rb"
|
48
54
|
]
|
49
55
|
|
50
56
|
if s.respond_to? :specification_version then
|
data/lib/graboid.rb
CHANGED
data/lib/graboid/entity.rb
CHANGED
@@ -67,7 +67,7 @@ module Graboid
|
|
67
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
68
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
69
69
|
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
70
|
-
extracted_hash[at.first] = processor.nil? ? node_collection.first.
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
|
71
71
|
|
72
72
|
extracted_hash
|
73
73
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -45,9 +45,7 @@ class PostWithPager
|
|
45
45
|
end
|
46
46
|
|
47
47
|
pager do |doc|
|
48
|
-
|
49
|
-
#puts link.inspect
|
50
|
-
link
|
48
|
+
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
51
49
|
end
|
52
50
|
|
53
51
|
before_paginate do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 3
|
10
|
+
version: 0.3.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-15 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -78,6 +78,9 @@ files:
|
|
78
78
|
- README.mdown
|
79
79
|
- Rakefile
|
80
80
|
- VERSION
|
81
|
+
- examples/active_rain_post.rb
|
82
|
+
- examples/live_journal_post.rb
|
83
|
+
- examples/ning_post.rb
|
81
84
|
- graboid.gemspec
|
82
85
|
- lib/graboid.rb
|
83
86
|
- lib/graboid/entity.rb
|
@@ -128,3 +131,6 @@ test_files:
|
|
128
131
|
- spec/graboid/entity_spec.rb
|
129
132
|
- spec/graboid_spec.rb
|
130
133
|
- spec/spec_helper.rb
|
134
|
+
- examples/active_rain_post.rb
|
135
|
+
- examples/live_journal_post.rb
|
136
|
+
- examples/ning_post.rb
|