graboid 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.mdown +10 -2
- data/VERSION +1 -1
- data/examples/active_rain_post.rb +36 -0
- data/examples/live_journal_post.rb +38 -0
- data/examples/ning_post.rb +56 -0
- data/graboid.gemspec +9 -3
- data/lib/graboid.rb +5 -1
- data/lib/graboid/entity.rb +1 -1
- data/spec/spec_helper.rb +1 -3
- metadata +10 -4
data/README.mdown
CHANGED
@@ -21,22 +21,30 @@
|
|
21
21
|
|
22
22
|
set :title
|
23
23
|
set :domain, :selector => '.domain a'
|
24
|
+
|
24
25
|
set :link, :selector => '.title' do |entry|
|
25
26
|
entry.css('a').first['href']
|
26
27
|
end
|
27
|
-
|
28
|
+
|
28
29
|
pager do |doc|
|
29
30
|
doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
|
30
31
|
end
|
31
32
|
|
33
|
+
before_paginate do
|
34
|
+
puts "opening page: #{self.source}"
|
35
|
+
puts "collection size: #{self.collection.length}"
|
36
|
+
puts "#{"*"*100}"
|
37
|
+
end
|
38
|
+
|
32
39
|
end
|
33
40
|
|
34
41
|
RedditEntry.source = 'http://reddit.com'
|
35
42
|
|
36
|
-
RedditEntry.all(:max_pages =>
|
43
|
+
RedditEntry.all(:max_pages => 5).each do |p|
|
37
44
|
puts "title: #{p.title}"
|
38
45
|
puts "domain: #{p.domain}"
|
39
46
|
puts "link: #{p.link}"
|
47
|
+
puts "#{"*"*100}"
|
40
48
|
end
|
41
49
|
|
42
50
|
##Note on Patches/Pull Requests
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.3
|
@@ -0,0 +1,36 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class ActiveRainPost
|
5
|
+
include Graboid::Entity
|
6
|
+
|
7
|
+
root '.blog_entry'
|
8
|
+
|
9
|
+
field :title, :selector => 'h2'
|
10
|
+
|
11
|
+
field :body, :selector => 'div' do |elm|
|
12
|
+
elm.css('p').collect(&:to_html)
|
13
|
+
end
|
14
|
+
|
15
|
+
pager do |doc|
|
16
|
+
"http://activerain.com" + doc.css('.pagination a').select{|a| a.text =~ /Next/i }.first['href'] rescue nil
|
17
|
+
end
|
18
|
+
|
19
|
+
before_paginate do
|
20
|
+
# logging for fun
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "*"*100
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
|
29
|
+
@posts = ActiveRainPost.all
|
30
|
+
|
31
|
+
@posts.each do |post|
|
32
|
+
puts "#{post.title}"
|
33
|
+
puts "*"*100
|
34
|
+
end
|
35
|
+
|
36
|
+
puts "total: #{@posts.length}"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
require File.join(dir, 'graboid')
|
3
|
+
|
4
|
+
class LiveJournalPost
|
5
|
+
include Graboid::Entity
|
6
|
+
|
7
|
+
root '.entrybox'
|
8
|
+
|
9
|
+
field :title, :selector => '.caption a'
|
10
|
+
field :body, :selector => 'td[@colspan="2"]'
|
11
|
+
field :comment_link, :selector => '.caption a' do |elm|
|
12
|
+
elm['href']
|
13
|
+
end
|
14
|
+
|
15
|
+
pager do |doc|
|
16
|
+
doc.css('a').select{|a| a.text =~ /earlier/i }.first['href'] rescue nil
|
17
|
+
end
|
18
|
+
|
19
|
+
before_paginate do
|
20
|
+
# logging for fun
|
21
|
+
puts "opening page: #{self.source}"
|
22
|
+
puts "collection size: #{self.collection.length}"
|
23
|
+
puts "*"*100
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
|
29
|
+
@posts = LiveJournalPost.all(:max_pages => 3)
|
30
|
+
|
31
|
+
@posts.each do |post|
|
32
|
+
puts "#{post.title}"
|
33
|
+
puts "#{post.comment_link}"
|
34
|
+
puts "#{post.body}"
|
35
|
+
puts "*"*100
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "total: #{@posts.length}"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
%w{rubygems graboid}.each {|f| require f }
|
2
|
+
|
3
|
+
class NingPost
|
4
|
+
include Graboid::Entity
|
5
|
+
|
6
|
+
selector 'div.xg_blog .xg_module_body'
|
7
|
+
|
8
|
+
set :title do |elm|
|
9
|
+
elm.text.match(/^\s*(.*)$\s*/).captures.first
|
10
|
+
end
|
11
|
+
|
12
|
+
set :pub_date, :selector => 'p[class=small]' do |elm|
|
13
|
+
elm.text.match(/on (.* \d+, \d{4})/)[1]
|
14
|
+
end
|
15
|
+
|
16
|
+
set :comment_link, :selector => 'p[class=small]' do |elm|
|
17
|
+
elm.css('a').select {|n| n['href'] =~ /comments/ }.first['href'] rescue nil
|
18
|
+
end
|
19
|
+
|
20
|
+
set :link, :selector => '.title' do |elm|
|
21
|
+
elm.css('a').last["href"]
|
22
|
+
end
|
23
|
+
|
24
|
+
set :body, :selector => '.title' do |elm|
|
25
|
+
# ning's list page only has an excerpt of the body. No biggie,
|
26
|
+
# we'll just go grab it.
|
27
|
+
show_url = elm.css('a').last["href"]
|
28
|
+
Nokogiri::HTML(open(show_url)).css('.postbody').to_html
|
29
|
+
end
|
30
|
+
|
31
|
+
pager do |doc|
|
32
|
+
doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
|
33
|
+
end
|
34
|
+
|
35
|
+
before_paginate do
|
36
|
+
# clearing empty rows. ning has shit markup
|
37
|
+
# and very few relevant class names.
|
38
|
+
self.collection.delete_if {|post| post.css('h3').length == 0 }
|
39
|
+
|
40
|
+
# logging for fun
|
41
|
+
puts "opening page: #{self.source}"
|
42
|
+
puts "collection size: #{self.collection.length}"
|
43
|
+
puts "*"*100
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
|
49
|
+
@posts = NingPost.all(:max_pages => 1)
|
50
|
+
|
51
|
+
@posts.each do |post|
|
52
|
+
puts "#{post.pub_date} -- #{post.title}"
|
53
|
+
puts "*"*100
|
54
|
+
end
|
55
|
+
|
56
|
+
puts "total: #{@posts.length}"
|
data/graboid.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{graboid}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Christopher Burnett"]
|
12
|
-
s.date = %q{2010-06-
|
12
|
+
s.date = %q{2010-06-15}
|
13
13
|
s.description = %q{web scraping made easier}
|
14
14
|
s.email = %q{signalstatic@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -23,6 +23,9 @@ Gem::Specification.new do |s|
|
|
23
23
|
"README.mdown",
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
|
+
"examples/active_rain_post.rb",
|
27
|
+
"examples/live_journal_post.rb",
|
28
|
+
"examples/ning_post.rb",
|
26
29
|
"graboid.gemspec",
|
27
30
|
"lib/graboid.rb",
|
28
31
|
"lib/graboid/entity.rb",
|
@@ -44,7 +47,10 @@ Gem::Specification.new do |s|
|
|
44
47
|
"spec/fixtures/server.rb",
|
45
48
|
"spec/graboid/entity_spec.rb",
|
46
49
|
"spec/graboid_spec.rb",
|
47
|
-
"spec/spec_helper.rb"
|
50
|
+
"spec/spec_helper.rb",
|
51
|
+
"examples/active_rain_post.rb",
|
52
|
+
"examples/live_journal_post.rb",
|
53
|
+
"examples/ning_post.rb"
|
48
54
|
]
|
49
55
|
|
50
56
|
if s.respond_to? :specification_version then
|
data/lib/graboid.rb
CHANGED
data/lib/graboid/entity.rb
CHANGED
@@ -67,7 +67,7 @@ module Graboid
|
|
67
67
|
attribute_map.inject({}) do |extracted_hash, at|
|
68
68
|
selector, processor = at.last[:selector], at.last[:processor]
|
69
69
|
node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
|
70
|
-
extracted_hash[at.first] = processor.nil? ? node_collection.first.
|
70
|
+
extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
|
71
71
|
|
72
72
|
extracted_hash
|
73
73
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -45,9 +45,7 @@ class PostWithPager
|
|
45
45
|
end
|
46
46
|
|
47
47
|
pager do |doc|
|
48
|
-
|
49
|
-
#puts link.inspect
|
50
|
-
link
|
48
|
+
'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
|
51
49
|
end
|
52
50
|
|
53
51
|
before_paginate do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: graboid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 3
|
10
|
+
version: 0.3.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Christopher Burnett
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-15 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -78,6 +78,9 @@ files:
|
|
78
78
|
- README.mdown
|
79
79
|
- Rakefile
|
80
80
|
- VERSION
|
81
|
+
- examples/active_rain_post.rb
|
82
|
+
- examples/live_journal_post.rb
|
83
|
+
- examples/ning_post.rb
|
81
84
|
- graboid.gemspec
|
82
85
|
- lib/graboid.rb
|
83
86
|
- lib/graboid/entity.rb
|
@@ -128,3 +131,6 @@ test_files:
|
|
128
131
|
- spec/graboid/entity_spec.rb
|
129
132
|
- spec/graboid_spec.rb
|
130
133
|
- spec/spec_helper.rb
|
134
|
+
- examples/active_rain_post.rb
|
135
|
+
- examples/live_journal_post.rb
|
136
|
+
- examples/ning_post.rb
|