graboid 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,22 +21,30 @@
21
21
 
22
22
  set :title
23
23
  set :domain, :selector => '.domain a'
24
+
24
25
  set :link, :selector => '.title' do |entry|
25
26
  entry.css('a').first['href']
26
27
  end
27
-
28
+
28
29
  pager do |doc|
29
30
  doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
30
31
  end
31
32
 
33
+ before_paginate do
34
+ puts "opening page: #{self.source}"
35
+ puts "collection size: #{self.collection.length}"
36
+ puts "#{"*"*100}"
37
+ end
38
+
32
39
  end
33
40
 
34
41
  RedditEntry.source = 'http://reddit.com'
35
42
 
36
- RedditEntry.all(:max_pages => 2).each do |p|
43
+ RedditEntry.all(:max_pages => 5).each do |p|
37
44
  puts "title: #{p.title}"
38
45
  puts "domain: #{p.domain}"
39
46
  puts "link: #{p.link}"
47
+ puts "#{"*"*100}"
40
48
  end
41
49
 
42
50
  ##Note on Patches/Pull Requests
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.2
1
+ 0.3.3
@@ -0,0 +1,36 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class ActiveRainPost
5
+ include Graboid::Entity
6
+
7
+ root '.blog_entry'
8
+
9
+ field :title, :selector => 'h2'
10
+
11
+ field :body, :selector => 'div' do |elm|
12
+ elm.css('p').collect(&:to_html)
13
+ end
14
+
15
+ pager do |doc|
16
+ "http://activerain.com" + doc.css('.pagination a').select{|a| a.text =~ /Next/i }.first['href'] rescue nil
17
+ end
18
+
19
+ before_paginate do
20
+ # logging for fun
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "*"*100
24
+ end
25
+
26
+ end
27
+
28
+ ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
29
+ @posts = ActiveRainPost.all
30
+
31
+ @posts.each do |post|
32
+ puts "#{post.title}"
33
+ puts "*"*100
34
+ end
35
+
36
+ puts "total: #{@posts.length}"
@@ -0,0 +1,38 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class LiveJournalPost
5
+ include Graboid::Entity
6
+
7
+ root '.entrybox'
8
+
9
+ field :title, :selector => '.caption a'
10
+ field :body, :selector => 'td[@colspan="2"]'
11
+ field :comment_link, :selector => '.caption a' do |elm|
12
+ elm['href']
13
+ end
14
+
15
+ pager do |doc|
16
+ doc.css('a').select{|a| a.text =~ /earlier/i }.first['href'] rescue nil
17
+ end
18
+
19
+ before_paginate do
20
+ # logging for fun
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "*"*100
24
+ end
25
+
26
+ end
27
+
28
+ LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
29
+ @posts = LiveJournalPost.all(:max_pages => 3)
30
+
31
+ @posts.each do |post|
32
+ puts "#{post.title}"
33
+ puts "#{post.comment_link}"
34
+ puts "#{post.body}"
35
+ puts "*"*100
36
+ end
37
+
38
+ puts "total: #{@posts.length}"
@@ -0,0 +1,56 @@
1
+ %w{rubygems graboid}.each {|f| require f }
2
+
3
+ class NingPost
4
+ include Graboid::Entity
5
+
6
+ selector 'div.xg_blog .xg_module_body'
7
+
8
+ set :title do |elm|
9
+ elm.text.match(/^\s*(.*)$\s*/).captures.first
10
+ end
11
+
12
+ set :pub_date, :selector => 'p[class=small]' do |elm|
13
+ elm.text.match(/on (.* \d+, \d{4})/)[1]
14
+ end
15
+
16
+ set :comment_link, :selector => 'p[class=small]' do |elm|
17
+ elm.css('a').select {|n| n['href'] =~ /comments/ }.first['href'] rescue nil
18
+ end
19
+
20
+ set :link, :selector => '.title' do |elm|
21
+ elm.css('a').last["href"]
22
+ end
23
+
24
+ set :body, :selector => '.title' do |elm|
25
+ # ning's list page only has an excerpt of the body. No biggie,
26
+ # we'll just go grab it.
27
+ show_url = elm.css('a').last["href"]
28
+ Nokogiri::HTML(open(show_url)).css('.postbody').to_html
29
+ end
30
+
31
+ pager do |doc|
32
+ doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
33
+ end
34
+
35
+ before_paginate do
36
+ # clearing empty rows. ning has shit markup
37
+ # and very few relevant class names.
38
+ self.collection.delete_if {|post| post.css('h3').length == 0 }
39
+
40
+ # logging for fun
41
+ puts "opening page: #{self.source}"
42
+ puts "collection size: #{self.collection.length}"
43
+ puts "*"*100
44
+ end
45
+
46
+ end
47
+
48
+ NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
49
+ @posts = NingPost.all(:max_pages => 1)
50
+
51
+ @posts.each do |post|
52
+ puts "#{post.pub_date} -- #{post.title}"
53
+ puts "*"*100
54
+ end
55
+
56
+ puts "total: #{@posts.length}"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.2"
8
+ s.version = "0.3.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-14}
12
+ s.date = %q{2010-06-15}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -23,6 +23,9 @@ Gem::Specification.new do |s|
23
23
  "README.mdown",
24
24
  "Rakefile",
25
25
  "VERSION",
26
+ "examples/active_rain_post.rb",
27
+ "examples/live_journal_post.rb",
28
+ "examples/ning_post.rb",
26
29
  "graboid.gemspec",
27
30
  "lib/graboid.rb",
28
31
  "lib/graboid/entity.rb",
@@ -44,7 +47,10 @@ Gem::Specification.new do |s|
44
47
  "spec/fixtures/server.rb",
45
48
  "spec/graboid/entity_spec.rb",
46
49
  "spec/graboid_spec.rb",
47
- "spec/spec_helper.rb"
50
+ "spec/spec_helper.rb",
51
+ "examples/active_rain_post.rb",
52
+ "examples/live_journal_post.rb",
53
+ "examples/ning_post.rb"
48
54
  ]
49
55
 
50
56
  if s.respond_to? :specification_version then
@@ -1,5 +1,9 @@
1
1
  %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
2
2
 
3
+ dir = Pathname(__FILE__).dirname.expand_path
4
+
5
+ require dir + 'graboid/entity'
6
+
7
+
3
8
  module Graboid
4
- require 'graboid/entity'
5
9
  end
@@ -67,7 +67,7 @@ module Graboid
67
67
  attribute_map.inject({}) do |extracted_hash, at|
68
68
  selector, processor = at.last[:selector], at.last[:processor]
69
69
  node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
- extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
71
71
 
72
72
  extracted_hash
73
73
  end
@@ -45,9 +45,7 @@ class PostWithPager
45
45
  end
46
46
 
47
47
  pager do |doc|
48
- link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
49
- #puts link.inspect
50
- link
48
+ 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
51
49
  end
52
50
 
53
51
  before_paginate do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 2
10
- version: 0.3.2
9
+ - 3
10
+ version: 0.3.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-14 00:00:00 -07:00
18
+ date: 2010-06-15 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -78,6 +78,9 @@ files:
78
78
  - README.mdown
79
79
  - Rakefile
80
80
  - VERSION
81
+ - examples/active_rain_post.rb
82
+ - examples/live_journal_post.rb
83
+ - examples/ning_post.rb
81
84
  - graboid.gemspec
82
85
  - lib/graboid.rb
83
86
  - lib/graboid/entity.rb
@@ -128,3 +131,6 @@ test_files:
128
131
  - spec/graboid/entity_spec.rb
129
132
  - spec/graboid_spec.rb
130
133
  - spec/spec_helper.rb
134
+ - examples/active_rain_post.rb
135
+ - examples/live_journal_post.rb
136
+ - examples/ning_post.rb