graboid 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,22 +21,30 @@
21
21
 
22
22
  set :title
23
23
  set :domain, :selector => '.domain a'
24
+
24
25
  set :link, :selector => '.title' do |entry|
25
26
  entry.css('a').first['href']
26
27
  end
27
-
28
+
28
29
  pager do |doc|
29
30
  doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
30
31
  end
31
32
 
33
+ before_paginate do
34
+ puts "opening page: #{self.source}"
35
+ puts "collection size: #{self.collection.length}"
36
+ puts "#{"*"*100}"
37
+ end
38
+
32
39
  end
33
40
 
34
41
  RedditEntry.source = 'http://reddit.com'
35
42
 
36
- RedditEntry.all(:max_pages => 2).each do |p|
43
+ RedditEntry.all(:max_pages => 5).each do |p|
37
44
  puts "title: #{p.title}"
38
45
  puts "domain: #{p.domain}"
39
46
  puts "link: #{p.link}"
47
+ puts "#{"*"*100}"
40
48
  end
41
49
 
42
50
  ##Note on Patches/Pull Requests
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.2
1
+ 0.3.3
@@ -0,0 +1,36 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class ActiveRainPost
5
+ include Graboid::Entity
6
+
7
+ root '.blog_entry'
8
+
9
+ field :title, :selector => 'h2'
10
+
11
+ field :body, :selector => 'div' do |elm|
12
+ elm.css('p').collect(&:to_html)
13
+ end
14
+
15
+ pager do |doc|
16
+ "http://activerain.com" + doc.css('.pagination a').select{|a| a.text =~ /Next/i }.first['href'] rescue nil
17
+ end
18
+
19
+ before_paginate do
20
+ # logging for fun
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "*"*100
24
+ end
25
+
26
+ end
27
+
28
+ ActiveRainPost.source = 'http://activerain.com/blogs/elizabethweintraub'
29
+ @posts = ActiveRainPost.all
30
+
31
+ @posts.each do |post|
32
+ puts "#{post.title}"
33
+ puts "*"*100
34
+ end
35
+
36
+ puts "total: #{@posts.length}"
@@ -0,0 +1,38 @@
1
+ dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require File.join(dir, 'graboid')
3
+
4
+ class LiveJournalPost
5
+ include Graboid::Entity
6
+
7
+ root '.entrybox'
8
+
9
+ field :title, :selector => '.caption a'
10
+ field :body, :selector => 'td[@colspan="2"]'
11
+ field :comment_link, :selector => '.caption a' do |elm|
12
+ elm['href']
13
+ end
14
+
15
+ pager do |doc|
16
+ doc.css('a').select{|a| a.text =~ /earlier/i }.first['href'] rescue nil
17
+ end
18
+
19
+ before_paginate do
20
+ # logging for fun
21
+ puts "opening page: #{self.source}"
22
+ puts "collection size: #{self.collection.length}"
23
+ puts "*"*100
24
+ end
25
+
26
+ end
27
+
28
+ LiveJournalPost.source = 'http://zeroplate.livejournal.com/'
29
+ @posts = LiveJournalPost.all(:max_pages => 3)
30
+
31
+ @posts.each do |post|
32
+ puts "#{post.title}"
33
+ puts "#{post.comment_link}"
34
+ puts "#{post.body}"
35
+ puts "*"*100
36
+ end
37
+
38
+ puts "total: #{@posts.length}"
@@ -0,0 +1,56 @@
1
+ %w{rubygems graboid}.each {|f| require f }
2
+
3
+ class NingPost
4
+ include Graboid::Entity
5
+
6
+ selector 'div.xg_blog .xg_module_body'
7
+
8
+ set :title do |elm|
9
+ elm.text.match(/^\s*(.*)$\s*/).captures.first
10
+ end
11
+
12
+ set :pub_date, :selector => 'p[class=small]' do |elm|
13
+ elm.text.match(/on (.* \d+, \d{4})/)[1]
14
+ end
15
+
16
+ set :comment_link, :selector => 'p[class=small]' do |elm|
17
+ elm.css('a').select {|n| n['href'] =~ /comments/ }.first['href'] rescue nil
18
+ end
19
+
20
+ set :link, :selector => '.title' do |elm|
21
+ elm.css('a').last["href"]
22
+ end
23
+
24
+ set :body, :selector => '.title' do |elm|
25
+ # ning's list page only has an excerpt of the body. No biggie,
26
+ # we'll just go grab it.
27
+ show_url = elm.css('a').last["href"]
28
+ Nokogiri::HTML(open(show_url)).css('.postbody').to_html
29
+ end
30
+
31
+ pager do |doc|
32
+ doc.css('.pagination a').select{|a| a.text =~ /previous/i }.first['href'] rescue nil
33
+ end
34
+
35
+ before_paginate do
36
+ # clearing empty rows. ning has shit markup
37
+ # and very few relevant class names.
38
+ self.collection.delete_if {|post| post.css('h3').length == 0 }
39
+
40
+ # logging for fun
41
+ puts "opening page: #{self.source}"
42
+ puts "collection size: #{self.collection.length}"
43
+ puts "*"*100
44
+ end
45
+
46
+ end
47
+
48
+ NingPost.source = 'http://cuwebd.ning.com/profiles/blog/list'
49
+ @posts = NingPost.all(:max_pages => 1)
50
+
51
+ @posts.each do |post|
52
+ puts "#{post.pub_date} -- #{post.title}"
53
+ puts "*"*100
54
+ end
55
+
56
+ puts "total: #{@posts.length}"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.2"
8
+ s.version = "0.3.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-06-14}
12
+ s.date = %q{2010-06-15}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -23,6 +23,9 @@ Gem::Specification.new do |s|
23
23
  "README.mdown",
24
24
  "Rakefile",
25
25
  "VERSION",
26
+ "examples/active_rain_post.rb",
27
+ "examples/live_journal_post.rb",
28
+ "examples/ning_post.rb",
26
29
  "graboid.gemspec",
27
30
  "lib/graboid.rb",
28
31
  "lib/graboid/entity.rb",
@@ -44,7 +47,10 @@ Gem::Specification.new do |s|
44
47
  "spec/fixtures/server.rb",
45
48
  "spec/graboid/entity_spec.rb",
46
49
  "spec/graboid_spec.rb",
47
- "spec/spec_helper.rb"
50
+ "spec/spec_helper.rb",
51
+ "examples/active_rain_post.rb",
52
+ "examples/live_journal_post.rb",
53
+ "examples/ning_post.rb"
48
54
  ]
49
55
 
50
56
  if s.respond_to? :specification_version then
@@ -1,5 +1,9 @@
1
1
  %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
2
2
 
3
+ dir = Pathname(__FILE__).dirname.expand_path
4
+
5
+ require dir + 'graboid/entity'
6
+
7
+
3
8
  module Graboid
4
- require 'graboid/entity'
5
9
  end
@@ -67,7 +67,7 @@ module Graboid
67
67
  attribute_map.inject({}) do |extracted_hash, at|
68
68
  selector, processor = at.last[:selector], at.last[:processor]
69
69
  node_collection = self.mode == :html ? fragment.css(selector) : fragment.xpath(selector)
70
- extracted_hash[at.first] = processor.nil? ? node_collection.first.text : processor.call(node_collection.first) rescue ""
70
+ extracted_hash[at.first] = processor.nil? ? node_collection.first.inner_html : processor.call(node_collection.first) rescue ""
71
71
 
72
72
  extracted_hash
73
73
  end
@@ -45,9 +45,7 @@ class PostWithPager
45
45
  end
46
46
 
47
47
  pager do |doc|
48
- link = 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
49
- #puts link.inspect
50
- link
48
+ 'http://localhost:9393'+doc.css('a.next').first['href'] rescue nil
51
49
  end
52
50
 
53
51
  before_paginate do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 3
9
- - 2
10
- version: 0.3.2
9
+ - 3
10
+ version: 0.3.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-14 00:00:00 -07:00
18
+ date: 2010-06-15 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -78,6 +78,9 @@ files:
78
78
  - README.mdown
79
79
  - Rakefile
80
80
  - VERSION
81
+ - examples/active_rain_post.rb
82
+ - examples/live_journal_post.rb
83
+ - examples/ning_post.rb
81
84
  - graboid.gemspec
82
85
  - lib/graboid.rb
83
86
  - lib/graboid/entity.rb
@@ -128,3 +131,6 @@ test_files:
128
131
  - spec/graboid/entity_spec.rb
129
132
  - spec/graboid_spec.rb
130
133
  - spec/spec_helper.rb
134
+ - examples/active_rain_post.rb
135
+ - examples/live_journal_post.rb
136
+ - examples/ning_post.rb