guitsaru-scraper 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.2
1
+ 0.2.0
data/lib/scraper.rb CHANGED
@@ -13,17 +13,31 @@ class Scraper
13
13
  # options[:div] - The container div with the links
14
14
  # options[:domain] - The domain to collect links from, all other domains are ignored
15
15
  # options[:ignore] - An Array of regexes. Any links matching one will be ignored.
16
+ # options[:recursive] - A boolean. If false, only get the top level links. Default is true.
17
+ # options[:self] - A boolean. Whether to include the main page in results. Default is true.
16
18
  def initialize(url, options = {})
17
19
  @url = url
18
20
  @options = options
21
+ unless @options.has_key?(:recursive)
22
+ @options.merge!(:recursive => true)
23
+ end
24
+
25
+ unless @options.has_key?(:self)
26
+ @options.merge!(:self => true)
27
+ end
19
28
  end
20
29
 
21
30
  def scrape(options = {})
31
+ options.merge!(@options)
22
32
  links = [Link.new(self.url)]
33
+
23
34
  until (not_visited = links.uniq.select { |link| !link.visited }).empty?
24
- not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
35
+ not_visited.each { |link| links += link.scrape!(options) }
36
+ break unless options[:recursive]
25
37
  end
26
38
 
39
+ links.delete(Link.new(self.url)) unless options[:self]
40
+
27
41
  return links.uniq
28
42
  end
29
43
  end
data/scraper.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scraper}
5
- s.version = "0.1.2"
5
+ s.version = "0.2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Matt Pruitt"]
9
- s.date = %q{2009-06-18}
9
+ s.date = %q{2009-07-23}
10
10
  s.email = %q{guitsaru@gmail.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
data/test/test_helper.rb CHANGED
@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
8
  require 'scraper'
9
9
 
10
10
  class Test::Unit::TestCase
11
- FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
- FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
- FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
- FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
- FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
11
+ FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
+ FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
+ FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
+ FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
16
16
  end
17
17
 
data/test/test_scraper.rb CHANGED
@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
55
55
  assert(!@results.include?(Link.new('http://google.com')))
56
56
  end
57
57
  end
58
+
59
+ context "Non-recursive scraping" do
60
+ setup do
61
+ @scraper = Scraper.new('http://example.com/main.html', :recursive => false)
62
+ @results = @scraper.scrape(:div => '#content')
63
+ end
64
+
65
+ should "include top level links" do
66
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
67
+ end
68
+
69
+ should "not include recursive links" do
70
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
71
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
72
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
73
+ end
74
+ end
75
+
76
+ context "Scraping without self" do
77
+ setup do
78
+ @scraper = Scraper.new('http://example.com/main.html', :self => false)
79
+ @results = @scraper.scrape(:div => '#content')
80
+ end
81
+
82
+ should "not include self" do
83
+ assert(!@results.include?(Link.new('http://example.com/main.html')))
84
+ end
85
+ end
58
86
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guitsaru-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-18 00:00:00 -07:00
12
+ date: 2009-07-23 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency