guitsaru-scraper 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.2
1
+ 0.2.0
data/lib/scraper.rb CHANGED
@@ -13,17 +13,31 @@ class Scraper
13
13
  # options[:div] - The container div with the links
14
14
  # options[:domain] - The domain to collect links from, all other domains are ignored
15
15
  # options[:ignore] - An Array of regexes. Any links matching one will be ignored.
16
+ # options[:recursive] - A boolean. If false, only get the top level links. Default is true.
17
+ # options[:self] - A boolean. Whether to include the main page in results. Default is true.
16
18
  def initialize(url, options = {})
17
19
  @url = url
18
20
  @options = options
21
+ unless @options.has_key?(:recursive)
22
+ @options.merge!(:recursive => true)
23
+ end
24
+
25
+ unless @options.has_key?(:self)
26
+ @options.merge!(:self => true)
27
+ end
19
28
  end
20
29
 
21
30
  def scrape(options = {})
31
+ options.merge!(@options)
22
32
  links = [Link.new(self.url)]
33
+
23
34
  until (not_visited = links.uniq.select { |link| !link.visited }).empty?
24
- not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
35
+ not_visited.each { |link| links += link.scrape!(options) }
36
+ break unless options[:recursive]
25
37
  end
26
38
 
39
+ links.delete(Link.new(self.url)) unless options[:self]
40
+
27
41
  return links.uniq
28
42
  end
29
43
  end
data/scraper.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scraper}
5
- s.version = "0.1.2"
5
+ s.version = "0.2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Matt Pruitt"]
9
- s.date = %q{2009-06-18}
9
+ s.date = %q{2009-07-23}
10
10
  s.email = %q{guitsaru@gmail.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
data/test/test_helper.rb CHANGED
@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
8
  require 'scraper'
9
9
 
10
10
  class Test::Unit::TestCase
11
- FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
- FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
- FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
- FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
- FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
11
+ FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
+ FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
+ FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
+ FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
16
16
  end
17
17
 
data/test/test_scraper.rb CHANGED
@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
55
55
  assert(!@results.include?(Link.new('http://google.com')))
56
56
  end
57
57
  end
58
+
59
+ context "Non-recursive scraping" do
60
+ setup do
61
+ @scraper = Scraper.new('http://example.com/main.html', :recursive => false)
62
+ @results = @scraper.scrape(:div => '#content')
63
+ end
64
+
65
+ should "include top level links" do
66
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
67
+ end
68
+
69
+ should "not include recursive links" do
70
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
71
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
72
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
73
+ end
74
+ end
75
+
76
+ context "Scraping without self" do
77
+ setup do
78
+ @scraper = Scraper.new('http://example.com/main.html', :self => false)
79
+ @results = @scraper.scrape(:div => '#content')
80
+ end
81
+
82
+ should "not include self" do
83
+ assert(!@results.include?(Link.new('http://example.com/main.html')))
84
+ end
85
+ end
58
86
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guitsaru-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-18 00:00:00 -07:00
12
+ date: 2009-07-23 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency