guitsaru-scraper 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/scraper.rb +15 -1
- data/scraper.gemspec +2 -2
- data/test/test_helper.rb +5 -5
- data/test/test_scraper.rb +28 -0
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/scraper.rb
CHANGED
@@ -13,17 +13,31 @@ class Scraper
|
|
13
13
|
# options[:div] - The container div with the links
|
14
14
|
# options[:domain] - The domain to collect links from, all other domains are ignored
|
15
15
|
# options[:ignore] - An Array of regexes. Any links matching one will be ignored.
|
16
|
+
# options[:recursive] - A boolean. If false, only get the top level links. Default is true.
|
17
|
+
# options[:self] - A boolean. Whether to include the main page in results. Default is true.
|
16
18
|
def initialize(url, options = {})
|
17
19
|
@url = url
|
18
20
|
@options = options
|
21
|
+
unless @options.has_key?(:recursive)
|
22
|
+
@options.merge!(:recursive => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
unless @options.has_key?(:self)
|
26
|
+
@options.merge!(:self => true)
|
27
|
+
end
|
19
28
|
end
|
20
29
|
|
21
30
|
def scrape(options = {})
|
31
|
+
options.merge!(@options)
|
22
32
|
links = [Link.new(self.url)]
|
33
|
+
|
23
34
|
until (not_visited = links.uniq.select { |link| !link.visited }).empty?
|
24
|
-
not_visited.each { |link| links += link.scrape!(options
|
35
|
+
not_visited.each { |link| links += link.scrape!(options) }
|
36
|
+
break unless options[:recursive]
|
25
37
|
end
|
26
38
|
|
39
|
+
links.delete(Link.new(self.url)) unless options[:self]
|
40
|
+
|
27
41
|
return links.uniq
|
28
42
|
end
|
29
43
|
end
|
data/scraper.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scraper}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Matt Pruitt"]
|
9
|
-
s.date = %q{2009-
|
9
|
+
s.date = %q{2009-07-23}
|
10
10
|
s.email = %q{guitsaru@gmail.com}
|
11
11
|
s.extra_rdoc_files = [
|
12
12
|
"LICENSE",
|
data/test/test_helper.rb
CHANGED
@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
8
8
|
require 'scraper'
|
9
9
|
|
10
10
|
class Test::Unit::TestCase
|
11
|
-
FakeWeb.register_uri(:get, "http://example.com/main.html", :
|
12
|
-
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :
|
13
|
-
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :
|
14
|
-
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :
|
15
|
-
FakeWeb.register_uri(:get, "http://google.com", :
|
11
|
+
FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
|
12
|
+
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
|
13
|
+
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
|
15
|
+
FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
|
16
16
|
end
|
17
17
|
|
data/test/test_scraper.rb
CHANGED
@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
|
|
55
55
|
assert(!@results.include?(Link.new('http://google.com')))
|
56
56
|
end
|
57
57
|
end
|
58
|
+
|
59
|
+
context "Non-recursive scraping" do
|
60
|
+
setup do
|
61
|
+
@scraper = Scraper.new('http://example.com/main.html', :recursive => false)
|
62
|
+
@results = @scraper.scrape(:div => '#content')
|
63
|
+
end
|
64
|
+
|
65
|
+
should "include top level links" do
|
66
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
67
|
+
end
|
68
|
+
|
69
|
+
should "not include recursive links" do
|
70
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
|
71
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
72
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "Scraping without self" do
|
77
|
+
setup do
|
78
|
+
@scraper = Scraper.new('http://example.com/main.html', :self => false)
|
79
|
+
@results = @scraper.scrape(:div => '#content')
|
80
|
+
end
|
81
|
+
|
82
|
+
should "not include self" do
|
83
|
+
assert(!@results.include?(Link.new('http://example.com/main.html')))
|
84
|
+
end
|
85
|
+
end
|
58
86
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guitsaru-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Pruitt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-07-23 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|