guitsaru-scraper 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/scraper.rb +15 -1
- data/scraper.gemspec +2 -2
- data/test/test_helper.rb +5 -5
- data/test/test_scraper.rb +28 -0
- metadata +2 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/scraper.rb
CHANGED
@@ -13,17 +13,31 @@ class Scraper
|
|
13
13
|
# options[:div] - The container div with the links
|
14
14
|
# options[:domain] - The domain to collect links from, all other domains are ignored
|
15
15
|
# options[:ignore] - An Array of regexes. Any links matching one will be ignored.
|
16
|
+
# options[:recursive] - A boolean. If false, only get the top level links. Default is true.
|
17
|
+
# options[:self] - A boolean. Whether to include the main page in results. Default is true.
|
16
18
|
def initialize(url, options = {})
|
17
19
|
@url = url
|
18
20
|
@options = options
|
21
|
+
unless @options.has_key?(:recursive)
|
22
|
+
@options.merge!(:recursive => true)
|
23
|
+
end
|
24
|
+
|
25
|
+
unless @options.has_key?(:self)
|
26
|
+
@options.merge!(:self => true)
|
27
|
+
end
|
19
28
|
end
|
20
29
|
|
21
30
|
def scrape(options = {})
|
31
|
+
options.merge!(@options)
|
22
32
|
links = [Link.new(self.url)]
|
33
|
+
|
23
34
|
until (not_visited = links.uniq.select { |link| !link.visited }).empty?
|
24
|
-
not_visited.each { |link| links += link.scrape!(options
|
35
|
+
not_visited.each { |link| links += link.scrape!(options) }
|
36
|
+
break unless options[:recursive]
|
25
37
|
end
|
26
38
|
|
39
|
+
links.delete(Link.new(self.url)) unless options[:self]
|
40
|
+
|
27
41
|
return links.uniq
|
28
42
|
end
|
29
43
|
end
|
data/scraper.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scraper}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Matt Pruitt"]
|
9
|
-
s.date = %q{2009-
|
9
|
+
s.date = %q{2009-07-23}
|
10
10
|
s.email = %q{guitsaru@gmail.com}
|
11
11
|
s.extra_rdoc_files = [
|
12
12
|
"LICENSE",
|
data/test/test_helper.rb
CHANGED
@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
8
8
|
require 'scraper'
|
9
9
|
|
10
10
|
class Test::Unit::TestCase
|
11
|
-
FakeWeb.register_uri(:get, "http://example.com/main.html", :
|
12
|
-
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :
|
13
|
-
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :
|
14
|
-
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :
|
15
|
-
FakeWeb.register_uri(:get, "http://google.com", :
|
11
|
+
FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
|
12
|
+
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
|
13
|
+
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
|
15
|
+
FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
|
16
16
|
end
|
17
17
|
|
data/test/test_scraper.rb
CHANGED
@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
|
|
55
55
|
assert(!@results.include?(Link.new('http://google.com')))
|
56
56
|
end
|
57
57
|
end
|
58
|
+
|
59
|
+
context "Non-recursive scraping" do
|
60
|
+
setup do
|
61
|
+
@scraper = Scraper.new('http://example.com/main.html', :recursive => false)
|
62
|
+
@results = @scraper.scrape(:div => '#content')
|
63
|
+
end
|
64
|
+
|
65
|
+
should "include top level links" do
|
66
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
67
|
+
end
|
68
|
+
|
69
|
+
should "not include recursive links" do
|
70
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
|
71
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
72
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "Scraping without self" do
|
77
|
+
setup do
|
78
|
+
@scraper = Scraper.new('http://example.com/main.html', :self => false)
|
79
|
+
@results = @scraper.scrape(:div => '#content')
|
80
|
+
end
|
81
|
+
|
82
|
+
should "not include self" do
|
83
|
+
assert(!@results.include?(Link.new('http://example.com/main.html')))
|
84
|
+
end
|
85
|
+
end
|
58
86
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guitsaru-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Pruitt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-07-23 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|