guitsaru-scraper 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/scraper/link.rb +35 -7
- data/lib/scraper.rb +11 -5
- data/scraper.gemspec +3 -2
- data/test/fake_pages/first_child_page.html +1 -0
- data/test/fake_pages/first_page.html +1 -1
- data/test/fake_pages/google.html +19 -0
- data/test/fake_pages/main.html +2 -0
- data/test/test_helper.rb +1 -0
- data/test/test_link.rb +22 -1
- data/test/test_scraper.rb +26 -1
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
data/lib/scraper/link.rb
CHANGED
@@ -7,10 +7,10 @@ module Scrape
|
|
7
7
|
@visited = false
|
8
8
|
end
|
9
9
|
|
10
|
-
def scrape!(
|
10
|
+
def scrape!(options = {})
|
11
11
|
return [] if @visited
|
12
12
|
@visited = true
|
13
|
-
return get_links(
|
13
|
+
return get_links(options)
|
14
14
|
end
|
15
15
|
|
16
16
|
def ==(other)
|
@@ -27,24 +27,52 @@ module Scrape
|
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
30
|
-
def get_links(
|
30
|
+
def get_links(options = {})
|
31
|
+
div = nil
|
32
|
+
ignore = []
|
33
|
+
|
34
|
+
if options[:div]
|
35
|
+
div = options[:div]
|
36
|
+
end
|
37
|
+
|
38
|
+
if options[:ignore]
|
39
|
+
ignore = options[:ignore]
|
40
|
+
end
|
41
|
+
|
31
42
|
links = []
|
32
43
|
|
33
|
-
doc = Hpricot(Net::HTTP.get(URI.parse(url)))
|
44
|
+
doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
|
34
45
|
doc.search("#{div} a").each do |link|
|
35
46
|
url = link['href']
|
36
47
|
if url =~ /^\/(.*)/
|
37
48
|
components = URI::split(@url)
|
38
49
|
url = "#{components[0] || 'http'}://#{components[2]}#{url}"
|
39
|
-
elsif url =~ /^
|
40
|
-
|
50
|
+
elsif url =~ /^https?:\/\//i
|
51
|
+
url = url
|
52
|
+
elsif url =~ /file:\/\//
|
53
|
+
next
|
41
54
|
elsif url =~ /^#/
|
42
55
|
url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
|
43
56
|
else
|
44
57
|
url = (File.dirname(@url) + '/' + (url || ''))
|
45
58
|
end
|
46
59
|
|
47
|
-
|
60
|
+
# Don't add this link if it matches a pattern in ignore
|
61
|
+
skip = false
|
62
|
+
ignore.each { |pattern| skip = true if url =~ pattern }
|
63
|
+
skip = true if options[:domain] && !url.include?(options[:domain])
|
64
|
+
|
65
|
+
if !skip
|
66
|
+
new_link = Link.new(url, link.inner_html.strip)
|
67
|
+
|
68
|
+
# Don't visit anchors, visit the main page instead.
|
69
|
+
if url =~ /(https?:\/\/.*)#(.*$)/i
|
70
|
+
links << Link.new($1, $2)
|
71
|
+
new_link.visited = true
|
72
|
+
end
|
73
|
+
|
74
|
+
links << new_link
|
75
|
+
end
|
48
76
|
end
|
49
77
|
|
50
78
|
return links.uniq
|
data/lib/scraper.rb
CHANGED
@@ -8,14 +8,20 @@ class Scraper
|
|
8
8
|
|
9
9
|
attr_accessor :url
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
# Scrapes a web page, collecting all links on the page and scraping each new link.
|
12
|
+
# Possible options
|
13
|
+
# options[:div] - The container div with the links
|
14
|
+
# options[:domain] - The domain to collect links from, all other domains are ignored
|
15
|
+
# options[:ignore] - An Array of regexes. Any links matching one will be ignored.
|
16
|
+
def initialize(url, options = {})
|
17
|
+
@url = url
|
18
|
+
@options = options
|
13
19
|
end
|
14
20
|
|
15
|
-
def scrape(
|
21
|
+
def scrape(options = {})
|
16
22
|
links = [Link.new(self.url)]
|
17
|
-
until (not_visited = links.uniq.select { |link| !link.visited}).empty?
|
18
|
-
not_visited.each { |link| links += link.scrape!(
|
23
|
+
until (not_visited = links.uniq.select { |link| !link.visited }).empty?
|
24
|
+
not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
|
19
25
|
end
|
20
26
|
|
21
27
|
return links.uniq
|
data/scraper.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scraper}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Matt Pruitt"]
|
9
|
-
s.date = %q{2009-06-
|
9
|
+
s.date = %q{2009-06-18}
|
10
10
|
s.email = %q{guitsaru@gmail.com}
|
11
11
|
s.extra_rdoc_files = [
|
12
12
|
"LICENSE",
|
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
"scraper.gemspec",
|
25
25
|
"test/fake_pages/first_child_page.html",
|
26
26
|
"test/fake_pages/first_page.html",
|
27
|
+
"test/fake_pages/google.html",
|
27
28
|
"test/fake_pages/main.html",
|
28
29
|
"test/fake_pages/not_added.html",
|
29
30
|
"test/test_helper.rb",
|
@@ -14,7 +14,7 @@
|
|
14
14
|
<a href="not_added.html">Not Added</a>
|
15
15
|
</div>
|
16
16
|
<div id="content">
|
17
|
-
<a href="http://example.com/first_child_page.html">First Child Page</a>
|
17
|
+
<a href="http://example.com/first_child_page.html#content2">First Child Page</a>
|
18
18
|
</div>
|
19
19
|
</body>
|
20
20
|
</html>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="header">
|
14
|
+
<a href="not_added.html">Not Added</a>
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
</div>
|
18
|
+
</body>
|
19
|
+
</html>
|
data/test/fake_pages/main.html
CHANGED
data/test/test_helper.rb
CHANGED
@@ -12,5 +12,6 @@ class Test::Unit::TestCase
|
|
12
12
|
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
|
13
13
|
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
|
14
14
|
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
|
15
|
+
FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
|
15
16
|
end
|
16
17
|
|
data/test/test_link.rb
CHANGED
@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
|
|
37
37
|
assert(@results.is_a?(Array))
|
38
38
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
39
39
|
assert(@results.include?(Link.new('http://example.com/not_added.html')))
|
40
|
+
assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
41
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
45
|
context "scraping inside a div" do
|
44
46
|
setup do
|
45
47
|
@link = Link.new('http://example.com/main.html')
|
46
|
-
@results = @link.scrape!('#content')
|
48
|
+
@results = @link.scrape!(:div => '#content')
|
47
49
|
end
|
48
50
|
|
49
51
|
should "return an array of links on the page" do
|
50
52
|
assert_not_nil(@results)
|
51
53
|
assert(@results.is_a?(Array))
|
52
54
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
55
|
+
assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
56
|
+
end
|
57
|
+
|
58
|
+
should "not return links not in the div" do
|
59
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "scraping with ignore options" do
|
64
|
+
setup do
|
65
|
+
@link = Link.new('http://example.com/main.html')
|
66
|
+
@results = @link.scrape!(:div => '#content', :ignore => [/\?/])
|
67
|
+
end
|
68
|
+
|
69
|
+
should "return an array of links on the page" do
|
70
|
+
assert_not_nil(@results)
|
71
|
+
assert(@results.is_a?(Array))
|
72
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
73
|
+
assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
53
74
|
end
|
54
75
|
|
55
76
|
should "not return links not in the div" do
|
data/test/test_scraper.rb
CHANGED
@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
|
|
16
16
|
context "scraping" do
|
17
17
|
setup do
|
18
18
|
@scraper = Scraper.new('http://example.com/main.html')
|
19
|
-
@results = @scraper.scrape('#content')
|
19
|
+
@results = @scraper.scrape(:div => '#content')
|
20
20
|
end
|
21
21
|
|
22
22
|
should "Include a list of links on the pages." do
|
23
23
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
24
24
|
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
25
25
|
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
26
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
26
27
|
assert(@results.include?(Link.new('http://example.com/main.html')))
|
28
|
+
assert(@results.include?(Link.new('http://google.com')))
|
27
29
|
end
|
28
30
|
|
29
31
|
should "Not include any links outside of the content div" do
|
30
32
|
assert(!@results.include?(Link.new('http://example.com/not_added.html')))
|
31
33
|
end
|
32
34
|
end
|
35
|
+
|
36
|
+
context "scraping within domain" do
|
37
|
+
setup do
|
38
|
+
@scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
|
39
|
+
@results = @scraper.scrape(:div => '#content')
|
40
|
+
end
|
41
|
+
|
42
|
+
should "Include a list of links on the pages." do
|
43
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
44
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
45
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
46
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
47
|
+
assert(@results.include?(Link.new('http://example.com/main.html')))
|
48
|
+
end
|
49
|
+
|
50
|
+
should "Not include any links outside of the content div" do
|
51
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')))
|
52
|
+
end
|
53
|
+
|
54
|
+
should "Not include any links outside of the domain" do
|
55
|
+
assert(!@results.include?(Link.new('http://google.com')))
|
56
|
+
end
|
57
|
+
end
|
33
58
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guitsaru-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Pruitt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-06-
|
12
|
+
date: 2009-06-18 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- scraper.gemspec
|
44
44
|
- test/fake_pages/first_child_page.html
|
45
45
|
- test/fake_pages/first_page.html
|
46
|
+
- test/fake_pages/google.html
|
46
47
|
- test/fake_pages/main.html
|
47
48
|
- test/fake_pages/not_added.html
|
48
49
|
- test/test_helper.rb
|