guitsaru-scraper 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/scraper/link.rb +35 -7
- data/lib/scraper.rb +11 -5
- data/scraper.gemspec +3 -2
- data/test/fake_pages/first_child_page.html +1 -0
- data/test/fake_pages/first_page.html +1 -1
- data/test/fake_pages/google.html +19 -0
- data/test/fake_pages/main.html +2 -0
- data/test/test_helper.rb +1 -0
- data/test/test_link.rb +22 -1
- data/test/test_scraper.rb +26 -1
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
data/lib/scraper/link.rb
CHANGED
@@ -7,10 +7,10 @@ module Scrape
|
|
7
7
|
@visited = false
|
8
8
|
end
|
9
9
|
|
10
|
-
def scrape!(
|
10
|
+
def scrape!(options = {})
|
11
11
|
return [] if @visited
|
12
12
|
@visited = true
|
13
|
-
return get_links(
|
13
|
+
return get_links(options)
|
14
14
|
end
|
15
15
|
|
16
16
|
def ==(other)
|
@@ -27,24 +27,52 @@ module Scrape
|
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
30
|
-
def get_links(
|
30
|
+
def get_links(options = {})
|
31
|
+
div = nil
|
32
|
+
ignore = []
|
33
|
+
|
34
|
+
if options[:div]
|
35
|
+
div = options[:div]
|
36
|
+
end
|
37
|
+
|
38
|
+
if options[:ignore]
|
39
|
+
ignore = options[:ignore]
|
40
|
+
end
|
41
|
+
|
31
42
|
links = []
|
32
43
|
|
33
|
-
doc = Hpricot(Net::HTTP.get(URI.parse(url)))
|
44
|
+
doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
|
34
45
|
doc.search("#{div} a").each do |link|
|
35
46
|
url = link['href']
|
36
47
|
if url =~ /^\/(.*)/
|
37
48
|
components = URI::split(@url)
|
38
49
|
url = "#{components[0] || 'http'}://#{components[2]}#{url}"
|
39
|
-
elsif url =~ /^
|
40
|
-
|
50
|
+
elsif url =~ /^https?:\/\//i
|
51
|
+
url = url
|
52
|
+
elsif url =~ /file:\/\//
|
53
|
+
next
|
41
54
|
elsif url =~ /^#/
|
42
55
|
url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
|
43
56
|
else
|
44
57
|
url = (File.dirname(@url) + '/' + (url || ''))
|
45
58
|
end
|
46
59
|
|
47
|
-
|
60
|
+
# Don't add this link if it matches a pattern in ignore
|
61
|
+
skip = false
|
62
|
+
ignore.each { |pattern| skip = true if url =~ pattern }
|
63
|
+
skip = true if options[:domain] && !url.include?(options[:domain])
|
64
|
+
|
65
|
+
if !skip
|
66
|
+
new_link = Link.new(url, link.inner_html.strip)
|
67
|
+
|
68
|
+
# Don't visit anchors, visit the main page instead.
|
69
|
+
if url =~ /(https?:\/\/.*)#(.*$)/i
|
70
|
+
links << Link.new($1, $2)
|
71
|
+
new_link.visited = true
|
72
|
+
end
|
73
|
+
|
74
|
+
links << new_link
|
75
|
+
end
|
48
76
|
end
|
49
77
|
|
50
78
|
return links.uniq
|
data/lib/scraper.rb
CHANGED
@@ -8,14 +8,20 @@ class Scraper
|
|
8
8
|
|
9
9
|
attr_accessor :url
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
# Scrapes a web page, collecting all links on the page and scraping each new link.
|
12
|
+
# Possible options
|
13
|
+
# options[:div] - The container div with the links
|
14
|
+
# options[:domain] - The domain to collect links from, all other domains are ignored
|
15
|
+
# options[:ignore] - An Array of regexes. Any links matching one will be ignored.
|
16
|
+
def initialize(url, options = {})
|
17
|
+
@url = url
|
18
|
+
@options = options
|
13
19
|
end
|
14
20
|
|
15
|
-
def scrape(
|
21
|
+
def scrape(options = {})
|
16
22
|
links = [Link.new(self.url)]
|
17
|
-
until (not_visited = links.uniq.select { |link| !link.visited}).empty?
|
18
|
-
not_visited.each { |link| links += link.scrape!(
|
23
|
+
until (not_visited = links.uniq.select { |link| !link.visited }).empty?
|
24
|
+
not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
|
19
25
|
end
|
20
26
|
|
21
27
|
return links.uniq
|
data/scraper.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scraper}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Matt Pruitt"]
|
9
|
-
s.date = %q{2009-06-
|
9
|
+
s.date = %q{2009-06-18}
|
10
10
|
s.email = %q{guitsaru@gmail.com}
|
11
11
|
s.extra_rdoc_files = [
|
12
12
|
"LICENSE",
|
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
"scraper.gemspec",
|
25
25
|
"test/fake_pages/first_child_page.html",
|
26
26
|
"test/fake_pages/first_page.html",
|
27
|
+
"test/fake_pages/google.html",
|
27
28
|
"test/fake_pages/main.html",
|
28
29
|
"test/fake_pages/not_added.html",
|
29
30
|
"test/test_helper.rb",
|
@@ -14,7 +14,7 @@
|
|
14
14
|
<a href="not_added.html">Not Added</a>
|
15
15
|
</div>
|
16
16
|
<div id="content">
|
17
|
-
<a href="http://example.com/first_child_page.html">First Child Page</a>
|
17
|
+
<a href="http://example.com/first_child_page.html#content2">First Child Page</a>
|
18
18
|
</div>
|
19
19
|
</body>
|
20
20
|
</html>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="header">
|
14
|
+
<a href="not_added.html">Not Added</a>
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
</div>
|
18
|
+
</body>
|
19
|
+
</html>
|
data/test/fake_pages/main.html
CHANGED
data/test/test_helper.rb
CHANGED
@@ -12,5 +12,6 @@ class Test::Unit::TestCase
|
|
12
12
|
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
|
13
13
|
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
|
14
14
|
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
|
15
|
+
FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
|
15
16
|
end
|
16
17
|
|
data/test/test_link.rb
CHANGED
@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
|
|
37
37
|
assert(@results.is_a?(Array))
|
38
38
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
39
39
|
assert(@results.include?(Link.new('http://example.com/not_added.html')))
|
40
|
+
assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
41
|
+
assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
45
|
context "scraping inside a div" do
|
44
46
|
setup do
|
45
47
|
@link = Link.new('http://example.com/main.html')
|
46
|
-
@results = @link.scrape!('#content')
|
48
|
+
@results = @link.scrape!(:div => '#content')
|
47
49
|
end
|
48
50
|
|
49
51
|
should "return an array of links on the page" do
|
50
52
|
assert_not_nil(@results)
|
51
53
|
assert(@results.is_a?(Array))
|
52
54
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
55
|
+
assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
56
|
+
end
|
57
|
+
|
58
|
+
should "not return links not in the div" do
|
59
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "scraping with ignore options" do
|
64
|
+
setup do
|
65
|
+
@link = Link.new('http://example.com/main.html')
|
66
|
+
@results = @link.scrape!(:div => '#content', :ignore => [/\?/])
|
67
|
+
end
|
68
|
+
|
69
|
+
should "return an array of links on the page" do
|
70
|
+
assert_not_nil(@results)
|
71
|
+
assert(@results.is_a?(Array))
|
72
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
73
|
+
assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
|
53
74
|
end
|
54
75
|
|
55
76
|
should "not return links not in the div" do
|
data/test/test_scraper.rb
CHANGED
@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
|
|
16
16
|
context "scraping" do
|
17
17
|
setup do
|
18
18
|
@scraper = Scraper.new('http://example.com/main.html')
|
19
|
-
@results = @scraper.scrape('#content')
|
19
|
+
@results = @scraper.scrape(:div => '#content')
|
20
20
|
end
|
21
21
|
|
22
22
|
should "Include a list of links on the pages." do
|
23
23
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
24
24
|
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
25
25
|
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
26
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
26
27
|
assert(@results.include?(Link.new('http://example.com/main.html')))
|
28
|
+
assert(@results.include?(Link.new('http://google.com')))
|
27
29
|
end
|
28
30
|
|
29
31
|
should "Not include any links outside of the content div" do
|
30
32
|
assert(!@results.include?(Link.new('http://example.com/not_added.html')))
|
31
33
|
end
|
32
34
|
end
|
35
|
+
|
36
|
+
context "scraping within domain" do
|
37
|
+
setup do
|
38
|
+
@scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
|
39
|
+
@results = @scraper.scrape(:div => '#content')
|
40
|
+
end
|
41
|
+
|
42
|
+
should "Include a list of links on the pages." do
|
43
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
44
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
45
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
46
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
|
47
|
+
assert(@results.include?(Link.new('http://example.com/main.html')))
|
48
|
+
end
|
49
|
+
|
50
|
+
should "Not include any links outside of the content div" do
|
51
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')))
|
52
|
+
end
|
53
|
+
|
54
|
+
should "Not include any links outside of the domain" do
|
55
|
+
assert(!@results.include?(Link.new('http://google.com')))
|
56
|
+
end
|
57
|
+
end
|
33
58
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guitsaru-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Pruitt
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-06-
|
12
|
+
date: 2009-06-18 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- scraper.gemspec
|
44
44
|
- test/fake_pages/first_child_page.html
|
45
45
|
- test/fake_pages/first_page.html
|
46
|
+
- test/fake_pages/google.html
|
46
47
|
- test/fake_pages/main.html
|
47
48
|
- test/fake_pages/not_added.html
|
48
49
|
- test/test_helper.rb
|