guitsaru-scraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.rdoc +7 -0
- data/Rakefile +59 -0
- data/VERSION +1 -0
- data/lib/scraper/link.rb +43 -0
- data/lib/scraper.rb +23 -0
- data/test/fake_pages/first_child_page.html +15 -0
- data/test/fake_pages/first_page.html +20 -0
- data/test/fake_pages/main.html +20 -0
- data/test/fake_pages/not_added.html +20 -0
- data/test/test_helper.rb +16 -0
- data/test/test_link.rb +63 -0
- data/test/test_scraper.rb +32 -0
- metadata +79 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Matt Pruitt
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "scraper"
|
8
|
+
gem.summary = %Q{Collects all links on a webpage recursively.}
|
9
|
+
gem.email = "guitsaru@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/guitsaru/scraper"
|
11
|
+
gem.authors = ["Matt Pruitt"]
|
12
|
+
gem.rubyforge_project = "scraper"
|
13
|
+
gem.add_dependency('hpricot', '>= 0.6.161')
|
14
|
+
end
|
15
|
+
|
16
|
+
Jeweler::RubyforgeTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/test_*.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/test_*.rb'
|
33
|
+
test.verbose = true
|
34
|
+
test.rcov_opts += ['--exclude gems']
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION.yml')
|
48
|
+
config = YAML.load(File.read('VERSION.yml'))
|
49
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
50
|
+
else
|
51
|
+
version = ""
|
52
|
+
end
|
53
|
+
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
55
|
+
rdoc.title = "scraper #{version}"
|
56
|
+
rdoc.rdoc_files.include('README*')
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
58
|
+
end
|
59
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/scraper/link.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Scrape
|
2
|
+
class Link
|
3
|
+
attr_accessor :url, :visited, :title
|
4
|
+
def initialize(url, title='')
|
5
|
+
@url = url
|
6
|
+
@title = title
|
7
|
+
@visited = false
|
8
|
+
end
|
9
|
+
|
10
|
+
def scrape!(div=nil)
|
11
|
+
return [] if @visited
|
12
|
+
@visited = true
|
13
|
+
return get_links(div)
|
14
|
+
end
|
15
|
+
|
16
|
+
def ==(other)
|
17
|
+
return false unless other.is_a? Link
|
18
|
+
@url == other.url
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def get_links(div)
|
23
|
+
links = []
|
24
|
+
|
25
|
+
doc = Hpricot(Net::HTTP.get(URI.parse(url)))
|
26
|
+
doc.search("#{div} a").each do |link|
|
27
|
+
url = link['href']
|
28
|
+
if url =~ /^\/(.*)/
|
29
|
+
components = URI::split(@url)
|
30
|
+
url = "#{components[0] || 'http'}://#{components[2]}/url"
|
31
|
+
elsif url =~ /^http:\/\//i
|
32
|
+
url = url
|
33
|
+
else
|
34
|
+
url = (File.dirname(@url) + '/' + (url || ''))
|
35
|
+
end
|
36
|
+
|
37
|
+
links << Link.new(url, link.inner_html)
|
38
|
+
end
|
39
|
+
|
40
|
+
return links.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), '..', 'lib/scraper/link')
|
5
|
+
|
6
|
+
class Scraper
|
7
|
+
include Scrape
|
8
|
+
|
9
|
+
attr_accessor :url
|
10
|
+
|
11
|
+
def initialize(url)
|
12
|
+
self.url = url
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape(div=nil)
|
16
|
+
links = [Link.new(self.url)]
|
17
|
+
until (not_visited = links.uniq.select { |link| !link.visited}).empty?
|
18
|
+
not_visited.each { |link| links += link.scrape!(div) }
|
19
|
+
end
|
20
|
+
|
21
|
+
return links.uniq
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="content"><a href="/main.html">Main</a></div>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="header">
|
14
|
+
<a href="not_added.html">Not Added</a>
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
<a href="http://example.com/first_child_page.html">First Child Page</a>
|
18
|
+
</div>
|
19
|
+
</body>
|
20
|
+
</html>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="header">
|
14
|
+
<a href="not_added.html">Not Added</a>
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
<a href="first_page.html">First Page</a>
|
18
|
+
</div>
|
19
|
+
</body>
|
20
|
+
</html>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<title>untitled</title>
|
8
|
+
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
+
<meta name="author" content="Matt Pruitt">
|
10
|
+
<!-- Date: 2009-06-17 -->
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<div id="header">
|
14
|
+
<a href="not_added.html">Not Added</a>
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
<a href="first_child_page_not_added.html">First Child Page Not Added</a>
|
18
|
+
</div>
|
19
|
+
</body>
|
20
|
+
</html>
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'shoulda'
|
4
|
+
require 'fakeweb'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
require 'scraper'
|
9
|
+
|
10
|
+
class Test::Unit::TestCase
|
11
|
+
FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
|
12
|
+
FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
|
13
|
+
FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
|
15
|
+
end
|
16
|
+
|
data/test/test_link.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TestLink < Test::Unit::TestCase
|
4
|
+
include Scrape
|
5
|
+
|
6
|
+
context "initialization" do
|
7
|
+
setup do
|
8
|
+
@link = Link.new('http://example.com')
|
9
|
+
end
|
10
|
+
|
11
|
+
should "set the url" do
|
12
|
+
assert_equal('http://example.com', @link.url)
|
13
|
+
end
|
14
|
+
|
15
|
+
should "set the title" do
|
16
|
+
assert_equal('', @link.title)
|
17
|
+
assert_equal('title', Link.new('http://example.com', 'title').title)
|
18
|
+
end
|
19
|
+
|
20
|
+
should "set the visited flag" do
|
21
|
+
assert_equal(false, @link.visited)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "scraping" do
|
26
|
+
setup do
|
27
|
+
@link = Link.new('http://example.com/main.html')
|
28
|
+
@results = @link.scrape!
|
29
|
+
end
|
30
|
+
|
31
|
+
should "set the visited flag to true" do
|
32
|
+
assert(@link.visited, "Link was not visited")
|
33
|
+
end
|
34
|
+
|
35
|
+
should "return an array of links on the page" do
|
36
|
+
assert_not_nil(@results)
|
37
|
+
assert(@results.is_a?(Array))
|
38
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
39
|
+
assert(@results.include?(Link.new('http://example.com/not_added.html')))
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
context "scraping inside a div" do
|
44
|
+
setup do
|
45
|
+
@link = Link.new('http://example.com/main.html')
|
46
|
+
@results = @link.scrape!('#content')
|
47
|
+
end
|
48
|
+
|
49
|
+
should "return an array of links on the page" do
|
50
|
+
assert_not_nil(@results)
|
51
|
+
assert(@results.is_a?(Array))
|
52
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
53
|
+
end
|
54
|
+
|
55
|
+
should "not return links not in the div" do
|
56
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
should "be equal to another link with the same url" do
|
61
|
+
assert(Link.new('http://example.com') == Link.new('http://example.com'))
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TestScraper < Test::Unit::TestCase
|
4
|
+
include Scrape
|
5
|
+
|
6
|
+
context "initialization" do
|
7
|
+
setup do
|
8
|
+
@scraper = Scraper.new('http://example.com')
|
9
|
+
end
|
10
|
+
|
11
|
+
should "set the url" do
|
12
|
+
assert_equal('http://example.com', @scraper.url)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context "scraping" do
|
17
|
+
setup do
|
18
|
+
@scraper = Scraper.new('http://example.com/main.html')
|
19
|
+
@results = @scraper.scrape('#content')
|
20
|
+
end
|
21
|
+
|
22
|
+
should "Include a list of links on the pages." do
|
23
|
+
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
24
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
25
|
+
assert(@results.include?(Link.new('http://example.com/main.html')))
|
26
|
+
end
|
27
|
+
|
28
|
+
should "Not include any links outside of the content div" do
|
29
|
+
assert(!@results.include?(Link.new('http://example.com/not_added.html')))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: guitsaru-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matt Pruitt
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-17 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.6.161
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email: guitsaru@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files:
|
32
|
+
- LICENSE
|
33
|
+
- README.rdoc
|
34
|
+
files:
|
35
|
+
- .document
|
36
|
+
- .gitignore
|
37
|
+
- LICENSE
|
38
|
+
- README.rdoc
|
39
|
+
- Rakefile
|
40
|
+
- VERSION
|
41
|
+
- lib/scraper.rb
|
42
|
+
- lib/scraper/link.rb
|
43
|
+
- test/fake_pages/first_child_page.html
|
44
|
+
- test/fake_pages/first_page.html
|
45
|
+
- test/fake_pages/main.html
|
46
|
+
- test/fake_pages/not_added.html
|
47
|
+
- test/test_helper.rb
|
48
|
+
- test/test_link.rb
|
49
|
+
- test/test_scraper.rb
|
50
|
+
has_rdoc: false
|
51
|
+
homepage: http://github.com/guitsaru/scraper
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options:
|
54
|
+
- --charset=UTF-8
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
version:
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: scraper
|
72
|
+
rubygems_version: 1.2.0
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Collects all links on a webpage recursively.
|
76
|
+
test_files:
|
77
|
+
- test/test_helper.rb
|
78
|
+
- test/test_link.rb
|
79
|
+
- test/test_scraper.rb
|