guitsaru-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Pruitt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,7 @@
1
+ = scraper
2
+
3
+ Description goes here.
4
+
5
+ == Copyright
6
+
7
+ Copyright (c) 2009 Matt Pruitt. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "scraper"
8
+ gem.summary = %Q{Collects all links on a webpage recursively.}
9
+ gem.email = "guitsaru@gmail.com"
10
+ gem.homepage = "http://github.com/guitsaru/scraper"
11
+ gem.authors = ["Matt Pruitt"]
12
+ gem.rubyforge_project = "scraper"
13
+ gem.add_dependency('hpricot', '>= 0.6.161')
14
+ end
15
+
16
+ Jeweler::RubyforgeTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/test_*.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ test.rcov_opts += ['--exclude gems']
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "scraper #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,43 @@
1
+ module Scrape
2
+ class Link
3
+ attr_accessor :url, :visited, :title
4
+ def initialize(url, title='')
5
+ @url = url
6
+ @title = title
7
+ @visited = false
8
+ end
9
+
10
+ def scrape!(div=nil)
11
+ return [] if @visited
12
+ @visited = true
13
+ return get_links(div)
14
+ end
15
+
16
+ def ==(other)
17
+ return false unless other.is_a? Link
18
+ @url == other.url
19
+ end
20
+
21
+ private
22
+ def get_links(div)
23
+ links = []
24
+
25
+ doc = Hpricot(Net::HTTP.get(URI.parse(url)))
26
+ doc.search("#{div} a").each do |link|
27
+ url = link['href']
28
+ if url =~ /^\/(.*)/
29
+ components = URI::split(@url)
30
+ url = "#{components[0] || 'http'}://#{components[2]}/url"
31
+ elsif url =~ /^http:\/\//i
32
+ url = url
33
+ else
34
+ url = (File.dirname(@url) + '/' + (url || ''))
35
+ end
36
+
37
+ links << Link.new(url, link.inner_html)
38
+ end
39
+
40
+ return links.uniq
41
+ end
42
+ end
43
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'hpricot'
2
+ require 'open-uri'
3
+
4
+ require File.join(File.dirname(__FILE__), '..', 'lib/scraper/link')
5
+
6
+ class Scraper
7
+ include Scrape
8
+
9
+ attr_accessor :url
10
+
11
+ def initialize(url)
12
+ self.url = url
13
+ end
14
+
15
+ def scrape(div=nil)
16
+ links = [Link.new(self.url)]
17
+ until (not_visited = links.uniq.select { |link| !link.visited}).empty?
18
+ not_visited.each { |link| links += link.scrape!(div) }
19
+ end
20
+
21
+ return links.uniq
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="content"><a href="/main.html">Main</a></div>
14
+ </body>
15
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="http://example.com/first_child_page.html">First Child Page</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="first_page.html">First Page</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="first_child_page_not_added.html">First Child Page Not Added</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'fakeweb'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'scraper'
9
+
10
+ class Test::Unit::TestCase
11
+ FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
+ FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
+ FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
+ FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ end
16
+
data/test/test_link.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'test_helper'
2
+
3
+ class TestLink < Test::Unit::TestCase
4
+ include Scrape
5
+
6
+ context "initialization" do
7
+ setup do
8
+ @link = Link.new('http://example.com')
9
+ end
10
+
11
+ should "set the url" do
12
+ assert_equal('http://example.com', @link.url)
13
+ end
14
+
15
+ should "set the title" do
16
+ assert_equal('', @link.title)
17
+ assert_equal('title', Link.new('http://example.com', 'title').title)
18
+ end
19
+
20
+ should "set the visited flag" do
21
+ assert_equal(false, @link.visited)
22
+ end
23
+ end
24
+
25
+ context "scraping" do
26
+ setup do
27
+ @link = Link.new('http://example.com/main.html')
28
+ @results = @link.scrape!
29
+ end
30
+
31
+ should "set the visited flag to true" do
32
+ assert(@link.visited, "Link was not visited")
33
+ end
34
+
35
+ should "return an array of links on the page" do
36
+ assert_not_nil(@results)
37
+ assert(@results.is_a?(Array))
38
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
39
+ assert(@results.include?(Link.new('http://example.com/not_added.html')))
40
+ end
41
+ end
42
+
43
+ context "scraping inside a div" do
44
+ setup do
45
+ @link = Link.new('http://example.com/main.html')
46
+ @results = @link.scrape!('#content')
47
+ end
48
+
49
+ should "return an array of links on the page" do
50
+ assert_not_nil(@results)
51
+ assert(@results.is_a?(Array))
52
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
53
+ end
54
+
55
+ should "not return links not in the div" do
56
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
57
+ end
58
+ end
59
+
60
+ should "be equal to another link with the same url" do
61
+ assert(Link.new('http://example.com') == Link.new('http://example.com'))
62
+ end
63
+ end
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+
3
+ class TestScraper < Test::Unit::TestCase
4
+ include Scrape
5
+
6
+ context "initialization" do
7
+ setup do
8
+ @scraper = Scraper.new('http://example.com')
9
+ end
10
+
11
+ should "set the url" do
12
+ assert_equal('http://example.com', @scraper.url)
13
+ end
14
+ end
15
+
16
+ context "scraping" do
17
+ setup do
18
+ @scraper = Scraper.new('http://example.com/main.html')
19
+ @results = @scraper.scrape('#content')
20
+ end
21
+
22
+ should "Include a list of links on the pages." do
23
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
24
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
25
+ assert(@results.include?(Link.new('http://example.com/main.html')))
26
+ end
27
+
28
+ should "Not include any links outside of the content div" do
29
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')))
30
+ end
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guitsaru-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Pruitt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-17 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.6.161
24
+ version:
25
+ description:
26
+ email: guitsaru@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - lib/scraper.rb
42
+ - lib/scraper/link.rb
43
+ - test/fake_pages/first_child_page.html
44
+ - test/fake_pages/first_page.html
45
+ - test/fake_pages/main.html
46
+ - test/fake_pages/not_added.html
47
+ - test/test_helper.rb
48
+ - test/test_link.rb
49
+ - test/test_scraper.rb
50
+ has_rdoc: false
51
+ homepage: http://github.com/guitsaru/scraper
52
+ post_install_message:
53
+ rdoc_options:
54
+ - --charset=UTF-8
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ version:
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ requirements: []
70
+
71
+ rubyforge_project: scraper
72
+ rubygems_version: 1.2.0
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Collects all links on a webpage recursively.
76
+ test_files:
77
+ - test/test_helper.rb
78
+ - test/test_link.rb
79
+ - test/test_scraper.rb