guitsaru-scraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Pruitt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,7 @@
1
+ = scraper
2
+
3
+ Description goes here.
4
+
5
+ == Copyright
6
+
7
+ Copyright (c) 2009 Matt Pruitt. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "scraper"
8
+ gem.summary = %Q{Collects all links on a webpage recursively.}
9
+ gem.email = "guitsaru@gmail.com"
10
+ gem.homepage = "http://github.com/guitsaru/scraper"
11
+ gem.authors = ["Matt Pruitt"]
12
+ gem.rubyforge_project = "scraper"
13
+ gem.add_dependency('hpricot', '>= 0.6.161')
14
+ end
15
+
16
+ Jeweler::RubyforgeTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/test_*.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ test.rcov_opts += ['--exclude gems']
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "scraper #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,43 @@
1
+ module Scrape
2
+ class Link
3
+ attr_accessor :url, :visited, :title
4
+ def initialize(url, title='')
5
+ @url = url
6
+ @title = title
7
+ @visited = false
8
+ end
9
+
10
+ def scrape!(div=nil)
11
+ return [] if @visited
12
+ @visited = true
13
+ return get_links(div)
14
+ end
15
+
16
+ def ==(other)
17
+ return false unless other.is_a? Link
18
+ @url == other.url
19
+ end
20
+
21
+ private
22
+ def get_links(div)
23
+ links = []
24
+
25
+ doc = Hpricot(Net::HTTP.get(URI.parse(url)))
26
+ doc.search("#{div} a").each do |link|
27
+ url = link['href']
28
+ if url =~ /^\/(.*)/
29
+ components = URI::split(@url)
30
+ url = "#{components[0] || 'http'}://#{components[2]}/url"
31
+ elsif url =~ /^http:\/\//i
32
+ url = url
33
+ else
34
+ url = (File.dirname(@url) + '/' + (url || ''))
35
+ end
36
+
37
+ links << Link.new(url, link.inner_html)
38
+ end
39
+
40
+ return links.uniq
41
+ end
42
+ end
43
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'hpricot'
2
+ require 'open-uri'
3
+
4
+ require File.join(File.dirname(__FILE__), '..', 'lib/scraper/link')
5
+
6
+ class Scraper
7
+ include Scrape
8
+
9
+ attr_accessor :url
10
+
11
+ def initialize(url)
12
+ self.url = url
13
+ end
14
+
15
+ def scrape(div=nil)
16
+ links = [Link.new(self.url)]
17
+ until (not_visited = links.uniq.select { |link| !link.visited}).empty?
18
+ not_visited.each { |link| links += link.scrape!(div) }
19
+ end
20
+
21
+ return links.uniq
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="content"><a href="/main.html">Main</a></div>
14
+ </body>
15
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="http://example.com/first_child_page.html">First Child Page</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="first_page.html">First Page</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,20 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ <a href="first_child_page_not_added.html">First Child Page Not Added</a>
18
+ </div>
19
+ </body>
20
+ </html>
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'fakeweb'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'scraper'
9
+
10
+ class Test::Unit::TestCase
11
+ FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
12
+ FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
+ FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
+ FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ end
16
+
data/test/test_link.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'test_helper'
2
+
3
+ class TestLink < Test::Unit::TestCase
4
+ include Scrape
5
+
6
+ context "initialization" do
7
+ setup do
8
+ @link = Link.new('http://example.com')
9
+ end
10
+
11
+ should "set the url" do
12
+ assert_equal('http://example.com', @link.url)
13
+ end
14
+
15
+ should "set the title" do
16
+ assert_equal('', @link.title)
17
+ assert_equal('title', Link.new('http://example.com', 'title').title)
18
+ end
19
+
20
+ should "set the visited flag" do
21
+ assert_equal(false, @link.visited)
22
+ end
23
+ end
24
+
25
+ context "scraping" do
26
+ setup do
27
+ @link = Link.new('http://example.com/main.html')
28
+ @results = @link.scrape!
29
+ end
30
+
31
+ should "set the visited flag to true" do
32
+ assert(@link.visited, "Link was not visited")
33
+ end
34
+
35
+ should "return an array of links on the page" do
36
+ assert_not_nil(@results)
37
+ assert(@results.is_a?(Array))
38
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
39
+ assert(@results.include?(Link.new('http://example.com/not_added.html')))
40
+ end
41
+ end
42
+
43
+ context "scraping inside a div" do
44
+ setup do
45
+ @link = Link.new('http://example.com/main.html')
46
+ @results = @link.scrape!('#content')
47
+ end
48
+
49
+ should "return an array of links on the page" do
50
+ assert_not_nil(@results)
51
+ assert(@results.is_a?(Array))
52
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
53
+ end
54
+
55
+ should "not return links not in the div" do
56
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
57
+ end
58
+ end
59
+
60
+ should "be equal to another link with the same url" do
61
+ assert(Link.new('http://example.com') == Link.new('http://example.com'))
62
+ end
63
+ end
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+
3
+ class TestScraper < Test::Unit::TestCase
4
+ include Scrape
5
+
6
+ context "initialization" do
7
+ setup do
8
+ @scraper = Scraper.new('http://example.com')
9
+ end
10
+
11
+ should "set the url" do
12
+ assert_equal('http://example.com', @scraper.url)
13
+ end
14
+ end
15
+
16
+ context "scraping" do
17
+ setup do
18
+ @scraper = Scraper.new('http://example.com/main.html')
19
+ @results = @scraper.scrape('#content')
20
+ end
21
+
22
+ should "Include a list of links on the pages." do
23
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
24
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
25
+ assert(@results.include?(Link.new('http://example.com/main.html')))
26
+ end
27
+
28
+ should "Not include any links outside of the content div" do
29
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')))
30
+ end
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guitsaru-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Pruitt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-17 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.6.161
24
+ version:
25
+ description:
26
+ email: guitsaru@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - lib/scraper.rb
42
+ - lib/scraper/link.rb
43
+ - test/fake_pages/first_child_page.html
44
+ - test/fake_pages/first_page.html
45
+ - test/fake_pages/main.html
46
+ - test/fake_pages/not_added.html
47
+ - test/test_helper.rb
48
+ - test/test_link.rb
49
+ - test/test_scraper.rb
50
+ has_rdoc: false
51
+ homepage: http://github.com/guitsaru/scraper
52
+ post_install_message:
53
+ rdoc_options:
54
+ - --charset=UTF-8
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ version:
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ requirements: []
70
+
71
+ rubyforge_project: scraper
72
+ rubygems_version: 1.2.0
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Collects all links on a webpage recursively.
76
+ test_files:
77
+ - test/test_helper.rb
78
+ - test/test_link.rb
79
+ - test/test_scraper.rb