guitsaru-scraper 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
data/lib/scraper/link.rb CHANGED
@@ -7,10 +7,10 @@ module Scrape
7
7
  @visited = false
8
8
  end
9
9
 
10
- def scrape!(div=nil)
10
+ def scrape!(options = {})
11
11
  return [] if @visited
12
12
  @visited = true
13
- return get_links(div)
13
+ return get_links(options)
14
14
  end
15
15
 
16
16
  def ==(other)
@@ -27,24 +27,52 @@ module Scrape
27
27
  end
28
28
 
29
29
  private
30
- def get_links(div)
30
+ def get_links(options = {})
31
+ div = nil
32
+ ignore = []
33
+
34
+ if options[:div]
35
+ div = options[:div]
36
+ end
37
+
38
+ if options[:ignore]
39
+ ignore = options[:ignore]
40
+ end
41
+
31
42
  links = []
32
43
 
33
- doc = Hpricot(Net::HTTP.get(URI.parse(url)))
44
+ doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
34
45
  doc.search("#{div} a").each do |link|
35
46
  url = link['href']
36
47
  if url =~ /^\/(.*)/
37
48
  components = URI::split(@url)
38
49
  url = "#{components[0] || 'http'}://#{components[2]}#{url}"
39
- elsif url =~ /^http:\/\//i
40
- url = url
50
+ elsif url =~ /^https?:\/\//i
51
+ url = url
52
+ elsif url =~ /file:\/\//
53
+ next
41
54
  elsif url =~ /^#/
42
55
  url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
43
56
  else
44
57
  url = (File.dirname(@url) + '/' + (url || ''))
45
58
  end
46
59
 
47
- links << Link.new(url, link.inner_html)
60
+ # Don't add this link if it matches a pattern in ignore
61
+ skip = false
62
+ ignore.each { |pattern| skip = true if url =~ pattern }
63
+ skip = true if options[:domain] && !url.include?(options[:domain])
64
+
65
+ if !skip
66
+ new_link = Link.new(url, link.inner_html.strip)
67
+
68
+ # Don't visit anchors, visit the main page instead.
69
+ if url =~ /(https?:\/\/.*)#(.*$)/i
70
+ links << Link.new($1, $2)
71
+ new_link.visited = true
72
+ end
73
+
74
+ links << new_link
75
+ end
48
76
  end
49
77
 
50
78
  return links.uniq
data/lib/scraper.rb CHANGED
@@ -8,14 +8,20 @@ class Scraper
8
8
 
9
9
  attr_accessor :url
10
10
 
11
- def initialize(url)
12
- self.url = url
11
+ # Scrapes a web page, collecting all links on the page and scraping each new link.
12
+ # Possible options
13
+ # options[:div] - The container div with the links
14
+ # options[:domain] - The domain to collect links from, all other domains are ignored
15
+ # options[:ignore] - An Array of regexes. Any links matching one will be ignored.
16
+ def initialize(url, options = {})
17
+ @url = url
18
+ @options = options
13
19
  end
14
20
 
15
- def scrape(div=nil)
21
+ def scrape(options = {})
16
22
  links = [Link.new(self.url)]
17
- until (not_visited = links.uniq.select { |link| !link.visited}).empty?
18
- not_visited.each { |link| links += link.scrape!(div) }
23
+ until (not_visited = links.uniq.select { |link| !link.visited }).empty?
24
+ not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
19
25
  end
20
26
 
21
27
  return links.uniq
data/scraper.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scraper}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Matt Pruitt"]
9
- s.date = %q{2009-06-17}
9
+ s.date = %q{2009-06-18}
10
10
  s.email = %q{guitsaru@gmail.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
24
24
  "scraper.gemspec",
25
25
  "test/fake_pages/first_child_page.html",
26
26
  "test/fake_pages/first_page.html",
27
+ "test/fake_pages/google.html",
27
28
  "test/fake_pages/main.html",
28
29
  "test/fake_pages/not_added.html",
29
30
  "test/test_helper.rb",
@@ -13,6 +13,7 @@
13
13
  <div id="content">
14
14
  <a href="/main.html">Main</a>
15
15
  <a href="#content">Content</a>
16
+ <a href="http://google.com">Google</a>
16
17
  </div>
17
18
  </body>
18
19
  </html>
@@ -14,7 +14,7 @@
14
14
  <a href="not_added.html">Not Added</a>
15
15
  </div>
16
16
  <div id="content">
17
- <a href="http://example.com/first_child_page.html">First Child Page</a>
17
+ <a href="http://example.com/first_child_page.html#content2">First Child Page</a>
18
18
  </div>
19
19
  </body>
20
20
  </html>
@@ -0,0 +1,19 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ </div>
18
+ </body>
19
+ </html>
@@ -15,6 +15,8 @@
15
15
  </div>
16
16
  <div id="content">
17
17
  <a href="first_page.html">First Page</a>
18
+ <a href="file://fileserver/file.pdf">A File</a>
19
+ <a href="main.html?action=edit">Edit</a>
18
20
  </div>
19
21
  </body>
20
22
  </html>
data/test/test_helper.rb CHANGED
@@ -12,5 +12,6 @@ class Test::Unit::TestCase
12
12
  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
13
  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
14
  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
15
16
  end
16
17
 
data/test/test_link.rb CHANGED
@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
37
37
  assert(@results.is_a?(Array))
38
38
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
39
39
  assert(@results.include?(Link.new('http://example.com/not_added.html')))
40
+ assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
41
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
40
42
  end
41
43
  end
42
44
 
43
45
  context "scraping inside a div" do
44
46
  setup do
45
47
  @link = Link.new('http://example.com/main.html')
46
- @results = @link.scrape!('#content')
48
+ @results = @link.scrape!(:div => '#content')
47
49
  end
48
50
 
49
51
  should "return an array of links on the page" do
50
52
  assert_not_nil(@results)
51
53
  assert(@results.is_a?(Array))
52
54
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
55
+ assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
56
+ end
57
+
58
+ should "not return links not in the div" do
59
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
60
+ end
61
+ end
62
+
63
+ context "scraping with ignore options" do
64
+ setup do
65
+ @link = Link.new('http://example.com/main.html')
66
+ @results = @link.scrape!(:div => '#content', :ignore => [/\?/])
67
+ end
68
+
69
+ should "return an array of links on the page" do
70
+ assert_not_nil(@results)
71
+ assert(@results.is_a?(Array))
72
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
73
+ assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
53
74
  end
54
75
 
55
76
  should "not return links not in the div" do
data/test/test_scraper.rb CHANGED
@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
16
16
  context "scraping" do
17
17
  setup do
18
18
  @scraper = Scraper.new('http://example.com/main.html')
19
- @results = @scraper.scrape('#content')
19
+ @results = @scraper.scrape(:div => '#content')
20
20
  end
21
21
 
22
22
  should "Include a list of links on the pages." do
23
23
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
24
24
  assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
25
25
  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
26
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
26
27
  assert(@results.include?(Link.new('http://example.com/main.html')))
28
+ assert(@results.include?(Link.new('http://google.com')))
27
29
  end
28
30
 
29
31
  should "Not include any links outside of the content div" do
30
32
  assert(!@results.include?(Link.new('http://example.com/not_added.html')))
31
33
  end
32
34
  end
35
+
36
+ context "scraping within domain" do
37
+ setup do
38
+ @scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
39
+ @results = @scraper.scrape(:div => '#content')
40
+ end
41
+
42
+ should "Include a list of links on the pages." do
43
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
44
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
45
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
46
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
47
+ assert(@results.include?(Link.new('http://example.com/main.html')))
48
+ end
49
+
50
+ should "Not include any links outside of the content div" do
51
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')))
52
+ end
53
+
54
+ should "Not include any links outside of the domain" do
55
+ assert(!@results.include?(Link.new('http://google.com')))
56
+ end
57
+ end
33
58
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guitsaru-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-17 00:00:00 -07:00
12
+ date: 2009-06-18 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
43
43
  - scraper.gemspec
44
44
  - test/fake_pages/first_child_page.html
45
45
  - test/fake_pages/first_page.html
46
+ - test/fake_pages/google.html
46
47
  - test/fake_pages/main.html
47
48
  - test/fake_pages/not_added.html
48
49
  - test/test_helper.rb