guitsaru-scraper 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
data/lib/scraper/link.rb CHANGED
@@ -7,10 +7,10 @@ module Scrape
7
7
  @visited = false
8
8
  end
9
9
 
10
- def scrape!(div=nil)
10
+ def scrape!(options = {})
11
11
  return [] if @visited
12
12
  @visited = true
13
- return get_links(div)
13
+ return get_links(options)
14
14
  end
15
15
 
16
16
  def ==(other)
@@ -27,24 +27,52 @@ module Scrape
27
27
  end
28
28
 
29
29
  private
30
- def get_links(div)
30
+ def get_links(options = {})
31
+ div = nil
32
+ ignore = []
33
+
34
+ if options[:div]
35
+ div = options[:div]
36
+ end
37
+
38
+ if options[:ignore]
39
+ ignore = options[:ignore]
40
+ end
41
+
31
42
  links = []
32
43
 
33
- doc = Hpricot(Net::HTTP.get(URI.parse(url)))
44
+ doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
34
45
  doc.search("#{div} a").each do |link|
35
46
  url = link['href']
36
47
  if url =~ /^\/(.*)/
37
48
  components = URI::split(@url)
38
49
  url = "#{components[0] || 'http'}://#{components[2]}#{url}"
39
- elsif url =~ /^http:\/\//i
40
- url = url
50
+ elsif url =~ /^https?:\/\//i
51
+ url = url
52
+ elsif url =~ /file:\/\//
53
+ next
41
54
  elsif url =~ /^#/
42
55
  url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
43
56
  else
44
57
  url = (File.dirname(@url) + '/' + (url || ''))
45
58
  end
46
59
 
47
- links << Link.new(url, link.inner_html)
60
+ # Don't add this link if it matches a pattern in ignore
61
+ skip = false
62
+ ignore.each { |pattern| skip = true if url =~ pattern }
63
+ skip = true if options[:domain] && !url.include?(options[:domain])
64
+
65
+ if !skip
66
+ new_link = Link.new(url, link.inner_html.strip)
67
+
68
+ # Don't visit anchors, visit the main page instead.
69
+ if url =~ /(https?:\/\/.*)#(.*$)/i
70
+ links << Link.new($1, $2)
71
+ new_link.visited = true
72
+ end
73
+
74
+ links << new_link
75
+ end
48
76
  end
49
77
 
50
78
  return links.uniq
data/lib/scraper.rb CHANGED
@@ -8,14 +8,20 @@ class Scraper
8
8
 
9
9
  attr_accessor :url
10
10
 
11
- def initialize(url)
12
- self.url = url
11
+ # Scrapes a web page, collecting all links on the page and scraping each new link.
12
+ # Possible options
13
+ # options[:div] - The container div with the links
14
+ # options[:domain] - The domain to collect links from, all other domains are ignored
15
+ # options[:ignore] - An Array of regexes. Any links matching one will be ignored.
16
+ def initialize(url, options = {})
17
+ @url = url
18
+ @options = options
13
19
  end
14
20
 
15
- def scrape(div=nil)
21
+ def scrape(options = {})
16
22
  links = [Link.new(self.url)]
17
- until (not_visited = links.uniq.select { |link| !link.visited}).empty?
18
- not_visited.each { |link| links += link.scrape!(div) }
23
+ until (not_visited = links.uniq.select { |link| !link.visited }).empty?
24
+ not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
19
25
  end
20
26
 
21
27
  return links.uniq
data/scraper.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scraper}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Matt Pruitt"]
9
- s.date = %q{2009-06-17}
9
+ s.date = %q{2009-06-18}
10
10
  s.email = %q{guitsaru@gmail.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
24
24
  "scraper.gemspec",
25
25
  "test/fake_pages/first_child_page.html",
26
26
  "test/fake_pages/first_page.html",
27
+ "test/fake_pages/google.html",
27
28
  "test/fake_pages/main.html",
28
29
  "test/fake_pages/not_added.html",
29
30
  "test/test_helper.rb",
@@ -13,6 +13,7 @@
13
13
  <div id="content">
14
14
  <a href="/main.html">Main</a>
15
15
  <a href="#content">Content</a>
16
+ <a href="http://google.com">Google</a>
16
17
  </div>
17
18
  </body>
18
19
  </html>
@@ -14,7 +14,7 @@
14
14
  <a href="not_added.html">Not Added</a>
15
15
  </div>
16
16
  <div id="content">
17
- <a href="http://example.com/first_child_page.html">First Child Page</a>
17
+ <a href="http://example.com/first_child_page.html#content2">First Child Page</a>
18
18
  </div>
19
19
  </body>
20
20
  </html>
@@ -0,0 +1,19 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>untitled</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Matt Pruitt">
10
+ <!-- Date: 2009-06-17 -->
11
+ </head>
12
+ <body>
13
+ <div id="header">
14
+ <a href="not_added.html">Not Added</a>
15
+ </div>
16
+ <div id="content">
17
+ </div>
18
+ </body>
19
+ </html>
@@ -15,6 +15,8 @@
15
15
  </div>
16
16
  <div id="content">
17
17
  <a href="first_page.html">First Page</a>
18
+ <a href="file://fileserver/file.pdf">A File</a>
19
+ <a href="main.html?action=edit">Edit</a>
18
20
  </div>
19
21
  </body>
20
22
  </html>
data/test/test_helper.rb CHANGED
@@ -12,5 +12,6 @@ class Test::Unit::TestCase
12
12
  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
13
13
  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
14
14
  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
15
+ FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
15
16
  end
16
17
 
data/test/test_link.rb CHANGED
@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
37
37
  assert(@results.is_a?(Array))
38
38
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
39
39
  assert(@results.include?(Link.new('http://example.com/not_added.html')))
40
+ assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
41
+ assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
40
42
  end
41
43
  end
42
44
 
43
45
  context "scraping inside a div" do
44
46
  setup do
45
47
  @link = Link.new('http://example.com/main.html')
46
- @results = @link.scrape!('#content')
48
+ @results = @link.scrape!(:div => '#content')
47
49
  end
48
50
 
49
51
  should "return an array of links on the page" do
50
52
  assert_not_nil(@results)
51
53
  assert(@results.is_a?(Array))
52
54
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
55
+ assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
56
+ end
57
+
58
+ should "not return links not in the div" do
59
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
60
+ end
61
+ end
62
+
63
+ context "scraping with ignore options" do
64
+ setup do
65
+ @link = Link.new('http://example.com/main.html')
66
+ @results = @link.scrape!(:div => '#content', :ignore => [/\?/])
67
+ end
68
+
69
+ should "return an array of links on the page" do
70
+ assert_not_nil(@results)
71
+ assert(@results.is_a?(Array))
72
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
73
+ assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
53
74
  end
54
75
 
55
76
  should "not return links not in the div" do
data/test/test_scraper.rb CHANGED
@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
16
16
  context "scraping" do
17
17
  setup do
18
18
  @scraper = Scraper.new('http://example.com/main.html')
19
- @results = @scraper.scrape('#content')
19
+ @results = @scraper.scrape(:div => '#content')
20
20
  end
21
21
 
22
22
  should "Include a list of links on the pages." do
23
23
  assert(@results.include?(Link.new('http://example.com/first_page.html')))
24
24
  assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
25
25
  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
26
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
26
27
  assert(@results.include?(Link.new('http://example.com/main.html')))
28
+ assert(@results.include?(Link.new('http://google.com')))
27
29
  end
28
30
 
29
31
  should "Not include any links outside of the content div" do
30
32
  assert(!@results.include?(Link.new('http://example.com/not_added.html')))
31
33
  end
32
34
  end
35
+
36
+ context "scraping within domain" do
37
+ setup do
38
+ @scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
39
+ @results = @scraper.scrape(:div => '#content')
40
+ end
41
+
42
+ should "Include a list of links on the pages." do
43
+ assert(@results.include?(Link.new('http://example.com/first_page.html')))
44
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
45
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
46
+ assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
47
+ assert(@results.include?(Link.new('http://example.com/main.html')))
48
+ end
49
+
50
+ should "Not include any links outside of the content div" do
51
+ assert(!@results.include?(Link.new('http://example.com/not_added.html')))
52
+ end
53
+
54
+ should "Not include any links outside of the domain" do
55
+ assert(!@results.include?(Link.new('http://google.com')))
56
+ end
57
+ end
33
58
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guitsaru-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-17 00:00:00 -07:00
12
+ date: 2009-06-18 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
43
43
  - scraper.gemspec
44
44
  - test/fake_pages/first_child_page.html
45
45
  - test/fake_pages/first_page.html
46
+ - test/fake_pages/google.html
46
47
  - test/fake_pages/main.html
47
48
  - test/fake_pages/not_added.html
48
49
  - test/test_helper.rb