guitsaru-scraper 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/scraper/link.rb +11 -1
- data/scraper.gemspec +57 -0
- data/test/fake_pages/first_child_page.html +4 -1
- data/test/test_scraper.rb +1 -0
- metadata +2 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/lib/scraper/link.rb
CHANGED
@@ -18,6 +18,14 @@ module Scrape
|
|
18
18
|
@url == other.url
|
19
19
|
end
|
20
20
|
|
21
|
+
def eql?(other)
|
22
|
+
return self == other
|
23
|
+
end
|
24
|
+
|
25
|
+
def hash
|
26
|
+
@url.hash
|
27
|
+
end
|
28
|
+
|
21
29
|
private
|
22
30
|
def get_links(div)
|
23
31
|
links = []
|
@@ -27,9 +35,11 @@ module Scrape
|
|
27
35
|
url = link['href']
|
28
36
|
if url =~ /^\/(.*)/
|
29
37
|
components = URI::split(@url)
|
30
|
-
url = "#{components[0] || 'http'}://#{components[2]}
|
38
|
+
url = "#{components[0] || 'http'}://#{components[2]}#{url}"
|
31
39
|
elsif url =~ /^http:\/\//i
|
32
40
|
url = url
|
41
|
+
elsif url =~ /^#/
|
42
|
+
url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
|
33
43
|
else
|
34
44
|
url = (File.dirname(@url) + '/' + (url || ''))
|
35
45
|
end
|
data/scraper.gemspec
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{scraper}
|
5
|
+
s.version = "0.1.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Matt Pruitt"]
|
9
|
+
s.date = %q{2009-06-17}
|
10
|
+
s.email = %q{guitsaru@gmail.com}
|
11
|
+
s.extra_rdoc_files = [
|
12
|
+
"LICENSE",
|
13
|
+
"README.rdoc"
|
14
|
+
]
|
15
|
+
s.files = [
|
16
|
+
".document",
|
17
|
+
".gitignore",
|
18
|
+
"LICENSE",
|
19
|
+
"README.rdoc",
|
20
|
+
"Rakefile",
|
21
|
+
"VERSION",
|
22
|
+
"lib/scraper.rb",
|
23
|
+
"lib/scraper/link.rb",
|
24
|
+
"scraper.gemspec",
|
25
|
+
"test/fake_pages/first_child_page.html",
|
26
|
+
"test/fake_pages/first_page.html",
|
27
|
+
"test/fake_pages/main.html",
|
28
|
+
"test/fake_pages/not_added.html",
|
29
|
+
"test/test_helper.rb",
|
30
|
+
"test/test_link.rb",
|
31
|
+
"test/test_scraper.rb"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/guitsaru/scraper}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubyforge_project = %q{scraper}
|
37
|
+
s.rubygems_version = %q{1.3.4}
|
38
|
+
s.summary = %q{Collects all links on a webpage recursively.}
|
39
|
+
s.test_files = [
|
40
|
+
"test/test_helper.rb",
|
41
|
+
"test/test_link.rb",
|
42
|
+
"test/test_scraper.rb"
|
43
|
+
]
|
44
|
+
|
45
|
+
if s.respond_to? :specification_version then
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
47
|
+
s.specification_version = 3
|
48
|
+
|
49
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
50
|
+
s.add_runtime_dependency(%q<hpricot>, [">= 0.6.161"])
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<hpricot>, [">= 0.6.161"])
|
53
|
+
end
|
54
|
+
else
|
55
|
+
s.add_dependency(%q<hpricot>, [">= 0.6.161"])
|
56
|
+
end
|
57
|
+
end
|
data/test/test_scraper.rb
CHANGED
@@ -22,6 +22,7 @@ class TestScraper < Test::Unit::TestCase
|
|
22
22
|
should "Include a list of links on the pages." do
|
23
23
|
assert(@results.include?(Link.new('http://example.com/first_page.html')))
|
24
24
|
assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
|
25
|
+
assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
|
25
26
|
assert(@results.include?(Link.new('http://example.com/main.html')))
|
26
27
|
end
|
27
28
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guitsaru-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Pruitt
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- VERSION
|
41
41
|
- lib/scraper.rb
|
42
42
|
- lib/scraper/link.rb
|
43
|
+
- scraper.gemspec
|
43
44
|
- test/fake_pages/first_child_page.html
|
44
45
|
- test/fake_pages/first_page.html
|
45
46
|
- test/fake_pages/main.html
|