redback 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/redback +5 -0
- data/lib/redback.rb +171 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ad5bb571eee56b563c4d85fc2fd00342d25c8a16
|
4
|
+
data.tar.gz: 451b6ab74b8039579180da7cd1f423ae5c54909d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0df94ad9c9cd34bd4e735a0fe25f9750d71762538f7a4c1a1e27e5e491b2e1cd8f6a5827cedc660231a5e254a07b86afad626b91cb26abe740af19875a0bd0b9
|
7
|
+
data.tar.gz: c3b5f59eb773157c74666e694dda602e2a7fc6ec1664f0957397ee654461bec54b4d7a13058a06b2ab439c67a27f2b493b88a245d86290075dfa6f438dc03107
|
data/bin/redback
ADDED
data/lib/redback.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'net/http'
|
4
|
+
require 'parallel'
|
5
|
+
|
6
|
+
class Redback
|
7
|
+
|
8
|
+
def initialize(url, &each_site)
|
9
|
+
if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
|
10
|
+
url = 'http://' + url
|
11
|
+
end
|
12
|
+
|
13
|
+
@uri = URI.parse(url)
|
14
|
+
|
15
|
+
@pages_hit = 0
|
16
|
+
|
17
|
+
@visited = []
|
18
|
+
@to_visit = []
|
19
|
+
|
20
|
+
@each_site = each_site
|
21
|
+
|
22
|
+
@options = {
|
23
|
+
:ignore_hash => true,
|
24
|
+
:ignore_query_string => false,
|
25
|
+
:search_in_comments => false,
|
26
|
+
:threads => 4,
|
27
|
+
:num_pages => 1000
|
28
|
+
}
|
29
|
+
|
30
|
+
crawl_page(url)
|
31
|
+
spider
|
32
|
+
end
|
33
|
+
|
34
|
+
def queue_link(url)
|
35
|
+
@to_visit << url
|
36
|
+
end
|
37
|
+
|
38
|
+
def crawl_page(url, limit = 10)
|
39
|
+
# Don't crawl a page twice
|
40
|
+
return if @visited.include? url
|
41
|
+
|
42
|
+
# Let's not hit this again
|
43
|
+
@visited << url
|
44
|
+
|
45
|
+
begin
|
46
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
47
|
+
rescue
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
headers = {
|
52
|
+
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
|
53
|
+
"Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
|
54
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
55
|
+
}
|
56
|
+
|
57
|
+
begin
|
58
|
+
req = Net::HTTP::Get.new(uri.path, headers)
|
59
|
+
response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
|
60
|
+
|
61
|
+
case response
|
62
|
+
when Net::HTTPRedirection
|
63
|
+
return crawl_page(response['location'], limit - 1)
|
64
|
+
when Net::HTTPSuccess
|
65
|
+
doc = Hpricot(response.body)
|
66
|
+
end
|
67
|
+
rescue
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
@pages_hit += 1
|
72
|
+
|
73
|
+
@each_site.call url
|
74
|
+
|
75
|
+
find_links(doc, url) do |link|
|
76
|
+
next if @visited.include? link
|
77
|
+
next if @to_visit.include? link
|
78
|
+
|
79
|
+
@to_visit << link
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def find_links(doc, url)
|
84
|
+
return unless doc.respond_to? 'search'
|
85
|
+
|
86
|
+
begin
|
87
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
88
|
+
rescue
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
hrefs = []
|
93
|
+
|
94
|
+
# Looks like a valid document! Let's parse it for links
|
95
|
+
doc.search("//a[@href]").each do |e|
|
96
|
+
hrefs << e.get_attribute("href")
|
97
|
+
end
|
98
|
+
|
99
|
+
if @options[:search_in_comments]
|
100
|
+
# Let's also look for commented-out URIs
|
101
|
+
doc.search("//comment()").each do |e|
|
102
|
+
e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
hrefs.each do |href|
|
107
|
+
# Skip mailto links
|
108
|
+
next if href =~ /^mailto:/
|
109
|
+
|
110
|
+
# If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
|
111
|
+
if href.to_s =~ /^\//
|
112
|
+
href = uri.scheme + "://" + uri.host + href.to_s
|
113
|
+
end
|
114
|
+
|
115
|
+
# If we're dealing with a path-relative URL, make it relative to the current directory.
|
116
|
+
unless href.to_s =~ /[a-z]+:\/\//
|
117
|
+
# Take everything up to the final / in the path to be the current directory.
|
118
|
+
if uri.path =~ /\//
|
119
|
+
/^(.*)\//.match(uri.path)
|
120
|
+
path = $1
|
121
|
+
# If we're on the homepage, then we don't need a path.
|
122
|
+
else
|
123
|
+
path = ""
|
124
|
+
end
|
125
|
+
|
126
|
+
href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
|
127
|
+
end
|
128
|
+
|
129
|
+
# At this point, we should have an absolute URL regardless of
|
130
|
+
# its original format.
|
131
|
+
|
132
|
+
# Strip hash links
|
133
|
+
if ( @options[:ignore_hash] )
|
134
|
+
href.gsub!(/(#.*?)$/, '')
|
135
|
+
end
|
136
|
+
|
137
|
+
# Strip query strings
|
138
|
+
if ( @options[:ignore_query_string] )
|
139
|
+
href.gsub!(/(\?.*?)$/, '')
|
140
|
+
end
|
141
|
+
|
142
|
+
begin
|
143
|
+
href_uri = URI.parse(href)
|
144
|
+
rescue
|
145
|
+
# No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
|
146
|
+
next
|
147
|
+
end
|
148
|
+
|
149
|
+
next if href_uri.host != uri.host
|
150
|
+
next unless href_uri.scheme =~ /^https?$/
|
151
|
+
|
152
|
+
yield href
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def spider(&block)
|
157
|
+
Parallel.in_threads(@options[:threads]) { |thread_number|
|
158
|
+
# We've crawled too many pages
|
159
|
+
next if @pages_hit > @options[:num_pages] && @options[:num_pages] >= 0
|
160
|
+
|
161
|
+
while @to_visit.length > 0 do
|
162
|
+
begin
|
163
|
+
url = @to_visit.pop
|
164
|
+
end while ( @visited.include? url )
|
165
|
+
|
166
|
+
crawl_page(url, block)
|
167
|
+
end
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: redback
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rob Miller
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Fetches a URL you give it and recursively searches for all URLs it can
|
14
|
+
find, building up a list of unique URLs on the same hostname.
|
15
|
+
email: rob@bigfish.co.uk
|
16
|
+
executables:
|
17
|
+
- redback
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- bin/redback
|
22
|
+
- lib/redback.rb
|
23
|
+
homepage: https://github.com/robmiller/redback
|
24
|
+
licenses: []
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.0.3
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: Spiders a website, pulling out a list of unique URLs.
|
46
|
+
test_files: []
|