redback 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/redback +5 -0
- data/lib/redback.rb +171 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ad5bb571eee56b563c4d85fc2fd00342d25c8a16
|
4
|
+
data.tar.gz: 451b6ab74b8039579180da7cd1f423ae5c54909d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0df94ad9c9cd34bd4e735a0fe25f9750d71762538f7a4c1a1e27e5e491b2e1cd8f6a5827cedc660231a5e254a07b86afad626b91cb26abe740af19875a0bd0b9
|
7
|
+
data.tar.gz: c3b5f59eb773157c74666e694dda602e2a7fc6ec1664f0957397ee654461bec54b4d7a13058a06b2ab439c67a27f2b493b88a245d86290075dfa6f438dc03107
|
data/bin/redback
ADDED
data/lib/redback.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'net/http'
|
4
|
+
require 'parallel'
|
5
|
+
|
6
|
+
class Redback
|
7
|
+
|
8
|
+
def initialize(url, &each_site)
|
9
|
+
if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
|
10
|
+
url = 'http://' + url
|
11
|
+
end
|
12
|
+
|
13
|
+
@uri = URI.parse(url)
|
14
|
+
|
15
|
+
@pages_hit = 0
|
16
|
+
|
17
|
+
@visited = []
|
18
|
+
@to_visit = []
|
19
|
+
|
20
|
+
@each_site = each_site
|
21
|
+
|
22
|
+
@options = {
|
23
|
+
:ignore_hash => true,
|
24
|
+
:ignore_query_string => false,
|
25
|
+
:search_in_comments => false,
|
26
|
+
:threads => 4,
|
27
|
+
:num_pages => 1000
|
28
|
+
}
|
29
|
+
|
30
|
+
crawl_page(url)
|
31
|
+
spider
|
32
|
+
end
|
33
|
+
|
34
|
+
def queue_link(url)
|
35
|
+
@to_visit << url
|
36
|
+
end
|
37
|
+
|
38
|
+
def crawl_page(url, limit = 10)
|
39
|
+
# Don't crawl a page twice
|
40
|
+
return if @visited.include? url
|
41
|
+
|
42
|
+
# Let's not hit this again
|
43
|
+
@visited << url
|
44
|
+
|
45
|
+
begin
|
46
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
47
|
+
rescue
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
headers = {
|
52
|
+
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
|
53
|
+
"Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
|
54
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
55
|
+
}
|
56
|
+
|
57
|
+
begin
|
58
|
+
req = Net::HTTP::Get.new(uri.path, headers)
|
59
|
+
response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
|
60
|
+
|
61
|
+
case response
|
62
|
+
when Net::HTTPRedirection
|
63
|
+
return crawl_page(response['location'], limit - 1)
|
64
|
+
when Net::HTTPSuccess
|
65
|
+
doc = Hpricot(response.body)
|
66
|
+
end
|
67
|
+
rescue
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
@pages_hit += 1
|
72
|
+
|
73
|
+
@each_site.call url
|
74
|
+
|
75
|
+
find_links(doc, url) do |link|
|
76
|
+
next if @visited.include? link
|
77
|
+
next if @to_visit.include? link
|
78
|
+
|
79
|
+
@to_visit << link
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def find_links(doc, url)
|
84
|
+
return unless doc.respond_to? 'search'
|
85
|
+
|
86
|
+
begin
|
87
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
88
|
+
rescue
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
hrefs = []
|
93
|
+
|
94
|
+
# Looks like a valid document! Let's parse it for links
|
95
|
+
doc.search("//a[@href]").each do |e|
|
96
|
+
hrefs << e.get_attribute("href")
|
97
|
+
end
|
98
|
+
|
99
|
+
if @options[:search_in_comments]
|
100
|
+
# Let's also look for commented-out URIs
|
101
|
+
doc.search("//comment()").each do |e|
|
102
|
+
e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
hrefs.each do |href|
|
107
|
+
# Skip mailto links
|
108
|
+
next if href =~ /^mailto:/
|
109
|
+
|
110
|
+
# If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
|
111
|
+
if href.to_s =~ /^\//
|
112
|
+
href = uri.scheme + "://" + uri.host + href.to_s
|
113
|
+
end
|
114
|
+
|
115
|
+
# If we're dealing with a path-relative URL, make it relative to the current directory.
|
116
|
+
unless href.to_s =~ /[a-z]+:\/\//
|
117
|
+
# Take everything up to the final / in the path to be the current directory.
|
118
|
+
if uri.path =~ /\//
|
119
|
+
/^(.*)\//.match(uri.path)
|
120
|
+
path = $1
|
121
|
+
# If we're on the homepage, then we don't need a path.
|
122
|
+
else
|
123
|
+
path = ""
|
124
|
+
end
|
125
|
+
|
126
|
+
href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
|
127
|
+
end
|
128
|
+
|
129
|
+
# At this point, we should have an absolute URL regardless of
|
130
|
+
# its original format.
|
131
|
+
|
132
|
+
# Strip hash links
|
133
|
+
if ( @options[:ignore_hash] )
|
134
|
+
href.gsub!(/(#.*?)$/, '')
|
135
|
+
end
|
136
|
+
|
137
|
+
# Strip query strings
|
138
|
+
if ( @options[:ignore_query_string] )
|
139
|
+
href.gsub!(/(\?.*?)$/, '')
|
140
|
+
end
|
141
|
+
|
142
|
+
begin
|
143
|
+
href_uri = URI.parse(href)
|
144
|
+
rescue
|
145
|
+
# No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
|
146
|
+
next
|
147
|
+
end
|
148
|
+
|
149
|
+
next if href_uri.host != uri.host
|
150
|
+
next unless href_uri.scheme =~ /^https?$/
|
151
|
+
|
152
|
+
yield href
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def spider(&block)
|
157
|
+
Parallel.in_threads(@options[:threads]) { |thread_number|
|
158
|
+
# We've crawled too many pages
|
159
|
+
next if @pages_hit > @options[:num_pages] && @options[:num_pages] >= 0
|
160
|
+
|
161
|
+
while @to_visit.length > 0 do
|
162
|
+
begin
|
163
|
+
url = @to_visit.pop
|
164
|
+
end while ( @visited.include? url )
|
165
|
+
|
166
|
+
crawl_page(url, block)
|
167
|
+
end
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: redback
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rob Miller
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Fetches a URL you give it and recursively searches for all URLs it can
|
14
|
+
find, building up a list of unique URLs on the same hostname.
|
15
|
+
email: rob@bigfish.co.uk
|
16
|
+
executables:
|
17
|
+
- redback
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- bin/redback
|
22
|
+
- lib/redback.rb
|
23
|
+
homepage: https://github.com/robmiller/redback
|
24
|
+
licenses: []
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.0.3
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: Spiders a website, pulling out a list of unique URLs.
|
46
|
+
test_files: []
|