redback 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/redback +5 -0
  3. data/lib/redback.rb +171 -0
  4. metadata +46 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ad5bb571eee56b563c4d85fc2fd00342d25c8a16
4
+ data.tar.gz: 451b6ab74b8039579180da7cd1f423ae5c54909d
5
+ SHA512:
6
+ metadata.gz: 0df94ad9c9cd34bd4e735a0fe25f9750d71762538f7a4c1a1e27e5e491b2e1cd8f6a5827cedc660231a5e254a07b86afad626b91cb26abe740af19875a0bd0b9
7
+ data.tar.gz: c3b5f59eb773157c74666e694dda602e2a7fc6ec1664f0957397ee654461bec54b4d7a13058a06b2ab439c67a27f2b493b88a245d86290075dfa6f438dc03107
data/bin/redback ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/redback.rb'
4
+
5
+ Redback.new ARGV[0] { |url| puts url }
data/lib/redback.rb ADDED
@@ -0,0 +1,171 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'net/http'
4
+ require 'parallel'
5
+
6
+ class Redback
7
+
8
+ def initialize(url, &each_site)
9
+ if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
10
+ url = 'http://' + url
11
+ end
12
+
13
+ @uri = URI.parse(url)
14
+
15
+ @pages_hit = 0
16
+
17
+ @visited = []
18
+ @to_visit = []
19
+
20
+ @each_site = each_site
21
+
22
+ @options = {
23
+ :ignore_hash => true,
24
+ :ignore_query_string => false,
25
+ :search_in_comments => false,
26
+ :threads => 4,
27
+ :num_pages => 1000
28
+ }
29
+
30
+ crawl_page(url)
31
+ spider
32
+ end
33
+
34
+ def queue_link(url)
35
+ @to_visit << url
36
+ end
37
+
38
+ def crawl_page(url, limit = 10)
39
+ # Don't crawl a page twice
40
+ return if @visited.include? url
41
+
42
+ # Let's not hit this again
43
+ @visited << url
44
+
45
+ begin
46
+ uri = URI.parse(URI.encode(url.to_s.strip))
47
+ rescue
48
+ return
49
+ end
50
+
51
+ headers = {
52
+ "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
53
+ "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
54
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
55
+ }
56
+
57
+ begin
58
+ req = Net::HTTP::Get.new(uri.path, headers)
59
+ response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
60
+
61
+ case response
62
+ when Net::HTTPRedirection
63
+ return crawl_page(response['location'], limit - 1)
64
+ when Net::HTTPSuccess
65
+ doc = Hpricot(response.body)
66
+ end
67
+ rescue
68
+ return
69
+ end
70
+
71
+ @pages_hit += 1
72
+
73
+ @each_site.call url
74
+
75
+ find_links(doc, url) do |link|
76
+ next if @visited.include? link
77
+ next if @to_visit.include? link
78
+
79
+ @to_visit << link
80
+ end
81
+ end
82
+
83
+ def find_links(doc, url)
84
+ return unless doc.respond_to? 'search'
85
+
86
+ begin
87
+ uri = URI.parse(URI.encode(url.to_s.strip))
88
+ rescue
89
+ return
90
+ end
91
+
92
+ hrefs = []
93
+
94
+ # Looks like a valid document! Let's parse it for links
95
+ doc.search("//a[@href]").each do |e|
96
+ hrefs << e.get_attribute("href")
97
+ end
98
+
99
+ if @options[:search_in_comments]
100
+ # Let's also look for commented-out URIs
101
+ doc.search("//comment()").each do |e|
102
+ e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
103
+ end
104
+ end
105
+
106
+ hrefs.each do |href|
107
+ # Skip mailto links
108
+ next if href =~ /^mailto:/
109
+
110
+ # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
111
+ if href.to_s =~ /^\//
112
+ href = uri.scheme + "://" + uri.host + href.to_s
113
+ end
114
+
115
+ # If we're dealing with a path-relative URL, make it relative to the current directory.
116
+ unless href.to_s =~ /[a-z]+:\/\//
117
+ # Take everything up to the final / in the path to be the current directory.
118
+ if uri.path =~ /\//
119
+ /^(.*)\//.match(uri.path)
120
+ path = $1
121
+ # If we're on the homepage, then we don't need a path.
122
+ else
123
+ path = ""
124
+ end
125
+
126
+ href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
127
+ end
128
+
129
+ # At this point, we should have an absolute URL regardless of
130
+ # its original format.
131
+
132
+ # Strip hash links
133
+ if ( @options[:ignore_hash] )
134
+ href.gsub!(/(#.*?)$/, '')
135
+ end
136
+
137
+ # Strip query strings
138
+ if ( @options[:ignore_query_string] )
139
+ href.gsub!(/(\?.*?)$/, '')
140
+ end
141
+
142
+ begin
143
+ href_uri = URI.parse(href)
144
+ rescue
145
+ # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
146
+ next
147
+ end
148
+
149
+ next if href_uri.host != uri.host
150
+ next unless href_uri.scheme =~ /^https?$/
151
+
152
+ yield href
153
+ end
154
+ end
155
+
156
+ def spider(&block)
157
+ Parallel.in_threads(@options[:threads]) { |thread_number|
158
+ # We've crawled too many pages
159
+ next if @pages_hit > @options[:num_pages] && @options[:num_pages] >= 0
160
+
161
+ while @to_visit.length > 0 do
162
+ begin
163
+ url = @to_visit.pop
164
+ end while ( @visited.include? url )
165
+
166
+ crawl_page(url, block)
167
+ end
168
+ }
169
+ end
170
+ end
171
+
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redback
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Rob Miller
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Fetches a URL you give it and recursively searches for all URLs it can
14
+ find, building up a list of unique URLs on the same hostname.
15
+ email: rob@bigfish.co.uk
16
+ executables:
17
+ - redback
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/redback
22
+ - lib/redback.rb
23
+ homepage: https://github.com/robmiller/redback
24
+ licenses: []
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.0.3
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: Spiders a website, pulling out a list of unique URLs.
46
+ test_files: []