redback 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/redback +5 -0
  3. data/lib/redback.rb +171 -0
  4. metadata +46 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ad5bb571eee56b563c4d85fc2fd00342d25c8a16
4
+ data.tar.gz: 451b6ab74b8039579180da7cd1f423ae5c54909d
5
+ SHA512:
6
+ metadata.gz: 0df94ad9c9cd34bd4e735a0fe25f9750d71762538f7a4c1a1e27e5e491b2e1cd8f6a5827cedc660231a5e254a07b86afad626b91cb26abe740af19875a0bd0b9
7
+ data.tar.gz: c3b5f59eb773157c74666e694dda602e2a7fc6ec1664f0957397ee654461bec54b4d7a13058a06b2ab439c67a27f2b493b88a245d86290075dfa6f438dc03107
data/bin/redback ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/redback.rb'
4
+
5
+ Redback.new ARGV[0] { |url| puts url }
data/lib/redback.rb ADDED
@@ -0,0 +1,171 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'net/http'
4
+ require 'parallel'
5
+
6
+ class Redback
7
+
8
+ def initialize(url, &each_site)
9
+ if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
10
+ url = 'http://' + url
11
+ end
12
+
13
+ @uri = URI.parse(url)
14
+
15
+ @pages_hit = 0
16
+
17
+ @visited = []
18
+ @to_visit = []
19
+
20
+ @each_site = each_site
21
+
22
+ @options = {
23
+ :ignore_hash => true,
24
+ :ignore_query_string => false,
25
+ :search_in_comments => false,
26
+ :threads => 4,
27
+ :num_pages => 1000
28
+ }
29
+
30
+ crawl_page(url)
31
+ spider
32
+ end
33
+
34
+ def queue_link(url)
35
+ @to_visit << url
36
+ end
37
+
38
+ def crawl_page(url, limit = 10)
39
+ # Don't crawl a page twice
40
+ return if @visited.include? url
41
+
42
+ # Let's not hit this again
43
+ @visited << url
44
+
45
+ begin
46
+ uri = URI.parse(URI.encode(url.to_s.strip))
47
+ rescue
48
+ return
49
+ end
50
+
51
+ headers = {
52
+ "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
53
+ "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
54
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
55
+ }
56
+
57
+ begin
58
+ req = Net::HTTP::Get.new(uri.path, headers)
59
+ response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
60
+
61
+ case response
62
+ when Net::HTTPRedirection
63
+ return crawl_page(response['location'], limit - 1)
64
+ when Net::HTTPSuccess
65
+ doc = Hpricot(response.body)
66
+ end
67
+ rescue
68
+ return
69
+ end
70
+
71
+ @pages_hit += 1
72
+
73
+ @each_site.call url
74
+
75
+ find_links(doc, url) do |link|
76
+ next if @visited.include? link
77
+ next if @to_visit.include? link
78
+
79
+ @to_visit << link
80
+ end
81
+ end
82
+
83
+ def find_links(doc, url)
84
+ return unless doc.respond_to? 'search'
85
+
86
+ begin
87
+ uri = URI.parse(URI.encode(url.to_s.strip))
88
+ rescue
89
+ return
90
+ end
91
+
92
+ hrefs = []
93
+
94
+ # Looks like a valid document! Let's parse it for links
95
+ doc.search("//a[@href]").each do |e|
96
+ hrefs << e.get_attribute("href")
97
+ end
98
+
99
+ if @options[:search_in_comments]
100
+ # Let's also look for commented-out URIs
101
+ doc.search("//comment()").each do |e|
102
+ e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
103
+ end
104
+ end
105
+
106
+ hrefs.each do |href|
107
+ # Skip mailto links
108
+ next if href =~ /^mailto:/
109
+
110
+ # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
111
+ if href.to_s =~ /^\//
112
+ href = uri.scheme + "://" + uri.host + href.to_s
113
+ end
114
+
115
+ # If we're dealing with a path-relative URL, make it relative to the current directory.
116
+ unless href.to_s =~ /[a-z]+:\/\//
117
+ # Take everything up to the final / in the path to be the current directory.
118
+ if uri.path =~ /\//
119
+ /^(.*)\//.match(uri.path)
120
+ path = $1
121
+ # If we're on the homepage, then we don't need a path.
122
+ else
123
+ path = ""
124
+ end
125
+
126
+ href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
127
+ end
128
+
129
+ # At this point, we should have an absolute URL regardless of
130
+ # its original format.
131
+
132
+ # Strip hash links
133
+ if ( @options[:ignore_hash] )
134
+ href.gsub!(/(#.*?)$/, '')
135
+ end
136
+
137
+ # Strip query strings
138
+ if ( @options[:ignore_query_string] )
139
+ href.gsub!(/(\?.*?)$/, '')
140
+ end
141
+
142
+ begin
143
+ href_uri = URI.parse(href)
144
+ rescue
145
+ # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
146
+ next
147
+ end
148
+
149
+ next if href_uri.host != uri.host
150
+ next unless href_uri.scheme =~ /^https?$/
151
+
152
+ yield href
153
+ end
154
+ end
155
+
156
+ def spider(&block)
157
+ Parallel.in_threads(@options[:threads]) { |thread_number|
158
+ # We've crawled too many pages
159
+ next if @pages_hit > @options[:num_pages] && @options[:num_pages] >= 0
160
+
161
+ while @to_visit.length > 0 do
162
+ begin
163
+ url = @to_visit.pop
164
+ end while ( @visited.include? url )
165
+
166
+ crawl_page(url, block)
167
+ end
168
+ }
169
+ end
170
+ end
171
+
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redback
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Rob Miller
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Fetches a URL you give it and recursively searches for all URLs it can
14
+ find, building up a list of unique URLs on the same hostname.
15
+ email: rob@bigfish.co.uk
16
+ executables:
17
+ - redback
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/redback
22
+ - lib/redback.rb
23
+ homepage: https://github.com/robmiller/redback
24
+ licenses: []
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.0.3
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: Spiders a website, pulling out a list of unique URLs.
46
+ test_files: []