debugher 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +14 -0
- data/LICENSE +22 -0
- data/README.md +32 -0
- data/README.rdoc +0 -0
- data/Rakefile +18 -0
- data/debugher.gemspec +25 -0
- data/lib/debugher.rb +473 -0
- data/lib/debugher/version.rb +3 -0
- data/test/test_debugher.rb +195 -0
- metadata +108 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in debugher.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'nokogiri'
|
7
|
+
gem "addressable"
|
8
|
+
gem 'robots'
|
9
|
+
|
10
|
+
group :development, :test do
|
11
|
+
gem 'rspec'
|
12
|
+
gem 'rack-test'
|
13
|
+
gem 'simplecov', :require => false
|
14
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Peter Roome
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# Debugher
|
2
|
+
|
3
|
+
A handy set of methods for getting various bits of information about a web page.
|
4
|
+
|
5
|
+
This is used by the Rakkit Debugger to output what information we can gather about various pages on an adhoc basis.
|
6
|
+
The library is also used by the Rakkit spider to process and index pages across the web.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
gem 'debugher'
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install debugher
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
TODO: Write usage instructions here
|
25
|
+
|
26
|
+
## Contributing
|
27
|
+
|
28
|
+
1. Fork it
|
29
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
30
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
31
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
32
|
+
5. Create new Pull Request
|
data/README.rdoc
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rake/testtask'
|
4
|
+
require 'rdoc/task'
|
5
|
+
|
6
|
+
Rake::TestTask.new do |t|
|
7
|
+
t.libs << 'test'
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Run tests"
|
11
|
+
task :default => :test
|
12
|
+
|
13
|
+
Rake::RDocTask.new do |rd|
|
14
|
+
rd.main = "README.rdoc"
|
15
|
+
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
|
16
|
+
end
|
17
|
+
desc "Generate documentation"
|
18
|
+
task :rdoc
|
data/debugher.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/debugher/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Peter Roome"]
|
6
|
+
gem.email = ["pete@wearepandr.com"]
|
7
|
+
gem.description = %q{
|
8
|
+
A handy set of methods for getting various bits of information about a web page.
|
9
|
+
This is used by the Rakkit Debugger to output what information we can gather about various pages on an adhoc basis.
|
10
|
+
The library is also used by the Rakkit spider to process and index pages across the web.
|
11
|
+
}
|
12
|
+
gem.summary = %q{Methods for the Rakkit Debugger.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($\)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.name = "debugher"
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
gem.version = Debugher::VERSION
|
21
|
+
|
22
|
+
gem.add_dependency 'nokogiri'
|
23
|
+
gem.add_dependency "addressable"
|
24
|
+
gem.add_dependency 'robots'
|
25
|
+
end
|
data/lib/debugher.rb
ADDED
@@ -0,0 +1,473 @@
|
|
1
|
+
require "debugher/version"
|
2
|
+
|
3
|
+
module Debugher
|
4
|
+
require 'robots'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'addressable/uri'
|
7
|
+
require 'cgi'
|
8
|
+
|
9
|
+
class Debugger
|
10
|
+
FILE_TYPES = ['.mp3', '.m4a', '.MP3']
|
11
|
+
attr_accessor :url
|
12
|
+
|
13
|
+
# pass a url as a string to initialize
|
14
|
+
def initialize(url)
|
15
|
+
$stdout.sync = true
|
16
|
+
@uri = URI.parse(url)
|
17
|
+
@url = @uri.class == URI::HTTP ? url : "http://#{url}"
|
18
|
+
@uri = URI.parse(@url)
|
19
|
+
@opened_url = open_url
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_url
|
23
|
+
url_object = nil
|
24
|
+
ua = Debugger.user_agent
|
25
|
+
@robot = Robots.new(ua)
|
26
|
+
if @robot.allowed?(@uri)
|
27
|
+
begin
|
28
|
+
url_object = open(@uri,
|
29
|
+
"User-Agent" => ua,
|
30
|
+
"From" => "hello@rakkit.com",
|
31
|
+
"Referer" => "http://rakkit.com")
|
32
|
+
rescue Exception => e
|
33
|
+
# Most likely a 404 error
|
34
|
+
$stderr.puts "Unable to open url: #{url} - #{e}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
return url_object
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the response code of the page
|
41
|
+
#
|
42
|
+
# Example:
|
43
|
+
# >> Debugger.new("http://rakkit.com").response_code
|
44
|
+
# => 200 OK
|
45
|
+
def response_code
|
46
|
+
@opened_url.status.join(" ")
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return the fecthed URL
|
50
|
+
#
|
51
|
+
# Example:
|
52
|
+
# >> Debugger.new("rakkit.com").fetched_url
|
53
|
+
# => http://rakkit.com
|
54
|
+
def fetched_url
|
55
|
+
@uri.to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the canonical url of the page
|
59
|
+
#
|
60
|
+
# Example:
|
61
|
+
# >> Debugger.new("http://rakkit.com").response_code
|
62
|
+
# => http://rakkit.com/
|
63
|
+
def canonical_url
|
64
|
+
begin
|
65
|
+
canonical_uri = @uri
|
66
|
+
canonical_uri.path = ''
|
67
|
+
canonical_uri.query = nil
|
68
|
+
canonical_uri = canonical_uri + "/"
|
69
|
+
return canonical_uri.to_s
|
70
|
+
rescue Exception => e
|
71
|
+
puts "CANONICAL ERROR: #{e}"
|
72
|
+
puts @uri.inspect.to_s
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# loads the Hpricot XML object if it hasn't already been loaded
|
77
|
+
def page
|
78
|
+
@page ||= Nokogiri::HTML(@opened_url)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Get the RSS Feed URL
|
82
|
+
#
|
83
|
+
# Example:
|
84
|
+
# >> Debugger.new("http://wearepandr.com").rss_feed_url
|
85
|
+
# => http://wearepandr.com/feed
|
86
|
+
def rss_feed_url
|
87
|
+
rss_url = page.search("link[@type='application/rss+xml']")
|
88
|
+
rss_url = rss_url.length == 0 ? nil : rss_url.first['href']
|
89
|
+
|
90
|
+
rss_url = Debugger.stitch_to_make_absolute(canonical_url, rss_url) if Debugger.relative?(rss_url)
|
91
|
+
return rss_url.to_s
|
92
|
+
end
|
93
|
+
|
94
|
+
# Get the Atom Feed URL
|
95
|
+
#
|
96
|
+
# Example:
|
97
|
+
# >> Debugger.new("http://wearepandr.com").atom_feed_url
|
98
|
+
# => http://wearepandr.com/feed
|
99
|
+
def atom_feed_url
|
100
|
+
atom_url = page.search("link[@type='application/atom+xml']")
|
101
|
+
atom_url = atom_url.length == 0 ? nil : atom_url.first['href']
|
102
|
+
|
103
|
+
atom_url = Debugger.stitch_to_make_absolute(canonical_url, atom_url) if Debugger.relative?(atom_url)
|
104
|
+
return atom_url.to_s
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get the FEED URL, no matter if it's the Atom URL or the RSS URL
|
108
|
+
#
|
109
|
+
# Example:
|
110
|
+
# >> Debugger.new("http://wearepandr.com").feed_url
|
111
|
+
# => http://wearepandr.com/feed
|
112
|
+
def feed_url
|
113
|
+
if rss_feed_url != '' || atom_feed_url != ''
|
114
|
+
feed_url = rss_feed_url != '' ? rss_feed_url : atom_feed_url
|
115
|
+
|
116
|
+
if Debugger.relative?(feed_url)
|
117
|
+
feed_url = Debugger.stitch_to_make_absolute(canonical_url, feed_url)
|
118
|
+
else
|
119
|
+
feed_url = feed_url
|
120
|
+
end
|
121
|
+
|
122
|
+
else
|
123
|
+
feed_url = nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Return some meta info about the page
|
128
|
+
#
|
129
|
+
# Example:
|
130
|
+
# >> Debugger.new("http://wearepandr.com").scrape_info
|
131
|
+
# => {:response_code => "200 OK",
|
132
|
+
# :fetched_url => "http://wearepandr.com",
|
133
|
+
# :canonical_url => "http://wearepandr.com/",
|
134
|
+
# :feed_url => "http://wearepandr.com/feed"}
|
135
|
+
def scrape_info
|
136
|
+
return {:response_code => response_code,
|
137
|
+
:fetched_url => fetched_url,
|
138
|
+
:canonical_url => canonical_url,
|
139
|
+
:feed_url => feed_url}
|
140
|
+
end
|
141
|
+
|
142
|
+
# Get the page title
|
143
|
+
#
|
144
|
+
# Example:
|
145
|
+
# >> Debugger.new("http://wearepandr.com").title
|
146
|
+
# => Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk | PANDR
|
147
|
+
def title
|
148
|
+
title = page.css('title')[0].inner_html.strip
|
149
|
+
title = title == '' ? nil : title
|
150
|
+
return title
|
151
|
+
end
|
152
|
+
|
153
|
+
# Get the page description
|
154
|
+
#
|
155
|
+
# Example:
|
156
|
+
# >> Debugger.new("http://wearepandr.com").description
|
157
|
+
# => A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK
|
158
|
+
def description
|
159
|
+
description = page.css("meta[name='description']/@content").inner_html.strip
|
160
|
+
description = description == '' ? nil : description
|
161
|
+
return description
|
162
|
+
end
|
163
|
+
|
164
|
+
# Get the page meta data in a hash, title and description.
|
165
|
+
#
|
166
|
+
# Example:
|
167
|
+
# >> Debugger.new("http://wearepandr.com").meta_data
|
168
|
+
# => {:title => "Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk | PANDR",
|
169
|
+
# :description => "A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK"}
|
170
|
+
def meta_data
|
171
|
+
return {:title => title,
|
172
|
+
:description => description}
|
173
|
+
end
|
174
|
+
|
175
|
+
# Get the music links from the feed found on the page
|
176
|
+
#
|
177
|
+
# Example:
|
178
|
+
# >> Debugger.new("http://wearepandr.com").music_from_feed
|
179
|
+
# => ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]
|
180
|
+
#
|
181
|
+
# Arguments:
|
182
|
+
# file_types: [Array]
|
183
|
+
def music_from_feed(file_types=FILE_TYPES)
|
184
|
+
links = []
|
185
|
+
if !feed_url.nil?
|
186
|
+
@feed ||= Nokogiri::XML(open(feed_url))
|
187
|
+
@feed.encoding = 'utf-8'
|
188
|
+
channel = @feed.search('//channel')
|
189
|
+
|
190
|
+
# If the blog isn't set up with channels then we can
|
191
|
+
# search the data we have for all links that end in .mp3 x
|
192
|
+
if !channel.empty?
|
193
|
+
items = @feed.search("//channel/item")
|
194
|
+
items.each do |item|
|
195
|
+
enclosures = item.search("//channel/item/enclosure")
|
196
|
+
enclosures.each do |enclosure|
|
197
|
+
enclosure_file = enclosure['url'].to_s[-4,4]
|
198
|
+
links << enclosure['url'] if file_types.include?(enclosure_file)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
links = links.uniq
|
204
|
+
return links.compact
|
205
|
+
end
|
206
|
+
|
207
|
+
# Get the music links from the page html
|
208
|
+
#
|
209
|
+
# Example:
|
210
|
+
# >> Debugger.new("http://wearepandr.com").music_from_html
|
211
|
+
# => ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]
|
212
|
+
#
|
213
|
+
# Arguments:
|
214
|
+
# file_types: [Array]
|
215
|
+
def music_from_html(file_types=FILE_TYPES)
|
216
|
+
links = []
|
217
|
+
|
218
|
+
page_links.each do |track|
|
219
|
+
track_file = track['href'].to_s[-4,4]
|
220
|
+
|
221
|
+
if file_types.include?(track_file)
|
222
|
+
links << track["href"]
|
223
|
+
end
|
224
|
+
end
|
225
|
+
links = links.uniq
|
226
|
+
return links.compact
|
227
|
+
end
|
228
|
+
|
229
|
+
# Get the soundcloud music links from the page html
|
230
|
+
#
|
231
|
+
# Example:
|
232
|
+
# >> Debugger.new("http://wearepandr.com").music_from_soundcloud
|
233
|
+
# => ["http://api.soundcloud.com/playlists/2153957", "http://api.soundcloud.com/playlists/2153958"]
|
234
|
+
def music_from_soundcloud
|
235
|
+
links = []
|
236
|
+
@html_url ||= Nokogiri::HTML(open(@uri))
|
237
|
+
@html_url.search("//iframe", "//param").each do |url|
|
238
|
+
object_url = url["src"] || url["value"]
|
239
|
+
links << Debugger.get_soundcloud_url(object_url)
|
240
|
+
end
|
241
|
+
links = links.uniq
|
242
|
+
return links.compact
|
243
|
+
end
|
244
|
+
|
245
|
+
# Get the internal page links from the page
|
246
|
+
#
|
247
|
+
# Example:
|
248
|
+
# >> Debugger.new("http://wearepandr.com").internal_links
|
249
|
+
# => ["http://wearepandr.com/about", "http://wearepandr.com/blog"]
|
250
|
+
def internal_links
|
251
|
+
links = []
|
252
|
+
current_host = @uri.host
|
253
|
+
|
254
|
+
page_links.each do |link|
|
255
|
+
|
256
|
+
# Remove anchors from links
|
257
|
+
|
258
|
+
new_link = link['href'].nil? ? nil : link['href'].split("#")[0]
|
259
|
+
|
260
|
+
if !new_link.nil? && !new_link.strip.empty? && !Debugger.mailto_link?(new_link)
|
261
|
+
|
262
|
+
new_link = Debugger.make_absolute(new_link)
|
263
|
+
|
264
|
+
if new_link != nil
|
265
|
+
|
266
|
+
# Check to see if the URL is still from the current site
|
267
|
+
#
|
268
|
+
if current_host == Addressable::URI.parse(new_link).host
|
269
|
+
links << new_link
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
links = links.uniq
|
276
|
+
return links.compact
|
277
|
+
end
|
278
|
+
|
279
|
+
# Get all the links from the page
|
280
|
+
#
|
281
|
+
# Example:
|
282
|
+
# >> Debugger.new("http://wearepandr.com").page_links
|
283
|
+
# => ["http://wearepandr.com/about", "http://google.com", "http://yahoo.com"]
|
284
|
+
def page_links
|
285
|
+
@html_url ||= Nokogiri::HTML(open(@uri))
|
286
|
+
|
287
|
+
links = @html_url.search("//a")
|
288
|
+
return links
|
289
|
+
end
|
290
|
+
|
291
|
+
# Get all the links from the page
|
292
|
+
#
|
293
|
+
# Example:
|
294
|
+
# >> Debugger.new("http://wearepandr.com").host
|
295
|
+
# => wearepandr.com
|
296
|
+
def host
|
297
|
+
Addressable::URI.parse(@uri).host
|
298
|
+
end
|
299
|
+
|
300
|
+
# Get the pages content type
|
301
|
+
#
|
302
|
+
# Example:
|
303
|
+
# >> Debugger.new("http://wearepandr.com").content_type
|
304
|
+
# => text/html
|
305
|
+
def content_type
|
306
|
+
@opened_url.content_type
|
307
|
+
end
|
308
|
+
|
309
|
+
# Get the pages charset
|
310
|
+
#
|
311
|
+
# Example:
|
312
|
+
# >> Debugger.new("http://wearepandr.com").charset
|
313
|
+
# => utf-8
|
314
|
+
def charset
|
315
|
+
@opened_url.charset
|
316
|
+
end
|
317
|
+
|
318
|
+
# Get the pages content encoding
|
319
|
+
#
|
320
|
+
# Example:
|
321
|
+
# >> Debugger.new("http://wearepandr.com").content_encoding
|
322
|
+
# => []
|
323
|
+
def content_encoding
|
324
|
+
@opened_url.content_encoding
|
325
|
+
end
|
326
|
+
|
327
|
+
# Get the pages last modified date
|
328
|
+
#
|
329
|
+
# Example:
|
330
|
+
# >> Debugger.new("http://wearepandr.com").last_modified
|
331
|
+
# =>
|
332
|
+
def last_modified
|
333
|
+
@opened_url.last_modified
|
334
|
+
end
|
335
|
+
|
336
|
+
# Get the user agent
|
337
|
+
#
|
338
|
+
# Example:
|
339
|
+
# >> Debugger.user_agent("PANDR")
|
340
|
+
# => PANDR/V0.1
|
341
|
+
#
|
342
|
+
# Arguments:
|
343
|
+
# ua: (String)
|
344
|
+
def self.user_agent(ua="Rakkit")
|
345
|
+
"#{ua}/V#{Debugher::VERSION}"
|
346
|
+
end
|
347
|
+
|
348
|
+
# Get the current version
|
349
|
+
#
|
350
|
+
# Example:
|
351
|
+
# >> Debugger.version
|
352
|
+
# => V0.1
|
353
|
+
def self.version
|
354
|
+
"V#{Debugher::VERSION}"
|
355
|
+
end
|
356
|
+
|
357
|
+
# Check if a URL is relative or not
|
358
|
+
#
|
359
|
+
# Example:
|
360
|
+
# >> Debugger.relative?("http://wearepandr.com")
|
361
|
+
# => false
|
362
|
+
#
|
363
|
+
# Arguments:
|
364
|
+
# url: (String)
|
365
|
+
def self.relative?(url)
|
366
|
+
begin
|
367
|
+
@addressable_url = Addressable::URI.parse(url)
|
368
|
+
return @addressable_url.relative?
|
369
|
+
rescue
|
370
|
+
return false
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
# Make a URL absolute
|
375
|
+
#
|
376
|
+
# Example:
|
377
|
+
# >> Debugger.make_absolute("/about", "http://wearepandr.com")
|
378
|
+
# => http://wearepandr.com/about
|
379
|
+
#
|
380
|
+
# Arguments:
|
381
|
+
# url: (String)
|
382
|
+
# base_url: (String)
|
383
|
+
def self.make_absolute(url, base_url=nil)
|
384
|
+
if Debugger.relative?(url)
|
385
|
+
begin
|
386
|
+
if !base_url.nil?
|
387
|
+
base_url = Debugger.new(base_url).canonical_url
|
388
|
+
else
|
389
|
+
base_url = canonical_url
|
390
|
+
end
|
391
|
+
|
392
|
+
url = Debugger.stitch_to_make_absolute(base_url, url)
|
393
|
+
rescue Exception => e
|
394
|
+
url = nil
|
395
|
+
$stderr.puts "Debugger Error: #{url} - #{e}"
|
396
|
+
puts "ERROR: Could not make this URL absolute. Set to nil."
|
397
|
+
end
|
398
|
+
end
|
399
|
+
return url
|
400
|
+
end
|
401
|
+
|
402
|
+
# Stitch two strings together to make a single absolute url
|
403
|
+
#
|
404
|
+
# Example:
|
405
|
+
# >> Debugger.stitch_to_make_absolute("http://wearepandr.com/", "/about")
|
406
|
+
# => http://wearepandr.com/about
|
407
|
+
#
|
408
|
+
# Arguments:
|
409
|
+
# canonical_url: (String)
|
410
|
+
# path: (String)
|
411
|
+
def self.stitch_to_make_absolute(canonical_url, path)
|
412
|
+
canonical_url.chomp("/") + path
|
413
|
+
end
|
414
|
+
|
415
|
+
# Check if a string is a mailto link
|
416
|
+
#
|
417
|
+
# Example:
|
418
|
+
# >> Debugger.mailto_link?("mailto:pete@wearepandr.com")
|
419
|
+
# => true
|
420
|
+
#
|
421
|
+
# Arguments:
|
422
|
+
# url: (String)
|
423
|
+
def self.mailto_link?(url)
|
424
|
+
url[0..5] == "mailto"
|
425
|
+
end
|
426
|
+
|
427
|
+
# Extract the URL element of a soundcloud embed in order to grab the link to the track.
|
428
|
+
#
|
429
|
+
# Example:
|
430
|
+
# >> Debugger.get_soundcloud_url("https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F59422468")
|
431
|
+
# => http://api.soundcloud.com/tracks/59422468
|
432
|
+
#
|
433
|
+
# Arguments:
|
434
|
+
# url: (String)
|
435
|
+
def self.get_soundcloud_url(url)
|
436
|
+
begin
|
437
|
+
uri = URI.parse(url)
|
438
|
+
new_url = uri.query.split("&").reject { |q| q[0..2] != "url"}[0]
|
439
|
+
new_url = CGI.unescape(new_url[4..new_url.length])
|
440
|
+
|
441
|
+
if Debugger.soundcloud_url?(new_url)
|
442
|
+
return new_url
|
443
|
+
end
|
444
|
+
rescue
|
445
|
+
$stderr.puts "Bad URL - Soundcloud URL's don't cause errors so safe to assume it's not a Soundcloud link."
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
# Check if a string is a Soundcloud URL
|
450
|
+
#
|
451
|
+
# Example:
|
452
|
+
# >> Debugger.soundcloud_url?("http://api.soundcloud.com/tracks/59422468")
|
453
|
+
# => http://api.soundcloud.com/tracks/59422468
|
454
|
+
#
|
455
|
+
# Arguments:
|
456
|
+
# url: (String)
|
457
|
+
def self.soundcloud_url?(url)
|
458
|
+
url.include?("api.soundcloud.com")
|
459
|
+
end
|
460
|
+
|
461
|
+
# Check if a url is a valid url
|
462
|
+
#
|
463
|
+
# Example:
|
464
|
+
# >> Debugger.valid_url?("http://wearepandr.com")
|
465
|
+
# => true
|
466
|
+
#
|
467
|
+
# Arguments:
|
468
|
+
# url: (String)
|
469
|
+
def self.valid_url?(url)
|
470
|
+
!(url =~ URI::regexp).nil?
|
471
|
+
end
|
472
|
+
end
|
473
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
require './lib/debugher'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'rack/test'
|
4
|
+
|
5
|
+
ENV['RACK_ENV'] = 'test'
|
6
|
+
|
7
|
+
class DebugherTest < Test::Unit::TestCase
|
8
|
+
include Rack::Test::Methods
|
9
|
+
include Debugher
|
10
|
+
|
11
|
+
def test_initialize
|
12
|
+
@page = Debugger.new("http://wearepandr.com/")
|
13
|
+
|
14
|
+
assert_equal @page.url, "http://wearepandr.com/"
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_rss_feed_url
|
18
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
19
|
+
assert_equal @page.rss_feed_url, "http://funtofunky.wordpress.com/feed/"
|
20
|
+
|
21
|
+
@page = Debugger.new("http://blog.iso50.com/")
|
22
|
+
assert_equal @page.rss_feed_url, "http://blog.iso50.com/feed/"
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_atom_feed_url
|
26
|
+
@page = Debugger.new("http://wearepandr.com/")
|
27
|
+
assert_equal @page.atom_feed_url, "http://wearepandr.com/feed"
|
28
|
+
|
29
|
+
@page = Debugger.new("http://thefourohfive.com/")
|
30
|
+
assert_equal @page.atom_feed_url, "http://thefourohfive.com/feed"
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_feed_url
|
34
|
+
# Atom Feed
|
35
|
+
@page = Debugger.new("http://wearepandr.com/")
|
36
|
+
assert_equal @page.feed_url, "http://wearepandr.com/feed"
|
37
|
+
|
38
|
+
# RSS Feed
|
39
|
+
@page = Debugger.new("http://funtofunky.wordpress.com")
|
40
|
+
assert_equal @page.feed_url, "http://funtofunky.wordpress.com/feed/"
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_scrape_info
|
44
|
+
@page = Debugger.new("http://rakkit.com/about")
|
45
|
+
@scrape_info = @page.scrape_info
|
46
|
+
|
47
|
+
assert_equal '200 OK', @scrape_info[:response_code]
|
48
|
+
assert_equal 'http://rakkit.com/about', @scrape_info[:fetched_url]
|
49
|
+
assert_equal 'http://rakkit.com/', @scrape_info[:canonical_url]
|
50
|
+
assert_equal nil, @scrape_info[:feed_url]
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_meta_data
|
54
|
+
@page = Debugger.new("http://rakkit.com")
|
55
|
+
@meta = @page.meta_data
|
56
|
+
|
57
|
+
assert_equal 'The latest new music from websites, artists and labels you love | Rakkit', @meta[:title]
|
58
|
+
assert_equal 'The Social link between new music and the fans.', @meta[:description]
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_music_from_feed
|
62
|
+
@page = Debugger.new("http://blog.iso50.com")
|
63
|
+
@music_links = @page.music_from_feed
|
64
|
+
|
65
|
+
assert @music_links.kind_of?(Array)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_music_from_html
|
69
|
+
@page = Debugger.new("http://blog.iso50.com")
|
70
|
+
@music_links = @page.music_from_html
|
71
|
+
|
72
|
+
assert @music_links.kind_of?(Array)
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_music_from_soundcloud
|
76
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
77
|
+
@music_links = @page.music_from_soundcloud
|
78
|
+
|
79
|
+
assert @music_links.kind_of?(Array)
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_page_links
|
83
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
84
|
+
@internal_links = @page.internal_links
|
85
|
+
|
86
|
+
assert @internal_links.kind_of?(Array)
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_valid_url?
|
90
|
+
@valid_url = Debugger.valid_url?("http://funtofunky.wordpress.com/")
|
91
|
+
assert_equal @valid_url, true
|
92
|
+
|
93
|
+
@valid_url = Debugger.valid_url?("blah blah blah")
|
94
|
+
assert_equal @valid_url, false
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_host
|
98
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
99
|
+
assert_equal @page.host, "funtofunky.wordpress.com"
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_content_type
|
103
|
+
@page = Debugger.new("http://wearepandr.com")
|
104
|
+
assert_equal @page.content_type, "text/html"
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_charset
|
108
|
+
@page = Debugger.new("http://wearepandr.com")
|
109
|
+
assert_equal @page.charset, "utf-8"
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_content_encoding
|
113
|
+
# Need to find better examples of this
|
114
|
+
@page = Debugger.new("http://wearepandr.com")
|
115
|
+
assert_equal @page.content_encoding, []
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_last_modified
|
119
|
+
# Need to find better examples of this
|
120
|
+
@page = Debugger.new("http://wearepandr.com")
|
121
|
+
assert_equal @page.last_modified, nil
|
122
|
+
end
|
123
|
+
|
124
|
+
# Self Methods
|
125
|
+
#
|
126
|
+
def test_user_agent
|
127
|
+
@ua = Debugger.user_agent
|
128
|
+
assert_equal @ua, "Rakkit/V#{Debugher::VERSION}"
|
129
|
+
|
130
|
+
@ua = Debugger.user_agent("PANDR")
|
131
|
+
assert_equal @ua, "PANDR/V#{Debugher::VERSION}"
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_version
|
135
|
+
@version = Debugger.version
|
136
|
+
|
137
|
+
# Enough of a test that we're getting the Version #
|
138
|
+
assert_equal @version, "V#{Debugher::VERSION}"
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_mail_to_link?
|
142
|
+
@url = "http://wearepandr.com"
|
143
|
+
assert_equal Debugger.mailto_link?(@url), false
|
144
|
+
|
145
|
+
@url = "mailto:pete@wearepandr.com"
|
146
|
+
assert_equal Debugger.mailto_link?(@url), true
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_relative?
|
150
|
+
@url = "/"
|
151
|
+
assert_equal Debugger.relative?(@url), true
|
152
|
+
|
153
|
+
@url = "/about"
|
154
|
+
assert_equal Debugger.relative?(@url), true
|
155
|
+
|
156
|
+
@url = "http://wearepandr.com"
|
157
|
+
assert_equal Debugger.relative?(@url), false
|
158
|
+
|
159
|
+
@url = "http://wearepandr.com/"
|
160
|
+
assert_equal Debugger.relative?(@url), false
|
161
|
+
|
162
|
+
@url = "http://staff.wearepandr.com"
|
163
|
+
assert_equal Debugger.relative?(@url), false
|
164
|
+
end
|
165
|
+
|
166
|
+
def test_make_absolute
|
167
|
+
@absolute = Debugger.make_absolute("/about", "http://blog.iso50.com")
|
168
|
+
assert_equal @absolute, "http://blog.iso50.com/about"
|
169
|
+
|
170
|
+
@absolute = Debugger.make_absolute("/about", "http://blog.iso50.com/")
|
171
|
+
assert_equal @absolute, "http://blog.iso50.com/about"
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_get_soundcloud_url
|
175
|
+
@soundcloud_embed = "https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F59422468"
|
176
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), "http://api.soundcloud.com/tracks/59422468"
|
177
|
+
|
178
|
+
@soundcloud_embed = "https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F2153957"
|
179
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), "http://api.soundcloud.com/playlists/2153957"
|
180
|
+
|
181
|
+
@soundcloud_embed = "http://wearepandr.com"
|
182
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), nil
|
183
|
+
end
|
184
|
+
|
185
|
+
def test_soundcloud_url?
|
186
|
+
@url = "http://wearepandr.com"
|
187
|
+
assert_equal Debugger.soundcloud_url?(@url), false
|
188
|
+
|
189
|
+
@url = "http://api.soundcloud.com/playlists/2153957"
|
190
|
+
assert_equal Debugger.soundcloud_url?(@url), true
|
191
|
+
|
192
|
+
# A further addition to the method could be to test that there
|
193
|
+
# is a unique id on the end of the url.
|
194
|
+
end
|
195
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: debugher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Roome
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: addressable
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: robots
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: ! "\n A handy set of methods for getting various
|
63
|
+
bits of information about a web page.\n This is used by
|
64
|
+
the Rakkit Debugger to output what information we can gather about various pages
|
65
|
+
on an adhoc basis.\n The library is also used by the Rakkit
|
66
|
+
spider to process and index pages across the web.\n "
|
67
|
+
email:
|
68
|
+
- pete@wearepandr.com
|
69
|
+
executables: []
|
70
|
+
extensions: []
|
71
|
+
extra_rdoc_files: []
|
72
|
+
files:
|
73
|
+
- .gitignore
|
74
|
+
- Gemfile
|
75
|
+
- LICENSE
|
76
|
+
- README.md
|
77
|
+
- README.rdoc
|
78
|
+
- Rakefile
|
79
|
+
- debugher.gemspec
|
80
|
+
- lib/debugher.rb
|
81
|
+
- lib/debugher/version.rb
|
82
|
+
- test/test_debugher.rb
|
83
|
+
homepage: ''
|
84
|
+
licenses: []
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ! '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.8.19
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: Methods for the Rakkit Debugger.
|
107
|
+
test_files:
|
108
|
+
- test/test_debugher.rb
|