debugher 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +14 -0
- data/LICENSE +22 -0
- data/README.md +32 -0
- data/README.rdoc +0 -0
- data/Rakefile +18 -0
- data/debugher.gemspec +25 -0
- data/lib/debugher.rb +473 -0
- data/lib/debugher/version.rb +3 -0
- data/test/test_debugher.rb +195 -0
- metadata +108 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in debugher.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'nokogiri'
|
7
|
+
gem "addressable"
|
8
|
+
gem 'robots'
|
9
|
+
|
10
|
+
group :development, :test do
|
11
|
+
gem 'rspec'
|
12
|
+
gem 'rack-test'
|
13
|
+
gem 'simplecov', :require => false
|
14
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Peter Roome
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# Debugher
|
2
|
+
|
3
|
+
A handy set of methods for getting various bits of information about a web page.
|
4
|
+
|
5
|
+
This is used by the Rakkit Debugger to output what information we can gather about various pages on an adhoc basis.
|
6
|
+
The library is also used by the Rakkit spider to process and index pages across the web.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
gem 'debugher'
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install debugher
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
TODO: Write usage instructions here
|
25
|
+
|
26
|
+
## Contributing
|
27
|
+
|
28
|
+
1. Fork it
|
29
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
30
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
31
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
32
|
+
5. Create new Pull Request
|
data/README.rdoc
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rake/testtask'
|
4
|
+
require 'rdoc/task'
|
5
|
+
|
6
|
+
Rake::TestTask.new do |t|
|
7
|
+
t.libs << 'test'
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Run tests"
|
11
|
+
task :default => :test
|
12
|
+
|
13
|
+
Rake::RDocTask.new do |rd|
|
14
|
+
rd.main = "README.rdoc"
|
15
|
+
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
|
16
|
+
end
|
17
|
+
desc "Generate documentation"
|
18
|
+
task :rdoc
|
data/debugher.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/debugher/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Peter Roome"]
|
6
|
+
gem.email = ["pete@wearepandr.com"]
|
7
|
+
gem.description = %q{
|
8
|
+
A handy set of methods for getting various bits of information about a web page.
|
9
|
+
This is used by the Rakkit Debugger to output what information we can gather about various pages on an adhoc basis.
|
10
|
+
The library is also used by the Rakkit spider to process and index pages across the web.
|
11
|
+
}
|
12
|
+
gem.summary = %q{Methods for the Rakkit Debugger.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($\)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.name = "debugher"
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
gem.version = Debugher::VERSION
|
21
|
+
|
22
|
+
gem.add_dependency 'nokogiri'
|
23
|
+
gem.add_dependency "addressable"
|
24
|
+
gem.add_dependency 'robots'
|
25
|
+
end
|
data/lib/debugher.rb
ADDED
@@ -0,0 +1,473 @@
|
|
1
|
+
require "debugher/version"
|
2
|
+
|
3
|
+
module Debugher
|
4
|
+
require 'robots'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'addressable/uri'
|
7
|
+
require 'cgi'
|
8
|
+
|
9
|
+
class Debugger
|
10
|
+
FILE_TYPES = ['.mp3', '.m4a', '.MP3']
|
11
|
+
attr_accessor :url
|
12
|
+
|
13
|
+
# pass a url as a string to initialize
|
14
|
+
def initialize(url)
|
15
|
+
$stdout.sync = true
|
16
|
+
@uri = URI.parse(url)
|
17
|
+
@url = @uri.class == URI::HTTP ? url : "http://#{url}"
|
18
|
+
@uri = URI.parse(@url)
|
19
|
+
@opened_url = open_url
|
20
|
+
end
|
21
|
+
|
22
|
+
def open_url
|
23
|
+
url_object = nil
|
24
|
+
ua = Debugger.user_agent
|
25
|
+
@robot = Robots.new(ua)
|
26
|
+
if @robot.allowed?(@uri)
|
27
|
+
begin
|
28
|
+
url_object = open(@uri,
|
29
|
+
"User-Agent" => ua,
|
30
|
+
"From" => "hello@rakkit.com",
|
31
|
+
"Referer" => "http://rakkit.com")
|
32
|
+
rescue Exception => e
|
33
|
+
# Most likely a 404 error
|
34
|
+
$stderr.puts "Unable to open url: #{url} - #{e}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
return url_object
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the response code of the page
|
41
|
+
#
|
42
|
+
# Example:
|
43
|
+
# >> Debugger.new("http://rakkit.com").response_code
|
44
|
+
# => 200 OK
|
45
|
+
def response_code
|
46
|
+
@opened_url.status.join(" ")
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return the fecthed URL
|
50
|
+
#
|
51
|
+
# Example:
|
52
|
+
# >> Debugger.new("rakkit.com").fetched_url
|
53
|
+
# => http://rakkit.com
|
54
|
+
def fetched_url
|
55
|
+
@uri.to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the canonical url of the page
|
59
|
+
#
|
60
|
+
# Example:
|
61
|
+
# >> Debugger.new("http://rakkit.com").response_code
|
62
|
+
# => http://rakkit.com/
|
63
|
+
def canonical_url
|
64
|
+
begin
|
65
|
+
canonical_uri = @uri
|
66
|
+
canonical_uri.path = ''
|
67
|
+
canonical_uri.query = nil
|
68
|
+
canonical_uri = canonical_uri + "/"
|
69
|
+
return canonical_uri.to_s
|
70
|
+
rescue Exception => e
|
71
|
+
puts "CANONICAL ERROR: #{e}"
|
72
|
+
puts @uri.inspect.to_s
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# loads the Hpricot XML object if it hasn't already been loaded
|
77
|
+
def page
|
78
|
+
@page ||= Nokogiri::HTML(@opened_url)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Get the RSS Feed URL
|
82
|
+
#
|
83
|
+
# Example:
|
84
|
+
# >> Debugger.new("http://wearepandr.com").rss_feed_url
|
85
|
+
# => http://wearepandr.com/feed
|
86
|
+
def rss_feed_url
|
87
|
+
rss_url = page.search("link[@type='application/rss+xml']")
|
88
|
+
rss_url = rss_url.length == 0 ? nil : rss_url.first['href']
|
89
|
+
|
90
|
+
rss_url = Debugger.stitch_to_make_absolute(canonical_url, rss_url) if Debugger.relative?(rss_url)
|
91
|
+
return rss_url.to_s
|
92
|
+
end
|
93
|
+
|
94
|
+
# Get the Atom Feed URL
|
95
|
+
#
|
96
|
+
# Example:
|
97
|
+
# >> Debugger.new("http://wearepandr.com").atom_feed_url
|
98
|
+
# => http://wearepandr.com/feed
|
99
|
+
def atom_feed_url
|
100
|
+
atom_url = page.search("link[@type='application/atom+xml']")
|
101
|
+
atom_url = atom_url.length == 0 ? nil : atom_url.first['href']
|
102
|
+
|
103
|
+
atom_url = Debugger.stitch_to_make_absolute(canonical_url, atom_url) if Debugger.relative?(atom_url)
|
104
|
+
return atom_url.to_s
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get the FEED URL, no matter if it's the Atom URL or the RSS URL
|
108
|
+
#
|
109
|
+
# Example:
|
110
|
+
# >> Debugger.new("http://wearepandr.com").feed_url
|
111
|
+
# => http://wearepandr.com/feed
|
112
|
+
def feed_url
|
113
|
+
if rss_feed_url != '' || atom_feed_url != ''
|
114
|
+
feed_url = rss_feed_url != '' ? rss_feed_url : atom_feed_url
|
115
|
+
|
116
|
+
if Debugger.relative?(feed_url)
|
117
|
+
feed_url = Debugger.stitch_to_make_absolute(canonical_url, feed_url)
|
118
|
+
else
|
119
|
+
feed_url = feed_url
|
120
|
+
end
|
121
|
+
|
122
|
+
else
|
123
|
+
feed_url = nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Return some meta info about the page
|
128
|
+
#
|
129
|
+
# Example:
|
130
|
+
# >> Debugger.new("http://wearepandr.com").scrape_info
|
131
|
+
# => {:response_code => "200 OK",
|
132
|
+
# :fetched_url => "http://wearepandr.com",
|
133
|
+
# :canonical_url => "http://wearepandr.com/",
|
134
|
+
# :feed_url => "http://wearepandr.com/feed"}
|
135
|
+
def scrape_info
|
136
|
+
return {:response_code => response_code,
|
137
|
+
:fetched_url => fetched_url,
|
138
|
+
:canonical_url => canonical_url,
|
139
|
+
:feed_url => feed_url}
|
140
|
+
end
|
141
|
+
|
142
|
+
# Get the page title
|
143
|
+
#
|
144
|
+
# Example:
|
145
|
+
# >> Debugger.new("http://wearepandr.com").title
|
146
|
+
# => Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk | PANDR
|
147
|
+
def title
|
148
|
+
title = page.css('title')[0].inner_html.strip
|
149
|
+
title = title == '' ? nil : title
|
150
|
+
return title
|
151
|
+
end
|
152
|
+
|
153
|
+
# Get the page description
|
154
|
+
#
|
155
|
+
# Example:
|
156
|
+
# >> Debugger.new("http://wearepandr.com").description
|
157
|
+
# => A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK
|
158
|
+
def description
|
159
|
+
description = page.css("meta[name='description']/@content").inner_html.strip
|
160
|
+
description = description == '' ? nil : description
|
161
|
+
return description
|
162
|
+
end
|
163
|
+
|
164
|
+
# Get the page meta data in a hash, title and description.
|
165
|
+
#
|
166
|
+
# Example:
|
167
|
+
# >> Debugger.new("http://wearepandr.com").meta_data
|
168
|
+
# => {:title => "Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk | PANDR",
|
169
|
+
# :description => "A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK"}
|
170
|
+
def meta_data
|
171
|
+
return {:title => title,
|
172
|
+
:description => description}
|
173
|
+
end
|
174
|
+
|
175
|
+
# Get the music links from the feed found on the page
|
176
|
+
#
|
177
|
+
# Example:
|
178
|
+
# >> Debugger.new("http://wearepandr.com").music_from_feed
|
179
|
+
# => ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]
|
180
|
+
#
|
181
|
+
# Arguments:
|
182
|
+
# file_types: [Array]
|
183
|
+
def music_from_feed(file_types=FILE_TYPES)
|
184
|
+
links = []
|
185
|
+
if !feed_url.nil?
|
186
|
+
@feed ||= Nokogiri::XML(open(feed_url))
|
187
|
+
@feed.encoding = 'utf-8'
|
188
|
+
channel = @feed.search('//channel')
|
189
|
+
|
190
|
+
# If the blog isn't set up with channels then we can
|
191
|
+
# search the data we have for all links that end in .mp3 x
|
192
|
+
if !channel.empty?
|
193
|
+
items = @feed.search("//channel/item")
|
194
|
+
items.each do |item|
|
195
|
+
enclosures = item.search("//channel/item/enclosure")
|
196
|
+
enclosures.each do |enclosure|
|
197
|
+
enclosure_file = enclosure['url'].to_s[-4,4]
|
198
|
+
links << enclosure['url'] if file_types.include?(enclosure_file)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
links = links.uniq
|
204
|
+
return links.compact
|
205
|
+
end
|
206
|
+
|
207
|
+
# Get the music links from the page html
|
208
|
+
#
|
209
|
+
# Example:
|
210
|
+
# >> Debugger.new("http://wearepandr.com").music_from_html
|
211
|
+
# => ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]
|
212
|
+
#
|
213
|
+
# Arguments:
|
214
|
+
# file_types: [Array]
|
215
|
+
def music_from_html(file_types=FILE_TYPES)
|
216
|
+
links = []
|
217
|
+
|
218
|
+
page_links.each do |track|
|
219
|
+
track_file = track['href'].to_s[-4,4]
|
220
|
+
|
221
|
+
if file_types.include?(track_file)
|
222
|
+
links << track["href"]
|
223
|
+
end
|
224
|
+
end
|
225
|
+
links = links.uniq
|
226
|
+
return links.compact
|
227
|
+
end
|
228
|
+
|
229
|
+
# Get the soundcloud music links from the page html
|
230
|
+
#
|
231
|
+
# Example:
|
232
|
+
# >> Debugger.new("http://wearepandr.com").music_from_soundcloud
|
233
|
+
# => ["http://api.soundcloud.com/playlists/2153957", "http://api.soundcloud.com/playlists/2153958"]
|
234
|
+
def music_from_soundcloud
|
235
|
+
links = []
|
236
|
+
@html_url ||= Nokogiri::HTML(open(@uri))
|
237
|
+
@html_url.search("//iframe", "//param").each do |url|
|
238
|
+
object_url = url["src"] || url["value"]
|
239
|
+
links << Debugger.get_soundcloud_url(object_url)
|
240
|
+
end
|
241
|
+
links = links.uniq
|
242
|
+
return links.compact
|
243
|
+
end
|
244
|
+
|
245
|
+
# Get the internal page links from the page
|
246
|
+
#
|
247
|
+
# Example:
|
248
|
+
# >> Debugger.new("http://wearepandr.com").internal_links
|
249
|
+
# => ["http://wearepandr.com/about", "http://wearepandr.com/blog"]
|
250
|
+
def internal_links
|
251
|
+
links = []
|
252
|
+
current_host = @uri.host
|
253
|
+
|
254
|
+
page_links.each do |link|
|
255
|
+
|
256
|
+
# Remove anchors from links
|
257
|
+
|
258
|
+
new_link = link['href'].nil? ? nil : link['href'].split("#")[0]
|
259
|
+
|
260
|
+
if !new_link.nil? && !new_link.strip.empty? && !Debugger.mailto_link?(new_link)
|
261
|
+
|
262
|
+
new_link = Debugger.make_absolute(new_link)
|
263
|
+
|
264
|
+
if new_link != nil
|
265
|
+
|
266
|
+
# Check to see if the URL is still from the current site
|
267
|
+
#
|
268
|
+
if current_host == Addressable::URI.parse(new_link).host
|
269
|
+
links << new_link
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
links = links.uniq
|
276
|
+
return links.compact
|
277
|
+
end
|
278
|
+
|
279
|
+
# Get all the links from the page
|
280
|
+
#
|
281
|
+
# Example:
|
282
|
+
# >> Debugger.new("http://wearepandr.com").page_links
|
283
|
+
# => ["http://wearepandr.com/about", "http://google.com", "http://yahoo.com"]
|
284
|
+
def page_links
|
285
|
+
@html_url ||= Nokogiri::HTML(open(@uri))
|
286
|
+
|
287
|
+
links = @html_url.search("//a")
|
288
|
+
return links
|
289
|
+
end
|
290
|
+
|
291
|
+
# Get all the links from the page
|
292
|
+
#
|
293
|
+
# Example:
|
294
|
+
# >> Debugger.new("http://wearepandr.com").host
|
295
|
+
# => wearepandr.com
|
296
|
+
def host
|
297
|
+
Addressable::URI.parse(@uri).host
|
298
|
+
end
|
299
|
+
|
300
|
+
# Get the pages content type
|
301
|
+
#
|
302
|
+
# Example:
|
303
|
+
# >> Debugger.new("http://wearepandr.com").content_type
|
304
|
+
# => text/html
|
305
|
+
def content_type
|
306
|
+
@opened_url.content_type
|
307
|
+
end
|
308
|
+
|
309
|
+
# Get the pages charset
|
310
|
+
#
|
311
|
+
# Example:
|
312
|
+
# >> Debugger.new("http://wearepandr.com").charset
|
313
|
+
# => utf-8
|
314
|
+
def charset
|
315
|
+
@opened_url.charset
|
316
|
+
end
|
317
|
+
|
318
|
+
# Get the pages content encoding
|
319
|
+
#
|
320
|
+
# Example:
|
321
|
+
# >> Debugger.new("http://wearepandr.com").content_encoding
|
322
|
+
# => []
|
323
|
+
def content_encoding
|
324
|
+
@opened_url.content_encoding
|
325
|
+
end
|
326
|
+
|
327
|
+
# Get the pages last modified date
|
328
|
+
#
|
329
|
+
# Example:
|
330
|
+
# >> Debugger.new("http://wearepandr.com").last_modified
|
331
|
+
# =>
|
332
|
+
def last_modified
|
333
|
+
@opened_url.last_modified
|
334
|
+
end
|
335
|
+
|
336
|
+
# Get the user agent
|
337
|
+
#
|
338
|
+
# Example:
|
339
|
+
# >> Debugger.user_agent("PANDR")
|
340
|
+
# => PANDR/V0.1
|
341
|
+
#
|
342
|
+
# Arguments:
|
343
|
+
# ua: (String)
|
344
|
+
def self.user_agent(ua="Rakkit")
|
345
|
+
"#{ua}/V#{Debugher::VERSION}"
|
346
|
+
end
|
347
|
+
|
348
|
+
# Get the current version
|
349
|
+
#
|
350
|
+
# Example:
|
351
|
+
# >> Debugger.version
|
352
|
+
# => V0.1
|
353
|
+
def self.version
|
354
|
+
"V#{Debugher::VERSION}"
|
355
|
+
end
|
356
|
+
|
357
|
+
# Check if a URL is relative or not
|
358
|
+
#
|
359
|
+
# Example:
|
360
|
+
# >> Debugger.relative?("http://wearepandr.com")
|
361
|
+
# => false
|
362
|
+
#
|
363
|
+
# Arguments:
|
364
|
+
# url: (String)
|
365
|
+
def self.relative?(url)
|
366
|
+
begin
|
367
|
+
@addressable_url = Addressable::URI.parse(url)
|
368
|
+
return @addressable_url.relative?
|
369
|
+
rescue
|
370
|
+
return false
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
# Make a URL absolute
|
375
|
+
#
|
376
|
+
# Example:
|
377
|
+
# >> Debugger.make_absolute("/about", "http://wearepandr.com")
|
378
|
+
# => http://wearepandr.com/about
|
379
|
+
#
|
380
|
+
# Arguments:
|
381
|
+
# url: (String)
|
382
|
+
# base_url: (String)
|
383
|
+
def self.make_absolute(url, base_url=nil)
|
384
|
+
if Debugger.relative?(url)
|
385
|
+
begin
|
386
|
+
if !base_url.nil?
|
387
|
+
base_url = Debugger.new(base_url).canonical_url
|
388
|
+
else
|
389
|
+
base_url = canonical_url
|
390
|
+
end
|
391
|
+
|
392
|
+
url = Debugger.stitch_to_make_absolute(base_url, url)
|
393
|
+
rescue Exception => e
|
394
|
+
url = nil
|
395
|
+
$stderr.puts "Debugger Error: #{url} - #{e}"
|
396
|
+
puts "ERROR: Could not make this URL absolute. Set to nil."
|
397
|
+
end
|
398
|
+
end
|
399
|
+
return url
|
400
|
+
end
|
401
|
+
|
402
|
+
# Stitch two strings together to make a single absolute url
|
403
|
+
#
|
404
|
+
# Example:
|
405
|
+
# >> Debugger.stitch_to_make_absolute("http://wearepandr.com/", "/about")
|
406
|
+
# => http://wearepandr.com/about
|
407
|
+
#
|
408
|
+
# Arguments:
|
409
|
+
# canonical_url: (String)
|
410
|
+
# path: (String)
|
411
|
+
def self.stitch_to_make_absolute(canonical_url, path)
|
412
|
+
canonical_url.chomp("/") + path
|
413
|
+
end
|
414
|
+
|
415
|
+
# Check if a string is a mailto link
|
416
|
+
#
|
417
|
+
# Example:
|
418
|
+
# >> Debugger.mailto_link?("mailto:pete@wearepandr.com")
|
419
|
+
# => true
|
420
|
+
#
|
421
|
+
# Arguments:
|
422
|
+
# url: (String)
|
423
|
+
def self.mailto_link?(url)
|
424
|
+
url[0..5] == "mailto"
|
425
|
+
end
|
426
|
+
|
427
|
+
# Extract the URL element of a soundcloud embed in order to grab the link to the track.
|
428
|
+
#
|
429
|
+
# Example:
|
430
|
+
# >> Debugger.get_soundcloud_url("https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F59422468")
|
431
|
+
# => http://api.soundcloud.com/tracks/59422468
|
432
|
+
#
|
433
|
+
# Arguments:
|
434
|
+
# url: (String)
|
435
|
+
def self.get_soundcloud_url(url)
|
436
|
+
begin
|
437
|
+
uri = URI.parse(url)
|
438
|
+
new_url = uri.query.split("&").reject { |q| q[0..2] != "url"}[0]
|
439
|
+
new_url = CGI.unescape(new_url[4..new_url.length])
|
440
|
+
|
441
|
+
if Debugger.soundcloud_url?(new_url)
|
442
|
+
return new_url
|
443
|
+
end
|
444
|
+
rescue
|
445
|
+
$stderr.puts "Bad URL - Soundcloud URL's don't cause errors so safe to assume it's not a Soundcloud link."
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
# Check if a string is a Soundcloud URL
|
450
|
+
#
|
451
|
+
# Example:
|
452
|
+
# >> Debugger.soundcloud_url?("http://api.soundcloud.com/tracks/59422468")
|
453
|
+
# => http://api.soundcloud.com/tracks/59422468
|
454
|
+
#
|
455
|
+
# Arguments:
|
456
|
+
# url: (String)
|
457
|
+
def self.soundcloud_url?(url)
|
458
|
+
url.include?("api.soundcloud.com")
|
459
|
+
end
|
460
|
+
|
461
|
+
# Check if a url is a valid url
|
462
|
+
#
|
463
|
+
# Example:
|
464
|
+
# >> Debugger.valid_url?("http://wearepandr.com")
|
465
|
+
# => true
|
466
|
+
#
|
467
|
+
# Arguments:
|
468
|
+
# url: (String)
|
469
|
+
def self.valid_url?(url)
|
470
|
+
!(url =~ URI::regexp).nil?
|
471
|
+
end
|
472
|
+
end
|
473
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
require './lib/debugher'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'rack/test'
|
4
|
+
|
5
|
+
ENV['RACK_ENV'] = 'test'
|
6
|
+
|
7
|
+
class DebugherTest < Test::Unit::TestCase
|
8
|
+
include Rack::Test::Methods
|
9
|
+
include Debugher
|
10
|
+
|
11
|
+
def test_initialize
|
12
|
+
@page = Debugger.new("http://wearepandr.com/")
|
13
|
+
|
14
|
+
assert_equal @page.url, "http://wearepandr.com/"
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_rss_feed_url
|
18
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
19
|
+
assert_equal @page.rss_feed_url, "http://funtofunky.wordpress.com/feed/"
|
20
|
+
|
21
|
+
@page = Debugger.new("http://blog.iso50.com/")
|
22
|
+
assert_equal @page.rss_feed_url, "http://blog.iso50.com/feed/"
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_atom_feed_url
|
26
|
+
@page = Debugger.new("http://wearepandr.com/")
|
27
|
+
assert_equal @page.atom_feed_url, "http://wearepandr.com/feed"
|
28
|
+
|
29
|
+
@page = Debugger.new("http://thefourohfive.com/")
|
30
|
+
assert_equal @page.atom_feed_url, "http://thefourohfive.com/feed"
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_feed_url
|
34
|
+
# Atom Feed
|
35
|
+
@page = Debugger.new("http://wearepandr.com/")
|
36
|
+
assert_equal @page.feed_url, "http://wearepandr.com/feed"
|
37
|
+
|
38
|
+
# RSS Feed
|
39
|
+
@page = Debugger.new("http://funtofunky.wordpress.com")
|
40
|
+
assert_equal @page.feed_url, "http://funtofunky.wordpress.com/feed/"
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_scrape_info
|
44
|
+
@page = Debugger.new("http://rakkit.com/about")
|
45
|
+
@scrape_info = @page.scrape_info
|
46
|
+
|
47
|
+
assert_equal '200 OK', @scrape_info[:response_code]
|
48
|
+
assert_equal 'http://rakkit.com/about', @scrape_info[:fetched_url]
|
49
|
+
assert_equal 'http://rakkit.com/', @scrape_info[:canonical_url]
|
50
|
+
assert_equal nil, @scrape_info[:feed_url]
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_meta_data
|
54
|
+
@page = Debugger.new("http://rakkit.com")
|
55
|
+
@meta = @page.meta_data
|
56
|
+
|
57
|
+
assert_equal 'The latest new music from websites, artists and labels you love | Rakkit', @meta[:title]
|
58
|
+
assert_equal 'The Social link between new music and the fans.', @meta[:description]
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_music_from_feed
|
62
|
+
@page = Debugger.new("http://blog.iso50.com")
|
63
|
+
@music_links = @page.music_from_feed
|
64
|
+
|
65
|
+
assert @music_links.kind_of?(Array)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_music_from_html
|
69
|
+
@page = Debugger.new("http://blog.iso50.com")
|
70
|
+
@music_links = @page.music_from_html
|
71
|
+
|
72
|
+
assert @music_links.kind_of?(Array)
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_music_from_soundcloud
|
76
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
77
|
+
@music_links = @page.music_from_soundcloud
|
78
|
+
|
79
|
+
assert @music_links.kind_of?(Array)
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_page_links
|
83
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
84
|
+
@internal_links = @page.internal_links
|
85
|
+
|
86
|
+
assert @internal_links.kind_of?(Array)
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_valid_url?
|
90
|
+
@valid_url = Debugger.valid_url?("http://funtofunky.wordpress.com/")
|
91
|
+
assert_equal @valid_url, true
|
92
|
+
|
93
|
+
@valid_url = Debugger.valid_url?("blah blah blah")
|
94
|
+
assert_equal @valid_url, false
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_host
|
98
|
+
@page = Debugger.new("http://funtofunky.wordpress.com/")
|
99
|
+
assert_equal @page.host, "funtofunky.wordpress.com"
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_content_type
|
103
|
+
@page = Debugger.new("http://wearepandr.com")
|
104
|
+
assert_equal @page.content_type, "text/html"
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_charset
|
108
|
+
@page = Debugger.new("http://wearepandr.com")
|
109
|
+
assert_equal @page.charset, "utf-8"
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_content_encoding
|
113
|
+
# Need to find better examples of this
|
114
|
+
@page = Debugger.new("http://wearepandr.com")
|
115
|
+
assert_equal @page.content_encoding, []
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_last_modified
|
119
|
+
# Need to find better examples of this
|
120
|
+
@page = Debugger.new("http://wearepandr.com")
|
121
|
+
assert_equal @page.last_modified, nil
|
122
|
+
end
|
123
|
+
|
124
|
+
# Self Methods
|
125
|
+
#
|
126
|
+
def test_user_agent
|
127
|
+
@ua = Debugger.user_agent
|
128
|
+
assert_equal @ua, "Rakkit/V#{Debugher::VERSION}"
|
129
|
+
|
130
|
+
@ua = Debugger.user_agent("PANDR")
|
131
|
+
assert_equal @ua, "PANDR/V#{Debugher::VERSION}"
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_version
|
135
|
+
@version = Debugger.version
|
136
|
+
|
137
|
+
# Enough of a test that we're getting the Version #
|
138
|
+
assert_equal @version, "V#{Debugher::VERSION}"
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_mail_to_link?
|
142
|
+
@url = "http://wearepandr.com"
|
143
|
+
assert_equal Debugger.mailto_link?(@url), false
|
144
|
+
|
145
|
+
@url = "mailto:pete@wearepandr.com"
|
146
|
+
assert_equal Debugger.mailto_link?(@url), true
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_relative?
|
150
|
+
@url = "/"
|
151
|
+
assert_equal Debugger.relative?(@url), true
|
152
|
+
|
153
|
+
@url = "/about"
|
154
|
+
assert_equal Debugger.relative?(@url), true
|
155
|
+
|
156
|
+
@url = "http://wearepandr.com"
|
157
|
+
assert_equal Debugger.relative?(@url), false
|
158
|
+
|
159
|
+
@url = "http://wearepandr.com/"
|
160
|
+
assert_equal Debugger.relative?(@url), false
|
161
|
+
|
162
|
+
@url = "http://staff.wearepandr.com"
|
163
|
+
assert_equal Debugger.relative?(@url), false
|
164
|
+
end
|
165
|
+
|
166
|
+
def test_make_absolute
|
167
|
+
@absolute = Debugger.make_absolute("/about", "http://blog.iso50.com")
|
168
|
+
assert_equal @absolute, "http://blog.iso50.com/about"
|
169
|
+
|
170
|
+
@absolute = Debugger.make_absolute("/about", "http://blog.iso50.com/")
|
171
|
+
assert_equal @absolute, "http://blog.iso50.com/about"
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_get_soundcloud_url
|
175
|
+
@soundcloud_embed = "https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F59422468"
|
176
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), "http://api.soundcloud.com/tracks/59422468"
|
177
|
+
|
178
|
+
@soundcloud_embed = "https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F2153957"
|
179
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), "http://api.soundcloud.com/playlists/2153957"
|
180
|
+
|
181
|
+
@soundcloud_embed = "http://wearepandr.com"
|
182
|
+
assert_equal Debugger.get_soundcloud_url(@soundcloud_embed), nil
|
183
|
+
end
|
184
|
+
|
185
|
+
def test_soundcloud_url?
|
186
|
+
@url = "http://wearepandr.com"
|
187
|
+
assert_equal Debugger.soundcloud_url?(@url), false
|
188
|
+
|
189
|
+
@url = "http://api.soundcloud.com/playlists/2153957"
|
190
|
+
assert_equal Debugger.soundcloud_url?(@url), true
|
191
|
+
|
192
|
+
# A further addition to the method could be to test that there
|
193
|
+
# is a unique id on the end of the url.
|
194
|
+
end
|
195
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: debugher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Roome
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: addressable
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: robots
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: ! "\n A handy set of methods for getting various
|
63
|
+
bits of information about a web page.\n This is used by
|
64
|
+
the Rakkit Debugger to output what information we can gather about various pages
|
65
|
+
on an adhoc basis.\n The library is also used by the Rakkit
|
66
|
+
spider to process and index pages across the web.\n "
|
67
|
+
email:
|
68
|
+
- pete@wearepandr.com
|
69
|
+
executables: []
|
70
|
+
extensions: []
|
71
|
+
extra_rdoc_files: []
|
72
|
+
files:
|
73
|
+
- .gitignore
|
74
|
+
- Gemfile
|
75
|
+
- LICENSE
|
76
|
+
- README.md
|
77
|
+
- README.rdoc
|
78
|
+
- Rakefile
|
79
|
+
- debugher.gemspec
|
80
|
+
- lib/debugher.rb
|
81
|
+
- lib/debugher/version.rb
|
82
|
+
- test/test_debugher.rb
|
83
|
+
homepage: ''
|
84
|
+
licenses: []
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ! '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.8.19
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: Methods for the Rakkit Debugger.
|
107
|
+
test_files:
|
108
|
+
- test/test_debugher.rb
|