metainspector 1.16.1 → 1.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/meta_inspector.rb +10 -3
- data/lib/meta_inspector/deprecations.rb +19 -0
- data/lib/meta_inspector/document.rb +81 -0
- data/lib/meta_inspector/exception_log.rb +29 -0
- data/lib/meta_inspector/exceptionable.rb +11 -0
- data/lib/meta_inspector/parser.rb +178 -0
- data/lib/meta_inspector/request.rb +55 -0
- data/lib/meta_inspector/url.rb +76 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +97 -0
- data/spec/exception_log_spec.rb +59 -0
- data/spec/meta_inspector_spec.rb +9 -0
- data/spec/parser_spec.rb +374 -0
- data/spec/redirections_spec.rb +20 -3
- data/spec/request_spec.rb +64 -0
- data/spec/url_spec.rb +74 -0
- metadata +18 -7
- data/lib/meta_inspector/scraper.rb +0 -283
- data/spec/metainspector_spec.rb +0 -547
data/spec/redirections_spec.rb
CHANGED
@@ -9,7 +9,7 @@ describe MetaInspector do
|
|
9
9
|
m = MetaInspector.new("http://facebook.com")
|
10
10
|
m.title.should be_nil
|
11
11
|
m.should_not be_ok
|
12
|
-
m.
|
12
|
+
m.exceptions.first.message.should == "redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
|
13
13
|
end
|
14
14
|
|
15
15
|
it "allows safe redirections when :allow_redirections => :safe" do
|
@@ -30,14 +30,14 @@ describe MetaInspector do
|
|
30
30
|
m = MetaInspector.new("https://unsafe-facebook.com")
|
31
31
|
m.title.should be_nil
|
32
32
|
m.should_not be_ok
|
33
|
-
m.
|
33
|
+
m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
34
34
|
end
|
35
35
|
|
36
36
|
it "disallows unsafe redirections when :allow_redirections => :safe" do
|
37
37
|
m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
|
38
38
|
m.title.should be_nil
|
39
39
|
m.should_not be_ok
|
40
|
-
m.
|
40
|
+
m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
41
41
|
end
|
42
42
|
|
43
43
|
it "allows unsafe redirections when :allow_redirections => :all" do
|
@@ -46,5 +46,22 @@ describe MetaInspector do
|
|
46
46
|
m.should be_ok
|
47
47
|
end
|
48
48
|
end
|
49
|
+
|
50
|
+
describe "Redirections should update the base_uri" do
|
51
|
+
it "updates the base_uri on safe redirections" do
|
52
|
+
m = MetaInspector.new("http://facebook.com", :allow_redirections => :safe)
|
53
|
+
# Check for the title to make sure the request happens
|
54
|
+
m.title.should == "Hello From Facebook"
|
55
|
+
m.url.should == "https://www.facebook.com/"
|
56
|
+
end
|
57
|
+
|
58
|
+
it "updates the base_uri on all redirections" do
|
59
|
+
m = MetaInspector.new("http://facebook.com", :allow_redirections => :all)
|
60
|
+
# Check for the title to make sure the request happens
|
61
|
+
m.title.should == "Hello From Facebook"
|
62
|
+
|
63
|
+
m.url.should == "https://www.facebook.com/"
|
64
|
+
end
|
65
|
+
end
|
49
66
|
end
|
50
67
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector::Request do
|
6
|
+
|
7
|
+
describe "read" do
|
8
|
+
it "should return the content of the page" do
|
9
|
+
page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
|
10
|
+
|
11
|
+
page_request.read[0..14].should == "<!DOCTYPE html>"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "content_type" do
|
16
|
+
it "should return the correct content type of the url for html pages" do
|
17
|
+
page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
|
18
|
+
|
19
|
+
page_request.content_type.should == "text/html"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return the correct content type of the url for non html pages" do
|
23
|
+
image_request = MetaInspector::Request.new(url('http://pagerankalert.com/image.png'))
|
24
|
+
|
25
|
+
image_request.content_type.should == "image/png"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'exception handling' do
|
30
|
+
before(:each) do
|
31
|
+
FakeWeb.allow_net_connect = true
|
32
|
+
end
|
33
|
+
|
34
|
+
after(:each) do
|
35
|
+
FakeWeb.allow_net_connect = false
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should handle timeouts" do
|
39
|
+
impatient = MetaInspector::Request.new(url('http://example.com'), timeout: 0.0000000000001)
|
40
|
+
|
41
|
+
expect {
|
42
|
+
impatient.read.should be_nil
|
43
|
+
}.to change { impatient.exceptions.size }
|
44
|
+
|
45
|
+
impatient.exceptions.first.class.should == Timeout::Error
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should handle socket errors" do
|
49
|
+
nowhere = MetaInspector::Request.new(url('http://caca232dsdsaer3sdsd-asd343.org'))
|
50
|
+
|
51
|
+
expect {
|
52
|
+
nowhere.read.should be_nil
|
53
|
+
}.to change { nowhere.exceptions.size }
|
54
|
+
|
55
|
+
nowhere.exceptions.first.class.should == SocketError
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def url(initial_url)
|
62
|
+
MetaInspector::URL.new(initial_url)
|
63
|
+
end
|
64
|
+
end
|
data/spec/url_spec.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector::URL do
|
6
|
+
it "should normalize URLs" do
|
7
|
+
MetaInspector::URL.new('http://example.com').url.should == 'http://example.com/'
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should accept an URL with a scheme' do
|
11
|
+
MetaInspector::URL.new('http://example.com/').url.should == 'http://example.com/'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should use http:// as a default scheme" do
|
15
|
+
MetaInspector::URL.new('example.com').url.should == 'http://example.com/'
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should accept an URL with international characters" do
|
19
|
+
MetaInspector::URL.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return the scheme" do
|
23
|
+
MetaInspector::URL.new('http://example.com').scheme.should == 'http'
|
24
|
+
MetaInspector::URL.new('https://example.com').scheme.should == 'https'
|
25
|
+
MetaInspector::URL.new('example.com').scheme.should == 'http'
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should return the host" do
|
29
|
+
MetaInspector::URL.new('http://example.com').host.should == 'example.com'
|
30
|
+
MetaInspector::URL.new('https://example.com').host.should == 'example.com'
|
31
|
+
MetaInspector::URL.new('example.com').host.should == 'example.com'
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should return the root url" do
|
35
|
+
MetaInspector::URL.new('http://example.com').root_url.should == 'http://example.com/'
|
36
|
+
MetaInspector::URL.new('https://example.com').root_url.should == 'https://example.com/'
|
37
|
+
MetaInspector::URL.new('example.com').root_url.should == 'http://example.com/'
|
38
|
+
MetaInspector::URL.new('http://example.com/faqs').root_url.should == 'http://example.com/'
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "url=" do
|
42
|
+
it "should update the url" do
|
43
|
+
url = MetaInspector::URL.new('http://first.com/')
|
44
|
+
|
45
|
+
url.url = 'http://second.com/'
|
46
|
+
url.url.should == 'http://second.com/'
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should add the missing scheme and normalize" do
|
50
|
+
url = MetaInspector::URL.new('http://first.com/')
|
51
|
+
|
52
|
+
url.url = 'second.com'
|
53
|
+
url.url.should == 'http://second.com/'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "exception handling" do
|
58
|
+
it "should handle URI::InvalidURIError" do
|
59
|
+
expect {
|
60
|
+
@malformed = MetaInspector::URL.new('javascript://')
|
61
|
+
}.to_not raise_error
|
62
|
+
|
63
|
+
@malformed.exceptions.first.class.should == URI::InvalidURIError
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should handle URI::InvalidComponentError" do
|
67
|
+
expect {
|
68
|
+
@malformed = MetaInspector::URL.new('mailto:email(at)example.com')
|
69
|
+
}.to_not raise_error
|
70
|
+
|
71
|
+
@malformed.exceptions.first.class.should == URI::InvalidComponentError
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -138,12 +138,20 @@ files:
|
|
138
138
|
- README.md
|
139
139
|
- Rakefile
|
140
140
|
- lib/meta_inspector.rb
|
141
|
-
- lib/meta_inspector/
|
141
|
+
- lib/meta_inspector/deprecations.rb
|
142
|
+
- lib/meta_inspector/document.rb
|
143
|
+
- lib/meta_inspector/exception_log.rb
|
144
|
+
- lib/meta_inspector/exceptionable.rb
|
145
|
+
- lib/meta_inspector/parser.rb
|
146
|
+
- lib/meta_inspector/request.rb
|
147
|
+
- lib/meta_inspector/url.rb
|
142
148
|
- lib/meta_inspector/version.rb
|
143
149
|
- lib/metainspector.rb
|
144
150
|
- meta_inspector.gemspec
|
145
151
|
- samples/basic_scraping.rb
|
146
152
|
- samples/spider.rb
|
153
|
+
- spec/document_spec.rb
|
154
|
+
- spec/exception_log_spec.rb
|
147
155
|
- spec/fixtures/alazan.com.response
|
148
156
|
- spec/fixtures/alazan_websolution.response
|
149
157
|
- spec/fixtures/charset_000.response
|
@@ -171,9 +179,12 @@ files:
|
|
171
179
|
- spec/fixtures/unsafe_https.facebook.com.response
|
172
180
|
- spec/fixtures/wordpress_site.response
|
173
181
|
- spec/fixtures/youtube.response
|
174
|
-
- spec/
|
182
|
+
- spec/meta_inspector_spec.rb
|
183
|
+
- spec/parser_spec.rb
|
175
184
|
- spec/redirections_spec.rb
|
185
|
+
- spec/request_spec.rb
|
176
186
|
- spec/spec_helper.rb
|
187
|
+
- spec/url_spec.rb
|
177
188
|
homepage: http://jaimeiniesta.github.io/metainspector/
|
178
189
|
licenses: []
|
179
190
|
metadata: {}
|
@@ -183,17 +194,17 @@ require_paths:
|
|
183
194
|
- lib
|
184
195
|
required_ruby_version: !ruby/object:Gem::Requirement
|
185
196
|
requirements:
|
186
|
-
- - '>='
|
197
|
+
- - ! '>='
|
187
198
|
- !ruby/object:Gem::Version
|
188
199
|
version: '0'
|
189
200
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
190
201
|
requirements:
|
191
|
-
- - '>='
|
202
|
+
- - ! '>='
|
192
203
|
- !ruby/object:Gem::Version
|
193
204
|
version: '0'
|
194
205
|
requirements: []
|
195
206
|
rubyforge_project:
|
196
|
-
rubygems_version: 2.
|
207
|
+
rubygems_version: 2.0.5
|
197
208
|
signing_key:
|
198
209
|
specification_version: 4
|
199
210
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
|
@@ -1,283 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require 'open-uri'
|
4
|
-
require 'open_uri_redirections'
|
5
|
-
require 'addressable/uri'
|
6
|
-
require 'nokogiri'
|
7
|
-
require 'hashie/rash'
|
8
|
-
require 'timeout'
|
9
|
-
|
10
|
-
# MetaInspector provides an easy way to scrape web pages and get its elements
|
11
|
-
module MetaInspector
|
12
|
-
class Scraper
|
13
|
-
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
14
|
-
attr_reader :allow_redirections, :verbose
|
15
|
-
|
16
|
-
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
17
|
-
# Options:
|
18
|
-
# => timeout: defaults to 20 seconds
|
19
|
-
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
20
|
-
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
21
|
-
# => document: the html of the url as a string
|
22
|
-
# => verbose: if the errors should be logged to the screen
|
23
|
-
def initialize(url, options = {})
|
24
|
-
options = defaults.merge(options)
|
25
|
-
|
26
|
-
@url = with_default_scheme(normalize_url(url))
|
27
|
-
@scheme = URI.parse(@url).scheme
|
28
|
-
@host = URI.parse(@url).host
|
29
|
-
@root_url = "#{@scheme}://#{@host}/"
|
30
|
-
@timeout = options[:timeout]
|
31
|
-
@data = Hashie::Rash.new
|
32
|
-
@errors = []
|
33
|
-
@html_content_only = options[:html_content_only]
|
34
|
-
@allow_redirections = options[:allow_redirections]
|
35
|
-
@verbose = options[:verbose]
|
36
|
-
@document = options[:document]
|
37
|
-
end
|
38
|
-
|
39
|
-
# Returns the parsed document title, from the content of the <title> tag.
|
40
|
-
# This is not the same as the meta_title tag
|
41
|
-
def title
|
42
|
-
@title ||= parsed_document.css('title').inner_text rescue nil
|
43
|
-
end
|
44
|
-
|
45
|
-
# A description getter that first checks for a meta description and if not present will
|
46
|
-
# guess by looking at the first paragraph with more than 120 characters
|
47
|
-
def description
|
48
|
-
meta_description.nil? ? secondary_description : meta_description
|
49
|
-
end
|
50
|
-
|
51
|
-
# Links found on the page, as absolute URLs
|
52
|
-
def links
|
53
|
-
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
|
54
|
-
end
|
55
|
-
|
56
|
-
# Internal links found on the page, as absolute URLs
|
57
|
-
def internal_links
|
58
|
-
@internal_links ||= links.select {|link| host_from_url(link) == host }
|
59
|
-
end
|
60
|
-
|
61
|
-
# External links found on the page, as absolute URLs
|
62
|
-
def external_links
|
63
|
-
@external_links ||= links.select {|link| host_from_url(link) != host }
|
64
|
-
end
|
65
|
-
|
66
|
-
# Images found on the page, as absolute URLs
|
67
|
-
def images
|
68
|
-
@images ||= parsed_images.map{ |i| absolutify_url(i) }
|
69
|
-
end
|
70
|
-
|
71
|
-
# Returns the parsed image from Facebook's open graph property tags
|
72
|
-
# Most all major websites now define this property and is usually very relevant
|
73
|
-
# See doc at http://developers.facebook.com/docs/opengraph/
|
74
|
-
def image
|
75
|
-
meta_og_image || meta_twitter_image
|
76
|
-
end
|
77
|
-
|
78
|
-
# Returns the parsed document meta rss link
|
79
|
-
def feed
|
80
|
-
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
81
|
-
end
|
82
|
-
|
83
|
-
# Returns the charset from the meta tags, looking for it in the following order:
|
84
|
-
# <meta charset='utf-8' />
|
85
|
-
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
86
|
-
def charset
|
87
|
-
@charset ||= (charset_from_meta_charset || charset_from_content_type)
|
88
|
-
end
|
89
|
-
|
90
|
-
# Returns all parsed data as a nested Hash
|
91
|
-
def to_hash
|
92
|
-
scrape_meta_data
|
93
|
-
|
94
|
-
{
|
95
|
-
'url' => url,
|
96
|
-
'title' => title,
|
97
|
-
'links' => links,
|
98
|
-
'internal_links' => internal_links,
|
99
|
-
'external_links' => external_links,
|
100
|
-
'images' => images,
|
101
|
-
'charset' => charset,
|
102
|
-
'feed' => feed,
|
103
|
-
'content_type' => content_type
|
104
|
-
}.merge @data.to_hash
|
105
|
-
end
|
106
|
-
|
107
|
-
# Returns the whole parsed document
|
108
|
-
def parsed_document
|
109
|
-
@parsed_document ||= Nokogiri::HTML(document)
|
110
|
-
rescue Exception => e
|
111
|
-
add_fatal_error "Parsing exception: #{e.message}"
|
112
|
-
end
|
113
|
-
|
114
|
-
# Returns the original, unparsed document
|
115
|
-
def document
|
116
|
-
@document ||= if html_content_only && content_type != "text/html"
|
117
|
-
raise "The url provided contains #{content_type} content instead of text/html content" and nil
|
118
|
-
else
|
119
|
-
request.read
|
120
|
-
end
|
121
|
-
rescue Exception => e
|
122
|
-
add_fatal_error "Scraping exception: #{e.message}"
|
123
|
-
end
|
124
|
-
|
125
|
-
# Returns the content_type of the fetched document
|
126
|
-
def content_type
|
127
|
-
@content_type ||= request.content_type
|
128
|
-
end
|
129
|
-
|
130
|
-
# Returns true if there are no errors
|
131
|
-
def ok?
|
132
|
-
errors.empty?
|
133
|
-
end
|
134
|
-
|
135
|
-
private
|
136
|
-
|
137
|
-
def defaults
|
138
|
-
{
|
139
|
-
:timeout => 20,
|
140
|
-
:html_content_only => false,
|
141
|
-
:verbose => false
|
142
|
-
}
|
143
|
-
end
|
144
|
-
|
145
|
-
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
146
|
-
# meta name: keywords, description, robots, generator
|
147
|
-
# meta http-equiv: content-language, Content-Type
|
148
|
-
#
|
149
|
-
# It will first try with meta name="..." and if nothing found,
|
150
|
-
# with meta http-equiv="...", substituting "_" by "-"
|
151
|
-
# TODO: define respond_to? to return true on the meta_name methods
|
152
|
-
def method_missing(method_name)
|
153
|
-
if method_name.to_s =~ /^meta_(.*)/
|
154
|
-
key = $1
|
155
|
-
|
156
|
-
#special treatment for opengraph (og:) and twitter card (twitter:) tags
|
157
|
-
key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
|
158
|
-
|
159
|
-
scrape_meta_data
|
160
|
-
|
161
|
-
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
162
|
-
else
|
163
|
-
super
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
# Makes the request to the server
|
168
|
-
def request
|
169
|
-
Timeout::timeout(timeout) { @request ||= open(url, {:allow_redirections => allow_redirections}) }
|
170
|
-
|
171
|
-
rescue TimeoutError
|
172
|
-
add_fatal_error 'Timeout!!!'
|
173
|
-
rescue SocketError
|
174
|
-
add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
|
175
|
-
rescue Exception => e
|
176
|
-
add_fatal_error "Scraping exception: #{e.message}"
|
177
|
-
end
|
178
|
-
|
179
|
-
# Scrapes all meta tags found
|
180
|
-
def scrape_meta_data
|
181
|
-
unless @data.meta
|
182
|
-
@data.meta!.name!
|
183
|
-
@data.meta!.property!
|
184
|
-
parsed_document.xpath("//meta").each do |element|
|
185
|
-
get_meta_name_or_property(element)
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
# Store meta tag value, looking at meta name or meta property
|
191
|
-
def get_meta_name_or_property(element)
|
192
|
-
name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
|
193
|
-
content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
|
194
|
-
|
195
|
-
if !name_or_property.nil? && !content_or_value.nil?
|
196
|
-
@data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
def parsed_feed(format)
|
201
|
-
feed = parsed_document.search("//link[@type='application/#{format}+xml']").first
|
202
|
-
feed ? absolutify_url(feed.attributes['href'].value) : nil
|
203
|
-
end
|
204
|
-
|
205
|
-
def parsed_links
|
206
|
-
@parsed_links ||= cleanup_nokogiri_values(parsed_document.search("//a/@href"))
|
207
|
-
end
|
208
|
-
|
209
|
-
def parsed_images
|
210
|
-
@parsed_images ||= cleanup_nokogiri_values(parsed_document.search('//img/@src'))
|
211
|
-
end
|
212
|
-
|
213
|
-
# Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
|
214
|
-
def cleanup_nokogiri_values(results)
|
215
|
-
results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
|
216
|
-
end
|
217
|
-
|
218
|
-
# Stores the error for later inspection
|
219
|
-
def add_fatal_error(error)
|
220
|
-
warn error if verbose
|
221
|
-
@errors << error
|
222
|
-
end
|
223
|
-
|
224
|
-
# Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
|
225
|
-
def normalize_url(url)
|
226
|
-
Addressable::URI.parse(url).normalize.to_s
|
227
|
-
end
|
228
|
-
|
229
|
-
# Adds 'http' as default scheme, if there if none
|
230
|
-
def with_default_scheme(url)
|
231
|
-
URI.parse(url).scheme.nil? ? 'http://' + url : url
|
232
|
-
end
|
233
|
-
|
234
|
-
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
235
|
-
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
236
|
-
def absolutify_url(uri)
|
237
|
-
if uri =~ /^\w*\:/i
|
238
|
-
normalize_url(uri)
|
239
|
-
else
|
240
|
-
Addressable::URI.join(base_url, uri).normalize.to_s
|
241
|
-
end
|
242
|
-
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
243
|
-
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
244
|
-
end
|
245
|
-
|
246
|
-
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
247
|
-
# or the url of the document if no <base> tag was found.
|
248
|
-
def base_url
|
249
|
-
base_href || @url
|
250
|
-
end
|
251
|
-
|
252
|
-
# Returns the value of the href attribute on the <base /> tag, if it exists
|
253
|
-
def base_href
|
254
|
-
parsed_document.search('base').first.attributes['href'].value rescue nil
|
255
|
-
end
|
256
|
-
|
257
|
-
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
258
|
-
def unrelativize_url(url)
|
259
|
-
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
260
|
-
end
|
261
|
-
|
262
|
-
# Extracts the host from a given URL
|
263
|
-
def host_from_url(url)
|
264
|
-
URI.parse(url).host
|
265
|
-
rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
|
266
|
-
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
267
|
-
end
|
268
|
-
|
269
|
-
# Look for the first <p> block with 120 characters or more
|
270
|
-
def secondary_description
|
271
|
-
first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
|
272
|
-
first_long_paragraph ? first_long_paragraph.text : ''
|
273
|
-
end
|
274
|
-
|
275
|
-
def charset_from_meta_charset
|
276
|
-
parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
277
|
-
end
|
278
|
-
|
279
|
-
def charset_from_content_type
|
280
|
-
parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
281
|
-
end
|
282
|
-
end
|
283
|
-
end
|