content_urls 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.rdoc +2 -0
- data/content_urls.gemspec +2 -2
- data/lib/content_urls/parsers/html_parser.rb +0 -2
- data/lib/content_urls/version.rb +1 -1
- data/lib/content_urls.rb +49 -3
- data/spec/content_urls_spec.rb +54 -0
- data/spec/html_parser_spec.rb +20 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
|
10
|
+
NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
|
11
|
+
ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
|
14
|
+
NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
|
15
|
+
NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=
|
data/README.rdoc
CHANGED
@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
|
|
27
27
|
* url() notation
|
28
28
|
* JavaScript content
|
29
29
|
* URI module's REGEXP
|
30
|
+
* Can convert relative URLs to absolute URLs by providing resource URL
|
31
|
+
* Can convert relative URLs to absolute URLs when base URL found in HTML content
|
30
32
|
|
31
33
|
== Examples
|
32
34
|
=== Find URLs in an HTML document
|
data/content_urls.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "content_urls"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dennis Sutch"]
|
12
|
-
s.date = "2013-07-
|
12
|
+
s.date = "2013-07-18"
|
13
13
|
s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
|
14
14
|
s.email = "dennis@sutch.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -62,8 +62,6 @@ class ContentUrls
|
|
62
62
|
# - should href URL be changed?
|
63
63
|
# - should relative URLs be modified using base?
|
64
64
|
# - how should rewritten relative URLs be handled?
|
65
|
-
base = doc.search('//head/base/@href') # base URI for resolving relative URIs
|
66
|
-
base = nil if base && base.to_s.strip.empty?
|
67
65
|
|
68
66
|
@@parser_definition.each do |type, definition|
|
69
67
|
doc.search(definition[:xpath]).each do |obj|
|
data/lib/content_urls/version.rb
CHANGED
data/lib/content_urls.rb
CHANGED
@@ -9,6 +9,9 @@ class ContentUrls
|
|
9
9
|
#
|
10
10
|
# @param [String] content the content.
|
11
11
|
# @param [String] type the media type of the content.
|
12
|
+
# @param [Hash] opts the options for manipulating returned URLs
|
13
|
+
# @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
|
14
|
+
# @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
|
12
15
|
# @return [Array] the unique URLs found in the content.
|
13
16
|
#
|
14
17
|
# @example Parse HTML code for URLs
|
@@ -18,7 +21,14 @@ class ContentUrls
|
|
18
21
|
# end
|
19
22
|
# # => "Found URL: index.html"
|
20
23
|
#
|
21
|
-
# @example Parse
|
24
|
+
# @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
|
25
|
+
# content = '<html><a href="index.html">Home</a></html>'
|
26
|
+
# ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
|
27
|
+
# puts "Found URL: #{url}"
|
28
|
+
# end
|
29
|
+
# # => "Found URL: http://www.example.com/index.html"
|
30
|
+
#
|
31
|
+
# # @example Parse content obtained from a robot
|
22
32
|
# response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
|
23
33
|
# puts "URLs found at http://example.com/sample-1:"
|
24
34
|
# ContentUrls.urls(response.body, response.content_type).each do |url|
|
@@ -26,14 +36,50 @@ class ContentUrls
|
|
26
36
|
# end
|
27
37
|
# # => [a list of URLs found in the content located at http://example.com/sample-1]
|
28
38
|
#
|
29
|
-
def self.urls(content, type)
|
39
|
+
def self.urls(content, type, options = {})
|
40
|
+
options = {
|
41
|
+
:use_base_url => false,
|
42
|
+
:content_url => nil,
|
43
|
+
}.merge(options)
|
30
44
|
urls = []
|
31
45
|
if (parser = get_parser(type))
|
32
|
-
|
46
|
+
base = base_url(content, type) if options[:use_base_url]
|
47
|
+
base = '' if URI(base || '').relative?
|
48
|
+
if options[:content_url]
|
49
|
+
content_url = URI(options[:content_url]) rescue ''
|
50
|
+
content_url = '' if URI(content_url).relative?
|
51
|
+
base = URI.join(content_url, base)
|
52
|
+
end
|
53
|
+
if URI(base).relative?
|
54
|
+
parser.urls(content).each { |url| urls << url }
|
55
|
+
else
|
56
|
+
parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
|
57
|
+
end
|
33
58
|
end
|
34
59
|
urls
|
35
60
|
end
|
36
61
|
|
62
|
+
# Returns base URL found in the content, if available.
|
63
|
+
#
|
64
|
+
# @param [String] content the content.
|
65
|
+
# @param [String] type the media type of the content.
|
66
|
+
# @return [String] the base URL found in the content.
|
67
|
+
#
|
68
|
+
# @example Parse HTML code for base URL
|
69
|
+
# content = '<html><head><base href="/home/">'
|
70
|
+
# puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
|
71
|
+
# # => "Found base URL: /home/"
|
72
|
+
#
|
73
|
+
def self.base_url(content, type)
|
74
|
+
base = nil
|
75
|
+
if (parser = get_parser(type))
|
76
|
+
if (parser.respond_to?(:base))
|
77
|
+
base = parser.base(content)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
base
|
81
|
+
end
|
82
|
+
|
37
83
|
# Rewrites each URL in the content by calling the supplied block with each URL.
|
38
84
|
#
|
39
85
|
# @param [String] content the HTML content.
|
data/spec/content_urls_spec.rb
CHANGED
@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
|
|
27
27
|
ContentUrls.get_parser('content/test').should eq 'some_parser_class'
|
28
28
|
end
|
29
29
|
end
|
30
|
+
|
31
|
+
describe ContentUrls do
|
32
|
+
it "should return relative URLs as absolute when requested" do
|
33
|
+
|
34
|
+
html_base_sample =<<BASE_SAMPLE
|
35
|
+
<html>
|
36
|
+
<head>
|
37
|
+
<base href='http://www.example.com/sample/'>
|
38
|
+
<title>HTML base Sample</title>
|
39
|
+
</head>
|
40
|
+
<body>
|
41
|
+
<h1>HTML base Sample</h1>
|
42
|
+
<a href='about.html'>about</a>
|
43
|
+
</body>
|
44
|
+
</html>
|
45
|
+
BASE_SAMPLE
|
46
|
+
|
47
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
|
48
|
+
urls[0].should eq 'http://www.example.com/sample/about.html'
|
49
|
+
|
50
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
|
51
|
+
urls[0].should eq 'https://www2.example.com/test/about.html'
|
52
|
+
|
53
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
|
54
|
+
urls[0].should eq 'http://www.example.com/sample/about.html'
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe ContentUrls do
|
59
|
+
it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
|
60
|
+
|
61
|
+
html_base_sample =<<BASE_SAMPLE
|
62
|
+
<html>
|
63
|
+
<head>
|
64
|
+
<base href='http://www2.example.com/sample/'>
|
65
|
+
<title>HTML base Sample</title>
|
66
|
+
</head>
|
67
|
+
<body>
|
68
|
+
<h1>HTML base Sample</h1>
|
69
|
+
<a href='http://www.example.com/about.html'>about</a>
|
70
|
+
</body>
|
71
|
+
</html>
|
72
|
+
BASE_SAMPLE
|
73
|
+
|
74
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
|
75
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
76
|
+
|
77
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
|
78
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
79
|
+
|
80
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
|
81
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
82
|
+
end
|
83
|
+
end
|
data/spec/html_parser_spec.rb
CHANGED
@@ -334,13 +334,32 @@ SAMPLE_13
|
|
334
334
|
end
|
335
335
|
end
|
336
336
|
|
337
|
+
describe ContentUrls::HtmlParser do
|
338
|
+
it "should parse the HTML and return nil when no 'base' URL" do
|
339
|
+
|
340
|
+
html_missing_base_sample =<<MISSING_BASE_SAMPLE
|
341
|
+
<html>
|
342
|
+
<head>
|
343
|
+
<title>HTML no base Sample</title>
|
344
|
+
</head>
|
345
|
+
<body>
|
346
|
+
<h1>HTML no base Sample</h1>
|
347
|
+
</body>
|
348
|
+
</html>
|
349
|
+
MISSING_BASE_SAMPLE
|
350
|
+
|
351
|
+
url = ContentUrls::HtmlParser.base(html_missing_base_sample)
|
352
|
+
url.should eq nil
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
337
356
|
describe ContentUrls::HtmlParser do
|
338
357
|
it "should parse the HTML and return the 'base' URL and no other URLs" do
|
339
358
|
|
340
359
|
html_base_sample =<<BASE_SAMPLE
|
341
360
|
<html>
|
342
361
|
<head>
|
343
|
-
<base href='/en/'
|
362
|
+
<base href='/en/'>
|
344
363
|
<title>HTML base Sample</title>
|
345
364
|
</head>
|
346
365
|
<body>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: content_urls
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dennis Sutch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|