content_urls 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.rdoc +2 -0
- data/content_urls.gemspec +2 -2
- data/lib/content_urls/parsers/html_parser.rb +0 -2
- data/lib/content_urls/version.rb +1 -1
- data/lib/content_urls.rb +49 -3
- data/spec/content_urls_spec.rb +54 -0
- data/spec/html_parser_spec.rb +20 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
---
|
|
2
2
|
!binary "U0hBMQ==":
|
|
3
3
|
metadata.gz: !binary |-
|
|
4
|
-
|
|
4
|
+
MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
|
|
5
5
|
data.tar.gz: !binary |-
|
|
6
|
-
|
|
6
|
+
NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
|
|
7
7
|
!binary "U0hBNTEy":
|
|
8
8
|
metadata.gz: !binary |-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
|
|
10
|
+
NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
|
|
11
|
+
ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
|
|
12
12
|
data.tar.gz: !binary |-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
|
|
14
|
+
NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
|
|
15
|
+
NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=
|
data/README.rdoc
CHANGED
|
@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
|
|
|
27
27
|
* url() notation
|
|
28
28
|
* JavaScript content
|
|
29
29
|
* URI module's REGEXP
|
|
30
|
+
* Can convert relative URLs to absolute URLs by providing resource URL
|
|
31
|
+
* Can convert relative URLs to absolute URLs when base URL found in HTML content
|
|
30
32
|
|
|
31
33
|
== Examples
|
|
32
34
|
=== Find URLs in an HTML document
|
data/content_urls.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "content_urls"
|
|
8
|
-
s.version = "0.1.
|
|
8
|
+
s.version = "0.1.7"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Dennis Sutch"]
|
|
12
|
-
s.date = "2013-07-
|
|
12
|
+
s.date = "2013-07-18"
|
|
13
13
|
s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
|
|
14
14
|
s.email = "dennis@sutch.com"
|
|
15
15
|
s.extra_rdoc_files = [
|
|
@@ -62,8 +62,6 @@ class ContentUrls
|
|
|
62
62
|
# - should href URL be changed?
|
|
63
63
|
# - should relative URLs be modified using base?
|
|
64
64
|
# - how should rewritten relative URLs be handled?
|
|
65
|
-
base = doc.search('//head/base/@href') # base URI for resolving relative URIs
|
|
66
|
-
base = nil if base && base.to_s.strip.empty?
|
|
67
65
|
|
|
68
66
|
@@parser_definition.each do |type, definition|
|
|
69
67
|
doc.search(definition[:xpath]).each do |obj|
|
data/lib/content_urls/version.rb
CHANGED
data/lib/content_urls.rb
CHANGED
|
@@ -9,6 +9,9 @@ class ContentUrls
|
|
|
9
9
|
#
|
|
10
10
|
# @param [String] content the content.
|
|
11
11
|
# @param [String] type the media type of the content.
|
|
12
|
+
# @param [Hash] opts the options for manipulating returned URLs
|
|
13
|
+
# @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
|
|
14
|
+
# @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
|
|
12
15
|
# @return [Array] the unique URLs found in the content.
|
|
13
16
|
#
|
|
14
17
|
# @example Parse HTML code for URLs
|
|
@@ -18,7 +21,14 @@ class ContentUrls
|
|
|
18
21
|
# end
|
|
19
22
|
# # => "Found URL: index.html"
|
|
20
23
|
#
|
|
21
|
-
# @example Parse
|
|
24
|
+
# @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
|
|
25
|
+
# content = '<html><a href="index.html">Home</a></html>'
|
|
26
|
+
# ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
|
|
27
|
+
# puts "Found URL: #{url}"
|
|
28
|
+
# end
|
|
29
|
+
# # => "Found URL: http://www.example.com/index.html"
|
|
30
|
+
#
|
|
31
|
+
# # @example Parse content obtained from a robot
|
|
22
32
|
# response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
|
|
23
33
|
# puts "URLs found at http://example.com/sample-1:"
|
|
24
34
|
# ContentUrls.urls(response.body, response.content_type).each do |url|
|
|
@@ -26,14 +36,50 @@ class ContentUrls
|
|
|
26
36
|
# end
|
|
27
37
|
# # => [a list of URLs found in the content located at http://example.com/sample-1]
|
|
28
38
|
#
|
|
29
|
-
def self.urls(content, type)
|
|
39
|
+
def self.urls(content, type, options = {})
|
|
40
|
+
options = {
|
|
41
|
+
:use_base_url => false,
|
|
42
|
+
:content_url => nil,
|
|
43
|
+
}.merge(options)
|
|
30
44
|
urls = []
|
|
31
45
|
if (parser = get_parser(type))
|
|
32
|
-
|
|
46
|
+
base = base_url(content, type) if options[:use_base_url]
|
|
47
|
+
base = '' if URI(base || '').relative?
|
|
48
|
+
if options[:content_url]
|
|
49
|
+
content_url = URI(options[:content_url]) rescue ''
|
|
50
|
+
content_url = '' if URI(content_url).relative?
|
|
51
|
+
base = URI.join(content_url, base)
|
|
52
|
+
end
|
|
53
|
+
if URI(base).relative?
|
|
54
|
+
parser.urls(content).each { |url| urls << url }
|
|
55
|
+
else
|
|
56
|
+
parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
|
|
57
|
+
end
|
|
33
58
|
end
|
|
34
59
|
urls
|
|
35
60
|
end
|
|
36
61
|
|
|
62
|
+
# Returns base URL found in the content, if available.
|
|
63
|
+
#
|
|
64
|
+
# @param [String] content the content.
|
|
65
|
+
# @param [String] type the media type of the content.
|
|
66
|
+
# @return [String] the base URL found in the content.
|
|
67
|
+
#
|
|
68
|
+
# @example Parse HTML code for base URL
|
|
69
|
+
# content = '<html><head><base href="/home/">'
|
|
70
|
+
# puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
|
|
71
|
+
# # => "Found base URL: /home/"
|
|
72
|
+
#
|
|
73
|
+
def self.base_url(content, type)
|
|
74
|
+
base = nil
|
|
75
|
+
if (parser = get_parser(type))
|
|
76
|
+
if (parser.respond_to?(:base))
|
|
77
|
+
base = parser.base(content)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
base
|
|
81
|
+
end
|
|
82
|
+
|
|
37
83
|
# Rewrites each URL in the content by calling the supplied block with each URL.
|
|
38
84
|
#
|
|
39
85
|
# @param [String] content the HTML content.
|
data/spec/content_urls_spec.rb
CHANGED
|
@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
|
|
|
27
27
|
ContentUrls.get_parser('content/test').should eq 'some_parser_class'
|
|
28
28
|
end
|
|
29
29
|
end
|
|
30
|
+
|
|
31
|
+
describe ContentUrls do
|
|
32
|
+
it "should return relative URLs as absolute when requested" do
|
|
33
|
+
|
|
34
|
+
html_base_sample =<<BASE_SAMPLE
|
|
35
|
+
<html>
|
|
36
|
+
<head>
|
|
37
|
+
<base href='http://www.example.com/sample/'>
|
|
38
|
+
<title>HTML base Sample</title>
|
|
39
|
+
</head>
|
|
40
|
+
<body>
|
|
41
|
+
<h1>HTML base Sample</h1>
|
|
42
|
+
<a href='about.html'>about</a>
|
|
43
|
+
</body>
|
|
44
|
+
</html>
|
|
45
|
+
BASE_SAMPLE
|
|
46
|
+
|
|
47
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
|
|
48
|
+
urls[0].should eq 'http://www.example.com/sample/about.html'
|
|
49
|
+
|
|
50
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
|
|
51
|
+
urls[0].should eq 'https://www2.example.com/test/about.html'
|
|
52
|
+
|
|
53
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
|
|
54
|
+
urls[0].should eq 'http://www.example.com/sample/about.html'
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe ContentUrls do
|
|
59
|
+
it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
|
|
60
|
+
|
|
61
|
+
html_base_sample =<<BASE_SAMPLE
|
|
62
|
+
<html>
|
|
63
|
+
<head>
|
|
64
|
+
<base href='http://www2.example.com/sample/'>
|
|
65
|
+
<title>HTML base Sample</title>
|
|
66
|
+
</head>
|
|
67
|
+
<body>
|
|
68
|
+
<h1>HTML base Sample</h1>
|
|
69
|
+
<a href='http://www.example.com/about.html'>about</a>
|
|
70
|
+
</body>
|
|
71
|
+
</html>
|
|
72
|
+
BASE_SAMPLE
|
|
73
|
+
|
|
74
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
|
|
75
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
|
76
|
+
|
|
77
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
|
|
78
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
|
79
|
+
|
|
80
|
+
urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
|
|
81
|
+
urls[0].should eq 'http://www.example.com/about.html'
|
|
82
|
+
end
|
|
83
|
+
end
|
data/spec/html_parser_spec.rb
CHANGED
|
@@ -334,13 +334,32 @@ SAMPLE_13
|
|
|
334
334
|
end
|
|
335
335
|
end
|
|
336
336
|
|
|
337
|
+
describe ContentUrls::HtmlParser do
|
|
338
|
+
it "should parse the HTML and return nil when no 'base' URL" do
|
|
339
|
+
|
|
340
|
+
html_missing_base_sample =<<MISSING_BASE_SAMPLE
|
|
341
|
+
<html>
|
|
342
|
+
<head>
|
|
343
|
+
<title>HTML no base Sample</title>
|
|
344
|
+
</head>
|
|
345
|
+
<body>
|
|
346
|
+
<h1>HTML no base Sample</h1>
|
|
347
|
+
</body>
|
|
348
|
+
</html>
|
|
349
|
+
MISSING_BASE_SAMPLE
|
|
350
|
+
|
|
351
|
+
url = ContentUrls::HtmlParser.base(html_missing_base_sample)
|
|
352
|
+
url.should eq nil
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
|
|
337
356
|
describe ContentUrls::HtmlParser do
|
|
338
357
|
it "should parse the HTML and return the 'base' URL and no other URLs" do
|
|
339
358
|
|
|
340
359
|
html_base_sample =<<BASE_SAMPLE
|
|
341
360
|
<html>
|
|
342
361
|
<head>
|
|
343
|
-
<base href='/en/'
|
|
362
|
+
<base href='/en/'>
|
|
344
363
|
<title>HTML base Sample</title>
|
|
345
364
|
</head>
|
|
346
365
|
<body>
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: content_urls
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dennis Sutch
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2013-07-
|
|
11
|
+
date: 2013-07-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|