content_urls 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.document +5 -5
- data/.rspec +1 -1
- data/Gemfile +13 -13
- data/LICENSE.txt +20 -20
- data/README.rdoc +73 -63
- data/Rakefile +42 -42
- data/VERSION +1 -1
- data/content_urls-0.1.0.gem +0 -0
- data/content_urls.gemspec +5 -4
- data/lib/content_urls.rb +107 -107
- data/lib/content_urls/parsers/css_parser.rb +126 -126
- data/lib/content_urls/parsers/html_parser.rb +172 -150
- data/lib/content_urls/parsers/java_script_parser.rb +64 -64
- data/lib/content_urls/version.rb +3 -3
- data/spec/content_urls_spec.rb +29 -29
- data/spec/css_parser_spec.rb +34 -34
- data/spec/html_parser_spec.rb +358 -318
- data/spec/java_script_parser_spec.rb +31 -31
- data/spec/spec_helper.rb +12 -12
- metadata +6 -23
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ODJhMDkzODQ3NDAzMWI5MGMzOWYzZDEzZDNkNTE4YTVlZjFmMjVmNA==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YWE0MGZkNmRkMGE1YWU4ODk5ZWI5YzQ2OGFkMmM5YjU4ZWRlY2ZlMw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MDJiOTE3YjlmMTI4NzY2Y2Q3ODI3NTNjYzMyMzRiYTkwNDJjODJjMTgwZjFi
|
10
|
+
NGFmYmUxY2NiZDVjOTM4ZjU3NDQ0MmFlNjk5ODVmNmJjZmJhMzQ3ZmMzZWQz
|
11
|
+
MzM1NDlhOGM2NDkwMTI0ZjEwYmE1ZGFkOTVlMmIyNzA4ODI0YjI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OTEyZDY5ZjBjODVmODgwMDU4NzkwYzMyOGM5MmI2OTBiMGJkYjQwYTU2Y2M5
|
14
|
+
NDgxY2IzM2M1YzkxNjEwNzdlNDgyNTU1Y2U0YmYzZTUzY2U0NzAzNmZkMWI0
|
15
|
+
OWU5ZDU0NWU3MDU3YmFkNTBhNDFmZWJjM2Q2Y2NhMWE5YmU4MDQ=
|
data/.document
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
lib/**/*.rb
|
2
|
-
bin/*
|
3
|
-
-
|
4
|
-
features/**/*.feature
|
5
|
-
LICENSE.txt
|
1
|
+
lib/**/*.rb
|
2
|
+
bin/*
|
3
|
+
-
|
4
|
+
features/**/*.feature
|
5
|
+
LICENSE.txt
|
data/.rspec
CHANGED
@@ -1 +1 @@
|
|
1
|
-
#--color
|
1
|
+
#--color
|
data/Gemfile
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
-
source "http://rubygems.org"
|
2
|
-
|
3
|
-
gem "nokogiri"
|
4
|
-
|
5
|
-
group :development do
|
6
|
-
gem "rspec", "~> 2.8.0"
|
7
|
-
gem "yard", "~> 0.7"
|
8
|
-
gem "rdoc", "~> 3.12"
|
9
|
-
gem "bundler"
|
10
|
-
gem "jeweler", "~> 1.8.4"
|
11
|
-
gem "rcov", "0.9.9"
|
12
|
-
gem "rake", "~> 0.9.2.2"
|
13
|
-
end
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "nokogiri"
|
4
|
+
|
5
|
+
group :development do
|
6
|
+
gem "rspec", "~> 2.8.0"
|
7
|
+
gem "yard", "~> 0.7"
|
8
|
+
gem "rdoc", "~> 3.12"
|
9
|
+
gem "bundler"
|
10
|
+
gem "jeweler", "~> 1.8.4"
|
11
|
+
gem "rcov", "0.9.9"
|
12
|
+
gem "rake", "~> 0.9.2.2"
|
13
|
+
end
|
data/LICENSE.txt
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
Copyright (c) 2012 Dennis Sutch
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
1
|
+
Copyright (c) 2012 Dennis Sutch
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
CHANGED
@@ -1,63 +1,73 @@
|
|
1
|
-
=
|
2
|
-
|
3
|
-
Find and rewrite URLs in different types of content.
|
4
|
-
|
5
|
-
ContentUrls was developed to address two use cases:
|
6
|
-
* Find each URL in content retrieved from a website in order to spider and find all content on the website.
|
7
|
-
* Rewrite each URL in content retrieved from a website in order to make a working local copy of the website.
|
8
|
-
|
9
|
-
== Features
|
10
|
-
* Three types of content: HTML, CSS and JavaScript
|
11
|
-
* HTML content
|
12
|
-
* <a> tag href attribute
|
13
|
-
* <area> tag href attribute
|
14
|
-
* <body> tag background attribute
|
15
|
-
* <embed> tag src attribute
|
16
|
-
* <
|
17
|
-
* <
|
18
|
-
* <
|
19
|
-
* <
|
20
|
-
* <
|
21
|
-
*
|
22
|
-
*
|
23
|
-
*
|
24
|
-
* CSS content
|
25
|
-
|
26
|
-
*
|
27
|
-
*
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
*
|
48
|
-
*
|
49
|
-
*
|
50
|
-
*
|
51
|
-
*
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
*
|
57
|
-
*
|
58
|
-
*
|
59
|
-
*
|
60
|
-
*
|
61
|
-
|
62
|
-
|
63
|
-
|
1
|
+
= ContentUrls
|
2
|
+
|
3
|
+
Find and rewrite URLs in different types of content.
|
4
|
+
|
5
|
+
ContentUrls was developed to address two use cases:
|
6
|
+
* Find each URL in content retrieved from a website in order to spider and find all content on the website.
|
7
|
+
* Rewrite each URL in content retrieved from a website in order to make a working local copy of the website.
|
8
|
+
|
9
|
+
== Features
|
10
|
+
* Three types of content: HTML, CSS and JavaScript
|
11
|
+
* HTML content
|
12
|
+
* <a> tag href attribute
|
13
|
+
* <area> tag href attribute
|
14
|
+
* <body> tag background attribute
|
15
|
+
* <embed> tag src attribute
|
16
|
+
* <frame> tag src attribute
|
17
|
+
* <iframe> tag src attribute
|
18
|
+
* <img> tag src attribute
|
19
|
+
* <link> tag href attribute
|
20
|
+
* <meta> tag content attribute containing URL
|
21
|
+
* <object> tag data attribute
|
22
|
+
* <script> tag src attribute
|
23
|
+
* style attribute of any tag (parsed as CSS content)
|
24
|
+
* body of <style> tag (parsed as CSS content)
|
25
|
+
* body of <script> tag when type or language attribute identifies JavaScript (parsed as JavaScript content)
|
26
|
+
* CSS content
|
27
|
+
* url() notation
|
28
|
+
* JavaScript content
|
29
|
+
* URI module's REGEXP
|
30
|
+
|
31
|
+
== Examples
|
32
|
+
=== Find URLs in an HTML document
|
33
|
+
Provide the HTML content and the content type and obtain an array of unique URLs.
|
34
|
+
ContentUrls.urls(html, 'text/html').each do |url|
|
35
|
+
puts "Found URL: #{url}"
|
36
|
+
end
|
37
|
+
|
38
|
+
=== Rewrite URLs in an HTML document
|
39
|
+
Provide the HTML content, the content type, and a block to rewrite each URL's extension.
|
40
|
+
rewritten_html = ContentUrls.rewrite_each_url(html, 'text/html') {|url| url.sub(/.htm/, '.html'}
|
41
|
+
|
42
|
+
== Requirements
|
43
|
+
* nokogiri
|
44
|
+
|
45
|
+
== Development
|
46
|
+
To test and develop this gem, additional requirements are:
|
47
|
+
* bundler
|
48
|
+
* jeweler
|
49
|
+
* rake
|
50
|
+
* rcov
|
51
|
+
* rdoc
|
52
|
+
* rspec
|
53
|
+
* yard
|
54
|
+
|
55
|
+
== Goals for ContentUrls
|
56
|
+
* Include support for:
|
57
|
+
* Acrobat (.pdf)
|
58
|
+
* Flash (.swf)
|
59
|
+
* Microsoft Office (.doc, .xls, .ppt)
|
60
|
+
* text (regular expression for URLs)
|
61
|
+
* Capture links retrieved from a headless web browser which executes the code (JavaScript, etc.)
|
62
|
+
|
63
|
+
== Contributing to content_urls
|
64
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
65
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
66
|
+
* Fork the project.
|
67
|
+
* Start a feature/bugfix branch.
|
68
|
+
* Commit and push until you are happy with your contribution.
|
69
|
+
* Make sure to add tests for it. This is important so I don't unintentionally break it in a future version.
|
70
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
71
|
+
|
72
|
+
== Copyright
|
73
|
+
Copyright (c) 2012 Dennis Sutch. See LICENSE.txt for further details.
|
data/Rakefile
CHANGED
@@ -1,42 +1,42 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
13
|
-
|
14
|
-
require 'jeweler'
|
15
|
-
Jeweler::Tasks.new do |gem|
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "content_urls"
|
18
|
-
gem.homepage = "http://github.com/sutch/content_urls"
|
19
|
-
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{Find and rewrite URLs in different types of content.}
|
21
|
-
gem.description = %Q{Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.}
|
22
|
-
gem.email = "dennis@sutch.com"
|
23
|
-
gem.authors = ["Dennis Sutch"]
|
24
|
-
# dependencies defined in Gemfile
|
25
|
-
end
|
26
|
-
Jeweler::RubygemsDotOrgTasks.new
|
27
|
-
|
28
|
-
require 'rspec/core'
|
29
|
-
require 'rspec/core/rake_task'
|
30
|
-
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
-
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
-
end
|
33
|
-
|
34
|
-
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
-
spec.rcov = true
|
37
|
-
end
|
38
|
-
|
39
|
-
task :default => :spec
|
40
|
-
|
41
|
-
require 'yard'
|
42
|
-
YARD::Rake::YardocTask.new
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "content_urls"
|
18
|
+
gem.homepage = "http://github.com/sutch/content_urls"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Find and rewrite URLs in different types of content.}
|
21
|
+
gem.description = %Q{Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.}
|
22
|
+
gem.email = "dennis@sutch.com"
|
23
|
+
gem.authors = ["Dennis Sutch"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'yard'
|
42
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
Binary file
|
data/content_urls.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "content_urls"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dennis Sutch"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2013-07-10"
|
13
13
|
s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
|
14
14
|
s.email = "dennis@sutch.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
"README.rdoc",
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
|
+
"content_urls-0.1.0.gem",
|
27
28
|
"content_urls.gemspec",
|
28
29
|
"lib/content_urls.rb",
|
29
30
|
"lib/content_urls/parsers/css_parser.rb",
|
@@ -39,11 +40,11 @@ Gem::Specification.new do |s|
|
|
39
40
|
s.homepage = "http://github.com/sutch/content_urls"
|
40
41
|
s.licenses = ["MIT"]
|
41
42
|
s.require_paths = ["lib"]
|
42
|
-
s.rubygems_version = "
|
43
|
+
s.rubygems_version = "2.0.3"
|
43
44
|
s.summary = "Find and rewrite URLs in different types of content."
|
44
45
|
|
45
46
|
if s.respond_to? :specification_version then
|
46
|
-
s.specification_version =
|
47
|
+
s.specification_version = 4
|
47
48
|
|
48
49
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
49
50
|
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
data/lib/content_urls.rb
CHANGED
@@ -1,107 +1,107 @@
|
|
1
|
-
require 'content_urls/version'
|
2
|
-
require 'uri'
|
3
|
-
|
4
|
-
# +ContentUrls+ parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.
|
5
|
-
#
|
6
|
-
class ContentUrls
|
7
|
-
|
8
|
-
# Returns the URLs found in the content.
|
9
|
-
#
|
10
|
-
# @param [String] content the content.
|
11
|
-
# @param [String] type the media type of the content.
|
12
|
-
# @return [Array] the unique URLs found in the content.
|
13
|
-
#
|
14
|
-
# @example Parse HTML code for URLs
|
15
|
-
# content = '<html><a href="index.html">Home</a></html>'
|
16
|
-
# ContentUrls.urls(content, 'text/html').each do |url|
|
17
|
-
# puts "Found URL: #{url}"
|
18
|
-
# end
|
19
|
-
# # => "Found URL: index.html"
|
20
|
-
#
|
21
|
-
# @example Parse content obtained from a robot
|
22
|
-
# response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
|
23
|
-
# puts "URLs found at http://example.com/sample-1:"
|
24
|
-
# ContentUrls.urls(response.body, response.content_type).each do |url|
|
25
|
-
# puts " #{url}"
|
26
|
-
# end
|
27
|
-
# # => [a list of URLs found in the content located at http://example.com/sample-1]
|
28
|
-
#
|
29
|
-
def self.urls(content, type)
|
30
|
-
urls = []
|
31
|
-
if (parser = get_parser(type))
|
32
|
-
parser.new(content).urls.each { |url| urls << url }
|
33
|
-
end
|
34
|
-
urls
|
35
|
-
end
|
36
|
-
|
37
|
-
# Rewrites each URL in the content by calling the supplied block with each URL.
|
38
|
-
#
|
39
|
-
# @param [String] content the HTML content.
|
40
|
-
# @param [String] type the media type of the content.
|
41
|
-
# @returns [string] content the rewritten content.
|
42
|
-
#
|
43
|
-
# @example Rewrite URLs in HTML code
|
44
|
-
# content = '<html><a href="index.htm">Home</a></html>'
|
45
|
-
# content = ContentUrls.rewrite_each_url(content, 'text/html') {|url| 'gone.html'}
|
46
|
-
# puts "Rewritten: #{content}"
|
47
|
-
# # => "Rewritten: <html><a href="gone.html">Home</a></html>"
|
48
|
-
#
|
49
|
-
def self.rewrite_each_url(content, type, &block)
|
50
|
-
if (parser = get_parser(type))
|
51
|
-
parser.rewrite_each_url(content) do |url|
|
52
|
-
replacement = yield url
|
53
|
-
(replacement.nil? ? url : replacement)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
content
|
57
|
-
end
|
58
|
-
|
59
|
-
# Convert a relative URL to an absolute URL using base_url (for example, the content's original location or an HTML document's href attribute of the base tag).
|
60
|
-
#
|
61
|
-
# @example Obtain absolute URL of "../index.html" of page obtained from "http://example.com/one/two/sample.html"
|
62
|
-
# puts ContentUrls.to_absolute("../index.html", "http://example.com/folder/sample.html")
|
63
|
-
# # => "http://example.com/index.html"
|
64
|
-
#
|
65
|
-
def self.to_absolute(url, base_url)
|
66
|
-
return nil if url.nil?
|
67
|
-
|
68
|
-
url = URI.encode(URI.decode(url.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) # remove anchor
|
69
|
-
absolute = URI(base_url).merge(url)
|
70
|
-
absolute.path = '/' if absolute.path.empty?
|
71
|
-
absolute.to_s
|
72
|
-
end
|
73
|
-
|
74
|
-
protected
|
75
|
-
|
76
|
-
@@type_parser = Hash.new { |hash, key| hash[key] = [] } # mapping of type regex to parser class
|
77
|
-
|
78
|
-
# Register a parser implementation class for one or more content type regular expressions
|
79
|
-
def self.register_parser(parser_class, *type_regexes)
|
80
|
-
type_regexes.each do |regex|
|
81
|
-
@@type_parser[regex].push parser_class
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Return parser for a file type or nil if content type not recognized
|
86
|
-
def self.get_parser(type)
|
87
|
-
@@type_parser.each_pair do |regex, parser|
|
88
|
-
if type =~ regex
|
89
|
-
return parser.first
|
90
|
-
end
|
91
|
-
end
|
92
|
-
return nil
|
93
|
-
end
|
94
|
-
|
95
|
-
# Parser implementations
|
96
|
-
# - each implementation's urls method should return unique URLs
|
97
|
-
|
98
|
-
require 'content_urls/parsers/html_parser'
|
99
|
-
register_parser ContentUrls::HtmlParser, %r{^(text/html)\b}, %r{^(application/xhtml+xml)\b}
|
100
|
-
|
101
|
-
require 'content_urls/parsers/css_parser'
|
102
|
-
register_parser ContentUrls::CssParser, %r{^(text/css)\b}
|
103
|
-
|
104
|
-
require 'content_urls/parsers/java_script_parser'
|
105
|
-
register_parser ContentUrls::JavaScriptParser, %r{^(application/x-javascript)\b}, %r{^(application/javascript)\b}, %r{^(text/javascript)\b}
|
106
|
-
|
107
|
-
end
|
1
|
+
require 'content_urls/version'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
# +ContentUrls+ parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.
|
5
|
+
#
|
6
|
+
class ContentUrls
|
7
|
+
|
8
|
+
# Returns the URLs found in the content.
|
9
|
+
#
|
10
|
+
# @param [String] content the content.
|
11
|
+
# @param [String] type the media type of the content.
|
12
|
+
# @return [Array] the unique URLs found in the content.
|
13
|
+
#
|
14
|
+
# @example Parse HTML code for URLs
|
15
|
+
# content = '<html><a href="index.html">Home</a></html>'
|
16
|
+
# ContentUrls.urls(content, 'text/html').each do |url|
|
17
|
+
# puts "Found URL: #{url}"
|
18
|
+
# end
|
19
|
+
# # => "Found URL: index.html"
|
20
|
+
#
|
21
|
+
# @example Parse content obtained from a robot
|
22
|
+
# response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
|
23
|
+
# puts "URLs found at http://example.com/sample-1:"
|
24
|
+
# ContentUrls.urls(response.body, response.content_type).each do |url|
|
25
|
+
# puts " #{url}"
|
26
|
+
# end
|
27
|
+
# # => [a list of URLs found in the content located at http://example.com/sample-1]
|
28
|
+
#
|
29
|
+
def self.urls(content, type)
|
30
|
+
urls = []
|
31
|
+
if (parser = get_parser(type))
|
32
|
+
parser.new(content).urls.each { |url| urls << url }
|
33
|
+
end
|
34
|
+
urls
|
35
|
+
end
|
36
|
+
|
37
|
+
# Rewrites each URL in the content by calling the supplied block with each URL.
|
38
|
+
#
|
39
|
+
# @param [String] content the HTML content.
|
40
|
+
# @param [String] type the media type of the content.
|
41
|
+
# @returns [string] content the rewritten content.
|
42
|
+
#
|
43
|
+
# @example Rewrite URLs in HTML code
|
44
|
+
# content = '<html><a href="index.htm">Home</a></html>'
|
45
|
+
# content = ContentUrls.rewrite_each_url(content, 'text/html') {|url| 'gone.html'}
|
46
|
+
# puts "Rewritten: #{content}"
|
47
|
+
# # => "Rewritten: <html><a href="gone.html">Home</a></html>"
|
48
|
+
#
|
49
|
+
def self.rewrite_each_url(content, type, &block)
|
50
|
+
if (parser = get_parser(type))
|
51
|
+
parser.rewrite_each_url(content) do |url|
|
52
|
+
replacement = yield url
|
53
|
+
(replacement.nil? ? url : replacement)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
content
|
57
|
+
end
|
58
|
+
|
59
|
+
# Convert a relative URL to an absolute URL using base_url (for example, the content's original location or an HTML document's href attribute of the base tag).
|
60
|
+
#
|
61
|
+
# @example Obtain absolute URL of "../index.html" of page obtained from "http://example.com/one/two/sample.html"
|
62
|
+
# puts ContentUrls.to_absolute("../index.html", "http://example.com/folder/sample.html")
|
63
|
+
# # => "http://example.com/index.html"
|
64
|
+
#
|
65
|
+
def self.to_absolute(url, base_url)
|
66
|
+
return nil if url.nil?
|
67
|
+
|
68
|
+
url = URI.encode(URI.decode(url.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) # remove anchor
|
69
|
+
absolute = URI(base_url).merge(url)
|
70
|
+
absolute.path = '/' if absolute.path.empty?
|
71
|
+
absolute.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
protected
|
75
|
+
|
76
|
+
@@type_parser = Hash.new { |hash, key| hash[key] = [] } # mapping of type regex to parser class
|
77
|
+
|
78
|
+
# Register a parser implementation class for one or more content type regular expressions
|
79
|
+
def self.register_parser(parser_class, *type_regexes)
|
80
|
+
type_regexes.each do |regex|
|
81
|
+
@@type_parser[regex].push parser_class
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return parser for a file type or nil if content type not recognized
|
86
|
+
def self.get_parser(type)
|
87
|
+
@@type_parser.each_pair do |regex, parser|
|
88
|
+
if type =~ regex
|
89
|
+
return parser.first
|
90
|
+
end
|
91
|
+
end
|
92
|
+
return nil
|
93
|
+
end
|
94
|
+
|
95
|
+
# Parser implementations
|
96
|
+
# - each implementation's urls method should return unique URLs
|
97
|
+
|
98
|
+
require 'content_urls/parsers/html_parser'
|
99
|
+
register_parser ContentUrls::HtmlParser, %r{^(text/html)\b}, %r{^(application/xhtml+xml)\b}
|
100
|
+
|
101
|
+
require 'content_urls/parsers/css_parser'
|
102
|
+
register_parser ContentUrls::CssParser, %r{^(text/css)\b}
|
103
|
+
|
104
|
+
require 'content_urls/parsers/java_script_parser'
|
105
|
+
register_parser ContentUrls::JavaScriptParser, %r{^(application/x-javascript)\b}, %r{^(application/javascript)\b}, %r{^(text/javascript)\b}
|
106
|
+
|
107
|
+
end
|