feedtools 0.2.26 → 0.2.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +232 -216
- data/db/migration.rb +2 -0
- data/db/schema.mysql.sql +2 -0
- data/db/schema.postgresql.sql +3 -1
- data/db/schema.sqlite.sql +3 -1
- data/lib/feed_tools.rb +37 -14
- data/lib/feed_tools/database_feed_cache.rb +13 -2
- data/lib/feed_tools/feed.rb +430 -104
- data/lib/feed_tools/feed_item.rb +533 -268
- data/lib/feed_tools/helpers/generic_helper.rb +1 -1
- data/lib/feed_tools/helpers/html_helper.rb +78 -116
- data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
- data/lib/feed_tools/helpers/uri_helper.rb +46 -54
- data/lib/feed_tools/monkey_patch.rb +27 -1
- data/lib/feed_tools/vendor/html5/History.txt +10 -0
- data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
- data/lib/feed_tools/vendor/html5/README +45 -0
- data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
- data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
- data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
- data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
- data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
- data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
- data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
- data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
- data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
- data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
- data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
- data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
- data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
- data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
- data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
- data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
- data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
- data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
- data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
- data/lib/feed_tools/vendor/uri.rb +781 -0
- data/lib/feed_tools/version.rb +1 -1
- data/rakefile +27 -6
- data/test/unit/atom_test.rb +298 -210
- data/test/unit/helper_test.rb +7 -12
- data/test/unit/rdf_test.rb +51 -1
- data/test/unit/rss_test.rb +13 -3
- metadata +239 -116
- data/lib/feed_tools/vendor/htree.rb +0 -97
- data/lib/feed_tools/vendor/htree/container.rb +0 -10
- data/lib/feed_tools/vendor/htree/context.rb +0 -67
- data/lib/feed_tools/vendor/htree/display.rb +0 -27
- data/lib/feed_tools/vendor/htree/doc.rb +0 -149
- data/lib/feed_tools/vendor/htree/elem.rb +0 -262
- data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
- data/lib/feed_tools/vendor/htree/equality.rb +0 -218
- data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
- data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
- data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
- data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
- data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
- data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
- data/lib/feed_tools/vendor/htree/loc.rb +0 -367
- data/lib/feed_tools/vendor/htree/modules.rb +0 -48
- data/lib/feed_tools/vendor/htree/name.rb +0 -124
- data/lib/feed_tools/vendor/htree/output.rb +0 -207
- data/lib/feed_tools/vendor/htree/parse.rb +0 -409
- data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
- data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
- data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
- data/lib/feed_tools/vendor/htree/scan.rb +0 -166
- data/lib/feed_tools/vendor/htree/tag.rb +0 -111
- data/lib/feed_tools/vendor/htree/template.rb +0 -909
- data/lib/feed_tools/vendor/htree/text.rb +0 -115
- data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
|
2
|
+
|
|
3
|
+
require 'html5/html5parser'
|
|
4
|
+
require 'html5/treewalkers'
|
|
5
|
+
require 'html5/treebuilders'
|
|
6
|
+
|
|
7
|
+
$tree_types_to_test = {
|
|
8
|
+
'simpletree' =>
|
|
9
|
+
{:builder => HTML5::TreeBuilders['simpletree'],
|
|
10
|
+
:walker => HTML5::TreeWalkers['simpletree']},
|
|
11
|
+
'rexml' =>
|
|
12
|
+
{:builder => HTML5::TreeBuilders['rexml'],
|
|
13
|
+
:walker => HTML5::TreeWalkers['rexml']},
|
|
14
|
+
'hpricot' =>
|
|
15
|
+
{:builder => HTML5::TreeBuilders['hpricot'],
|
|
16
|
+
:walker => HTML5::TreeWalkers['hpricot']},
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
|
20
|
+
|
|
21
|
+
class TestTreeWalkers < Test::Unit::TestCase
|
|
22
|
+
include HTML5::TestSupport
|
|
23
|
+
|
|
24
|
+
def concatenateCharacterTokens(tokens)
|
|
25
|
+
charactersToken = nil
|
|
26
|
+
for token in tokens
|
|
27
|
+
type = token[:type]
|
|
28
|
+
if [:Characters, :SpaceCharacters].include?(type)
|
|
29
|
+
if charactersToken == nil
|
|
30
|
+
charactersToken = {:type => :Characters, :data => token[:data]}
|
|
31
|
+
else
|
|
32
|
+
charactersToken[:data] += token[:data]
|
|
33
|
+
end
|
|
34
|
+
else
|
|
35
|
+
if charactersToken != nil
|
|
36
|
+
yield charactersToken
|
|
37
|
+
charactersToken = nil
|
|
38
|
+
end
|
|
39
|
+
yield token
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
yield charactersToken if charactersToken != nil
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def convertTokens(tokens)
|
|
46
|
+
output = []
|
|
47
|
+
indent = 0
|
|
48
|
+
concatenateCharacterTokens(tokens) do |token|
|
|
49
|
+
case token[:type]
|
|
50
|
+
when :StartTag, :EmptyTag
|
|
51
|
+
output << "#{' '*indent}<#{token[:name]}>"
|
|
52
|
+
indent += 2
|
|
53
|
+
for name, value in token[:data].to_a.sort
|
|
54
|
+
next if name=='xmlns'
|
|
55
|
+
output << "#{' '*indent}#{name}=\"#{value}\""
|
|
56
|
+
end
|
|
57
|
+
indent -= 2 if token[:type] == :EmptyTag
|
|
58
|
+
when :EndTag
|
|
59
|
+
indent -= 2
|
|
60
|
+
when :Comment
|
|
61
|
+
output << "#{' '*indent}<!-- #{token[:data]} -->"
|
|
62
|
+
when :Doctype
|
|
63
|
+
if token[:name] and token[:name].any?
|
|
64
|
+
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
|
|
65
|
+
else
|
|
66
|
+
output << "#{' '*indent}<!DOCTYPE >"
|
|
67
|
+
end
|
|
68
|
+
when :Characters, :SpaceCharacters
|
|
69
|
+
output << "#{' '*indent}\"#{token[:data]}\""
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
output.join("\n")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
html5_test_files('tree-construction').each do |test_file|
|
|
76
|
+
|
|
77
|
+
test_name = File.basename(test_file).sub('.dat', '')
|
|
78
|
+
next if test_name == 'tests5' # TODO
|
|
79
|
+
|
|
80
|
+
TestData.new(test_file, %w(data errors document-fragment document)).
|
|
81
|
+
each_with_index do |(input, errors, inner_html, expected), index|
|
|
82
|
+
|
|
83
|
+
expected = expected.gsub("\n| ","\n")[2..-1]
|
|
84
|
+
|
|
85
|
+
$tree_types_to_test.each do |tree_name, tree_class|
|
|
86
|
+
|
|
87
|
+
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
|
88
|
+
|
|
89
|
+
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
|
90
|
+
|
|
91
|
+
if inner_html
|
|
92
|
+
parser.parse_fragment(input, inner_html)
|
|
93
|
+
else
|
|
94
|
+
parser.parse(input)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
document = parser.tree.get_document
|
|
98
|
+
|
|
99
|
+
begin
|
|
100
|
+
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
|
101
|
+
expected = sortattrs(expected)
|
|
102
|
+
assert_equal expected, output, [
|
|
103
|
+
'', 'Input:', input,
|
|
104
|
+
'', 'Expected:', expected,
|
|
105
|
+
'', 'Recieved:', output
|
|
106
|
+
].join("\n")
|
|
107
|
+
rescue NotImplementedError
|
|
108
|
+
# Amnesty for those that confess...
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def test_all_tokens
|
|
116
|
+
expected = [
|
|
117
|
+
{:data => [], :type => :StartTag, :name => 'html'},
|
|
118
|
+
{:data => [], :type => :StartTag, :name => 'head'},
|
|
119
|
+
{:data => [], :type => :EndTag, :name => 'head'},
|
|
120
|
+
{:data => [], :type => :StartTag, :name => 'body'},
|
|
121
|
+
{:data => [], :type => :EndTag, :name => 'body'},
|
|
122
|
+
{:data => [], :type => :EndTag, :name => 'html'}]
|
|
123
|
+
for treeName, tree_class in $tree_types_to_test
|
|
124
|
+
p = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
|
125
|
+
document = p.parse("<html></html>")
|
|
126
|
+
# document = tree_class.get(:adapter)(document)
|
|
127
|
+
output = tree_class[:walker].new(document)
|
|
128
|
+
expected.zip(output) do |expected_token, output_token|
|
|
129
|
+
assert_equal(expected_token, output_token)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/env ruby -wKU
|
|
2
|
+
|
|
3
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
|
4
|
+
|
|
5
|
+
require 'html5'
|
|
6
|
+
require 'html5/filters/validator'
|
|
7
|
+
|
|
8
|
+
class TestValidator < Test::Unit::TestCase
|
|
9
|
+
def run_validator_test(test)
|
|
10
|
+
p = HTML5::HTMLParser.new(:tokenizer => HTMLConformanceChecker)
|
|
11
|
+
p.parse(test['input'])
|
|
12
|
+
errorCodes = p.errors.collect{|e| e[1]}
|
|
13
|
+
if test.has_key?('fail-if')
|
|
14
|
+
assert !errorCodes.include?(test['fail-if'])
|
|
15
|
+
end
|
|
16
|
+
if test.has_key?('fail-unless')
|
|
17
|
+
assert errorCodes.include?(test['fail-unless'])
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
for filename in html5_test_files('validator')
|
|
22
|
+
tests = JSON.load(open(filename))
|
|
23
|
+
testName = File.basename(filename).sub(".test", "")
|
|
24
|
+
tests['tests'].each_with_index do |test, index|
|
|
25
|
+
define_method "test_#{testName}_#{index}" do
|
|
26
|
+
run_validator_test(test)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'html5/constants'
|
|
2
|
+
|
|
3
|
+
class TokenizerTestParser
|
|
4
|
+
def initialize(tokenizer)
|
|
5
|
+
@tokenizer = tokenizer
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def parse
|
|
9
|
+
@outputTokens = []
|
|
10
|
+
|
|
11
|
+
debug = nil
|
|
12
|
+
for token in @tokenizer
|
|
13
|
+
debug = token.inspect if token[:type] == :ParseError
|
|
14
|
+
send(('process' + token[:type].to_s), token)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
return @outputTokens
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def processDoctype(token)
|
|
21
|
+
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
|
|
22
|
+
token[:systemId], token[:correct]])
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def processStartTag(token)
|
|
26
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def processEmptyTag(token)
|
|
30
|
+
if not HTML5::VOID_ELEMENTS.include? token[:name]
|
|
31
|
+
@outputTokens.push("ParseError")
|
|
32
|
+
end
|
|
33
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def processEndTag(token)
|
|
37
|
+
if token[:data].length > 0
|
|
38
|
+
self.processParseError(token)
|
|
39
|
+
end
|
|
40
|
+
@outputTokens.push(["EndTag", token[:name]])
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def processComment(token)
|
|
44
|
+
@outputTokens.push(["Comment", token[:data]])
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def processCharacters(token)
|
|
48
|
+
@outputTokens.push(["Character", token[:data]])
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
alias processSpaceCharacters processCharacters
|
|
52
|
+
|
|
53
|
+
def processCharacters(token)
|
|
54
|
+
@outputTokens.push(["Character", token[:data]])
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def process_eof(token)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def processParseError(token)
|
|
61
|
+
@outputTokens.push("ParseError")
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
module FeedTools
|
|
2
|
+
# This is an implementation of a URI parser based on RFC 3986.
|
|
3
|
+
class URI
|
|
4
|
+
# Raised if something other than a uri is supplied.
|
|
5
|
+
class InvalidURIError < StandardError
|
|
6
|
+
end
|
|
7
|
+
# Raised if an invalid method option is supplied.
|
|
8
|
+
class InvalidOptionError < StandardError
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Returns a URI object based on the parsed string.
|
|
12
|
+
def self.parse(uri_string)
|
|
13
|
+
return nil if uri_string.nil?
|
|
14
|
+
|
|
15
|
+
# If a URI object is passed, just return itself.
|
|
16
|
+
return uri_string if uri_string.kind_of?(self)
|
|
17
|
+
|
|
18
|
+
uri_regex =
|
|
19
|
+
/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/
|
|
20
|
+
scan = uri_string.scan(uri_regex)
|
|
21
|
+
fragments = scan[0]
|
|
22
|
+
return nil if fragments.nil?
|
|
23
|
+
scheme = fragments[1]
|
|
24
|
+
authority = fragments[3]
|
|
25
|
+
path = fragments[4]
|
|
26
|
+
query = fragments[6]
|
|
27
|
+
fragment = fragments[8]
|
|
28
|
+
userinfo = nil
|
|
29
|
+
host = nil
|
|
30
|
+
port = nil
|
|
31
|
+
if authority != nil
|
|
32
|
+
userinfo = authority.scan(/^([^\[\]]*)@/).flatten[0]
|
|
33
|
+
host = authority.gsub(/^([^\[\]]*)@/, "").gsub(/:([^:@\[\]]*?)$/, "")
|
|
34
|
+
port = authority.scan(/:([^:@\[\]]*?)$/).flatten[0]
|
|
35
|
+
end
|
|
36
|
+
if port.nil? || port == ""
|
|
37
|
+
port = nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# WARNING: Not standards-compliant, but follows the theme
|
|
41
|
+
# of Postel's law:
|
|
42
|
+
#
|
|
43
|
+
# Special exception for dealing with the retarded idea of the
|
|
44
|
+
# feed pseudo-protocol. Without this exception, the parser will read
|
|
45
|
+
# the URI as having a blank port number, instead of as having a second
|
|
46
|
+
# URI embedded within. This exception translates these broken URIs
|
|
47
|
+
# and instead treats the inner URI as opaque.
|
|
48
|
+
if scheme == "feed" && host == "http"
|
|
49
|
+
userinfo = nil
|
|
50
|
+
host = nil
|
|
51
|
+
port = nil
|
|
52
|
+
path = authority + path
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
return URI.new(scheme, userinfo, host, port, path, query, fragment)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Converts a path to a file protocol URI. If the path supplied is
|
|
59
|
+
# relative, it will be returned as a relative URI. If the path supplied
|
|
60
|
+
# is actually a URI, it will return the parsed URI.
|
|
61
|
+
def self.convert_path(path)
|
|
62
|
+
return nil if path.nil?
|
|
63
|
+
|
|
64
|
+
converted_uri = path.strip
|
|
65
|
+
if converted_uri.length > 0 && converted_uri[0..0] == "/"
|
|
66
|
+
converted_uri = "file://" + converted_uri
|
|
67
|
+
end
|
|
68
|
+
if converted_uri.length > 0 &&
|
|
69
|
+
converted_uri.scan(/^[a-zA-Z]:[\\\/]/).size > 0
|
|
70
|
+
converted_uri = "file:///" + converted_uri
|
|
71
|
+
end
|
|
72
|
+
converted_uri.gsub!(/^file:\/*/i, "file:///")
|
|
73
|
+
if converted_uri =~ /^file:/i
|
|
74
|
+
# Adjust windows-style uris
|
|
75
|
+
converted_uri.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
|
|
76
|
+
converted_uri.gsub!(/\\/, '/')
|
|
77
|
+
converted_uri = self.parse(converted_uri).normalize
|
|
78
|
+
else
|
|
79
|
+
converted_uri = self.parse(converted_uri)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
return converted_uri
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Joins several uris together.
|
|
86
|
+
def self.join(*uris)
|
|
87
|
+
uri_objects = uris.collect do |uri|
|
|
88
|
+
uri.kind_of?(self) ? uri : self.parse(uri.to_s)
|
|
89
|
+
end
|
|
90
|
+
result = uri_objects.shift.dup
|
|
91
|
+
for uri in uri_objects
|
|
92
|
+
result.merge!(uri)
|
|
93
|
+
end
|
|
94
|
+
return result
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Correctly escapes a uri.
|
|
98
|
+
def self.escape(uri)
|
|
99
|
+
uri_object = uri.kind_of?(self) ? uri : self.parse(uri.to_s)
|
|
100
|
+
return URI.new(
|
|
101
|
+
uri_object.scheme,
|
|
102
|
+
uri_object.userinfo,
|
|
103
|
+
uri_object.host,
|
|
104
|
+
uri_object.specified_port,
|
|
105
|
+
self.normalize_escaping(uri_object.path),
|
|
106
|
+
self.normalize_escaping(uri_object.query),
|
|
107
|
+
self.normalize_escaping(uri_object.fragment)
|
|
108
|
+
).to_s
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Extracts uris from an arbitrary body of text.
|
|
112
|
+
def self.extract(text, options={})
|
|
113
|
+
defaults = {:base => nil, :parse => false}
|
|
114
|
+
options = defaults.merge(options)
|
|
115
|
+
raise InvalidOptionError unless (options.keys - defaults.keys).empty?
|
|
116
|
+
# This regular expression needs to be less forgiving or else it would
|
|
117
|
+
# match virtually all text. Which isn't exactly what we're going for.
|
|
118
|
+
extract_regex = /((([a-z\+]+):)[^ \n\<\>\"\\]+[\w\/])/
|
|
119
|
+
extracted_uris =
|
|
120
|
+
text.scan(extract_regex).collect { |match| match[0] }
|
|
121
|
+
sgml_extract_regex = /<[^>]+href=\"([^\"]+?)\"[^>]*>/
|
|
122
|
+
sgml_extracted_uris =
|
|
123
|
+
text.scan(sgml_extract_regex).collect { |match| match[0] }
|
|
124
|
+
extracted_uris.concat(sgml_extracted_uris - extracted_uris)
|
|
125
|
+
textile_extract_regex = /\".+?\":([^ ]+\/[^ ]+)[ \,\.\;\:\?\!\<\>\"]/i
|
|
126
|
+
textile_extracted_uris =
|
|
127
|
+
text.scan(textile_extract_regex).collect { |match| match[0] }
|
|
128
|
+
extracted_uris.concat(textile_extracted_uris - extracted_uris)
|
|
129
|
+
parsed_uris = []
|
|
130
|
+
base_uri = nil
|
|
131
|
+
if options[:base] != nil
|
|
132
|
+
base_uri = options[:base] if options[:base].kind_of?(self)
|
|
133
|
+
base_uri = self.parse(options[:base].to_s) if base_uri == nil
|
|
134
|
+
end
|
|
135
|
+
for uri_string in extracted_uris
|
|
136
|
+
begin
|
|
137
|
+
if base_uri == nil
|
|
138
|
+
parsed_uris << self.parse(uri_string)
|
|
139
|
+
else
|
|
140
|
+
parsed_uris << (base_uri + self.parse(uri_string))
|
|
141
|
+
end
|
|
142
|
+
rescue Exception
|
|
143
|
+
nil
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
parsed_uris.reject! do |uri|
|
|
147
|
+
(uri.scheme =~ /T\d+/ ||
|
|
148
|
+
uri.scheme == "xmlns" ||
|
|
149
|
+
uri.scheme == "xml" ||
|
|
150
|
+
uri.scheme == "thr" ||
|
|
151
|
+
uri.scheme == "this" ||
|
|
152
|
+
uri.scheme == "float" ||
|
|
153
|
+
uri.scheme == "user" ||
|
|
154
|
+
uri.scheme == "username" ||
|
|
155
|
+
uri.scheme == "out")
|
|
156
|
+
end
|
|
157
|
+
if options[:parse]
|
|
158
|
+
return parsed_uris
|
|
159
|
+
else
|
|
160
|
+
return parsed_uris.collect { |uri| uri.to_s }
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Creates a new uri object from component parts. Passing nil for
|
|
165
|
+
# any of these parameters is acceptable.
|
|
166
|
+
def initialize(scheme, userinfo, host, port, path, query, fragment)
|
|
167
|
+
assign_components(scheme, userinfo, host, port, path, query, fragment)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Returns the scheme (protocol) for this URI.
|
|
171
|
+
def scheme
|
|
172
|
+
return nil if @scheme.nil? || @scheme.strip == ""
|
|
173
|
+
return @scheme
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Returns the username and password segment of this URI.
|
|
177
|
+
def userinfo
|
|
178
|
+
return @userinfo
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Returns the host for this URI.
|
|
182
|
+
def host
|
|
183
|
+
return @host
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Returns the authority segment of this URI.
|
|
187
|
+
def authority
|
|
188
|
+
if !defined?(@authority) || @authority.nil?
|
|
189
|
+
return nil if self.host.nil?
|
|
190
|
+
@authority = ""
|
|
191
|
+
if self.userinfo != nil
|
|
192
|
+
@authority << "#{self.userinfo}@"
|
|
193
|
+
end
|
|
194
|
+
@authority << self.host
|
|
195
|
+
if self.specified_port != nil
|
|
196
|
+
@authority << ":#{self.specified_port}"
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
return @authority
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Returns the user for this URI.
|
|
203
|
+
def user
|
|
204
|
+
if !defined?(@user) || @user.nil?
|
|
205
|
+
@user = nil
|
|
206
|
+
return @user if @userinfo.nil?
|
|
207
|
+
@user = @userinfo.strip.scan(/^(.*):/).flatten[0].strip
|
|
208
|
+
end
|
|
209
|
+
return @user
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Returns the password for this URI.
|
|
213
|
+
def password
|
|
214
|
+
if !defined?(@password) || @password.nil?
|
|
215
|
+
@password = nil
|
|
216
|
+
return @password if @userinfo.nil?
|
|
217
|
+
@password = @userinfo.strip.scan(/:(.*)$/).flatten[0].strip
|
|
218
|
+
end
|
|
219
|
+
return @password
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Returns an array of known ip-based schemes. These schemes typically
|
|
223
|
+
# use a similar URI form:
|
|
224
|
+
# //<user>:<password>@<host>:<port>/<url-path>
|
|
225
|
+
def self.ip_based_schemes
|
|
226
|
+
return self.scheme_mapping.keys
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Returns a hash of common IP-based schemes and their default port
|
|
230
|
+
# numbers. Adding new schemes to this hash, as necessary, will allow
|
|
231
|
+
# for better URI normalization.
|
|
232
|
+
def self.scheme_mapping
|
|
233
|
+
if !defined?(@protocol_mapping) || @protocol_mapping.nil?
|
|
234
|
+
@protocol_mapping = {
|
|
235
|
+
"http" => 80,
|
|
236
|
+
"https" => 443,
|
|
237
|
+
"ftp" => 21,
|
|
238
|
+
"tftp" => 69,
|
|
239
|
+
"ssh" => 22,
|
|
240
|
+
"svn+ssh" => 22,
|
|
241
|
+
"telnet" => 23,
|
|
242
|
+
"nntp" => 119,
|
|
243
|
+
"gopher" => 70,
|
|
244
|
+
"wais" => 210,
|
|
245
|
+
"prospero" => 1525
|
|
246
|
+
}
|
|
247
|
+
end
|
|
248
|
+
return @protocol_mapping
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Returns the port number for this URI. This method will normalize to the
|
|
252
|
+
# default port for the URI's scheme if the port isn't explicitly specified
|
|
253
|
+
# in the URI.
|
|
254
|
+
def port
|
|
255
|
+
if @port.to_i == 0
|
|
256
|
+
if self.scheme.nil?
|
|
257
|
+
@port = nil
|
|
258
|
+
else
|
|
259
|
+
@port = self.class.scheme_mapping[self.scheme.strip.downcase]
|
|
260
|
+
end
|
|
261
|
+
return @port
|
|
262
|
+
else
|
|
263
|
+
@port = @port.to_i
|
|
264
|
+
return @port
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Returns the port number that was actually specified in the URI string.
|
|
269
|
+
def specified_port
|
|
270
|
+
@specified_port = nil if !defined?(@specified_port)
|
|
271
|
+
return nil if @specified_port.nil?
|
|
272
|
+
port = @specified_port.to_s.to_i
|
|
273
|
+
if port == 0
|
|
274
|
+
return nil
|
|
275
|
+
else
|
|
276
|
+
return port
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Returns the path for this URI.
|
|
281
|
+
def path
|
|
282
|
+
return @path
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# Returns the query string for this URI.
|
|
286
|
+
def query
|
|
287
|
+
return @query
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Returns the fragment for this URI.
|
|
291
|
+
def fragment
|
|
292
|
+
return @fragment
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Returns true if the URI uses an IP-based protocol.
|
|
296
|
+
def ip_based?
|
|
297
|
+
return false if self.scheme.nil?
|
|
298
|
+
return self.class.ip_based_schemes.include?(self.scheme.strip.downcase)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Returns true if this URI is known to be relative.
|
|
302
|
+
def relative?
|
|
303
|
+
return self.scheme.nil?
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Returns true if this URI is known to be absolute.
|
|
307
|
+
def absolute?
|
|
308
|
+
return !relative?
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# Joins two URIs together.
|
|
312
|
+
def +(uri)
|
|
313
|
+
if !uri.kind_of?(self.class)
|
|
314
|
+
uri = URI.parse(uri.to_s)
|
|
315
|
+
end
|
|
316
|
+
if uri.to_s == ""
|
|
317
|
+
return self.dup
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
joined_scheme = nil
|
|
321
|
+
joined_userinfo = nil
|
|
322
|
+
joined_host = nil
|
|
323
|
+
joined_port = nil
|
|
324
|
+
joined_path = nil
|
|
325
|
+
joined_query = nil
|
|
326
|
+
joined_fragment = nil
|
|
327
|
+
|
|
328
|
+
# Section 5.2.2 of RFC 3986
|
|
329
|
+
if uri.scheme != nil
|
|
330
|
+
joined_scheme = uri.scheme
|
|
331
|
+
joined_userinfo = uri.userinfo
|
|
332
|
+
joined_host = uri.host
|
|
333
|
+
joined_port = uri.specified_port
|
|
334
|
+
joined_path = self.class.normalize_path(uri.path)
|
|
335
|
+
joined_query = uri.query
|
|
336
|
+
else
|
|
337
|
+
if uri.authority != nil
|
|
338
|
+
joined_userinfo = uri.userinfo
|
|
339
|
+
joined_host = uri.host
|
|
340
|
+
joined_port = uri.specified_port
|
|
341
|
+
joined_path = self.class.normalize_path(uri.path)
|
|
342
|
+
joined_query = uri.query
|
|
343
|
+
else
|
|
344
|
+
if uri.path == nil || uri.path == ""
|
|
345
|
+
joined_path = self.path
|
|
346
|
+
if uri.query != nil
|
|
347
|
+
joined_query = uri.query
|
|
348
|
+
else
|
|
349
|
+
joined_query = self.query
|
|
350
|
+
end
|
|
351
|
+
else
|
|
352
|
+
if uri.path[0..0] == "/"
|
|
353
|
+
joined_path = self.class.normalize_path(uri.path)
|
|
354
|
+
else
|
|
355
|
+
base_path = self.path.nil? ? "" : self.path.dup
|
|
356
|
+
base_path = self.class.normalize_path(base_path)
|
|
357
|
+
base_path.gsub!(/\/[^\/]+$/, "/")
|
|
358
|
+
joined_path = self.class.normalize_path(base_path + uri.path)
|
|
359
|
+
end
|
|
360
|
+
joined_query = uri.query
|
|
361
|
+
end
|
|
362
|
+
joined_userinfo = self.userinfo
|
|
363
|
+
joined_host = self.host
|
|
364
|
+
joined_port = self.specified_port
|
|
365
|
+
end
|
|
366
|
+
joined_scheme = self.scheme
|
|
367
|
+
end
|
|
368
|
+
joined_fragment = uri.fragment
|
|
369
|
+
|
|
370
|
+
return URI.new(
|
|
371
|
+
joined_scheme,
|
|
372
|
+
joined_userinfo,
|
|
373
|
+
joined_host,
|
|
374
|
+
joined_port,
|
|
375
|
+
joined_path,
|
|
376
|
+
joined_query,
|
|
377
|
+
joined_fragment
|
|
378
|
+
)
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
# Merges two URIs together.
|
|
382
|
+
def merge(uri)
|
|
383
|
+
return self + uri
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# Destructive form of merge.
|
|
387
|
+
def merge!(uri)
|
|
388
|
+
replace_self(self.merge(uri))
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# Returns a normalized URI object.
|
|
392
|
+
#
|
|
393
|
+
# NOTE: This method does not attempt to conform to specifications. It
|
|
394
|
+
# exists largely to correct other people's failures to read the
|
|
395
|
+
# specifications, and also to deal with caching issues since several
|
|
396
|
+
# different URIs may represent the same resource and should not be
|
|
397
|
+
# cached multiple times.
|
|
398
|
+
def normalize
|
|
399
|
+
normalized_scheme = nil
|
|
400
|
+
normalized_scheme = self.scheme.strip.downcase if self.scheme != nil
|
|
401
|
+
normalized_scheme = "svn+ssh" if normalized_scheme == "ssh+svn"
|
|
402
|
+
if normalized_scheme == "feed"
|
|
403
|
+
if self.to_s =~ /^feed:\/*http:\/*/
|
|
404
|
+
return self.class.parse(
|
|
405
|
+
self.to_s.scan(/^feed:\/*(http:\/*.*)/).flatten[0]).normalize
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
normalized_userinfo = nil
|
|
409
|
+
normalized_userinfo = self.userinfo.strip if self.userinfo != nil
|
|
410
|
+
normalized_host = nil
|
|
411
|
+
normalized_host = self.host.strip.downcase if self.host != nil
|
|
412
|
+
if normalized_host != nil
|
|
413
|
+
begin
|
|
414
|
+
normalized_host = URI::IDNA.to_ascii(normalized_host)
|
|
415
|
+
rescue Exception
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Normalize IPv4 addresses that were generated with the stupid
|
|
420
|
+
# assumption that inet_addr() would be used to parse the IP address.
|
|
421
|
+
if normalized_host != nil && normalized_host.strip =~ /^\d+$/
|
|
422
|
+
# Decimal IPv4 address.
|
|
423
|
+
decimal = normalized_host.to_i
|
|
424
|
+
if decimal < (256 ** 4)
|
|
425
|
+
octets = [0,0,0,0]
|
|
426
|
+
octets[0] = decimal >> 24
|
|
427
|
+
decimal -= (octets[0] * (256 ** 3))
|
|
428
|
+
octets[1] = decimal >> 16
|
|
429
|
+
decimal -= (octets[1] * (256 ** 2))
|
|
430
|
+
octets[2] = decimal >> 8
|
|
431
|
+
decimal -= (octets[2] * (256 ** 1))
|
|
432
|
+
octets[3] = decimal
|
|
433
|
+
normalized_host = octets.join(".")
|
|
434
|
+
end
|
|
435
|
+
elsif (normalized_host != nil && normalized_host.strip =~
|
|
436
|
+
/^0+[0-7]{3}.0+[0-7]{3}.0+[0-7]{3}.0+[0-7]{3}$/)
|
|
437
|
+
# Octal IPv4 address.
|
|
438
|
+
octet_strings = normalized_host.split('.')
|
|
439
|
+
octets = []
|
|
440
|
+
octet_strings.each do |octet_string|
|
|
441
|
+
decimal = octet_string.to_i(8)
|
|
442
|
+
octets << decimal
|
|
443
|
+
end
|
|
444
|
+
normalized_host = octets.join(".")
|
|
445
|
+
elsif (normalized_host != nil && normalized_host.strip =~
|
|
446
|
+
/^0x[0-9a-f]{2}.0x[0-9a-f]{2}.0x[0-9a-f]{2}.0x[0-9a-f]{2}$/i)
|
|
447
|
+
# Hexidecimal IPv4 address.
|
|
448
|
+
octet_strings = normalized_host.split('.')
|
|
449
|
+
octets = []
|
|
450
|
+
octet_strings.each do |octet_string|
|
|
451
|
+
decimal = octet_string[2...4].to_i(16)
|
|
452
|
+
octets << decimal
|
|
453
|
+
end
|
|
454
|
+
normalized_host = octets.join(".")
|
|
455
|
+
end
|
|
456
|
+
normalized_port = self.port
|
|
457
|
+
if self.class.scheme_mapping[normalized_scheme] == normalized_port
|
|
458
|
+
normalized_port = nil
|
|
459
|
+
end
|
|
460
|
+
normalized_path = nil
|
|
461
|
+
normalized_path = self.path.strip if self.path != nil
|
|
462
|
+
if normalized_scheme != nil && normalized_host == nil
|
|
463
|
+
if self.class.ip_based_schemes.include?(normalized_scheme) &&
|
|
464
|
+
normalized_path =~ /[\w\.]+/
|
|
465
|
+
normalized_host = normalized_path
|
|
466
|
+
normalized_path = nil
|
|
467
|
+
unless normalized_host =~ /\./
|
|
468
|
+
normalized_host = normalized_host + ".com"
|
|
469
|
+
end
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
if normalized_path == nil &&
|
|
473
|
+
normalized_scheme != nil &&
|
|
474
|
+
normalized_host != nil
|
|
475
|
+
normalized_path = "/"
|
|
476
|
+
end
|
|
477
|
+
if normalized_path != nil
|
|
478
|
+
normalized_path = self.class.normalize_path(normalized_path)
|
|
479
|
+
normalized_path = self.class.normalize_escaping(normalized_path)
|
|
480
|
+
end
|
|
481
|
+
if normalized_path == ""
|
|
482
|
+
if ["http", "https", "ftp", "tftp"].include?(normalized_scheme)
|
|
483
|
+
normalized_path = "/"
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
normalized_path.gsub!(/%3B/, ";") if normalized_path != nil
|
|
487
|
+
normalized_path.gsub!(/%3A/, ":") if normalized_path != nil
|
|
488
|
+
normalized_path.gsub!(/%40/, "@") if normalized_path != nil
|
|
489
|
+
normalized_path.gsub!(/%2B/, "+") if normalized_path != nil
|
|
490
|
+
|
|
491
|
+
normalized_query = nil
|
|
492
|
+
normalized_query = self.query.strip if self.query != nil
|
|
493
|
+
normalized_query = self.class.normalize_escaping(normalized_query)
|
|
494
|
+
normalized_query.gsub!(/%3D/, "=") if normalized_query != nil
|
|
495
|
+
normalized_query.gsub!(/%26/, "&") if normalized_query != nil
|
|
496
|
+
normalized_query.gsub!(/%2B/, "+") if normalized_query != nil
|
|
497
|
+
|
|
498
|
+
normalized_fragment = nil
|
|
499
|
+
normalized_fragment = self.fragment.strip if self.fragment != nil
|
|
500
|
+
normalized_fragment = self.class.normalize_escaping(normalized_fragment)
|
|
501
|
+
return URI.new(
|
|
502
|
+
normalized_scheme,
|
|
503
|
+
normalized_userinfo,
|
|
504
|
+
normalized_host,
|
|
505
|
+
normalized_port,
|
|
506
|
+
normalized_path,
|
|
507
|
+
normalized_query,
|
|
508
|
+
normalized_fragment
|
|
509
|
+
)
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Destructively normalizes this URI object.
|
|
513
|
+
def normalize!
|
|
514
|
+
replace_self(self.normalize)
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
# Creates a URI suitable for display to users. If semantic attacks are
|
|
518
|
+
# likely, the application should try to detect these and warn the user.
|
|
519
|
+
# See RFC 3986 section 7.6 for more information.
|
|
520
|
+
def display_uri
|
|
521
|
+
display_uri = self.normalize
|
|
522
|
+
begin
|
|
523
|
+
display_uri.instance_variable_set("@host",
|
|
524
|
+
URI::IDNA.to_unicode(display_uri.host))
|
|
525
|
+
rescue Exception
|
|
526
|
+
end
|
|
527
|
+
return display_uri
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
# Returns true if the URI objects are equal. This method normalizes
|
|
531
|
+
# both URIs before doing the comparison, and allows comparison against
|
|
532
|
+
# strings.
|
|
533
|
+
def ===(uri)
|
|
534
|
+
uri_string = nil
|
|
535
|
+
if uri.respond_to?(:normalize)
|
|
536
|
+
uri_string = uri.normalize.to_s
|
|
537
|
+
else
|
|
538
|
+
begin
|
|
539
|
+
uri_string = URI.parse(uri.to_s).normalize.to_s
|
|
540
|
+
rescue Exception
|
|
541
|
+
return false
|
|
542
|
+
end
|
|
543
|
+
end
|
|
544
|
+
return self.normalize.to_s == uri_string
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
# Returns true if the URI objects are equal. This method normalizes
|
|
548
|
+
# both URIs before doing the comparison.
|
|
549
|
+
def ==(uri)
|
|
550
|
+
return false unless uri.kind_of?(self.class)
|
|
551
|
+
return self.normalize.to_s == uri.normalize.to_s
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
# Returns true if the URI objects are equal. This method does NOT
|
|
555
|
+
# normalize either URI before doing the comparison.
|
|
556
|
+
def eql?(uri)
|
|
557
|
+
return false unless uri.kind_of?(self.class)
|
|
558
|
+
return self.to_s == uri.to_s
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
# Clones the URI object.
|
|
562
|
+
def dup
|
|
563
|
+
duplicated_scheme = nil
|
|
564
|
+
duplicated_scheme = self.scheme.dup if self.scheme != nil
|
|
565
|
+
duplicated_userinfo = nil
|
|
566
|
+
duplicated_userinfo = self.userinfo.dup if self.userinfo != nil
|
|
567
|
+
duplicated_host = nil
|
|
568
|
+
duplicated_host = self.host.dup if self.host != nil
|
|
569
|
+
duplicated_port = self.port
|
|
570
|
+
duplicated_path = nil
|
|
571
|
+
duplicated_path = self.path.dup if self.path != nil
|
|
572
|
+
duplicated_query = nil
|
|
573
|
+
duplicated_query = self.query.dup if self.query != nil
|
|
574
|
+
duplicated_fragment = nil
|
|
575
|
+
duplicated_fragment = self.fragment.dup if self.fragment != nil
|
|
576
|
+
duplicated_uri = URI.new(
|
|
577
|
+
duplicated_scheme,
|
|
578
|
+
duplicated_userinfo,
|
|
579
|
+
duplicated_host,
|
|
580
|
+
duplicated_port,
|
|
581
|
+
duplicated_path,
|
|
582
|
+
duplicated_query,
|
|
583
|
+
duplicated_fragment
|
|
584
|
+
)
|
|
585
|
+
@specified_port = nil if !defined?(@specified_port)
|
|
586
|
+
duplicated_uri.instance_variable_set("@specified_port", @specified_port)
|
|
587
|
+
return duplicated_uri
|
|
588
|
+
end
|
|
589
|
+
|
|
590
|
+
# Returns the assembled URI as a string.
|
|
591
|
+
def to_s
|
|
592
|
+
uri_string = ""
|
|
593
|
+
if self.scheme != nil
|
|
594
|
+
uri_string << "#{self.scheme}:"
|
|
595
|
+
end
|
|
596
|
+
if self.authority != nil
|
|
597
|
+
uri_string << "//#{self.authority}"
|
|
598
|
+
end
|
|
599
|
+
if self.path != nil
|
|
600
|
+
uri_string << self.path
|
|
601
|
+
end
|
|
602
|
+
if self.query != nil
|
|
603
|
+
uri_string << "?#{self.query}"
|
|
604
|
+
end
|
|
605
|
+
if self.fragment != nil
|
|
606
|
+
uri_string << "##{self.fragment}"
|
|
607
|
+
end
|
|
608
|
+
return uri_string
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# Returns a string representation of the URI object's state.
|
|
612
|
+
def inspect
|
|
613
|
+
sprintf("#<%s:%#0x URL:%s>", self.class.to_s, self.object_id, self.to_s)
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
# This module handles internationalized domain names. When Ruby has an
|
|
617
|
+
# implementation of nameprep, stringprep, punycode, etc, this
|
|
618
|
+
# module should contain an actual implementation of IDNA instead of
|
|
619
|
+
# returning nil if libidn can't be used.
|
|
620
|
+
module IDNA
|
|
621
|
+
# Returns the ascii representation of the label.
|
|
622
|
+
def self.to_ascii(label)
|
|
623
|
+
return nil if label.nil?
|
|
624
|
+
if self.use_libidn?
|
|
625
|
+
return IDN::Idna.toASCII(label)
|
|
626
|
+
else
|
|
627
|
+
raise NotImplementedError,
|
|
628
|
+
"There is no available pure-ruby implementation. " +
|
|
629
|
+
"Install libidn bindings."
|
|
630
|
+
end
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
# Returns the unicode representation of the label.
|
|
634
|
+
def self.to_unicode(label)
|
|
635
|
+
return nil if label.nil?
|
|
636
|
+
if self.use_libidn?
|
|
637
|
+
return IDN::Idna.toUnicode(label)
|
|
638
|
+
else
|
|
639
|
+
raise NotImplementedError,
|
|
640
|
+
"There is no available pure-ruby implementation. " +
|
|
641
|
+
"Install libidn bindings."
|
|
642
|
+
end
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
private
|
|
646
|
+
# Determines if the libidn bindings are available and able to be used.
|
|
647
|
+
def self.use_libidn?
|
|
648
|
+
if !defined?(@use_libidn) || @use_libidn.nil?
|
|
649
|
+
begin
|
|
650
|
+
require 'rubygems'
|
|
651
|
+
rescue LoadError
|
|
652
|
+
end
|
|
653
|
+
begin
|
|
654
|
+
require 'idn'
|
|
655
|
+
rescue LoadError
|
|
656
|
+
end
|
|
657
|
+
@use_libidn = !!(defined?(IDN::Idna))
|
|
658
|
+
end
|
|
659
|
+
return @use_libidn
|
|
660
|
+
end
|
|
661
|
+
end
|
|
662
|
+
|
|
663
|
+
private
|
|
664
|
+
# Resolves paths to their simplest form.
|
|
665
|
+
def self.normalize_path(path)
|
|
666
|
+
return nil if path.nil?
|
|
667
|
+
normalized_path = path.dup
|
|
668
|
+
previous_state = normalized_path.dup
|
|
669
|
+
begin
|
|
670
|
+
previous_state = normalized_path.dup
|
|
671
|
+
normalized_path.gsub!(/\/\.\//, "/")
|
|
672
|
+
normalized_path.gsub!(/\/\.$/, "/")
|
|
673
|
+
parent = normalized_path.scan(/\/([^\/]+)\/\.\.\//).flatten[0]
|
|
674
|
+
if parent != "." && parent != ".."
|
|
675
|
+
normalized_path.gsub!(/\/#{parent}\/\.\.\//, "/")
|
|
676
|
+
end
|
|
677
|
+
parent = normalized_path.scan(/\/([^\/]+)\/\.\.$/).flatten[0]
|
|
678
|
+
if parent != "." && parent != ".."
|
|
679
|
+
normalized_path.gsub!(/\/#{parent}\/\.\.$/, "/")
|
|
680
|
+
end
|
|
681
|
+
normalized_path.gsub!(/^\.\.?\/?/, "")
|
|
682
|
+
normalized_path.gsub!(/^\/\.\.?\//, "/")
|
|
683
|
+
end until previous_state == normalized_path
|
|
684
|
+
return normalized_path
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
# Normalizes percent escaping of characters
|
|
688
|
+
def self.normalize_escaping(escaped_section)
|
|
689
|
+
return nil if escaped_section.nil?
|
|
690
|
+
normalized_section = escaped_section.dup
|
|
691
|
+
normalized_section.gsub!(/%[0-9a-f]{2}/i) do |sequence|
|
|
692
|
+
sequence[1..3].to_i(16).chr
|
|
693
|
+
end
|
|
694
|
+
if URI::IDNA.send(:use_libidn?)
|
|
695
|
+
normalized_section =
|
|
696
|
+
IDN::Stringprep.nfkc_normalize(normalized_section)
|
|
697
|
+
end
|
|
698
|
+
new_section = ""
|
|
699
|
+
for index in 0...normalized_section.size
|
|
700
|
+
if self.unreserved?(normalized_section[index]) ||
|
|
701
|
+
normalized_section[index] == '/'[0]
|
|
702
|
+
new_section << normalized_section[index..index]
|
|
703
|
+
else
|
|
704
|
+
new_section << ("%" + normalized_section[index].to_s(16).upcase)
|
|
705
|
+
end
|
|
706
|
+
end
|
|
707
|
+
normalized_section = new_section
|
|
708
|
+
return normalized_section
|
|
709
|
+
end
|
|
710
|
+
|
|
711
|
+
# Returns true if the specified character is unreserved.
|
|
712
|
+
def self.unreserved?(character)
|
|
713
|
+
character_string = nil
|
|
714
|
+
character_string = character.chr if character.respond_to?(:chr)
|
|
715
|
+
character_string = character[0..0] if character.kind_of?(String)
|
|
716
|
+
return self.unreserved.include?(character_string)
|
|
717
|
+
end
|
|
718
|
+
|
|
719
|
+
# Returns a list of unreserved characters.
|
|
720
|
+
def self.unreserved
|
|
721
|
+
if !defined?(@unreserved) || @unreserved.nil?
|
|
722
|
+
@unreserved = ["-", ".", "_", "~"]
|
|
723
|
+
for c in "a".."z"
|
|
724
|
+
@unreserved << c
|
|
725
|
+
@unreserved << c.upcase
|
|
726
|
+
end
|
|
727
|
+
for c in "0".."9"
|
|
728
|
+
@unreserved << c
|
|
729
|
+
end
|
|
730
|
+
@unreserved.sort!
|
|
731
|
+
end
|
|
732
|
+
return @unreserved
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
# Assigns the specified components to the appropriate instance variables.
|
|
736
|
+
# Used in destructive operations to avoid code repetition.
|
|
737
|
+
def assign_components(scheme, userinfo, host, port, path, query, fragment)
|
|
738
|
+
if scheme == nil && userinfo == nil && host == nil && port == nil &&
|
|
739
|
+
path == nil && query == nil && fragment == nil
|
|
740
|
+
raise InvalidURIError, "All parameters were nil."
|
|
741
|
+
end
|
|
742
|
+
@scheme = scheme
|
|
743
|
+
@userinfo = userinfo
|
|
744
|
+
@host = host
|
|
745
|
+
@specified_port = port.to_s
|
|
746
|
+
@port = port
|
|
747
|
+
@port = @port.to_s if @port.kind_of?(Fixnum)
|
|
748
|
+
if @port != nil && !(@port =~ /^\d+$/)
|
|
749
|
+
raise InvalidURIError,
|
|
750
|
+
"Invalid port number: #{@port.inspect}"
|
|
751
|
+
end
|
|
752
|
+
@port = @port.to_i
|
|
753
|
+
@port = nil if @port == 0
|
|
754
|
+
@path = path
|
|
755
|
+
@query = query
|
|
756
|
+
@fragment = fragment
|
|
757
|
+
if @scheme != nil && @host == "" && @path == ""
|
|
758
|
+
raise InvalidURIError,
|
|
759
|
+
"Absolute URI missing hierarchical segment."
|
|
760
|
+
end
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
# Replaces the internal state of self with the specified URI's state.
|
|
764
|
+
# Used in destructive operations to avoid code repetition.
|
|
765
|
+
def replace_self(uri)
|
|
766
|
+
@authority = nil
|
|
767
|
+
@user = nil
|
|
768
|
+
@password = nil
|
|
769
|
+
|
|
770
|
+
@scheme = uri.scheme
|
|
771
|
+
@userinfo = uri.userinfo
|
|
772
|
+
@host = uri.host
|
|
773
|
+
@specified_port = uri.instance_variable_get("@specified_port")
|
|
774
|
+
@port = @specified_port.to_s.to_i
|
|
775
|
+
@path = uri.path
|
|
776
|
+
@query = uri.query
|
|
777
|
+
@fragment = uri.fragment
|
|
778
|
+
return self
|
|
779
|
+
end
|
|
780
|
+
end
|
|
781
|
+
end
|