feedtools 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -0
- data/lib/feed_tools.rb +2496 -810
- data/lib/feed_tools/vendor/builder.rb +2 -0
- data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
- data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
- data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
- data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
- data/lib/feed_tools/vendor/htree.rb +97 -0
- data/lib/feed_tools/vendor/htree/container.rb +10 -0
- data/lib/feed_tools/vendor/htree/context.rb +67 -0
- data/lib/feed_tools/vendor/htree/display.rb +27 -0
- data/lib/feed_tools/vendor/htree/doc.rb +149 -0
- data/lib/feed_tools/vendor/htree/elem.rb +262 -0
- data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
- data/lib/feed_tools/vendor/htree/equality.rb +218 -0
- data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
- data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
- data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
- data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
- data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
- data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
- data/lib/feed_tools/vendor/htree/loc.rb +367 -0
- data/lib/feed_tools/vendor/htree/modules.rb +48 -0
- data/lib/feed_tools/vendor/htree/name.rb +124 -0
- data/lib/feed_tools/vendor/htree/output.rb +207 -0
- data/lib/feed_tools/vendor/htree/parse.rb +407 -0
- data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
- data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
- data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
- data/lib/feed_tools/vendor/htree/scan.rb +166 -0
- data/lib/feed_tools/vendor/htree/tag.rb +111 -0
- data/lib/feed_tools/vendor/htree/template.rb +909 -0
- data/lib/feed_tools/vendor/htree/text.rb +115 -0
- data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
- data/rakefile +1 -1
- data/test/rss_test.rb +97 -0
- metadata +30 -1
@@ -0,0 +1,48 @@
|
|
1
|
+
# :stopdoc:
|
2
|
+
module HTree
|
3
|
+
class Name; include HTree end
|
4
|
+
class Context; include HTree end
|
5
|
+
|
6
|
+
module Tag; include HTree end
|
7
|
+
class STag; include Tag end
|
8
|
+
class ETag; include Tag end
|
9
|
+
|
10
|
+
module Node; include HTree end
|
11
|
+
module Container; include Node end
|
12
|
+
class Doc; include Container end
|
13
|
+
class Elem; include Container end
|
14
|
+
module Leaf; include Node end
|
15
|
+
class Text; include Leaf end
|
16
|
+
class XMLDecl; include Leaf end
|
17
|
+
class DocType; include Leaf end
|
18
|
+
class ProcIns; include Leaf end
|
19
|
+
class Comment; include Leaf end
|
20
|
+
class BogusETag; include Leaf end
|
21
|
+
|
22
|
+
module Traverse end
|
23
|
+
module Container::Trav; include Traverse end
|
24
|
+
module Leaf::Trav; include Traverse end
|
25
|
+
class Doc; module Trav; include Container::Trav end; include Trav end
|
26
|
+
class Elem; module Trav; include Container::Trav end; include Trav end
|
27
|
+
class Text; module Trav; include Leaf::Trav end; include Trav end
|
28
|
+
class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
|
29
|
+
class DocType; module Trav; include Leaf::Trav end; include Trav end
|
30
|
+
class ProcIns; module Trav; include Leaf::Trav end; include Trav end
|
31
|
+
class Comment; module Trav; include Leaf::Trav end; include Trav end
|
32
|
+
class BogusETag; module Trav; include Leaf::Trav end; include Trav end
|
33
|
+
|
34
|
+
class Location; include HTree end
|
35
|
+
module Container::Loc end
|
36
|
+
module Leaf::Loc end
|
37
|
+
class Doc; class Loc < Location; include Trav, Container::Loc end end
|
38
|
+
class Elem; class Loc < Location; include Trav, Container::Loc end end
|
39
|
+
class Text; class Loc < Location; include Trav, Leaf::Loc end end
|
40
|
+
class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
|
41
|
+
class DocType; class Loc < Location; include Trav, Leaf::Loc end end
|
42
|
+
class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
|
43
|
+
class Comment; class Loc < Location; include Trav, Leaf::Loc end end
|
44
|
+
class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
|
45
|
+
|
46
|
+
class Error < StandardError; end
|
47
|
+
end
|
48
|
+
# :startdoc:
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# :stopdoc:
|
2
|
+
require 'htree/scan' # for Pat::Nmtoken
|
3
|
+
require 'htree/context'
|
4
|
+
|
5
|
+
module HTree # :nodoc:
|
6
|
+
# Name represents a element name and attribute name.
|
7
|
+
# It consists of a namespace prefix, a namespace URI and a local name.
|
8
|
+
class Name # :nodoc:
|
9
|
+
=begin
|
10
|
+
element name prefix uri localname
|
11
|
+
{u}n, n with xmlns=u nil 'u' 'n'
|
12
|
+
p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
|
13
|
+
n with xmlns='' nil '' 'n'
|
14
|
+
|
15
|
+
attribute name
|
16
|
+
xmlns= 'xmlns' nil nil
|
17
|
+
xmlns:n= 'xmlns' nil 'n'
|
18
|
+
p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
|
19
|
+
n= nil '' 'n'
|
20
|
+
=end
|
21
|
+
def Name.parse_element_name(name, context)
|
22
|
+
if /\{(.*)\}/ =~ name
|
23
|
+
# "{u}n" means "use default namespace",
|
24
|
+
# "p{u}n" means "use the specified prefix p"
|
25
|
+
$` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
|
26
|
+
elsif /:/ =~ name && !context.namespace_uri($`).empty?
|
27
|
+
Name.new($`, context.namespace_uri($`), $')
|
28
|
+
elsif !context.namespace_uri(nil).empty?
|
29
|
+
Name.new(nil, context.namespace_uri(nil), name)
|
30
|
+
else
|
31
|
+
Name.new(nil, '', name)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def Name.parse_attribute_name(name, context)
|
36
|
+
if name == 'xmlns'
|
37
|
+
Name.new('xmlns', nil, nil)
|
38
|
+
elsif /\Axmlns:/ =~ name
|
39
|
+
Name.new('xmlns', nil, $')
|
40
|
+
elsif /\{(.*)\}/ =~ name
|
41
|
+
case $`
|
42
|
+
when ''; Name.new(nil, $1, $')
|
43
|
+
else Name.new($`, $1, $')
|
44
|
+
end
|
45
|
+
elsif /:/ =~ name && !context.namespace_uri($`).empty?
|
46
|
+
Name.new($`, context.namespace_uri($`), $')
|
47
|
+
else
|
48
|
+
Name.new(nil, '', name)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
NameCache = {}
|
53
|
+
def Name.new(namespace_prefix, namespace_uri, local_name)
|
54
|
+
key = [namespace_prefix, namespace_uri, local_name, self]
|
55
|
+
NameCache.fetch(key) {
|
56
|
+
0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
|
57
|
+
NameCache[key] = super(key[0], key[1], key[2])
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def initialize(namespace_prefix, namespace_uri, local_name)
|
62
|
+
@namespace_prefix = namespace_prefix
|
63
|
+
@namespace_uri = namespace_uri
|
64
|
+
@local_name = local_name
|
65
|
+
if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
|
66
|
+
raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
|
67
|
+
end
|
68
|
+
if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
|
69
|
+
raise HTree::Error, "invalid local name: #{@local_name.inspect}"
|
70
|
+
end
|
71
|
+
if @namespace_prefix == 'xmlns'
|
72
|
+
unless @namespace_uri == nil
|
73
|
+
raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
|
74
|
+
end
|
75
|
+
else
|
76
|
+
unless String === @namespace_uri
|
77
|
+
raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
attr_reader :namespace_prefix, :namespace_uri, :local_name
|
82
|
+
|
83
|
+
def xmlns?
|
84
|
+
@namespace_prefix == 'xmlns' && @namespace_uri == nil
|
85
|
+
end
|
86
|
+
|
87
|
+
def universal_name
|
88
|
+
if @namespace_uri && !@namespace_uri.empty?
|
89
|
+
"{#{@namespace_uri}}#{@local_name}"
|
90
|
+
else
|
91
|
+
@local_name.dup
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def qualified_name
|
96
|
+
if @namespace_uri && !@namespace_uri.empty?
|
97
|
+
if @namespace_prefix
|
98
|
+
"#{@namespace_prefix}:#{@local_name}"
|
99
|
+
else
|
100
|
+
@local_name.dup
|
101
|
+
end
|
102
|
+
elsif @local_name
|
103
|
+
@local_name.dup
|
104
|
+
else
|
105
|
+
"xmlns"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_s
|
110
|
+
if @namespace_uri && !@namespace_uri.empty?
|
111
|
+
if @namespace_prefix
|
112
|
+
"#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
|
113
|
+
else
|
114
|
+
"{#{@namespace_uri}}#{@local_name}"
|
115
|
+
end
|
116
|
+
elsif @local_name
|
117
|
+
@local_name.dup
|
118
|
+
else
|
119
|
+
"xmlns"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
# :startdoc:
|
@@ -0,0 +1,207 @@
|
|
1
|
+
# :stopdoc:
|
2
|
+
require 'htree/encoder'
|
3
|
+
require 'htree/doc'
|
4
|
+
require 'htree/elem'
|
5
|
+
require 'htree/leaf'
|
6
|
+
require 'htree/text'
|
7
|
+
|
8
|
+
module HTree # :nodoc:
|
9
|
+
|
10
|
+
class Text # :nodoc:
|
11
|
+
ChRef = {
|
12
|
+
'>' => '>',
|
13
|
+
'<' => '<',
|
14
|
+
'"' => '"',
|
15
|
+
}
|
16
|
+
|
17
|
+
def output(out, context)
|
18
|
+
out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_attvalue_content
|
22
|
+
@rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def output_attvalue(out, context)
|
26
|
+
out.output_string '"'
|
27
|
+
out.output_text to_attvalue_content
|
28
|
+
out.output_string '"'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class Name # :nodoc:
|
33
|
+
def output(out, context)
|
34
|
+
# xxx: validate namespace prefix
|
35
|
+
if xmlns?
|
36
|
+
if @local_name
|
37
|
+
out.output_string "xmlns:#{@local_name}"
|
38
|
+
else
|
39
|
+
out.output_string "xmlns"
|
40
|
+
end
|
41
|
+
else
|
42
|
+
out.output_string qualified_name
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def output_attribute(text, out, context)
|
47
|
+
output(out, context)
|
48
|
+
out.output_string '='
|
49
|
+
text.output_attvalue(out, context)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class Doc # :nodoc:
|
54
|
+
def output(out, context)
|
55
|
+
context = DefaultContext # discard outer context
|
56
|
+
xmldecl = false
|
57
|
+
doctypedecl = false
|
58
|
+
@children.each {|n|
|
59
|
+
if n.respond_to? :output_prolog_xmldecl
|
60
|
+
n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
|
61
|
+
xmldecl = true
|
62
|
+
elsif n.respond_to? :output_prolog_doctypedecl
|
63
|
+
n.output_prolog_doctypedecl(out, context) unless doctypedecl
|
64
|
+
doctypedecl = true
|
65
|
+
else
|
66
|
+
n.output(out, context)
|
67
|
+
end
|
68
|
+
}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Elem # :nodoc:
|
73
|
+
def output(out, context)
|
74
|
+
if @empty
|
75
|
+
@stag.output_emptytag(out, context)
|
76
|
+
else
|
77
|
+
children_context = @stag.output_stag(out, context)
|
78
|
+
@children.each {|n| n.output(out, children_context) }
|
79
|
+
@stag.output_etag(out, context)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class STag # :nodoc:
|
85
|
+
def output_attributes(out, context)
|
86
|
+
@attributes.each {|aname, text|
|
87
|
+
next if aname.xmlns?
|
88
|
+
out.output_string ' '
|
89
|
+
aname.output_attribute(text, out, context)
|
90
|
+
}
|
91
|
+
@context.output_namespaces(out, context)
|
92
|
+
end
|
93
|
+
|
94
|
+
def output_emptytag(out, context)
|
95
|
+
out.output_string '<'
|
96
|
+
@name.output(out, context)
|
97
|
+
children_context = output_attributes(out, context)
|
98
|
+
out.output_string "\n/>"
|
99
|
+
children_context
|
100
|
+
end
|
101
|
+
|
102
|
+
def output_stag(out, context)
|
103
|
+
out.output_string '<'
|
104
|
+
@name.output(out, context)
|
105
|
+
children_context = output_attributes(out, context)
|
106
|
+
out.output_string "\n>"
|
107
|
+
children_context
|
108
|
+
end
|
109
|
+
|
110
|
+
def output_etag(out, context)
|
111
|
+
out.output_string '</'
|
112
|
+
@name.output(out, context)
|
113
|
+
out.output_string "\n>"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class Context # :nodoc:
|
118
|
+
def output_namespaces(out, outer_context)
|
119
|
+
unknown_namespaces = {}
|
120
|
+
@namespaces.each {|prefix, uri|
|
121
|
+
outer_uri = outer_context.namespace_uri(prefix)
|
122
|
+
if outer_uri == nil
|
123
|
+
unknown_namespaces[prefix] = uri
|
124
|
+
elsif outer_uri != uri
|
125
|
+
if prefix
|
126
|
+
out.output_string " xmlns:#{prefix}="
|
127
|
+
else
|
128
|
+
out.output_string " xmlns="
|
129
|
+
end
|
130
|
+
Text.new(uri).output_attvalue(out, outer_context)
|
131
|
+
end
|
132
|
+
}
|
133
|
+
unless unknown_namespaces.empty?
|
134
|
+
out.output_xmlns(unknown_namespaces)
|
135
|
+
end
|
136
|
+
outer_context.subst_namespaces(@namespaces)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
class BogusETag # :nodoc:
|
141
|
+
# don't output anything.
|
142
|
+
def output(out, context)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
class XMLDecl # :nodoc:
|
147
|
+
# don't output anything.
|
148
|
+
def output(out, context)
|
149
|
+
end
|
150
|
+
|
151
|
+
def output_prolog_xmldecl(out, context)
|
152
|
+
out.output_string "<?xml version=\"#{@version}\""
|
153
|
+
if @encoding
|
154
|
+
out.output_string " encoding=\"#{@encoding}\""
|
155
|
+
end
|
156
|
+
if @standalone != nil
|
157
|
+
out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
|
158
|
+
end
|
159
|
+
out.output_string "?>"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
class DocType # :nodoc:
|
164
|
+
# don't output anything.
|
165
|
+
def output(out, context)
|
166
|
+
end
|
167
|
+
|
168
|
+
def generate_content # :nodoc:
|
169
|
+
result = ''
|
170
|
+
if @public_identifier
|
171
|
+
result << "PUBLIC \"#{@public_identifier}\""
|
172
|
+
else
|
173
|
+
result << "SYSTEM"
|
174
|
+
end
|
175
|
+
# Although a system identifier is not omissible in XML,
|
176
|
+
# we cannot output it if it is not given.
|
177
|
+
if @system_identifier
|
178
|
+
if /"/ !~ @system_identifier
|
179
|
+
result << " \"#{@system_identifier}\""
|
180
|
+
else
|
181
|
+
result << " '#{@system_identifier}'"
|
182
|
+
end
|
183
|
+
end
|
184
|
+
result
|
185
|
+
end
|
186
|
+
|
187
|
+
def output_prolog_doctypedecl(out, context)
|
188
|
+
out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
class ProcIns # :nodoc:
|
193
|
+
def output(out, context)
|
194
|
+
out.output_string "<?#{@target}"
|
195
|
+
out.output_string " #{@content}" if @content
|
196
|
+
out.output_string "?>"
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class Comment # :nodoc:
|
201
|
+
def output(out, context)
|
202
|
+
out.output_string "<!--#{@content}-->"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
end
|
207
|
+
# :startdoc:
|
@@ -0,0 +1,407 @@
|
|
1
|
+
# :stopdoc:
|
2
|
+
require 'htree/scan'
|
3
|
+
require 'htree/htmlinfo'
|
4
|
+
require 'htree/text'
|
5
|
+
require 'htree/tag'
|
6
|
+
require 'htree/leaf'
|
7
|
+
require 'htree/doc'
|
8
|
+
require 'htree/elem'
|
9
|
+
require 'htree/raw_string'
|
10
|
+
require 'htree/context'
|
11
|
+
require 'htree/encoder'
|
12
|
+
require 'htree/fstr'
|
13
|
+
|
14
|
+
module HTree # :nodoc:
|
15
|
+
# HTree.parse parses <i>input</i> and return a document tree.
|
16
|
+
# represented by HTree::Doc.
|
17
|
+
#
|
18
|
+
# <i>input</i> should be a String or
|
19
|
+
# an object which respond to read or open method.
|
20
|
+
# For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
|
21
|
+
# Note that the URIs need open-uri.
|
22
|
+
#
|
23
|
+
# HTree.parse guesses <i>input</i> is HTML or not and XML or not.
|
24
|
+
#
|
25
|
+
# If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
|
26
|
+
# regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
|
27
|
+
#
|
28
|
+
# If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
|
29
|
+
#
|
30
|
+
# If opened file or read content has charset method,
|
31
|
+
# HTree.parse decode it according to $KCODE before parsing.
|
32
|
+
# Otherwise HTree.parse assumes the character encoding of the content is
|
33
|
+
# compatible to $KCODE.
|
34
|
+
# Note that the charset method is provided by URI::HTTP with open-uri.
|
35
|
+
def HTree.parse(input)
|
36
|
+
HTree.with_frozen_string_hash {
|
37
|
+
parse_as(input, false)
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
# HTree.parse_xml parses <i>input</i> as XML and
|
42
|
+
# return a document tree represented by HTree::Doc.
|
43
|
+
#
|
44
|
+
# It behaves almost same as HTree.parse but it assumes <i>input</> is XML
|
45
|
+
# even if no XML declaration.
|
46
|
+
# The assumption causes following differences.
|
47
|
+
# * doesn't downcase element name.
|
48
|
+
# * The content of <script> and <style> element is PCDATA, not CDATA.
|
49
|
+
def HTree.parse_xml(input)
|
50
|
+
HTree.with_frozen_string_hash {
|
51
|
+
parse_as(input, true)
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def HTree.parse_as(input, is_xml)
|
56
|
+
input_charset = nil
|
57
|
+
if input.tainted? && 1 <= $SAFE
|
58
|
+
raise SecurityError, "input tainted"
|
59
|
+
end
|
60
|
+
if input.respond_to? :read # IO, StringIO
|
61
|
+
input = input.read.untaint
|
62
|
+
input_charset = input.charset if input.respond_to? :charset
|
63
|
+
elsif input.respond_to? :open # Pathname, URI with open-uri
|
64
|
+
input.open {|f|
|
65
|
+
input = f.read.untaint
|
66
|
+
input_charset = f.charset if f.respond_to? :charset
|
67
|
+
}
|
68
|
+
end
|
69
|
+
if input_charset && input_charset != Encoder.internal_charset
|
70
|
+
input = Iconv.conv(Encoder.internal_charset, input_charset, input)
|
71
|
+
end
|
72
|
+
|
73
|
+
tokens = []
|
74
|
+
is_xml, is_html = HTree.scan(input, is_xml) {|token|
|
75
|
+
tokens << token
|
76
|
+
}
|
77
|
+
context = is_html ? HTMLContext: DefaultContext
|
78
|
+
structure_list = parse_pairs(tokens, is_xml, is_html)
|
79
|
+
structure_list = fix_structure_list(structure_list, is_xml, is_html)
|
80
|
+
nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
|
81
|
+
Doc.new(nodes)
|
82
|
+
end
|
83
|
+
|
84
|
+
def HTree.parse_pairs(tokens, is_xml, is_html)
|
85
|
+
stack = [[nil, nil, []]]
|
86
|
+
tokens.each {|token|
|
87
|
+
case token[0]
|
88
|
+
when :stag
|
89
|
+
stag_raw_string = token[1]
|
90
|
+
stagname = stag_raw_string[Pat::Name]
|
91
|
+
stagname = stagname.downcase if !is_xml && is_html
|
92
|
+
stagname = HTree.frozen_string(stagname)
|
93
|
+
stack << [stagname, stag_raw_string, []]
|
94
|
+
when :etag
|
95
|
+
etag_raw_string = token[1]
|
96
|
+
etagname = etag_raw_string[Pat::Name]
|
97
|
+
etagname = etagname.downcase if !is_xml && is_html
|
98
|
+
etagname = HTree.frozen_string(etagname)
|
99
|
+
matched_elem = nil
|
100
|
+
stack.reverse_each {|elem|
|
101
|
+
stagname, _, _ = elem
|
102
|
+
if stagname == etagname
|
103
|
+
matched_elem = elem
|
104
|
+
break
|
105
|
+
end
|
106
|
+
}
|
107
|
+
if matched_elem
|
108
|
+
until matched_elem.equal? stack.last
|
109
|
+
stagname, stag_raw_string, children = stack.pop
|
110
|
+
stack.last[2] << [:elem, stag_raw_string, children]
|
111
|
+
end
|
112
|
+
stagname, stag_raw_string, children = stack.pop
|
113
|
+
stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
|
114
|
+
else
|
115
|
+
stack.last[2] << [:bogus_etag, etag_raw_string]
|
116
|
+
end
|
117
|
+
else
|
118
|
+
stack.last[2] << token
|
119
|
+
end
|
120
|
+
}
|
121
|
+
elem = nil
|
122
|
+
while 1 < stack.length
|
123
|
+
stagname, stag_raw_string, children = stack.pop
|
124
|
+
stack.last[2] << [:elem, stag_raw_string, children]
|
125
|
+
end
|
126
|
+
stack[0][2]
|
127
|
+
end
|
128
|
+
|
129
|
+
def HTree.fix_structure_list(structure_list, is_xml, is_html)
|
130
|
+
result = []
|
131
|
+
rest = structure_list.dup
|
132
|
+
until rest.empty?
|
133
|
+
structure = rest.shift
|
134
|
+
if structure[0] == :elem
|
135
|
+
elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
|
136
|
+
result << elem
|
137
|
+
rest = rest2 + rest
|
138
|
+
else
|
139
|
+
result << structure
|
140
|
+
end
|
141
|
+
end
|
142
|
+
result
|
143
|
+
end
|
144
|
+
|
145
|
+
def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
|
146
|
+
stag_raw_string = elem[1]
|
147
|
+
children = elem[2]
|
148
|
+
if etag_raw_string = elem[3]
|
149
|
+
return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
|
150
|
+
else
|
151
|
+
tagname = stag_raw_string[Pat::Name]
|
152
|
+
tagname = tagname.downcase if !is_xml && is_html
|
153
|
+
if ElementContent[tagname] == :EMPTY
|
154
|
+
return [:elem, stag_raw_string, []], children
|
155
|
+
else
|
156
|
+
if ElementContent[tagname] == :CDATA
|
157
|
+
possible_tags = []
|
158
|
+
else
|
159
|
+
possible_tags = ElementContent[tagname]
|
160
|
+
end
|
161
|
+
if possible_tags
|
162
|
+
excluded_tags2 = ElementExclusions[tagname]
|
163
|
+
included_tags2 = ElementInclusions[tagname]
|
164
|
+
excluded_tags |= excluded_tags2 if excluded_tags2
|
165
|
+
included_tags |= included_tags2 if included_tags2
|
166
|
+
containable_tags = (possible_tags | included_tags) - excluded_tags
|
167
|
+
uncontainable_tags = ElementContent.keys - containable_tags
|
168
|
+
else
|
169
|
+
# If the tagname is unknown, it is assumed that any element
|
170
|
+
# except excluded can be contained.
|
171
|
+
uncontainable_tags = excluded_tags
|
172
|
+
end
|
173
|
+
fixed_children = []
|
174
|
+
rest = children
|
175
|
+
until rest.empty?
|
176
|
+
if rest[0][0] == :elem
|
177
|
+
elem = rest.shift
|
178
|
+
elem_tagname = elem[1][Pat::Name]
|
179
|
+
elem_tagname = elem_tagname.downcase if !is_xml && is_html
|
180
|
+
if uncontainable_tags.include? elem_tagname
|
181
|
+
rest.unshift elem
|
182
|
+
break
|
183
|
+
else
|
184
|
+
fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
|
185
|
+
fixed_children << fixed_elem
|
186
|
+
rest = rest2 + rest
|
187
|
+
end
|
188
|
+
else
|
189
|
+
fixed_children << rest.shift
|
190
|
+
end
|
191
|
+
end
|
192
|
+
return [:elem, stag_raw_string, fixed_children], rest
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
|
198
|
+
case structure[0]
|
199
|
+
when :text_pcdata
|
200
|
+
Text.parse_pcdata(structure[1])
|
201
|
+
when :elem
|
202
|
+
_, stag_rawstring, children, etag_rawstring = structure
|
203
|
+
etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
|
204
|
+
stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
|
205
|
+
if !children.empty? || etag
|
206
|
+
Elem.new!(stag,
|
207
|
+
children.map {|c| build_node(c, is_xml, is_html, stag.context) },
|
208
|
+
etag)
|
209
|
+
else
|
210
|
+
Elem.new!(stag)
|
211
|
+
end
|
212
|
+
when :emptytag
|
213
|
+
Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
|
214
|
+
when :bogus_etag
|
215
|
+
BogusETag.parse(structure[1], is_xml, is_html)
|
216
|
+
when :xmldecl
|
217
|
+
XMLDecl.parse(structure[1])
|
218
|
+
when :doctype
|
219
|
+
DocType.parse(structure[1], is_xml, is_html)
|
220
|
+
when :procins
|
221
|
+
ProcIns.parse(structure[1])
|
222
|
+
when :comment
|
223
|
+
Comment.parse(structure[1])
|
224
|
+
when :text_cdata_content
|
225
|
+
Text.parse_cdata_content(structure[1])
|
226
|
+
when :text_cdata_section
|
227
|
+
Text.parse_cdata_section(structure[1])
|
228
|
+
else
|
229
|
+
raise Exception, "[bug] unknown structure: #{structure.inspect}"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
|
234
|
+
attrs = []
|
235
|
+
if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
|
236
|
+
qname = $1
|
237
|
+
$2.scan(Pat::ValidAttr_C) {
|
238
|
+
attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
|
239
|
+
}
|
240
|
+
elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
|
241
|
+
qname = $1
|
242
|
+
last_attr = $3
|
243
|
+
$2.scan(Pat::InvalidAttr1_C) {
|
244
|
+
attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
|
245
|
+
}
|
246
|
+
if last_attr
|
247
|
+
/#{Pat::InvalidAttr1End_C}/o =~ last_attr
|
248
|
+
attrs << [$1, $2 || $3]
|
249
|
+
end
|
250
|
+
else
|
251
|
+
raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
|
252
|
+
end
|
253
|
+
|
254
|
+
qname = qname.downcase if !is_xml && is_html
|
255
|
+
|
256
|
+
attrs.map! {|aname, aval|
|
257
|
+
if aname
|
258
|
+
aname = (!is_xml && is_html) ? aname.downcase : aname
|
259
|
+
[aname, Text.parse_pcdata(aval)]
|
260
|
+
else
|
261
|
+
if val2name = OmittedAttrName[qname]
|
262
|
+
aval_downcase = aval.downcase
|
263
|
+
aname = val2name.fetch(aval_downcase, aval_downcase)
|
264
|
+
else
|
265
|
+
aname = aval
|
266
|
+
end
|
267
|
+
[aname, Text.new(aval)]
|
268
|
+
end
|
269
|
+
}
|
270
|
+
|
271
|
+
result = STag.new(qname, attrs, inherited_context)
|
272
|
+
result.raw_string = raw_string
|
273
|
+
result
|
274
|
+
end
|
275
|
+
|
276
|
+
def ETag.parse(raw_string, is_xml, is_html)
|
277
|
+
unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
|
278
|
+
raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
|
279
|
+
end
|
280
|
+
|
281
|
+
qname = $1
|
282
|
+
qname = qname.downcase if !is_xml && is_html
|
283
|
+
|
284
|
+
result = self.new(qname)
|
285
|
+
result.raw_string = raw_string
|
286
|
+
result
|
287
|
+
end
|
288
|
+
|
289
|
+
def BogusETag.parse(raw_string, is_xml, is_html)
|
290
|
+
unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
|
291
|
+
raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
|
292
|
+
end
|
293
|
+
|
294
|
+
qname = $1
|
295
|
+
qname = qname.downcase if !is_xml && is_html
|
296
|
+
|
297
|
+
result = self.new(qname)
|
298
|
+
result.raw_string = raw_string
|
299
|
+
result
|
300
|
+
end
|
301
|
+
|
302
|
+
def Text.parse_pcdata(raw_string)
|
303
|
+
fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
|
304
|
+
name = $1
|
305
|
+
case s
|
306
|
+
when /;\z/
|
307
|
+
s
|
308
|
+
when /\A&#/
|
309
|
+
"#{s};"
|
310
|
+
when '&'
|
311
|
+
'&'
|
312
|
+
else
|
313
|
+
if NamedCharactersPattern =~ name
|
314
|
+
"&#{name};"
|
315
|
+
else
|
316
|
+
"&#{name}"
|
317
|
+
end
|
318
|
+
end
|
319
|
+
}
|
320
|
+
fixed = raw_string if fixed == raw_string
|
321
|
+
result = Text.new_internal(fixed)
|
322
|
+
result.raw_string = raw_string
|
323
|
+
result
|
324
|
+
end
|
325
|
+
|
326
|
+
def Text.parse_cdata_content(raw_string)
|
327
|
+
result = Text.new(raw_string)
|
328
|
+
result.raw_string = raw_string
|
329
|
+
result
|
330
|
+
end
|
331
|
+
|
332
|
+
def Text.parse_cdata_section(raw_string)
|
333
|
+
unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
|
334
|
+
raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
|
335
|
+
end
|
336
|
+
|
337
|
+
content = $1
|
338
|
+
|
339
|
+
result = Text.new(content)
|
340
|
+
result.raw_string = raw_string
|
341
|
+
result
|
342
|
+
end
|
343
|
+
|
344
|
+
def XMLDecl.parse(raw_string)
|
345
|
+
unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
|
346
|
+
raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
|
347
|
+
end
|
348
|
+
|
349
|
+
version = $1 || $2
|
350
|
+
encoding = $3 || $4
|
351
|
+
case $5 || $6
|
352
|
+
when 'yes'
|
353
|
+
standalone = true
|
354
|
+
when 'no'
|
355
|
+
standalone = false
|
356
|
+
else
|
357
|
+
standalone = nil
|
358
|
+
end
|
359
|
+
|
360
|
+
result = XMLDecl.new(version, encoding, standalone)
|
361
|
+
result.raw_string = raw_string
|
362
|
+
result
|
363
|
+
end
|
364
|
+
|
365
|
+
def DocType.parse(raw_string, is_xml, is_html)
|
366
|
+
unless /\A#{Pat::DocType_C}\z/o =~ raw_string
|
367
|
+
raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
|
368
|
+
end
|
369
|
+
|
370
|
+
root_element_name = $1
|
371
|
+
public_identifier = $2 || $3
|
372
|
+
system_identifier = $4 || $5
|
373
|
+
|
374
|
+
root_element_name = root_element_name.downcase if !is_xml && is_html
|
375
|
+
|
376
|
+
result = DocType.new(root_element_name, public_identifier, system_identifier)
|
377
|
+
result.raw_string = raw_string
|
378
|
+
result
|
379
|
+
end
|
380
|
+
|
381
|
+
def ProcIns.parse(raw_string)
|
382
|
+
unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
|
383
|
+
raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
|
384
|
+
end
|
385
|
+
|
386
|
+
target = $1
|
387
|
+
content = $2
|
388
|
+
|
389
|
+
result = ProcIns.new(target, content)
|
390
|
+
result.raw_string = raw_string
|
391
|
+
result
|
392
|
+
end
|
393
|
+
|
394
|
+
def Comment.parse(raw_string)
|
395
|
+
unless /\A#{Pat::Comment_C}\z/o =~ raw_string
|
396
|
+
raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
|
397
|
+
end
|
398
|
+
|
399
|
+
content = $1
|
400
|
+
|
401
|
+
result = Comment.new(content)
|
402
|
+
result.raw_string = raw_string
|
403
|
+
result
|
404
|
+
end
|
405
|
+
|
406
|
+
end
|
407
|
+
# :startdoc:
|