htree 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +4 -0
- data/Makefile +20 -0
- data/Manifest +58 -0
- data/README +61 -0
- data/Rakefile +37 -0
- data/htree.gemspec +32 -0
- data/init.rb +1 -0
- data/install.rb +112 -0
- data/lib/htree.rb +97 -0
- data/lib/htree/container.rb +8 -0
- data/lib/htree/context.rb +69 -0
- data/lib/htree/display.rb +46 -0
- data/lib/htree/doc.rb +149 -0
- data/lib/htree/elem.rb +262 -0
- data/lib/htree/encoder.rb +217 -0
- data/lib/htree/equality.rb +219 -0
- data/lib/htree/extract_text.rb +37 -0
- data/lib/htree/fstr.rb +32 -0
- data/lib/htree/gencode.rb +193 -0
- data/lib/htree/htmlinfo.rb +672 -0
- data/lib/htree/inspect.rb +108 -0
- data/lib/htree/leaf.rb +92 -0
- data/lib/htree/loc.rb +369 -0
- data/lib/htree/modules.rb +49 -0
- data/lib/htree/name.rb +122 -0
- data/lib/htree/output.rb +212 -0
- data/lib/htree/parse.rb +410 -0
- data/lib/htree/raw_string.rb +127 -0
- data/lib/htree/regexp-util.rb +19 -0
- data/lib/htree/rexml.rb +131 -0
- data/lib/htree/scan.rb +176 -0
- data/lib/htree/tag.rb +113 -0
- data/lib/htree/template.rb +961 -0
- data/lib/htree/text.rb +115 -0
- data/lib/htree/traverse.rb +497 -0
- data/test-all.rb +5 -0
- data/test/assign.html +1 -0
- data/test/template.html +4 -0
- data/test/test-attr.rb +67 -0
- data/test/test-charset.rb +79 -0
- data/test/test-context.rb +29 -0
- data/test/test-display_xml.rb +45 -0
- data/test/test-elem-new.rb +101 -0
- data/test/test-encoder.rb +53 -0
- data/test/test-equality.rb +55 -0
- data/test/test-extract_text.rb +18 -0
- data/test/test-gencode.rb +27 -0
- data/test/test-leaf.rb +25 -0
- data/test/test-loc.rb +60 -0
- data/test/test-namespace.rb +147 -0
- data/test/test-output.rb +133 -0
- data/test/test-parse.rb +115 -0
- data/test/test-raw_string.rb +17 -0
- data/test/test-rexml.rb +70 -0
- data/test/test-scan.rb +153 -0
- data/test/test-security.rb +37 -0
- data/test/test-subnode.rb +142 -0
- data/test/test-template.rb +313 -0
- data/test/test-text.rb +43 -0
- data/test/test-traverse.rb +69 -0
- metadata +166 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,49 @@
|
|
1
|
+
module HTree
|
2
|
+
class Name; include HTree end
|
3
|
+
class Context; include HTree end
|
4
|
+
|
5
|
+
# :stopdoc:
|
6
|
+
module Tag; include HTree end
|
7
|
+
class STag; include Tag end
|
8
|
+
class ETag; include Tag end
|
9
|
+
# :startdoc:
|
10
|
+
|
11
|
+
module Node; include HTree end
|
12
|
+
module Container; include Node end
|
13
|
+
class Doc; include Container end
|
14
|
+
class Elem; include Container end
|
15
|
+
module Leaf; include Node end
|
16
|
+
class Text; include Leaf end
|
17
|
+
class XMLDecl; include Leaf end
|
18
|
+
class DocType; include Leaf end
|
19
|
+
class ProcIns; include Leaf end
|
20
|
+
class Comment; include Leaf end
|
21
|
+
class BogusETag; include Leaf end
|
22
|
+
|
23
|
+
module Traverse end
|
24
|
+
module Container::Trav; include Traverse end
|
25
|
+
module Leaf::Trav; include Traverse end
|
26
|
+
class Doc; module Trav; include Container::Trav end; include Trav end
|
27
|
+
class Elem; module Trav; include Container::Trav end; include Trav end
|
28
|
+
class Text; module Trav; include Leaf::Trav end; include Trav end
|
29
|
+
class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
|
30
|
+
class DocType; module Trav; include Leaf::Trav end; include Trav end
|
31
|
+
class ProcIns; module Trav; include Leaf::Trav end; include Trav end
|
32
|
+
class Comment; module Trav; include Leaf::Trav end; include Trav end
|
33
|
+
class BogusETag; module Trav; include Leaf::Trav end; include Trav end
|
34
|
+
|
35
|
+
class Location; include HTree end
|
36
|
+
module Container::Loc end
|
37
|
+
module Leaf::Loc end
|
38
|
+
class Doc; class Loc < Location; include Trav, Container::Loc end end
|
39
|
+
class Elem; class Loc < Location; include Trav, Container::Loc end end
|
40
|
+
class Text; class Loc < Location; include Trav, Leaf::Loc end end
|
41
|
+
class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
|
42
|
+
class DocType; class Loc < Location; include Trav, Leaf::Loc end end
|
43
|
+
class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
|
44
|
+
class Comment; class Loc < Location; include Trav, Leaf::Loc end end
|
45
|
+
class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
|
46
|
+
|
47
|
+
class Error < StandardError; end
|
48
|
+
end
|
49
|
+
|
data/lib/htree/name.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'htree/scan' # for Pat::Nmtoken
|
2
|
+
require 'htree/context'
|
3
|
+
|
4
|
+
module HTree
|
5
|
+
# Name represents a element name and attribute name.
|
6
|
+
# It consists of a namespace prefix, a namespace URI and a local name.
|
7
|
+
class Name
|
8
|
+
=begin
|
9
|
+
element name prefix uri localname
|
10
|
+
{u}n, n with xmlns=u nil 'u' 'n'
|
11
|
+
p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
|
12
|
+
n with xmlns='' nil '' 'n'
|
13
|
+
|
14
|
+
attribute name
|
15
|
+
xmlns= 'xmlns' nil nil
|
16
|
+
xmlns:n= 'xmlns' nil 'n'
|
17
|
+
p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
|
18
|
+
n= nil '' 'n'
|
19
|
+
=end
|
20
|
+
def Name.parse_element_name(name, context)
|
21
|
+
if /\{(.*)\}/ =~ name
|
22
|
+
# "{u}n" means "use default namespace",
|
23
|
+
# "p{u}n" means "use the specified prefix p"
|
24
|
+
$` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
|
25
|
+
elsif /:/ =~ name && !context.namespace_uri($`).empty?
|
26
|
+
Name.new($`, context.namespace_uri($`), $')
|
27
|
+
elsif !context.namespace_uri(nil).empty?
|
28
|
+
Name.new(nil, context.namespace_uri(nil), name)
|
29
|
+
else
|
30
|
+
Name.new(nil, '', name)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def Name.parse_attribute_name(name, context)
|
35
|
+
if name == 'xmlns'
|
36
|
+
Name.new('xmlns', nil, nil)
|
37
|
+
elsif /\Axmlns:/ =~ name
|
38
|
+
Name.new('xmlns', nil, $')
|
39
|
+
elsif /\{(.*)\}/ =~ name
|
40
|
+
case $`
|
41
|
+
when ''; Name.new(nil, $1, $')
|
42
|
+
else Name.new($`, $1, $')
|
43
|
+
end
|
44
|
+
elsif /:/ =~ name && !context.namespace_uri($`).empty?
|
45
|
+
Name.new($`, context.namespace_uri($`), $')
|
46
|
+
else
|
47
|
+
Name.new(nil, '', name)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
NameCache = {}
|
52
|
+
def Name.new(namespace_prefix, namespace_uri, local_name)
|
53
|
+
key = [namespace_prefix, namespace_uri, local_name, self]
|
54
|
+
NameCache.fetch(key) {
|
55
|
+
0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
|
56
|
+
NameCache[key] = super(key[0], key[1], key[2])
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
def initialize(namespace_prefix, namespace_uri, local_name)
|
61
|
+
@namespace_prefix = namespace_prefix
|
62
|
+
@namespace_uri = namespace_uri
|
63
|
+
@local_name = local_name
|
64
|
+
if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
|
65
|
+
raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
|
66
|
+
end
|
67
|
+
if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
|
68
|
+
raise HTree::Error, "invalid local name: #{@local_name.inspect}"
|
69
|
+
end
|
70
|
+
if @namespace_prefix == 'xmlns'
|
71
|
+
unless @namespace_uri == nil
|
72
|
+
raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
|
73
|
+
end
|
74
|
+
else
|
75
|
+
unless String === @namespace_uri
|
76
|
+
raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
attr_reader :namespace_prefix, :namespace_uri, :local_name
|
81
|
+
|
82
|
+
def xmlns?
|
83
|
+
@namespace_prefix == 'xmlns' && @namespace_uri == nil
|
84
|
+
end
|
85
|
+
|
86
|
+
def universal_name
|
87
|
+
if @namespace_uri && !@namespace_uri.empty?
|
88
|
+
"{#{@namespace_uri}}#{@local_name}"
|
89
|
+
else
|
90
|
+
@local_name.dup
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def qualified_name
|
95
|
+
if @namespace_uri && !@namespace_uri.empty?
|
96
|
+
if @namespace_prefix
|
97
|
+
"#{@namespace_prefix}:#{@local_name}"
|
98
|
+
else
|
99
|
+
@local_name.dup
|
100
|
+
end
|
101
|
+
elsif @local_name
|
102
|
+
@local_name.dup
|
103
|
+
else
|
104
|
+
"xmlns"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def to_s
|
109
|
+
if @namespace_uri && !@namespace_uri.empty?
|
110
|
+
if @namespace_prefix
|
111
|
+
"#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
|
112
|
+
else
|
113
|
+
"{#{@namespace_uri}}#{@local_name}"
|
114
|
+
end
|
115
|
+
elsif @local_name
|
116
|
+
@local_name.dup
|
117
|
+
else
|
118
|
+
"xmlns"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
data/lib/htree/output.rb
ADDED
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'htree/encoder'
|
2
|
+
require 'htree/doc'
|
3
|
+
require 'htree/elem'
|
4
|
+
require 'htree/leaf'
|
5
|
+
require 'htree/text'
|
6
|
+
|
7
|
+
module HTree
|
8
|
+
# :stopdoc:
|
9
|
+
|
10
|
+
class Text
|
11
|
+
ChRef = {
|
12
|
+
'>' => '>',
|
13
|
+
'<' => '<',
|
14
|
+
'"' => '"',
|
15
|
+
}
|
16
|
+
|
17
|
+
def output(out, context=nil)
|
18
|
+
out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_attvalue_content
|
22
|
+
@rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def output_attvalue(out, context)
|
26
|
+
out.output_string '"'
|
27
|
+
out.output_text to_attvalue_content
|
28
|
+
out.output_string '"'
|
29
|
+
end
|
30
|
+
|
31
|
+
def output_cdata(out)
|
32
|
+
str = self.to_s
|
33
|
+
if %r{</} =~ str
|
34
|
+
raise ArgumentError, "CDATA cannot contain '</': #{str.inspect}"
|
35
|
+
end
|
36
|
+
out.output_string(str)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class Name
|
41
|
+
def output(out, context)
|
42
|
+
# xxx: validate namespace prefix
|
43
|
+
if xmlns?
|
44
|
+
if @local_name
|
45
|
+
out.output_string "xmlns:#{@local_name}"
|
46
|
+
else
|
47
|
+
out.output_string "xmlns"
|
48
|
+
end
|
49
|
+
else
|
50
|
+
out.output_string qualified_name
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def output_attribute(text, out, context)
|
55
|
+
output(out, context)
|
56
|
+
out.output_string '='
|
57
|
+
text.output_attvalue(out, context)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Doc
|
62
|
+
def output(out, context)
|
63
|
+
xmldecl = false
|
64
|
+
@children.each {|n|
|
65
|
+
if n.respond_to? :output_prolog_xmldecl
|
66
|
+
n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
|
67
|
+
xmldecl = true
|
68
|
+
else
|
69
|
+
n.output(out, context)
|
70
|
+
end
|
71
|
+
}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class Elem
|
76
|
+
def output(out, context)
|
77
|
+
if %r{\A\{http://www.w3.org/1999/xhtml\}(?:script|style)\z} =~ @stag.element_name.universal_name
|
78
|
+
children_context = @stag.output_stag(out, context)
|
79
|
+
out.output_cdata_content(@children, children_context)
|
80
|
+
@stag.output_etag(out, context)
|
81
|
+
elsif @empty
|
82
|
+
@stag.output_emptytag(out, context)
|
83
|
+
else
|
84
|
+
children_context = @stag.output_stag(out, context)
|
85
|
+
@children.each {|n| n.output(out, children_context) }
|
86
|
+
@stag.output_etag(out, context)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class STag
|
92
|
+
def output_attributes(out, context)
|
93
|
+
@attributes.each {|aname, text|
|
94
|
+
next if aname.xmlns?
|
95
|
+
out.output_string ' '
|
96
|
+
aname.output_attribute(text, out, context)
|
97
|
+
}
|
98
|
+
@context.output_namespaces(out, context)
|
99
|
+
end
|
100
|
+
|
101
|
+
def output_emptytag(out, context)
|
102
|
+
out.output_string '<'
|
103
|
+
@name.output(out, context)
|
104
|
+
children_context = output_attributes(out, context)
|
105
|
+
out.output_string "\n"
|
106
|
+
out.output_slash_if_xml
|
107
|
+
out.output_string ">"
|
108
|
+
children_context
|
109
|
+
end
|
110
|
+
|
111
|
+
def output_stag(out, context)
|
112
|
+
out.output_string '<'
|
113
|
+
@name.output(out, context)
|
114
|
+
children_context = output_attributes(out, context)
|
115
|
+
out.output_string "\n>"
|
116
|
+
children_context
|
117
|
+
end
|
118
|
+
|
119
|
+
def output_etag(out, context)
|
120
|
+
out.output_string '</'
|
121
|
+
@name.output(out, context)
|
122
|
+
out.output_string "\n>"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
class Context
|
127
|
+
def output_namespaces(out, outer_context)
|
128
|
+
unknown_namespaces = {}
|
129
|
+
@namespaces.each {|prefix, uri|
|
130
|
+
outer_uri = outer_context.namespace_uri(prefix)
|
131
|
+
if outer_uri == nil
|
132
|
+
unknown_namespaces[prefix] = uri
|
133
|
+
elsif outer_uri != uri
|
134
|
+
if prefix
|
135
|
+
out.output_string " xmlns:#{prefix}="
|
136
|
+
else
|
137
|
+
out.output_string " xmlns="
|
138
|
+
end
|
139
|
+
Text.new(uri).output_attvalue(out, outer_context)
|
140
|
+
end
|
141
|
+
}
|
142
|
+
unless unknown_namespaces.empty?
|
143
|
+
out.output_xmlns(unknown_namespaces)
|
144
|
+
end
|
145
|
+
outer_context.subst_namespaces(@namespaces)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class BogusETag
|
150
|
+
# don't output anything.
|
151
|
+
def output(out, context)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
class XMLDecl
|
156
|
+
# don't output anything.
|
157
|
+
def output(out, context)
|
158
|
+
end
|
159
|
+
|
160
|
+
def output_prolog_xmldecl(out, context)
|
161
|
+
out.output_string "<?xml version=\"#{@version}\""
|
162
|
+
if @encoding
|
163
|
+
out.output_string " encoding=\"#{@encoding}\""
|
164
|
+
end
|
165
|
+
if @standalone != nil
|
166
|
+
out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
|
167
|
+
end
|
168
|
+
out.output_string "?>"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
class DocType
|
173
|
+
def output(out, context)
|
174
|
+
out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
|
175
|
+
end
|
176
|
+
|
177
|
+
def generate_content # :nodoc:
|
178
|
+
result = ''
|
179
|
+
if @public_identifier
|
180
|
+
result << "PUBLIC \"#{@public_identifier}\""
|
181
|
+
else
|
182
|
+
result << "SYSTEM"
|
183
|
+
end
|
184
|
+
# Although a system identifier is not omissible in XML,
|
185
|
+
# we cannot output it if it is not given.
|
186
|
+
if @system_identifier
|
187
|
+
if /"/ !~ @system_identifier
|
188
|
+
result << " \"#{@system_identifier}\""
|
189
|
+
else
|
190
|
+
result << " '#{@system_identifier}'"
|
191
|
+
end
|
192
|
+
end
|
193
|
+
result
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
class ProcIns
|
198
|
+
def output(out, context)
|
199
|
+
out.output_string "<?#{@target}"
|
200
|
+
out.output_string " #{@content}" if @content
|
201
|
+
out.output_string "?>"
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
class Comment
|
206
|
+
def output(out, context)
|
207
|
+
out.output_string "<!--#{@content}-->"
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# :startdoc:
|
212
|
+
end
|
data/lib/htree/parse.rb
ADDED
@@ -0,0 +1,410 @@
|
|
1
|
+
require 'htree/scan'
|
2
|
+
require 'htree/htmlinfo'
|
3
|
+
require 'htree/text'
|
4
|
+
require 'htree/tag'
|
5
|
+
require 'htree/leaf'
|
6
|
+
require 'htree/doc'
|
7
|
+
require 'htree/elem'
|
8
|
+
require 'htree/raw_string'
|
9
|
+
require 'htree/context'
|
10
|
+
require 'htree/encoder'
|
11
|
+
require 'htree/fstr'
|
12
|
+
|
13
|
+
module HTree
|
14
|
+
# HTree.parse parses <i>input</i> and return a document tree.
|
15
|
+
# represented by HTree::Doc.
|
16
|
+
#
|
17
|
+
# <i>input</i> should be a String or
|
18
|
+
# an object which respond to read or open method.
|
19
|
+
# For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
|
20
|
+
# Note that the URIs need open-uri.
|
21
|
+
#
|
22
|
+
# HTree.parse guesses <i>input</i> is HTML or not and XML or not.
|
23
|
+
#
|
24
|
+
# If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
|
25
|
+
# regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
|
26
|
+
#
|
27
|
+
# If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
|
28
|
+
#
|
29
|
+
# If opened file or read content has charset method,
|
30
|
+
# HTree.parse decode it according to $KCODE before parsing.
|
31
|
+
# Otherwise HTree.parse assumes the character encoding of the content is
|
32
|
+
# compatible to $KCODE.
|
33
|
+
# Note that the charset method is provided by URI::HTTP with open-uri.
|
34
|
+
def HTree.parse(input)
|
35
|
+
HTree.with_frozen_string_hash {
|
36
|
+
parse_as(input, false)
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
# HTree.parse_xml parses <i>input</i> as XML and
|
41
|
+
# return a document tree represented by HTree::Doc.
|
42
|
+
#
|
43
|
+
# It behaves almost same as HTree.parse but it assumes <i>input</i> is XML
|
44
|
+
# even if no XML declaration.
|
45
|
+
# The assumption causes following differences.
|
46
|
+
# * doesn't downcase element name.
|
47
|
+
# * The content of <script> and <style> element is PCDATA, not CDATA.
|
48
|
+
def HTree.parse_xml(input)
|
49
|
+
HTree.with_frozen_string_hash {
|
50
|
+
parse_as(input, true)
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
# :stopdoc:
|
55
|
+
|
56
|
+
def HTree.parse_as(input, is_xml)
|
57
|
+
input_charset = nil
|
58
|
+
if input.tainted? && 1 <= $SAFE
|
59
|
+
raise SecurityError, "input tainted"
|
60
|
+
end
|
61
|
+
if input.respond_to? :read # IO, StringIO
|
62
|
+
input = input.read.untaint
|
63
|
+
input_charset = input.charset if input.respond_to? :charset
|
64
|
+
elsif input.respond_to? :open # Pathname, URI with open-uri
|
65
|
+
input.open {|f|
|
66
|
+
input = f.read.untaint
|
67
|
+
input_charset = f.charset if f.respond_to? :charset
|
68
|
+
}
|
69
|
+
end
|
70
|
+
if input_charset && input_charset != Encoder.internal_charset
|
71
|
+
input = Iconv.conv(Encoder.internal_charset, input_charset, input)
|
72
|
+
end
|
73
|
+
|
74
|
+
tokens = []
|
75
|
+
is_xml, is_html = HTree.scan(input, is_xml) {|token|
|
76
|
+
tokens << token
|
77
|
+
}
|
78
|
+
context = is_html ? HTMLContext : DefaultContext
|
79
|
+
structure_list = parse_pairs(tokens, is_xml, is_html)
|
80
|
+
structure_list = fix_structure_list(structure_list, is_xml, is_html)
|
81
|
+
nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
|
82
|
+
Doc.new(nodes)
|
83
|
+
end
|
84
|
+
|
85
|
+
def HTree.parse_pairs(tokens, is_xml, is_html)
|
86
|
+
stack = [[nil, nil, []]]
|
87
|
+
tokens.each {|token|
|
88
|
+
case token[0]
|
89
|
+
when :stag
|
90
|
+
stag_raw_string = token[1]
|
91
|
+
stagname = stag_raw_string[Pat::Name]
|
92
|
+
stagname = stagname.downcase if !is_xml && is_html
|
93
|
+
stagname = HTree.frozen_string(stagname)
|
94
|
+
stack << [stagname, stag_raw_string, []]
|
95
|
+
when :etag
|
96
|
+
etag_raw_string = token[1]
|
97
|
+
etagname = etag_raw_string[Pat::Name]
|
98
|
+
etagname = etagname.downcase if !is_xml && is_html
|
99
|
+
etagname = HTree.frozen_string(etagname)
|
100
|
+
matched_elem = nil
|
101
|
+
stack.reverse_each {|elem|
|
102
|
+
stagname, _, _ = elem
|
103
|
+
if stagname == etagname
|
104
|
+
matched_elem = elem
|
105
|
+
break
|
106
|
+
end
|
107
|
+
}
|
108
|
+
if matched_elem
|
109
|
+
until matched_elem.equal? stack.last
|
110
|
+
stagname, stag_raw_string, children = stack.pop
|
111
|
+
stack.last[2] << [:elem, stag_raw_string, children]
|
112
|
+
end
|
113
|
+
stagname, stag_raw_string, children = stack.pop
|
114
|
+
stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
|
115
|
+
else
|
116
|
+
stack.last[2] << [:bogus_etag, etag_raw_string]
|
117
|
+
end
|
118
|
+
else
|
119
|
+
stack.last[2] << token
|
120
|
+
end
|
121
|
+
}
|
122
|
+
elem = nil
|
123
|
+
while 1 < stack.length
|
124
|
+
stagname, stag_raw_string, children = stack.pop
|
125
|
+
stack.last[2] << [:elem, stag_raw_string, children]
|
126
|
+
end
|
127
|
+
stack[0][2]
|
128
|
+
end
|
129
|
+
|
130
|
+
def HTree.fix_structure_list(structure_list, is_xml, is_html)
|
131
|
+
result = []
|
132
|
+
rest = structure_list.dup
|
133
|
+
until rest.empty?
|
134
|
+
structure = rest.shift
|
135
|
+
if structure[0] == :elem
|
136
|
+
elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
|
137
|
+
result << elem
|
138
|
+
rest = rest2 + rest
|
139
|
+
else
|
140
|
+
result << structure
|
141
|
+
end
|
142
|
+
end
|
143
|
+
result
|
144
|
+
end
|
145
|
+
|
146
|
+
def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
|
147
|
+
stag_raw_string = elem[1]
|
148
|
+
children = elem[2]
|
149
|
+
if etag_raw_string = elem[3]
|
150
|
+
return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
|
151
|
+
else
|
152
|
+
tagname = stag_raw_string[Pat::Name]
|
153
|
+
tagname = tagname.downcase if !is_xml && is_html
|
154
|
+
if ElementContent[tagname] == :EMPTY
|
155
|
+
return [:elem, stag_raw_string, []], children
|
156
|
+
else
|
157
|
+
if ElementContent[tagname] == :CDATA
|
158
|
+
possible_tags = []
|
159
|
+
else
|
160
|
+
possible_tags = ElementContent[tagname]
|
161
|
+
end
|
162
|
+
if possible_tags
|
163
|
+
excluded_tags2 = ElementExclusions[tagname]
|
164
|
+
included_tags2 = ElementInclusions[tagname]
|
165
|
+
excluded_tags |= excluded_tags2 if excluded_tags2
|
166
|
+
included_tags |= included_tags2 if included_tags2
|
167
|
+
containable_tags = (possible_tags | included_tags) - excluded_tags
|
168
|
+
uncontainable_tags = ElementContent.keys - containable_tags
|
169
|
+
else
|
170
|
+
# If the tagname is unknown, it is assumed that any element
|
171
|
+
# except excluded can be contained.
|
172
|
+
uncontainable_tags = excluded_tags
|
173
|
+
end
|
174
|
+
fixed_children = []
|
175
|
+
rest = children
|
176
|
+
until rest.empty?
|
177
|
+
if rest[0][0] == :elem
|
178
|
+
elem = rest.shift
|
179
|
+
elem_tagname = elem[1][Pat::Name]
|
180
|
+
elem_tagname = elem_tagname.downcase if !is_xml && is_html
|
181
|
+
if uncontainable_tags.include? elem_tagname
|
182
|
+
rest.unshift elem
|
183
|
+
break
|
184
|
+
else
|
185
|
+
fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
|
186
|
+
fixed_children << fixed_elem
|
187
|
+
rest = rest2 + rest
|
188
|
+
end
|
189
|
+
else
|
190
|
+
fixed_children << rest.shift
|
191
|
+
end
|
192
|
+
end
|
193
|
+
return [:elem, stag_raw_string, fixed_children], rest
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
|
199
|
+
case structure[0]
|
200
|
+
when :text_pcdata
|
201
|
+
Text.parse_pcdata(structure[1])
|
202
|
+
when :elem
|
203
|
+
_, stag_rawstring, children, etag_rawstring = structure
|
204
|
+
etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
|
205
|
+
stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
|
206
|
+
if !children.empty? || etag ||
|
207
|
+
stag.element_name.namespace_uri != 'http://www.w3.org/1999/xhtml' ||
|
208
|
+
HTree::ElementContent[stag.element_name.local_name] != :EMPTY
|
209
|
+
Elem.new!(stag,
|
210
|
+
children.map {|c| build_node(c, is_xml, is_html, stag.context) },
|
211
|
+
etag)
|
212
|
+
else
|
213
|
+
Elem.new!(stag)
|
214
|
+
end
|
215
|
+
when :emptytag
|
216
|
+
Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
|
217
|
+
when :bogus_etag
|
218
|
+
BogusETag.parse(structure[1], is_xml, is_html)
|
219
|
+
when :xmldecl
|
220
|
+
XMLDecl.parse(structure[1])
|
221
|
+
when :doctype
|
222
|
+
DocType.parse(structure[1], is_xml, is_html)
|
223
|
+
when :procins
|
224
|
+
ProcIns.parse(structure[1])
|
225
|
+
when :comment
|
226
|
+
Comment.parse(structure[1])
|
227
|
+
when :text_cdata_content
|
228
|
+
Text.parse_cdata_content(structure[1])
|
229
|
+
when :text_cdata_section
|
230
|
+
Text.parse_cdata_section(structure[1])
|
231
|
+
else
|
232
|
+
raise Exception, "[bug] unknown structure: #{structure.inspect}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
|
237
|
+
attrs = []
|
238
|
+
if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
|
239
|
+
qname = $1
|
240
|
+
$2.scan(Pat::ValidAttr_C) {
|
241
|
+
attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
|
242
|
+
}
|
243
|
+
elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
|
244
|
+
qname = $1
|
245
|
+
last_attr = $3
|
246
|
+
$2.scan(Pat::InvalidAttr1_C) {
|
247
|
+
attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
|
248
|
+
}
|
249
|
+
if last_attr
|
250
|
+
/#{Pat::InvalidAttr1End_C}/o =~ last_attr
|
251
|
+
attrs << [$1, $2 || $3]
|
252
|
+
end
|
253
|
+
else
|
254
|
+
raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
|
255
|
+
end
|
256
|
+
|
257
|
+
qname = qname.downcase if !is_xml && is_html
|
258
|
+
|
259
|
+
attrs.map! {|aname, aval|
|
260
|
+
if aname
|
261
|
+
aname = (!is_xml && is_html) ? aname.downcase : aname
|
262
|
+
[aname, Text.parse_pcdata(aval)]
|
263
|
+
else
|
264
|
+
if val2name = OmittedAttrName[qname]
|
265
|
+
aval_downcase = aval.downcase
|
266
|
+
aname = val2name.fetch(aval_downcase, aval_downcase)
|
267
|
+
else
|
268
|
+
aname = aval
|
269
|
+
end
|
270
|
+
[aname, Text.new(aval)]
|
271
|
+
end
|
272
|
+
}
|
273
|
+
|
274
|
+
result = STag.new(qname, attrs, inherited_context)
|
275
|
+
result.raw_string = raw_string
|
276
|
+
result
|
277
|
+
end
|
278
|
+
|
279
|
+
def ETag.parse(raw_string, is_xml, is_html)
|
280
|
+
unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
|
281
|
+
raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
|
282
|
+
end
|
283
|
+
|
284
|
+
qname = $1
|
285
|
+
qname = qname.downcase if !is_xml && is_html
|
286
|
+
|
287
|
+
result = self.new(qname)
|
288
|
+
result.raw_string = raw_string
|
289
|
+
result
|
290
|
+
end
|
291
|
+
|
292
|
+
def BogusETag.parse(raw_string, is_xml, is_html)
|
293
|
+
unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
|
294
|
+
raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
|
295
|
+
end
|
296
|
+
|
297
|
+
qname = $1
|
298
|
+
qname = qname.downcase if !is_xml && is_html
|
299
|
+
|
300
|
+
result = self.new(qname)
|
301
|
+
result.raw_string = raw_string
|
302
|
+
result
|
303
|
+
end
|
304
|
+
|
305
|
+
def Text.parse_pcdata(raw_string)
|
306
|
+
fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
|
307
|
+
name = $1
|
308
|
+
case s
|
309
|
+
when /;\z/
|
310
|
+
s
|
311
|
+
when /\A&#/
|
312
|
+
"#{s};"
|
313
|
+
when '&'
|
314
|
+
'&'
|
315
|
+
else
|
316
|
+
if NamedCharactersPattern =~ name
|
317
|
+
"&#{name};"
|
318
|
+
else
|
319
|
+
"&#{name}"
|
320
|
+
end
|
321
|
+
end
|
322
|
+
}
|
323
|
+
fixed = raw_string if fixed == raw_string
|
324
|
+
result = Text.new_internal(fixed)
|
325
|
+
result.raw_string = raw_string
|
326
|
+
result
|
327
|
+
end
|
328
|
+
|
329
|
+
def Text.parse_cdata_content(raw_string)
|
330
|
+
result = Text.new(raw_string)
|
331
|
+
result.raw_string = raw_string
|
332
|
+
result
|
333
|
+
end
|
334
|
+
|
335
|
+
def Text.parse_cdata_section(raw_string)
|
336
|
+
unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
|
337
|
+
raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
|
338
|
+
end
|
339
|
+
|
340
|
+
content = $1
|
341
|
+
|
342
|
+
result = Text.new(content)
|
343
|
+
result.raw_string = raw_string
|
344
|
+
result
|
345
|
+
end
|
346
|
+
|
347
|
+
def XMLDecl.parse(raw_string)
|
348
|
+
unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
|
349
|
+
raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
|
350
|
+
end
|
351
|
+
|
352
|
+
version = $1 || $2
|
353
|
+
encoding = $3 || $4
|
354
|
+
case $5 || $6
|
355
|
+
when 'yes'
|
356
|
+
standalone = true
|
357
|
+
when 'no'
|
358
|
+
standalone = false
|
359
|
+
else
|
360
|
+
standalone = nil
|
361
|
+
end
|
362
|
+
|
363
|
+
result = XMLDecl.new(version, encoding, standalone)
|
364
|
+
result.raw_string = raw_string
|
365
|
+
result
|
366
|
+
end
|
367
|
+
|
368
|
+
def DocType.parse(raw_string, is_xml, is_html)
|
369
|
+
unless /\A#{Pat::DocType_C}\z/o =~ raw_string
|
370
|
+
raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
|
371
|
+
end
|
372
|
+
|
373
|
+
root_element_name = $1
|
374
|
+
public_identifier = $2 || $3
|
375
|
+
system_identifier = $4 || $5
|
376
|
+
|
377
|
+
root_element_name = root_element_name.downcase if !is_xml && is_html
|
378
|
+
|
379
|
+
result = DocType.new(root_element_name, public_identifier, system_identifier)
|
380
|
+
result.raw_string = raw_string
|
381
|
+
result
|
382
|
+
end
|
383
|
+
|
384
|
+
def ProcIns.parse(raw_string)
|
385
|
+
unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
|
386
|
+
raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
|
387
|
+
end
|
388
|
+
|
389
|
+
target = $1
|
390
|
+
content = $2
|
391
|
+
|
392
|
+
result = ProcIns.new(target, content)
|
393
|
+
result.raw_string = raw_string
|
394
|
+
result
|
395
|
+
end
|
396
|
+
|
397
|
+
def Comment.parse(raw_string)
|
398
|
+
unless /\A#{Pat::Comment_C}\z/o =~ raw_string
|
399
|
+
raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
|
400
|
+
end
|
401
|
+
|
402
|
+
content = $1
|
403
|
+
|
404
|
+
result = Comment.new(content)
|
405
|
+
result.raw_string = raw_string
|
406
|
+
result
|
407
|
+
end
|
408
|
+
|
409
|
+
# :startdoc:
|
410
|
+
end
|