htree 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data.tar.gz.sig +4 -0
  2. data/Makefile +20 -0
  3. data/Manifest +58 -0
  4. data/README +61 -0
  5. data/Rakefile +37 -0
  6. data/htree.gemspec +32 -0
  7. data/init.rb +1 -0
  8. data/install.rb +112 -0
  9. data/lib/htree.rb +97 -0
  10. data/lib/htree/container.rb +8 -0
  11. data/lib/htree/context.rb +69 -0
  12. data/lib/htree/display.rb +46 -0
  13. data/lib/htree/doc.rb +149 -0
  14. data/lib/htree/elem.rb +262 -0
  15. data/lib/htree/encoder.rb +217 -0
  16. data/lib/htree/equality.rb +219 -0
  17. data/lib/htree/extract_text.rb +37 -0
  18. data/lib/htree/fstr.rb +32 -0
  19. data/lib/htree/gencode.rb +193 -0
  20. data/lib/htree/htmlinfo.rb +672 -0
  21. data/lib/htree/inspect.rb +108 -0
  22. data/lib/htree/leaf.rb +92 -0
  23. data/lib/htree/loc.rb +369 -0
  24. data/lib/htree/modules.rb +49 -0
  25. data/lib/htree/name.rb +122 -0
  26. data/lib/htree/output.rb +212 -0
  27. data/lib/htree/parse.rb +410 -0
  28. data/lib/htree/raw_string.rb +127 -0
  29. data/lib/htree/regexp-util.rb +19 -0
  30. data/lib/htree/rexml.rb +131 -0
  31. data/lib/htree/scan.rb +176 -0
  32. data/lib/htree/tag.rb +113 -0
  33. data/lib/htree/template.rb +961 -0
  34. data/lib/htree/text.rb +115 -0
  35. data/lib/htree/traverse.rb +497 -0
  36. data/test-all.rb +5 -0
  37. data/test/assign.html +1 -0
  38. data/test/template.html +4 -0
  39. data/test/test-attr.rb +67 -0
  40. data/test/test-charset.rb +79 -0
  41. data/test/test-context.rb +29 -0
  42. data/test/test-display_xml.rb +45 -0
  43. data/test/test-elem-new.rb +101 -0
  44. data/test/test-encoder.rb +53 -0
  45. data/test/test-equality.rb +55 -0
  46. data/test/test-extract_text.rb +18 -0
  47. data/test/test-gencode.rb +27 -0
  48. data/test/test-leaf.rb +25 -0
  49. data/test/test-loc.rb +60 -0
  50. data/test/test-namespace.rb +147 -0
  51. data/test/test-output.rb +133 -0
  52. data/test/test-parse.rb +115 -0
  53. data/test/test-raw_string.rb +17 -0
  54. data/test/test-rexml.rb +70 -0
  55. data/test/test-scan.rb +153 -0
  56. data/test/test-security.rb +37 -0
  57. data/test/test-subnode.rb +142 -0
  58. data/test/test-template.rb +313 -0
  59. data/test/test-text.rb +43 -0
  60. data/test/test-traverse.rb +69 -0
  61. metadata +166 -0
  62. metadata.gz.sig +1 -0
@@ -0,0 +1,49 @@
1
+ module HTree
2
+ class Name; include HTree end
3
+ class Context; include HTree end
4
+
5
+ # :stopdoc:
6
+ module Tag; include HTree end
7
+ class STag; include Tag end
8
+ class ETag; include Tag end
9
+ # :startdoc:
10
+
11
+ module Node; include HTree end
12
+ module Container; include Node end
13
+ class Doc; include Container end
14
+ class Elem; include Container end
15
+ module Leaf; include Node end
16
+ class Text; include Leaf end
17
+ class XMLDecl; include Leaf end
18
+ class DocType; include Leaf end
19
+ class ProcIns; include Leaf end
20
+ class Comment; include Leaf end
21
+ class BogusETag; include Leaf end
22
+
23
+ module Traverse end
24
+ module Container::Trav; include Traverse end
25
+ module Leaf::Trav; include Traverse end
26
+ class Doc; module Trav; include Container::Trav end; include Trav end
27
+ class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class Text; module Trav; include Leaf::Trav end; include Trav end
29
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
30
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
31
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
32
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
33
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
34
+
35
+ class Location; include HTree end
36
+ module Container::Loc end
37
+ module Leaf::Loc end
38
+ class Doc; class Loc < Location; include Trav, Container::Loc end end
39
+ class Elem; class Loc < Location; include Trav, Container::Loc end end
40
+ class Text; class Loc < Location; include Trav, Leaf::Loc end end
41
+ class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
42
+ class DocType; class Loc < Location; include Trav, Leaf::Loc end end
43
+ class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
44
+ class Comment; class Loc < Location; include Trav, Leaf::Loc end end
45
+ class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
46
+
47
+ class Error < StandardError; end
48
+ end
49
+
@@ -0,0 +1,122 @@
1
+ require 'htree/scan' # for Pat::Nmtoken
2
+ require 'htree/context'
3
+
4
+ module HTree
5
+ # Name represents a element name and attribute name.
6
+ # It consists of a namespace prefix, a namespace URI and a local name.
7
+ class Name
8
+ =begin
9
+ element name prefix uri localname
10
+ {u}n, n with xmlns=u nil 'u' 'n'
11
+ p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
12
+ n with xmlns='' nil '' 'n'
13
+
14
+ attribute name
15
+ xmlns= 'xmlns' nil nil
16
+ xmlns:n= 'xmlns' nil 'n'
17
+ p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
18
+ n= nil '' 'n'
19
+ =end
20
+ def Name.parse_element_name(name, context)
21
+ if /\{(.*)\}/ =~ name
22
+ # "{u}n" means "use default namespace",
23
+ # "p{u}n" means "use the specified prefix p"
24
+ $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
25
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
26
+ Name.new($`, context.namespace_uri($`), $')
27
+ elsif !context.namespace_uri(nil).empty?
28
+ Name.new(nil, context.namespace_uri(nil), name)
29
+ else
30
+ Name.new(nil, '', name)
31
+ end
32
+ end
33
+
34
+ def Name.parse_attribute_name(name, context)
35
+ if name == 'xmlns'
36
+ Name.new('xmlns', nil, nil)
37
+ elsif /\Axmlns:/ =~ name
38
+ Name.new('xmlns', nil, $')
39
+ elsif /\{(.*)\}/ =~ name
40
+ case $`
41
+ when ''; Name.new(nil, $1, $')
42
+ else Name.new($`, $1, $')
43
+ end
44
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
45
+ Name.new($`, context.namespace_uri($`), $')
46
+ else
47
+ Name.new(nil, '', name)
48
+ end
49
+ end
50
+
51
+ NameCache = {}
52
+ def Name.new(namespace_prefix, namespace_uri, local_name)
53
+ key = [namespace_prefix, namespace_uri, local_name, self]
54
+ NameCache.fetch(key) {
55
+ 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
56
+ NameCache[key] = super(key[0], key[1], key[2])
57
+ }
58
+ end
59
+
60
+ def initialize(namespace_prefix, namespace_uri, local_name)
61
+ @namespace_prefix = namespace_prefix
62
+ @namespace_uri = namespace_uri
63
+ @local_name = local_name
64
+ if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
65
+ raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
66
+ end
67
+ if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
68
+ raise HTree::Error, "invalid local name: #{@local_name.inspect}"
69
+ end
70
+ if @namespace_prefix == 'xmlns'
71
+ unless @namespace_uri == nil
72
+ raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
73
+ end
74
+ else
75
+ unless String === @namespace_uri
76
+ raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
77
+ end
78
+ end
79
+ end
80
+ attr_reader :namespace_prefix, :namespace_uri, :local_name
81
+
82
+ def xmlns?
83
+ @namespace_prefix == 'xmlns' && @namespace_uri == nil
84
+ end
85
+
86
+ def universal_name
87
+ if @namespace_uri && !@namespace_uri.empty?
88
+ "{#{@namespace_uri}}#{@local_name}"
89
+ else
90
+ @local_name.dup
91
+ end
92
+ end
93
+
94
+ def qualified_name
95
+ if @namespace_uri && !@namespace_uri.empty?
96
+ if @namespace_prefix
97
+ "#{@namespace_prefix}:#{@local_name}"
98
+ else
99
+ @local_name.dup
100
+ end
101
+ elsif @local_name
102
+ @local_name.dup
103
+ else
104
+ "xmlns"
105
+ end
106
+ end
107
+
108
+ def to_s
109
+ if @namespace_uri && !@namespace_uri.empty?
110
+ if @namespace_prefix
111
+ "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
112
+ else
113
+ "{#{@namespace_uri}}#{@local_name}"
114
+ end
115
+ elsif @local_name
116
+ @local_name.dup
117
+ else
118
+ "xmlns"
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,212 @@
1
+ require 'htree/encoder'
2
+ require 'htree/doc'
3
+ require 'htree/elem'
4
+ require 'htree/leaf'
5
+ require 'htree/text'
6
+
7
+ module HTree
8
+ # :stopdoc:
9
+
10
+ class Text
11
+ ChRef = {
12
+ '>' => '&gt;',
13
+ '<' => '&lt;',
14
+ '"' => '&quot;',
15
+ }
16
+
17
+ def output(out, context=nil)
18
+ out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
+ end
20
+
21
+ def to_attvalue_content
22
+ @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
+ end
24
+
25
+ def output_attvalue(out, context)
26
+ out.output_string '"'
27
+ out.output_text to_attvalue_content
28
+ out.output_string '"'
29
+ end
30
+
31
+ def output_cdata(out)
32
+ str = self.to_s
33
+ if %r{</} =~ str
34
+ raise ArgumentError, "CDATA cannot contain '</': #{str.inspect}"
35
+ end
36
+ out.output_string(str)
37
+ end
38
+ end
39
+
40
+ class Name
41
+ def output(out, context)
42
+ # xxx: validate namespace prefix
43
+ if xmlns?
44
+ if @local_name
45
+ out.output_string "xmlns:#{@local_name}"
46
+ else
47
+ out.output_string "xmlns"
48
+ end
49
+ else
50
+ out.output_string qualified_name
51
+ end
52
+ end
53
+
54
+ def output_attribute(text, out, context)
55
+ output(out, context)
56
+ out.output_string '='
57
+ text.output_attvalue(out, context)
58
+ end
59
+ end
60
+
61
+ class Doc
62
+ def output(out, context)
63
+ xmldecl = false
64
+ @children.each {|n|
65
+ if n.respond_to? :output_prolog_xmldecl
66
+ n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
67
+ xmldecl = true
68
+ else
69
+ n.output(out, context)
70
+ end
71
+ }
72
+ end
73
+ end
74
+
75
+ class Elem
76
+ def output(out, context)
77
+ if %r{\A\{http://www.w3.org/1999/xhtml\}(?:script|style)\z} =~ @stag.element_name.universal_name
78
+ children_context = @stag.output_stag(out, context)
79
+ out.output_cdata_content(@children, children_context)
80
+ @stag.output_etag(out, context)
81
+ elsif @empty
82
+ @stag.output_emptytag(out, context)
83
+ else
84
+ children_context = @stag.output_stag(out, context)
85
+ @children.each {|n| n.output(out, children_context) }
86
+ @stag.output_etag(out, context)
87
+ end
88
+ end
89
+ end
90
+
91
+ class STag
92
+ def output_attributes(out, context)
93
+ @attributes.each {|aname, text|
94
+ next if aname.xmlns?
95
+ out.output_string ' '
96
+ aname.output_attribute(text, out, context)
97
+ }
98
+ @context.output_namespaces(out, context)
99
+ end
100
+
101
+ def output_emptytag(out, context)
102
+ out.output_string '<'
103
+ @name.output(out, context)
104
+ children_context = output_attributes(out, context)
105
+ out.output_string "\n"
106
+ out.output_slash_if_xml
107
+ out.output_string ">"
108
+ children_context
109
+ end
110
+
111
+ def output_stag(out, context)
112
+ out.output_string '<'
113
+ @name.output(out, context)
114
+ children_context = output_attributes(out, context)
115
+ out.output_string "\n>"
116
+ children_context
117
+ end
118
+
119
+ def output_etag(out, context)
120
+ out.output_string '</'
121
+ @name.output(out, context)
122
+ out.output_string "\n>"
123
+ end
124
+ end
125
+
126
+ class Context
127
+ def output_namespaces(out, outer_context)
128
+ unknown_namespaces = {}
129
+ @namespaces.each {|prefix, uri|
130
+ outer_uri = outer_context.namespace_uri(prefix)
131
+ if outer_uri == nil
132
+ unknown_namespaces[prefix] = uri
133
+ elsif outer_uri != uri
134
+ if prefix
135
+ out.output_string " xmlns:#{prefix}="
136
+ else
137
+ out.output_string " xmlns="
138
+ end
139
+ Text.new(uri).output_attvalue(out, outer_context)
140
+ end
141
+ }
142
+ unless unknown_namespaces.empty?
143
+ out.output_xmlns(unknown_namespaces)
144
+ end
145
+ outer_context.subst_namespaces(@namespaces)
146
+ end
147
+ end
148
+
149
+ class BogusETag
150
+ # don't output anything.
151
+ def output(out, context)
152
+ end
153
+ end
154
+
155
+ class XMLDecl
156
+ # don't output anything.
157
+ def output(out, context)
158
+ end
159
+
160
+ def output_prolog_xmldecl(out, context)
161
+ out.output_string "<?xml version=\"#{@version}\""
162
+ if @encoding
163
+ out.output_string " encoding=\"#{@encoding}\""
164
+ end
165
+ if @standalone != nil
166
+ out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
167
+ end
168
+ out.output_string "?>"
169
+ end
170
+ end
171
+
172
+ class DocType
173
+ def output(out, context)
174
+ out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
175
+ end
176
+
177
+ def generate_content # :nodoc:
178
+ result = ''
179
+ if @public_identifier
180
+ result << "PUBLIC \"#{@public_identifier}\""
181
+ else
182
+ result << "SYSTEM"
183
+ end
184
+ # Although a system identifier is not omissible in XML,
185
+ # we cannot output it if it is not given.
186
+ if @system_identifier
187
+ if /"/ !~ @system_identifier
188
+ result << " \"#{@system_identifier}\""
189
+ else
190
+ result << " '#{@system_identifier}'"
191
+ end
192
+ end
193
+ result
194
+ end
195
+ end
196
+
197
+ class ProcIns
198
+ def output(out, context)
199
+ out.output_string "<?#{@target}"
200
+ out.output_string " #{@content}" if @content
201
+ out.output_string "?>"
202
+ end
203
+ end
204
+
205
+ class Comment
206
+ def output(out, context)
207
+ out.output_string "<!--#{@content}-->"
208
+ end
209
+ end
210
+
211
+ # :startdoc:
212
+ end
@@ -0,0 +1,410 @@
1
+ require 'htree/scan'
2
+ require 'htree/htmlinfo'
3
+ require 'htree/text'
4
+ require 'htree/tag'
5
+ require 'htree/leaf'
6
+ require 'htree/doc'
7
+ require 'htree/elem'
8
+ require 'htree/raw_string'
9
+ require 'htree/context'
10
+ require 'htree/encoder'
11
+ require 'htree/fstr'
12
+
13
+ module HTree
14
+ # HTree.parse parses <i>input</i> and return a document tree.
15
+ # represented by HTree::Doc.
16
+ #
17
+ # <i>input</i> should be a String or
18
+ # an object which respond to read or open method.
19
+ # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
20
+ # Note that the URIs need open-uri.
21
+ #
22
+ # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
23
+ #
24
+ # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
25
+ # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
26
+ #
27
+ # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
28
+ #
29
+ # If opened file or read content has charset method,
30
+ # HTree.parse decode it according to $KCODE before parsing.
31
+ # Otherwise HTree.parse assumes the character encoding of the content is
32
+ # compatible to $KCODE.
33
+ # Note that the charset method is provided by URI::HTTP with open-uri.
34
+ def HTree.parse(input)
35
+ HTree.with_frozen_string_hash {
36
+ parse_as(input, false)
37
+ }
38
+ end
39
+
40
+ # HTree.parse_xml parses <i>input</i> as XML and
41
+ # return a document tree represented by HTree::Doc.
42
+ #
43
+ # It behaves almost same as HTree.parse but it assumes <i>input</i> is XML
44
+ # even if no XML declaration.
45
+ # The assumption causes following differences.
46
+ # * doesn't downcase element name.
47
+ # * The content of <script> and <style> element is PCDATA, not CDATA.
48
+ def HTree.parse_xml(input)
49
+ HTree.with_frozen_string_hash {
50
+ parse_as(input, true)
51
+ }
52
+ end
53
+
54
+ # :stopdoc:
55
+
56
+ def HTree.parse_as(input, is_xml)
57
+ input_charset = nil
58
+ if input.tainted? && 1 <= $SAFE
59
+ raise SecurityError, "input tainted"
60
+ end
61
+ if input.respond_to? :read # IO, StringIO
62
+ input = input.read.untaint
63
+ input_charset = input.charset if input.respond_to? :charset
64
+ elsif input.respond_to? :open # Pathname, URI with open-uri
65
+ input.open {|f|
66
+ input = f.read.untaint
67
+ input_charset = f.charset if f.respond_to? :charset
68
+ }
69
+ end
70
+ if input_charset && input_charset != Encoder.internal_charset
71
+ input = Iconv.conv(Encoder.internal_charset, input_charset, input)
72
+ end
73
+
74
+ tokens = []
75
+ is_xml, is_html = HTree.scan(input, is_xml) {|token|
76
+ tokens << token
77
+ }
78
+ context = is_html ? HTMLContext : DefaultContext
79
+ structure_list = parse_pairs(tokens, is_xml, is_html)
80
+ structure_list = fix_structure_list(structure_list, is_xml, is_html)
81
+ nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
82
+ Doc.new(nodes)
83
+ end
84
+
85
+ def HTree.parse_pairs(tokens, is_xml, is_html)
86
+ stack = [[nil, nil, []]]
87
+ tokens.each {|token|
88
+ case token[0]
89
+ when :stag
90
+ stag_raw_string = token[1]
91
+ stagname = stag_raw_string[Pat::Name]
92
+ stagname = stagname.downcase if !is_xml && is_html
93
+ stagname = HTree.frozen_string(stagname)
94
+ stack << [stagname, stag_raw_string, []]
95
+ when :etag
96
+ etag_raw_string = token[1]
97
+ etagname = etag_raw_string[Pat::Name]
98
+ etagname = etagname.downcase if !is_xml && is_html
99
+ etagname = HTree.frozen_string(etagname)
100
+ matched_elem = nil
101
+ stack.reverse_each {|elem|
102
+ stagname, _, _ = elem
103
+ if stagname == etagname
104
+ matched_elem = elem
105
+ break
106
+ end
107
+ }
108
+ if matched_elem
109
+ until matched_elem.equal? stack.last
110
+ stagname, stag_raw_string, children = stack.pop
111
+ stack.last[2] << [:elem, stag_raw_string, children]
112
+ end
113
+ stagname, stag_raw_string, children = stack.pop
114
+ stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
115
+ else
116
+ stack.last[2] << [:bogus_etag, etag_raw_string]
117
+ end
118
+ else
119
+ stack.last[2] << token
120
+ end
121
+ }
122
+ elem = nil
123
+ while 1 < stack.length
124
+ stagname, stag_raw_string, children = stack.pop
125
+ stack.last[2] << [:elem, stag_raw_string, children]
126
+ end
127
+ stack[0][2]
128
+ end
129
+
130
+ def HTree.fix_structure_list(structure_list, is_xml, is_html)
131
+ result = []
132
+ rest = structure_list.dup
133
+ until rest.empty?
134
+ structure = rest.shift
135
+ if structure[0] == :elem
136
+ elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
137
+ result << elem
138
+ rest = rest2 + rest
139
+ else
140
+ result << structure
141
+ end
142
+ end
143
+ result
144
+ end
145
+
146
+ def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
147
+ stag_raw_string = elem[1]
148
+ children = elem[2]
149
+ if etag_raw_string = elem[3]
150
+ return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
151
+ else
152
+ tagname = stag_raw_string[Pat::Name]
153
+ tagname = tagname.downcase if !is_xml && is_html
154
+ if ElementContent[tagname] == :EMPTY
155
+ return [:elem, stag_raw_string, []], children
156
+ else
157
+ if ElementContent[tagname] == :CDATA
158
+ possible_tags = []
159
+ else
160
+ possible_tags = ElementContent[tagname]
161
+ end
162
+ if possible_tags
163
+ excluded_tags2 = ElementExclusions[tagname]
164
+ included_tags2 = ElementInclusions[tagname]
165
+ excluded_tags |= excluded_tags2 if excluded_tags2
166
+ included_tags |= included_tags2 if included_tags2
167
+ containable_tags = (possible_tags | included_tags) - excluded_tags
168
+ uncontainable_tags = ElementContent.keys - containable_tags
169
+ else
170
+ # If the tagname is unknown, it is assumed that any element
171
+ # except excluded can be contained.
172
+ uncontainable_tags = excluded_tags
173
+ end
174
+ fixed_children = []
175
+ rest = children
176
+ until rest.empty?
177
+ if rest[0][0] == :elem
178
+ elem = rest.shift
179
+ elem_tagname = elem[1][Pat::Name]
180
+ elem_tagname = elem_tagname.downcase if !is_xml && is_html
181
+ if uncontainable_tags.include? elem_tagname
182
+ rest.unshift elem
183
+ break
184
+ else
185
+ fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
186
+ fixed_children << fixed_elem
187
+ rest = rest2 + rest
188
+ end
189
+ else
190
+ fixed_children << rest.shift
191
+ end
192
+ end
193
+ return [:elem, stag_raw_string, fixed_children], rest
194
+ end
195
+ end
196
+ end
197
+
198
+ def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
199
+ case structure[0]
200
+ when :text_pcdata
201
+ Text.parse_pcdata(structure[1])
202
+ when :elem
203
+ _, stag_rawstring, children, etag_rawstring = structure
204
+ etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
205
+ stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
206
+ if !children.empty? || etag ||
207
+ stag.element_name.namespace_uri != 'http://www.w3.org/1999/xhtml' ||
208
+ HTree::ElementContent[stag.element_name.local_name] != :EMPTY
209
+ Elem.new!(stag,
210
+ children.map {|c| build_node(c, is_xml, is_html, stag.context) },
211
+ etag)
212
+ else
213
+ Elem.new!(stag)
214
+ end
215
+ when :emptytag
216
+ Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
217
+ when :bogus_etag
218
+ BogusETag.parse(structure[1], is_xml, is_html)
219
+ when :xmldecl
220
+ XMLDecl.parse(structure[1])
221
+ when :doctype
222
+ DocType.parse(structure[1], is_xml, is_html)
223
+ when :procins
224
+ ProcIns.parse(structure[1])
225
+ when :comment
226
+ Comment.parse(structure[1])
227
+ when :text_cdata_content
228
+ Text.parse_cdata_content(structure[1])
229
+ when :text_cdata_section
230
+ Text.parse_cdata_section(structure[1])
231
+ else
232
+ raise Exception, "[bug] unknown structure: #{structure.inspect}"
233
+ end
234
+ end
235
+
236
+ def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
237
+ attrs = []
238
+ if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
239
+ qname = $1
240
+ $2.scan(Pat::ValidAttr_C) {
241
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
242
+ }
243
+ elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
244
+ qname = $1
245
+ last_attr = $3
246
+ $2.scan(Pat::InvalidAttr1_C) {
247
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
248
+ }
249
+ if last_attr
250
+ /#{Pat::InvalidAttr1End_C}/o =~ last_attr
251
+ attrs << [$1, $2 || $3]
252
+ end
253
+ else
254
+ raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
255
+ end
256
+
257
+ qname = qname.downcase if !is_xml && is_html
258
+
259
+ attrs.map! {|aname, aval|
260
+ if aname
261
+ aname = (!is_xml && is_html) ? aname.downcase : aname
262
+ [aname, Text.parse_pcdata(aval)]
263
+ else
264
+ if val2name = OmittedAttrName[qname]
265
+ aval_downcase = aval.downcase
266
+ aname = val2name.fetch(aval_downcase, aval_downcase)
267
+ else
268
+ aname = aval
269
+ end
270
+ [aname, Text.new(aval)]
271
+ end
272
+ }
273
+
274
+ result = STag.new(qname, attrs, inherited_context)
275
+ result.raw_string = raw_string
276
+ result
277
+ end
278
+
279
+ def ETag.parse(raw_string, is_xml, is_html)
280
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
281
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
282
+ end
283
+
284
+ qname = $1
285
+ qname = qname.downcase if !is_xml && is_html
286
+
287
+ result = self.new(qname)
288
+ result.raw_string = raw_string
289
+ result
290
+ end
291
+
292
+ def BogusETag.parse(raw_string, is_xml, is_html)
293
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
294
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
295
+ end
296
+
297
+ qname = $1
298
+ qname = qname.downcase if !is_xml && is_html
299
+
300
+ result = self.new(qname)
301
+ result.raw_string = raw_string
302
+ result
303
+ end
304
+
305
+ def Text.parse_pcdata(raw_string)
306
+ fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
307
+ name = $1
308
+ case s
309
+ when /;\z/
310
+ s
311
+ when /\A&#/
312
+ "#{s};"
313
+ when '&'
314
+ '&amp;'
315
+ else
316
+ if NamedCharactersPattern =~ name
317
+ "&#{name};"
318
+ else
319
+ "&amp;#{name}"
320
+ end
321
+ end
322
+ }
323
+ fixed = raw_string if fixed == raw_string
324
+ result = Text.new_internal(fixed)
325
+ result.raw_string = raw_string
326
+ result
327
+ end
328
+
329
+ def Text.parse_cdata_content(raw_string)
330
+ result = Text.new(raw_string)
331
+ result.raw_string = raw_string
332
+ result
333
+ end
334
+
335
+ def Text.parse_cdata_section(raw_string)
336
+ unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
337
+ raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
338
+ end
339
+
340
+ content = $1
341
+
342
+ result = Text.new(content)
343
+ result.raw_string = raw_string
344
+ result
345
+ end
346
+
347
+ def XMLDecl.parse(raw_string)
348
+ unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
349
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
350
+ end
351
+
352
+ version = $1 || $2
353
+ encoding = $3 || $4
354
+ case $5 || $6
355
+ when 'yes'
356
+ standalone = true
357
+ when 'no'
358
+ standalone = false
359
+ else
360
+ standalone = nil
361
+ end
362
+
363
+ result = XMLDecl.new(version, encoding, standalone)
364
+ result.raw_string = raw_string
365
+ result
366
+ end
367
+
368
+ def DocType.parse(raw_string, is_xml, is_html)
369
+ unless /\A#{Pat::DocType_C}\z/o =~ raw_string
370
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
371
+ end
372
+
373
+ root_element_name = $1
374
+ public_identifier = $2 || $3
375
+ system_identifier = $4 || $5
376
+
377
+ root_element_name = root_element_name.downcase if !is_xml && is_html
378
+
379
+ result = DocType.new(root_element_name, public_identifier, system_identifier)
380
+ result.raw_string = raw_string
381
+ result
382
+ end
383
+
384
+ def ProcIns.parse(raw_string)
385
+ unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
386
+ raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
387
+ end
388
+
389
+ target = $1
390
+ content = $2
391
+
392
+ result = ProcIns.new(target, content)
393
+ result.raw_string = raw_string
394
+ result
395
+ end
396
+
397
+ def Comment.parse(raw_string)
398
+ unless /\A#{Pat::Comment_C}\z/o =~ raw_string
399
+ raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
400
+ end
401
+
402
+ content = $1
403
+
404
+ result = Comment.new(content)
405
+ result.raw_string = raw_string
406
+ result
407
+ end
408
+
409
+ # :startdoc:
410
+ end