htree 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data.tar.gz.sig +4 -0
  2. data/Makefile +20 -0
  3. data/Manifest +58 -0
  4. data/README +61 -0
  5. data/Rakefile +37 -0
  6. data/htree.gemspec +32 -0
  7. data/init.rb +1 -0
  8. data/install.rb +112 -0
  9. data/lib/htree.rb +97 -0
  10. data/lib/htree/container.rb +8 -0
  11. data/lib/htree/context.rb +69 -0
  12. data/lib/htree/display.rb +46 -0
  13. data/lib/htree/doc.rb +149 -0
  14. data/lib/htree/elem.rb +262 -0
  15. data/lib/htree/encoder.rb +217 -0
  16. data/lib/htree/equality.rb +219 -0
  17. data/lib/htree/extract_text.rb +37 -0
  18. data/lib/htree/fstr.rb +32 -0
  19. data/lib/htree/gencode.rb +193 -0
  20. data/lib/htree/htmlinfo.rb +672 -0
  21. data/lib/htree/inspect.rb +108 -0
  22. data/lib/htree/leaf.rb +92 -0
  23. data/lib/htree/loc.rb +369 -0
  24. data/lib/htree/modules.rb +49 -0
  25. data/lib/htree/name.rb +122 -0
  26. data/lib/htree/output.rb +212 -0
  27. data/lib/htree/parse.rb +410 -0
  28. data/lib/htree/raw_string.rb +127 -0
  29. data/lib/htree/regexp-util.rb +19 -0
  30. data/lib/htree/rexml.rb +131 -0
  31. data/lib/htree/scan.rb +176 -0
  32. data/lib/htree/tag.rb +113 -0
  33. data/lib/htree/template.rb +961 -0
  34. data/lib/htree/text.rb +115 -0
  35. data/lib/htree/traverse.rb +497 -0
  36. data/test-all.rb +5 -0
  37. data/test/assign.html +1 -0
  38. data/test/template.html +4 -0
  39. data/test/test-attr.rb +67 -0
  40. data/test/test-charset.rb +79 -0
  41. data/test/test-context.rb +29 -0
  42. data/test/test-display_xml.rb +45 -0
  43. data/test/test-elem-new.rb +101 -0
  44. data/test/test-encoder.rb +53 -0
  45. data/test/test-equality.rb +55 -0
  46. data/test/test-extract_text.rb +18 -0
  47. data/test/test-gencode.rb +27 -0
  48. data/test/test-leaf.rb +25 -0
  49. data/test/test-loc.rb +60 -0
  50. data/test/test-namespace.rb +147 -0
  51. data/test/test-output.rb +133 -0
  52. data/test/test-parse.rb +115 -0
  53. data/test/test-raw_string.rb +17 -0
  54. data/test/test-rexml.rb +70 -0
  55. data/test/test-scan.rb +153 -0
  56. data/test/test-security.rb +37 -0
  57. data/test/test-subnode.rb +142 -0
  58. data/test/test-template.rb +313 -0
  59. data/test/test-text.rb +43 -0
  60. data/test/test-traverse.rb +69 -0
  61. metadata +166 -0
  62. metadata.gz.sig +1 -0
@@ -0,0 +1,49 @@
1
+ module HTree
2
+ class Name; include HTree end
3
+ class Context; include HTree end
4
+
5
+ # :stopdoc:
6
+ module Tag; include HTree end
7
+ class STag; include Tag end
8
+ class ETag; include Tag end
9
+ # :startdoc:
10
+
11
+ module Node; include HTree end
12
+ module Container; include Node end
13
+ class Doc; include Container end
14
+ class Elem; include Container end
15
+ module Leaf; include Node end
16
+ class Text; include Leaf end
17
+ class XMLDecl; include Leaf end
18
+ class DocType; include Leaf end
19
+ class ProcIns; include Leaf end
20
+ class Comment; include Leaf end
21
+ class BogusETag; include Leaf end
22
+
23
+ module Traverse end
24
+ module Container::Trav; include Traverse end
25
+ module Leaf::Trav; include Traverse end
26
+ class Doc; module Trav; include Container::Trav end; include Trav end
27
+ class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class Text; module Trav; include Leaf::Trav end; include Trav end
29
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
30
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
31
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
32
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
33
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
34
+
35
+ class Location; include HTree end
36
+ module Container::Loc end
37
+ module Leaf::Loc end
38
+ class Doc; class Loc < Location; include Trav, Container::Loc end end
39
+ class Elem; class Loc < Location; include Trav, Container::Loc end end
40
+ class Text; class Loc < Location; include Trav, Leaf::Loc end end
41
+ class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
42
+ class DocType; class Loc < Location; include Trav, Leaf::Loc end end
43
+ class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
44
+ class Comment; class Loc < Location; include Trav, Leaf::Loc end end
45
+ class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
46
+
47
+ class Error < StandardError; end
48
+ end
49
+
@@ -0,0 +1,122 @@
1
+ require 'htree/scan' # for Pat::Nmtoken
2
+ require 'htree/context'
3
+
4
+ module HTree
5
+ # Name represents a element name and attribute name.
6
+ # It consists of a namespace prefix, a namespace URI and a local name.
7
+ class Name
8
+ =begin
9
+ element name prefix uri localname
10
+ {u}n, n with xmlns=u nil 'u' 'n'
11
+ p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
12
+ n with xmlns='' nil '' 'n'
13
+
14
+ attribute name
15
+ xmlns= 'xmlns' nil nil
16
+ xmlns:n= 'xmlns' nil 'n'
17
+ p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
18
+ n= nil '' 'n'
19
+ =end
20
+ def Name.parse_element_name(name, context)
21
+ if /\{(.*)\}/ =~ name
22
+ # "{u}n" means "use default namespace",
23
+ # "p{u}n" means "use the specified prefix p"
24
+ $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
25
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
26
+ Name.new($`, context.namespace_uri($`), $')
27
+ elsif !context.namespace_uri(nil).empty?
28
+ Name.new(nil, context.namespace_uri(nil), name)
29
+ else
30
+ Name.new(nil, '', name)
31
+ end
32
+ end
33
+
34
+ def Name.parse_attribute_name(name, context)
35
+ if name == 'xmlns'
36
+ Name.new('xmlns', nil, nil)
37
+ elsif /\Axmlns:/ =~ name
38
+ Name.new('xmlns', nil, $')
39
+ elsif /\{(.*)\}/ =~ name
40
+ case $`
41
+ when ''; Name.new(nil, $1, $')
42
+ else Name.new($`, $1, $')
43
+ end
44
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
45
+ Name.new($`, context.namespace_uri($`), $')
46
+ else
47
+ Name.new(nil, '', name)
48
+ end
49
+ end
50
+
51
+ NameCache = {}
52
+ def Name.new(namespace_prefix, namespace_uri, local_name)
53
+ key = [namespace_prefix, namespace_uri, local_name, self]
54
+ NameCache.fetch(key) {
55
+ 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
56
+ NameCache[key] = super(key[0], key[1], key[2])
57
+ }
58
+ end
59
+
60
+ def initialize(namespace_prefix, namespace_uri, local_name)
61
+ @namespace_prefix = namespace_prefix
62
+ @namespace_uri = namespace_uri
63
+ @local_name = local_name
64
+ if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
65
+ raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
66
+ end
67
+ if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
68
+ raise HTree::Error, "invalid local name: #{@local_name.inspect}"
69
+ end
70
+ if @namespace_prefix == 'xmlns'
71
+ unless @namespace_uri == nil
72
+ raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
73
+ end
74
+ else
75
+ unless String === @namespace_uri
76
+ raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
77
+ end
78
+ end
79
+ end
80
+ attr_reader :namespace_prefix, :namespace_uri, :local_name
81
+
82
+ def xmlns?
83
+ @namespace_prefix == 'xmlns' && @namespace_uri == nil
84
+ end
85
+
86
+ def universal_name
87
+ if @namespace_uri && !@namespace_uri.empty?
88
+ "{#{@namespace_uri}}#{@local_name}"
89
+ else
90
+ @local_name.dup
91
+ end
92
+ end
93
+
94
+ def qualified_name
95
+ if @namespace_uri && !@namespace_uri.empty?
96
+ if @namespace_prefix
97
+ "#{@namespace_prefix}:#{@local_name}"
98
+ else
99
+ @local_name.dup
100
+ end
101
+ elsif @local_name
102
+ @local_name.dup
103
+ else
104
+ "xmlns"
105
+ end
106
+ end
107
+
108
+ def to_s
109
+ if @namespace_uri && !@namespace_uri.empty?
110
+ if @namespace_prefix
111
+ "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
112
+ else
113
+ "{#{@namespace_uri}}#{@local_name}"
114
+ end
115
+ elsif @local_name
116
+ @local_name.dup
117
+ else
118
+ "xmlns"
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,212 @@
1
+ require 'htree/encoder'
2
+ require 'htree/doc'
3
+ require 'htree/elem'
4
+ require 'htree/leaf'
5
+ require 'htree/text'
6
+
7
+ module HTree
8
+ # :stopdoc:
9
+
10
+ class Text
11
+ ChRef = {
12
+ '>' => '&gt;',
13
+ '<' => '&lt;',
14
+ '"' => '&quot;',
15
+ }
16
+
17
+ def output(out, context=nil)
18
+ out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
+ end
20
+
21
+ def to_attvalue_content
22
+ @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
+ end
24
+
25
+ def output_attvalue(out, context)
26
+ out.output_string '"'
27
+ out.output_text to_attvalue_content
28
+ out.output_string '"'
29
+ end
30
+
31
+ def output_cdata(out)
32
+ str = self.to_s
33
+ if %r{</} =~ str
34
+ raise ArgumentError, "CDATA cannot contain '</': #{str.inspect}"
35
+ end
36
+ out.output_string(str)
37
+ end
38
+ end
39
+
40
+ class Name
41
+ def output(out, context)
42
+ # xxx: validate namespace prefix
43
+ if xmlns?
44
+ if @local_name
45
+ out.output_string "xmlns:#{@local_name}"
46
+ else
47
+ out.output_string "xmlns"
48
+ end
49
+ else
50
+ out.output_string qualified_name
51
+ end
52
+ end
53
+
54
+ def output_attribute(text, out, context)
55
+ output(out, context)
56
+ out.output_string '='
57
+ text.output_attvalue(out, context)
58
+ end
59
+ end
60
+
61
+ class Doc
62
+ def output(out, context)
63
+ xmldecl = false
64
+ @children.each {|n|
65
+ if n.respond_to? :output_prolog_xmldecl
66
+ n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
67
+ xmldecl = true
68
+ else
69
+ n.output(out, context)
70
+ end
71
+ }
72
+ end
73
+ end
74
+
75
+ class Elem
76
+ def output(out, context)
77
+ if %r{\A\{http://www.w3.org/1999/xhtml\}(?:script|style)\z} =~ @stag.element_name.universal_name
78
+ children_context = @stag.output_stag(out, context)
79
+ out.output_cdata_content(@children, children_context)
80
+ @stag.output_etag(out, context)
81
+ elsif @empty
82
+ @stag.output_emptytag(out, context)
83
+ else
84
+ children_context = @stag.output_stag(out, context)
85
+ @children.each {|n| n.output(out, children_context) }
86
+ @stag.output_etag(out, context)
87
+ end
88
+ end
89
+ end
90
+
91
+ class STag
92
+ def output_attributes(out, context)
93
+ @attributes.each {|aname, text|
94
+ next if aname.xmlns?
95
+ out.output_string ' '
96
+ aname.output_attribute(text, out, context)
97
+ }
98
+ @context.output_namespaces(out, context)
99
+ end
100
+
101
+ def output_emptytag(out, context)
102
+ out.output_string '<'
103
+ @name.output(out, context)
104
+ children_context = output_attributes(out, context)
105
+ out.output_string "\n"
106
+ out.output_slash_if_xml
107
+ out.output_string ">"
108
+ children_context
109
+ end
110
+
111
+ def output_stag(out, context)
112
+ out.output_string '<'
113
+ @name.output(out, context)
114
+ children_context = output_attributes(out, context)
115
+ out.output_string "\n>"
116
+ children_context
117
+ end
118
+
119
+ def output_etag(out, context)
120
+ out.output_string '</'
121
+ @name.output(out, context)
122
+ out.output_string "\n>"
123
+ end
124
+ end
125
+
126
+ class Context
127
+ def output_namespaces(out, outer_context)
128
+ unknown_namespaces = {}
129
+ @namespaces.each {|prefix, uri|
130
+ outer_uri = outer_context.namespace_uri(prefix)
131
+ if outer_uri == nil
132
+ unknown_namespaces[prefix] = uri
133
+ elsif outer_uri != uri
134
+ if prefix
135
+ out.output_string " xmlns:#{prefix}="
136
+ else
137
+ out.output_string " xmlns="
138
+ end
139
+ Text.new(uri).output_attvalue(out, outer_context)
140
+ end
141
+ }
142
+ unless unknown_namespaces.empty?
143
+ out.output_xmlns(unknown_namespaces)
144
+ end
145
+ outer_context.subst_namespaces(@namespaces)
146
+ end
147
+ end
148
+
149
+ class BogusETag
150
+ # don't output anything.
151
+ def output(out, context)
152
+ end
153
+ end
154
+
155
+ class XMLDecl
156
+ # don't output anything.
157
+ def output(out, context)
158
+ end
159
+
160
+ def output_prolog_xmldecl(out, context)
161
+ out.output_string "<?xml version=\"#{@version}\""
162
+ if @encoding
163
+ out.output_string " encoding=\"#{@encoding}\""
164
+ end
165
+ if @standalone != nil
166
+ out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
167
+ end
168
+ out.output_string "?>"
169
+ end
170
+ end
171
+
172
+ class DocType
173
+ def output(out, context)
174
+ out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
175
+ end
176
+
177
+ def generate_content # :nodoc:
178
+ result = ''
179
+ if @public_identifier
180
+ result << "PUBLIC \"#{@public_identifier}\""
181
+ else
182
+ result << "SYSTEM"
183
+ end
184
+ # Although a system identifier is not omissible in XML,
185
+ # we cannot output it if it is not given.
186
+ if @system_identifier
187
+ if /"/ !~ @system_identifier
188
+ result << " \"#{@system_identifier}\""
189
+ else
190
+ result << " '#{@system_identifier}'"
191
+ end
192
+ end
193
+ result
194
+ end
195
+ end
196
+
197
+ class ProcIns
198
+ def output(out, context)
199
+ out.output_string "<?#{@target}"
200
+ out.output_string " #{@content}" if @content
201
+ out.output_string "?>"
202
+ end
203
+ end
204
+
205
+ class Comment
206
+ def output(out, context)
207
+ out.output_string "<!--#{@content}-->"
208
+ end
209
+ end
210
+
211
+ # :startdoc:
212
+ end
@@ -0,0 +1,410 @@
1
+ require 'htree/scan'
2
+ require 'htree/htmlinfo'
3
+ require 'htree/text'
4
+ require 'htree/tag'
5
+ require 'htree/leaf'
6
+ require 'htree/doc'
7
+ require 'htree/elem'
8
+ require 'htree/raw_string'
9
+ require 'htree/context'
10
+ require 'htree/encoder'
11
+ require 'htree/fstr'
12
+
13
+ module HTree
14
+ # HTree.parse parses <i>input</i> and return a document tree.
15
+ # represented by HTree::Doc.
16
+ #
17
+ # <i>input</i> should be a String or
18
+ # an object which respond to read or open method.
19
+ # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
20
+ # Note that the URIs need open-uri.
21
+ #
22
+ # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
23
+ #
24
+ # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
25
+ # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
26
+ #
27
+ # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
28
+ #
29
+ # If opened file or read content has charset method,
30
+ # HTree.parse decode it according to $KCODE before parsing.
31
+ # Otherwise HTree.parse assumes the character encoding of the content is
32
+ # compatible to $KCODE.
33
+ # Note that the charset method is provided by URI::HTTP with open-uri.
34
+ def HTree.parse(input)
35
+ HTree.with_frozen_string_hash {
36
+ parse_as(input, false)
37
+ }
38
+ end
39
+
40
+ # HTree.parse_xml parses <i>input</i> as XML and
41
+ # return a document tree represented by HTree::Doc.
42
+ #
43
+ # It behaves almost same as HTree.parse but it assumes <i>input</i> is XML
44
+ # even if no XML declaration.
45
+ # The assumption causes following differences.
46
+ # * doesn't downcase element name.
47
+ # * The content of <script> and <style> element is PCDATA, not CDATA.
48
+ def HTree.parse_xml(input)
49
+ HTree.with_frozen_string_hash {
50
+ parse_as(input, true)
51
+ }
52
+ end
53
+
54
+ # :stopdoc:
55
+
56
+ def HTree.parse_as(input, is_xml)
57
+ input_charset = nil
58
+ if input.tainted? && 1 <= $SAFE
59
+ raise SecurityError, "input tainted"
60
+ end
61
+ if input.respond_to? :read # IO, StringIO
62
+ input = input.read.untaint
63
+ input_charset = input.charset if input.respond_to? :charset
64
+ elsif input.respond_to? :open # Pathname, URI with open-uri
65
+ input.open {|f|
66
+ input = f.read.untaint
67
+ input_charset = f.charset if f.respond_to? :charset
68
+ }
69
+ end
70
+ if input_charset && input_charset != Encoder.internal_charset
71
+ input = Iconv.conv(Encoder.internal_charset, input_charset, input)
72
+ end
73
+
74
+ tokens = []
75
+ is_xml, is_html = HTree.scan(input, is_xml) {|token|
76
+ tokens << token
77
+ }
78
+ context = is_html ? HTMLContext : DefaultContext
79
+ structure_list = parse_pairs(tokens, is_xml, is_html)
80
+ structure_list = fix_structure_list(structure_list, is_xml, is_html)
81
+ nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
82
+ Doc.new(nodes)
83
+ end
84
+
85
+ def HTree.parse_pairs(tokens, is_xml, is_html)
86
+ stack = [[nil, nil, []]]
87
+ tokens.each {|token|
88
+ case token[0]
89
+ when :stag
90
+ stag_raw_string = token[1]
91
+ stagname = stag_raw_string[Pat::Name]
92
+ stagname = stagname.downcase if !is_xml && is_html
93
+ stagname = HTree.frozen_string(stagname)
94
+ stack << [stagname, stag_raw_string, []]
95
+ when :etag
96
+ etag_raw_string = token[1]
97
+ etagname = etag_raw_string[Pat::Name]
98
+ etagname = etagname.downcase if !is_xml && is_html
99
+ etagname = HTree.frozen_string(etagname)
100
+ matched_elem = nil
101
+ stack.reverse_each {|elem|
102
+ stagname, _, _ = elem
103
+ if stagname == etagname
104
+ matched_elem = elem
105
+ break
106
+ end
107
+ }
108
+ if matched_elem
109
+ until matched_elem.equal? stack.last
110
+ stagname, stag_raw_string, children = stack.pop
111
+ stack.last[2] << [:elem, stag_raw_string, children]
112
+ end
113
+ stagname, stag_raw_string, children = stack.pop
114
+ stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
115
+ else
116
+ stack.last[2] << [:bogus_etag, etag_raw_string]
117
+ end
118
+ else
119
+ stack.last[2] << token
120
+ end
121
+ }
122
+ elem = nil
123
+ while 1 < stack.length
124
+ stagname, stag_raw_string, children = stack.pop
125
+ stack.last[2] << [:elem, stag_raw_string, children]
126
+ end
127
+ stack[0][2]
128
+ end
129
+
130
+ def HTree.fix_structure_list(structure_list, is_xml, is_html)
131
+ result = []
132
+ rest = structure_list.dup
133
+ until rest.empty?
134
+ structure = rest.shift
135
+ if structure[0] == :elem
136
+ elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
137
+ result << elem
138
+ rest = rest2 + rest
139
+ else
140
+ result << structure
141
+ end
142
+ end
143
+ result
144
+ end
145
+
146
+ def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
147
+ stag_raw_string = elem[1]
148
+ children = elem[2]
149
+ if etag_raw_string = elem[3]
150
+ return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
151
+ else
152
+ tagname = stag_raw_string[Pat::Name]
153
+ tagname = tagname.downcase if !is_xml && is_html
154
+ if ElementContent[tagname] == :EMPTY
155
+ return [:elem, stag_raw_string, []], children
156
+ else
157
+ if ElementContent[tagname] == :CDATA
158
+ possible_tags = []
159
+ else
160
+ possible_tags = ElementContent[tagname]
161
+ end
162
+ if possible_tags
163
+ excluded_tags2 = ElementExclusions[tagname]
164
+ included_tags2 = ElementInclusions[tagname]
165
+ excluded_tags |= excluded_tags2 if excluded_tags2
166
+ included_tags |= included_tags2 if included_tags2
167
+ containable_tags = (possible_tags | included_tags) - excluded_tags
168
+ uncontainable_tags = ElementContent.keys - containable_tags
169
+ else
170
+ # If the tagname is unknown, it is assumed that any element
171
+ # except excluded can be contained.
172
+ uncontainable_tags = excluded_tags
173
+ end
174
+ fixed_children = []
175
+ rest = children
176
+ until rest.empty?
177
+ if rest[0][0] == :elem
178
+ elem = rest.shift
179
+ elem_tagname = elem[1][Pat::Name]
180
+ elem_tagname = elem_tagname.downcase if !is_xml && is_html
181
+ if uncontainable_tags.include? elem_tagname
182
+ rest.unshift elem
183
+ break
184
+ else
185
+ fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
186
+ fixed_children << fixed_elem
187
+ rest = rest2 + rest
188
+ end
189
+ else
190
+ fixed_children << rest.shift
191
+ end
192
+ end
193
+ return [:elem, stag_raw_string, fixed_children], rest
194
+ end
195
+ end
196
+ end
197
+
198
+ def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
199
+ case structure[0]
200
+ when :text_pcdata
201
+ Text.parse_pcdata(structure[1])
202
+ when :elem
203
+ _, stag_rawstring, children, etag_rawstring = structure
204
+ etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
205
+ stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
206
+ if !children.empty? || etag ||
207
+ stag.element_name.namespace_uri != 'http://www.w3.org/1999/xhtml' ||
208
+ HTree::ElementContent[stag.element_name.local_name] != :EMPTY
209
+ Elem.new!(stag,
210
+ children.map {|c| build_node(c, is_xml, is_html, stag.context) },
211
+ etag)
212
+ else
213
+ Elem.new!(stag)
214
+ end
215
+ when :emptytag
216
+ Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
217
+ when :bogus_etag
218
+ BogusETag.parse(structure[1], is_xml, is_html)
219
+ when :xmldecl
220
+ XMLDecl.parse(structure[1])
221
+ when :doctype
222
+ DocType.parse(structure[1], is_xml, is_html)
223
+ when :procins
224
+ ProcIns.parse(structure[1])
225
+ when :comment
226
+ Comment.parse(structure[1])
227
+ when :text_cdata_content
228
+ Text.parse_cdata_content(structure[1])
229
+ when :text_cdata_section
230
+ Text.parse_cdata_section(structure[1])
231
+ else
232
+ raise Exception, "[bug] unknown structure: #{structure.inspect}"
233
+ end
234
+ end
235
+
236
+ def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
237
+ attrs = []
238
+ if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
239
+ qname = $1
240
+ $2.scan(Pat::ValidAttr_C) {
241
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
242
+ }
243
+ elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
244
+ qname = $1
245
+ last_attr = $3
246
+ $2.scan(Pat::InvalidAttr1_C) {
247
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
248
+ }
249
+ if last_attr
250
+ /#{Pat::InvalidAttr1End_C}/o =~ last_attr
251
+ attrs << [$1, $2 || $3]
252
+ end
253
+ else
254
+ raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
255
+ end
256
+
257
+ qname = qname.downcase if !is_xml && is_html
258
+
259
+ attrs.map! {|aname, aval|
260
+ if aname
261
+ aname = (!is_xml && is_html) ? aname.downcase : aname
262
+ [aname, Text.parse_pcdata(aval)]
263
+ else
264
+ if val2name = OmittedAttrName[qname]
265
+ aval_downcase = aval.downcase
266
+ aname = val2name.fetch(aval_downcase, aval_downcase)
267
+ else
268
+ aname = aval
269
+ end
270
+ [aname, Text.new(aval)]
271
+ end
272
+ }
273
+
274
+ result = STag.new(qname, attrs, inherited_context)
275
+ result.raw_string = raw_string
276
+ result
277
+ end
278
+
279
+ def ETag.parse(raw_string, is_xml, is_html)
280
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
281
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
282
+ end
283
+
284
+ qname = $1
285
+ qname = qname.downcase if !is_xml && is_html
286
+
287
+ result = self.new(qname)
288
+ result.raw_string = raw_string
289
+ result
290
+ end
291
+
292
+ def BogusETag.parse(raw_string, is_xml, is_html)
293
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
294
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
295
+ end
296
+
297
+ qname = $1
298
+ qname = qname.downcase if !is_xml && is_html
299
+
300
+ result = self.new(qname)
301
+ result.raw_string = raw_string
302
+ result
303
+ end
304
+
305
+ def Text.parse_pcdata(raw_string)
306
+ fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
307
+ name = $1
308
+ case s
309
+ when /;\z/
310
+ s
311
+ when /\A&#/
312
+ "#{s};"
313
+ when '&'
314
+ '&amp;'
315
+ else
316
+ if NamedCharactersPattern =~ name
317
+ "&#{name};"
318
+ else
319
+ "&amp;#{name}"
320
+ end
321
+ end
322
+ }
323
+ fixed = raw_string if fixed == raw_string
324
+ result = Text.new_internal(fixed)
325
+ result.raw_string = raw_string
326
+ result
327
+ end
328
+
329
+ def Text.parse_cdata_content(raw_string)
330
+ result = Text.new(raw_string)
331
+ result.raw_string = raw_string
332
+ result
333
+ end
334
+
335
+ def Text.parse_cdata_section(raw_string)
336
+ unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
337
+ raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
338
+ end
339
+
340
+ content = $1
341
+
342
+ result = Text.new(content)
343
+ result.raw_string = raw_string
344
+ result
345
+ end
346
+
347
+ def XMLDecl.parse(raw_string)
348
+ unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
349
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
350
+ end
351
+
352
+ version = $1 || $2
353
+ encoding = $3 || $4
354
+ case $5 || $6
355
+ when 'yes'
356
+ standalone = true
357
+ when 'no'
358
+ standalone = false
359
+ else
360
+ standalone = nil
361
+ end
362
+
363
+ result = XMLDecl.new(version, encoding, standalone)
364
+ result.raw_string = raw_string
365
+ result
366
+ end
367
+
368
+ def DocType.parse(raw_string, is_xml, is_html)
369
+ unless /\A#{Pat::DocType_C}\z/o =~ raw_string
370
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
371
+ end
372
+
373
+ root_element_name = $1
374
+ public_identifier = $2 || $3
375
+ system_identifier = $4 || $5
376
+
377
+ root_element_name = root_element_name.downcase if !is_xml && is_html
378
+
379
+ result = DocType.new(root_element_name, public_identifier, system_identifier)
380
+ result.raw_string = raw_string
381
+ result
382
+ end
383
+
384
+ def ProcIns.parse(raw_string)
385
+ unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
386
+ raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
387
+ end
388
+
389
+ target = $1
390
+ content = $2
391
+
392
+ result = ProcIns.new(target, content)
393
+ result.raw_string = raw_string
394
+ result
395
+ end
396
+
397
+ def Comment.parse(raw_string)
398
+ unless /\A#{Pat::Comment_C}\z/o =~ raw_string
399
+ raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
400
+ end
401
+
402
+ content = $1
403
+
404
+ result = Comment.new(content)
405
+ result.raw_string = raw_string
406
+ result
407
+ end
408
+
409
+ # :startdoc:
410
+ end