feedtools 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
@@ -0,0 +1,48 @@
1
+ # :stopdoc:
2
+ module HTree
3
+ class Name; include HTree end
4
+ class Context; include HTree end
5
+
6
+ module Tag; include HTree end
7
+ class STag; include Tag end
8
+ class ETag; include Tag end
9
+
10
+ module Node; include HTree end
11
+ module Container; include Node end
12
+ class Doc; include Container end
13
+ class Elem; include Container end
14
+ module Leaf; include Node end
15
+ class Text; include Leaf end
16
+ class XMLDecl; include Leaf end
17
+ class DocType; include Leaf end
18
+ class ProcIns; include Leaf end
19
+ class Comment; include Leaf end
20
+ class BogusETag; include Leaf end
21
+
22
+ module Traverse end
23
+ module Container::Trav; include Traverse end
24
+ module Leaf::Trav; include Traverse end
25
+ class Doc; module Trav; include Container::Trav end; include Trav end
26
+ class Elem; module Trav; include Container::Trav end; include Trav end
27
+ class Text; module Trav; include Leaf::Trav end; include Trav end
28
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
29
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
30
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
31
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
32
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
33
+
34
+ class Location; include HTree end
35
+ module Container::Loc end
36
+ module Leaf::Loc end
37
+ class Doc; class Loc < Location; include Trav, Container::Loc end end
38
+ class Elem; class Loc < Location; include Trav, Container::Loc end end
39
+ class Text; class Loc < Location; include Trav, Leaf::Loc end end
40
+ class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
41
+ class DocType; class Loc < Location; include Trav, Leaf::Loc end end
42
+ class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
43
+ class Comment; class Loc < Location; include Trav, Leaf::Loc end end
44
+ class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
45
+
46
+ class Error < StandardError; end
47
+ end
48
+ # :startdoc:
@@ -0,0 +1,124 @@
1
+ # :stopdoc:
2
+ require 'htree/scan' # for Pat::Nmtoken
3
+ require 'htree/context'
4
+
5
+ module HTree # :nodoc:
6
+ # Name represents a element name and attribute name.
7
+ # It consists of a namespace prefix, a namespace URI and a local name.
8
+ class Name # :nodoc:
9
+ =begin
10
+ element name prefix uri localname
11
+ {u}n, n with xmlns=u nil 'u' 'n'
12
+ p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
13
+ n with xmlns='' nil '' 'n'
14
+
15
+ attribute name
16
+ xmlns= 'xmlns' nil nil
17
+ xmlns:n= 'xmlns' nil 'n'
18
+ p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
19
+ n= nil '' 'n'
20
+ =end
21
+ def Name.parse_element_name(name, context)
22
+ if /\{(.*)\}/ =~ name
23
+ # "{u}n" means "use default namespace",
24
+ # "p{u}n" means "use the specified prefix p"
25
+ $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
26
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
27
+ Name.new($`, context.namespace_uri($`), $')
28
+ elsif !context.namespace_uri(nil).empty?
29
+ Name.new(nil, context.namespace_uri(nil), name)
30
+ else
31
+ Name.new(nil, '', name)
32
+ end
33
+ end
34
+
35
+ def Name.parse_attribute_name(name, context)
36
+ if name == 'xmlns'
37
+ Name.new('xmlns', nil, nil)
38
+ elsif /\Axmlns:/ =~ name
39
+ Name.new('xmlns', nil, $')
40
+ elsif /\{(.*)\}/ =~ name
41
+ case $`
42
+ when ''; Name.new(nil, $1, $')
43
+ else Name.new($`, $1, $')
44
+ end
45
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
46
+ Name.new($`, context.namespace_uri($`), $')
47
+ else
48
+ Name.new(nil, '', name)
49
+ end
50
+ end
51
+
52
+ NameCache = {}
53
+ def Name.new(namespace_prefix, namespace_uri, local_name)
54
+ key = [namespace_prefix, namespace_uri, local_name, self]
55
+ NameCache.fetch(key) {
56
+ 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
57
+ NameCache[key] = super(key[0], key[1], key[2])
58
+ }
59
+ end
60
+
61
+ def initialize(namespace_prefix, namespace_uri, local_name)
62
+ @namespace_prefix = namespace_prefix
63
+ @namespace_uri = namespace_uri
64
+ @local_name = local_name
65
+ if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
66
+ raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
67
+ end
68
+ if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
69
+ raise HTree::Error, "invalid local name: #{@local_name.inspect}"
70
+ end
71
+ if @namespace_prefix == 'xmlns'
72
+ unless @namespace_uri == nil
73
+ raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
74
+ end
75
+ else
76
+ unless String === @namespace_uri
77
+ raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
78
+ end
79
+ end
80
+ end
81
+ attr_reader :namespace_prefix, :namespace_uri, :local_name
82
+
83
+ def xmlns?
84
+ @namespace_prefix == 'xmlns' && @namespace_uri == nil
85
+ end
86
+
87
+ def universal_name
88
+ if @namespace_uri && !@namespace_uri.empty?
89
+ "{#{@namespace_uri}}#{@local_name}"
90
+ else
91
+ @local_name.dup
92
+ end
93
+ end
94
+
95
+ def qualified_name
96
+ if @namespace_uri && !@namespace_uri.empty?
97
+ if @namespace_prefix
98
+ "#{@namespace_prefix}:#{@local_name}"
99
+ else
100
+ @local_name.dup
101
+ end
102
+ elsif @local_name
103
+ @local_name.dup
104
+ else
105
+ "xmlns"
106
+ end
107
+ end
108
+
109
+ def to_s
110
+ if @namespace_uri && !@namespace_uri.empty?
111
+ if @namespace_prefix
112
+ "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
113
+ else
114
+ "{#{@namespace_uri}}#{@local_name}"
115
+ end
116
+ elsif @local_name
117
+ @local_name.dup
118
+ else
119
+ "xmlns"
120
+ end
121
+ end
122
+ end
123
+ end
124
+ # :startdoc:
@@ -0,0 +1,207 @@
1
+ # :stopdoc:
2
+ require 'htree/encoder'
3
+ require 'htree/doc'
4
+ require 'htree/elem'
5
+ require 'htree/leaf'
6
+ require 'htree/text'
7
+
8
+ module HTree # :nodoc:
9
+
10
+ class Text # :nodoc:
11
+ ChRef = {
12
+ '>' => '&gt;',
13
+ '<' => '&lt;',
14
+ '"' => '&quot;',
15
+ }
16
+
17
+ def output(out, context)
18
+ out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
+ end
20
+
21
+ def to_attvalue_content
22
+ @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
+ end
24
+
25
+ def output_attvalue(out, context)
26
+ out.output_string '"'
27
+ out.output_text to_attvalue_content
28
+ out.output_string '"'
29
+ end
30
+ end
31
+
32
+ class Name # :nodoc:
33
+ def output(out, context)
34
+ # xxx: validate namespace prefix
35
+ if xmlns?
36
+ if @local_name
37
+ out.output_string "xmlns:#{@local_name}"
38
+ else
39
+ out.output_string "xmlns"
40
+ end
41
+ else
42
+ out.output_string qualified_name
43
+ end
44
+ end
45
+
46
+ def output_attribute(text, out, context)
47
+ output(out, context)
48
+ out.output_string '='
49
+ text.output_attvalue(out, context)
50
+ end
51
+ end
52
+
53
+ class Doc # :nodoc:
54
+ def output(out, context)
55
+ context = DefaultContext # discard outer context
56
+ xmldecl = false
57
+ doctypedecl = false
58
+ @children.each {|n|
59
+ if n.respond_to? :output_prolog_xmldecl
60
+ n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
61
+ xmldecl = true
62
+ elsif n.respond_to? :output_prolog_doctypedecl
63
+ n.output_prolog_doctypedecl(out, context) unless doctypedecl
64
+ doctypedecl = true
65
+ else
66
+ n.output(out, context)
67
+ end
68
+ }
69
+ end
70
+ end
71
+
72
+ class Elem # :nodoc:
73
+ def output(out, context)
74
+ if @empty
75
+ @stag.output_emptytag(out, context)
76
+ else
77
+ children_context = @stag.output_stag(out, context)
78
+ @children.each {|n| n.output(out, children_context) }
79
+ @stag.output_etag(out, context)
80
+ end
81
+ end
82
+ end
83
+
84
+ class STag # :nodoc:
85
+ def output_attributes(out, context)
86
+ @attributes.each {|aname, text|
87
+ next if aname.xmlns?
88
+ out.output_string ' '
89
+ aname.output_attribute(text, out, context)
90
+ }
91
+ @context.output_namespaces(out, context)
92
+ end
93
+
94
+ def output_emptytag(out, context)
95
+ out.output_string '<'
96
+ @name.output(out, context)
97
+ children_context = output_attributes(out, context)
98
+ out.output_string "\n/>"
99
+ children_context
100
+ end
101
+
102
+ def output_stag(out, context)
103
+ out.output_string '<'
104
+ @name.output(out, context)
105
+ children_context = output_attributes(out, context)
106
+ out.output_string "\n>"
107
+ children_context
108
+ end
109
+
110
+ def output_etag(out, context)
111
+ out.output_string '</'
112
+ @name.output(out, context)
113
+ out.output_string "\n>"
114
+ end
115
+ end
116
+
117
+ class Context # :nodoc:
118
+ def output_namespaces(out, outer_context)
119
+ unknown_namespaces = {}
120
+ @namespaces.each {|prefix, uri|
121
+ outer_uri = outer_context.namespace_uri(prefix)
122
+ if outer_uri == nil
123
+ unknown_namespaces[prefix] = uri
124
+ elsif outer_uri != uri
125
+ if prefix
126
+ out.output_string " xmlns:#{prefix}="
127
+ else
128
+ out.output_string " xmlns="
129
+ end
130
+ Text.new(uri).output_attvalue(out, outer_context)
131
+ end
132
+ }
133
+ unless unknown_namespaces.empty?
134
+ out.output_xmlns(unknown_namespaces)
135
+ end
136
+ outer_context.subst_namespaces(@namespaces)
137
+ end
138
+ end
139
+
140
+ class BogusETag # :nodoc:
141
+ # don't output anything.
142
+ def output(out, context)
143
+ end
144
+ end
145
+
146
+ class XMLDecl # :nodoc:
147
+ # don't output anything.
148
+ def output(out, context)
149
+ end
150
+
151
+ def output_prolog_xmldecl(out, context)
152
+ out.output_string "<?xml version=\"#{@version}\""
153
+ if @encoding
154
+ out.output_string " encoding=\"#{@encoding}\""
155
+ end
156
+ if @standalone != nil
157
+ out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
158
+ end
159
+ out.output_string "?>"
160
+ end
161
+ end
162
+
163
+ class DocType # :nodoc:
164
+ # don't output anything.
165
+ def output(out, context)
166
+ end
167
+
168
+ def generate_content # :nodoc:
169
+ result = ''
170
+ if @public_identifier
171
+ result << "PUBLIC \"#{@public_identifier}\""
172
+ else
173
+ result << "SYSTEM"
174
+ end
175
+ # Although a system identifier is not omissible in XML,
176
+ # we cannot output it if it is not given.
177
+ if @system_identifier
178
+ if /"/ !~ @system_identifier
179
+ result << " \"#{@system_identifier}\""
180
+ else
181
+ result << " '#{@system_identifier}'"
182
+ end
183
+ end
184
+ result
185
+ end
186
+
187
+ def output_prolog_doctypedecl(out, context)
188
+ out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
189
+ end
190
+ end
191
+
192
+ class ProcIns # :nodoc:
193
+ def output(out, context)
194
+ out.output_string "<?#{@target}"
195
+ out.output_string " #{@content}" if @content
196
+ out.output_string "?>"
197
+ end
198
+ end
199
+
200
+ class Comment # :nodoc:
201
+ def output(out, context)
202
+ out.output_string "<!--#{@content}-->"
203
+ end
204
+ end
205
+
206
+ end
207
+ # :startdoc:
@@ -0,0 +1,407 @@
1
+ # :stopdoc:
2
+ require 'htree/scan'
3
+ require 'htree/htmlinfo'
4
+ require 'htree/text'
5
+ require 'htree/tag'
6
+ require 'htree/leaf'
7
+ require 'htree/doc'
8
+ require 'htree/elem'
9
+ require 'htree/raw_string'
10
+ require 'htree/context'
11
+ require 'htree/encoder'
12
+ require 'htree/fstr'
13
+
14
+ module HTree # :nodoc:
15
+ # HTree.parse parses <i>input</i> and return a document tree.
16
+ # represented by HTree::Doc.
17
+ #
18
+ # <i>input</i> should be a String or
19
+ # an object which respond to read or open method.
20
+ # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
21
+ # Note that the URIs need open-uri.
22
+ #
23
+ # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
24
+ #
25
+ # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
26
+ # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
27
+ #
28
+ # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
29
+ #
30
+ # If opened file or read content has charset method,
31
+ # HTree.parse decode it according to $KCODE before parsing.
32
+ # Otherwise HTree.parse assumes the character encoding of the content is
33
+ # compatible to $KCODE.
34
+ # Note that the charset method is provided by URI::HTTP with open-uri.
35
+ def HTree.parse(input)
36
+ HTree.with_frozen_string_hash {
37
+ parse_as(input, false)
38
+ }
39
+ end
40
+
41
+ # HTree.parse_xml parses <i>input</i> as XML and
42
+ # return a document tree represented by HTree::Doc.
43
+ #
44
+ # It behaves almost same as HTree.parse but it assumes <i>input</> is XML
45
+ # even if no XML declaration.
46
+ # The assumption causes following differences.
47
+ # * doesn't downcase element name.
48
+ # * The content of <script> and <style> element is PCDATA, not CDATA.
49
+ def HTree.parse_xml(input)
50
+ HTree.with_frozen_string_hash {
51
+ parse_as(input, true)
52
+ }
53
+ end
54
+
55
+ def HTree.parse_as(input, is_xml)
56
+ input_charset = nil
57
+ if input.tainted? && 1 <= $SAFE
58
+ raise SecurityError, "input tainted"
59
+ end
60
+ if input.respond_to? :read # IO, StringIO
61
+ input = input.read.untaint
62
+ input_charset = input.charset if input.respond_to? :charset
63
+ elsif input.respond_to? :open # Pathname, URI with open-uri
64
+ input.open {|f|
65
+ input = f.read.untaint
66
+ input_charset = f.charset if f.respond_to? :charset
67
+ }
68
+ end
69
+ if input_charset && input_charset != Encoder.internal_charset
70
+ input = Iconv.conv(Encoder.internal_charset, input_charset, input)
71
+ end
72
+
73
+ tokens = []
74
+ is_xml, is_html = HTree.scan(input, is_xml) {|token|
75
+ tokens << token
76
+ }
77
+ context = is_html ? HTMLContext: DefaultContext
78
+ structure_list = parse_pairs(tokens, is_xml, is_html)
79
+ structure_list = fix_structure_list(structure_list, is_xml, is_html)
80
+ nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
81
+ Doc.new(nodes)
82
+ end
83
+
84
+ def HTree.parse_pairs(tokens, is_xml, is_html)
85
+ stack = [[nil, nil, []]]
86
+ tokens.each {|token|
87
+ case token[0]
88
+ when :stag
89
+ stag_raw_string = token[1]
90
+ stagname = stag_raw_string[Pat::Name]
91
+ stagname = stagname.downcase if !is_xml && is_html
92
+ stagname = HTree.frozen_string(stagname)
93
+ stack << [stagname, stag_raw_string, []]
94
+ when :etag
95
+ etag_raw_string = token[1]
96
+ etagname = etag_raw_string[Pat::Name]
97
+ etagname = etagname.downcase if !is_xml && is_html
98
+ etagname = HTree.frozen_string(etagname)
99
+ matched_elem = nil
100
+ stack.reverse_each {|elem|
101
+ stagname, _, _ = elem
102
+ if stagname == etagname
103
+ matched_elem = elem
104
+ break
105
+ end
106
+ }
107
+ if matched_elem
108
+ until matched_elem.equal? stack.last
109
+ stagname, stag_raw_string, children = stack.pop
110
+ stack.last[2] << [:elem, stag_raw_string, children]
111
+ end
112
+ stagname, stag_raw_string, children = stack.pop
113
+ stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
114
+ else
115
+ stack.last[2] << [:bogus_etag, etag_raw_string]
116
+ end
117
+ else
118
+ stack.last[2] << token
119
+ end
120
+ }
121
+ elem = nil
122
+ while 1 < stack.length
123
+ stagname, stag_raw_string, children = stack.pop
124
+ stack.last[2] << [:elem, stag_raw_string, children]
125
+ end
126
+ stack[0][2]
127
+ end
128
+
129
+ def HTree.fix_structure_list(structure_list, is_xml, is_html)
130
+ result = []
131
+ rest = structure_list.dup
132
+ until rest.empty?
133
+ structure = rest.shift
134
+ if structure[0] == :elem
135
+ elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
136
+ result << elem
137
+ rest = rest2 + rest
138
+ else
139
+ result << structure
140
+ end
141
+ end
142
+ result
143
+ end
144
+
145
+ def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
146
+ stag_raw_string = elem[1]
147
+ children = elem[2]
148
+ if etag_raw_string = elem[3]
149
+ return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
150
+ else
151
+ tagname = stag_raw_string[Pat::Name]
152
+ tagname = tagname.downcase if !is_xml && is_html
153
+ if ElementContent[tagname] == :EMPTY
154
+ return [:elem, stag_raw_string, []], children
155
+ else
156
+ if ElementContent[tagname] == :CDATA
157
+ possible_tags = []
158
+ else
159
+ possible_tags = ElementContent[tagname]
160
+ end
161
+ if possible_tags
162
+ excluded_tags2 = ElementExclusions[tagname]
163
+ included_tags2 = ElementInclusions[tagname]
164
+ excluded_tags |= excluded_tags2 if excluded_tags2
165
+ included_tags |= included_tags2 if included_tags2
166
+ containable_tags = (possible_tags | included_tags) - excluded_tags
167
+ uncontainable_tags = ElementContent.keys - containable_tags
168
+ else
169
+ # If the tagname is unknown, it is assumed that any element
170
+ # except excluded can be contained.
171
+ uncontainable_tags = excluded_tags
172
+ end
173
+ fixed_children = []
174
+ rest = children
175
+ until rest.empty?
176
+ if rest[0][0] == :elem
177
+ elem = rest.shift
178
+ elem_tagname = elem[1][Pat::Name]
179
+ elem_tagname = elem_tagname.downcase if !is_xml && is_html
180
+ if uncontainable_tags.include? elem_tagname
181
+ rest.unshift elem
182
+ break
183
+ else
184
+ fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
185
+ fixed_children << fixed_elem
186
+ rest = rest2 + rest
187
+ end
188
+ else
189
+ fixed_children << rest.shift
190
+ end
191
+ end
192
+ return [:elem, stag_raw_string, fixed_children], rest
193
+ end
194
+ end
195
+ end
196
+
197
+ def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
198
+ case structure[0]
199
+ when :text_pcdata
200
+ Text.parse_pcdata(structure[1])
201
+ when :elem
202
+ _, stag_rawstring, children, etag_rawstring = structure
203
+ etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
204
+ stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
205
+ if !children.empty? || etag
206
+ Elem.new!(stag,
207
+ children.map {|c| build_node(c, is_xml, is_html, stag.context) },
208
+ etag)
209
+ else
210
+ Elem.new!(stag)
211
+ end
212
+ when :emptytag
213
+ Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
214
+ when :bogus_etag
215
+ BogusETag.parse(structure[1], is_xml, is_html)
216
+ when :xmldecl
217
+ XMLDecl.parse(structure[1])
218
+ when :doctype
219
+ DocType.parse(structure[1], is_xml, is_html)
220
+ when :procins
221
+ ProcIns.parse(structure[1])
222
+ when :comment
223
+ Comment.parse(structure[1])
224
+ when :text_cdata_content
225
+ Text.parse_cdata_content(structure[1])
226
+ when :text_cdata_section
227
+ Text.parse_cdata_section(structure[1])
228
+ else
229
+ raise Exception, "[bug] unknown structure: #{structure.inspect}"
230
+ end
231
+ end
232
+
233
+ def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
234
+ attrs = []
235
+ if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
236
+ qname = $1
237
+ $2.scan(Pat::ValidAttr_C) {
238
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
239
+ }
240
+ elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
241
+ qname = $1
242
+ last_attr = $3
243
+ $2.scan(Pat::InvalidAttr1_C) {
244
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
245
+ }
246
+ if last_attr
247
+ /#{Pat::InvalidAttr1End_C}/o =~ last_attr
248
+ attrs << [$1, $2 || $3]
249
+ end
250
+ else
251
+ raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
252
+ end
253
+
254
+ qname = qname.downcase if !is_xml && is_html
255
+
256
+ attrs.map! {|aname, aval|
257
+ if aname
258
+ aname = (!is_xml && is_html) ? aname.downcase : aname
259
+ [aname, Text.parse_pcdata(aval)]
260
+ else
261
+ if val2name = OmittedAttrName[qname]
262
+ aval_downcase = aval.downcase
263
+ aname = val2name.fetch(aval_downcase, aval_downcase)
264
+ else
265
+ aname = aval
266
+ end
267
+ [aname, Text.new(aval)]
268
+ end
269
+ }
270
+
271
+ result = STag.new(qname, attrs, inherited_context)
272
+ result.raw_string = raw_string
273
+ result
274
+ end
275
+
276
+ def ETag.parse(raw_string, is_xml, is_html)
277
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
278
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
279
+ end
280
+
281
+ qname = $1
282
+ qname = qname.downcase if !is_xml && is_html
283
+
284
+ result = self.new(qname)
285
+ result.raw_string = raw_string
286
+ result
287
+ end
288
+
289
+ def BogusETag.parse(raw_string, is_xml, is_html)
290
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
291
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
292
+ end
293
+
294
+ qname = $1
295
+ qname = qname.downcase if !is_xml && is_html
296
+
297
+ result = self.new(qname)
298
+ result.raw_string = raw_string
299
+ result
300
+ end
301
+
302
+ def Text.parse_pcdata(raw_string)
303
+ fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
304
+ name = $1
305
+ case s
306
+ when /;\z/
307
+ s
308
+ when /\A&#/
309
+ "#{s};"
310
+ when '&'
311
+ '&amp;'
312
+ else
313
+ if NamedCharactersPattern =~ name
314
+ "&#{name};"
315
+ else
316
+ "&amp;#{name}"
317
+ end
318
+ end
319
+ }
320
+ fixed = raw_string if fixed == raw_string
321
+ result = Text.new_internal(fixed)
322
+ result.raw_string = raw_string
323
+ result
324
+ end
325
+
326
+ def Text.parse_cdata_content(raw_string)
327
+ result = Text.new(raw_string)
328
+ result.raw_string = raw_string
329
+ result
330
+ end
331
+
332
+ def Text.parse_cdata_section(raw_string)
333
+ unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
334
+ raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
335
+ end
336
+
337
+ content = $1
338
+
339
+ result = Text.new(content)
340
+ result.raw_string = raw_string
341
+ result
342
+ end
343
+
344
+ def XMLDecl.parse(raw_string)
345
+ unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
346
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
347
+ end
348
+
349
+ version = $1 || $2
350
+ encoding = $3 || $4
351
+ case $5 || $6
352
+ when 'yes'
353
+ standalone = true
354
+ when 'no'
355
+ standalone = false
356
+ else
357
+ standalone = nil
358
+ end
359
+
360
+ result = XMLDecl.new(version, encoding, standalone)
361
+ result.raw_string = raw_string
362
+ result
363
+ end
364
+
365
+ def DocType.parse(raw_string, is_xml, is_html)
366
+ unless /\A#{Pat::DocType_C}\z/o =~ raw_string
367
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
368
+ end
369
+
370
+ root_element_name = $1
371
+ public_identifier = $2 || $3
372
+ system_identifier = $4 || $5
373
+
374
+ root_element_name = root_element_name.downcase if !is_xml && is_html
375
+
376
+ result = DocType.new(root_element_name, public_identifier, system_identifier)
377
+ result.raw_string = raw_string
378
+ result
379
+ end
380
+
381
+ def ProcIns.parse(raw_string)
382
+ unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
383
+ raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
384
+ end
385
+
386
+ target = $1
387
+ content = $2
388
+
389
+ result = ProcIns.new(target, content)
390
+ result.raw_string = raw_string
391
+ result
392
+ end
393
+
394
+ def Comment.parse(raw_string)
395
+ unless /\A#{Pat::Comment_C}\z/o =~ raw_string
396
+ raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
397
+ end
398
+
399
+ content = $1
400
+
401
+ result = Comment.new(content)
402
+ result.raw_string = raw_string
403
+ result
404
+ end
405
+
406
+ end
407
+ # :startdoc: