feedtools 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
@@ -0,0 +1,48 @@
1
+ # :stopdoc:
2
+ module HTree
3
+ class Name; include HTree end
4
+ class Context; include HTree end
5
+
6
+ module Tag; include HTree end
7
+ class STag; include Tag end
8
+ class ETag; include Tag end
9
+
10
+ module Node; include HTree end
11
+ module Container; include Node end
12
+ class Doc; include Container end
13
+ class Elem; include Container end
14
+ module Leaf; include Node end
15
+ class Text; include Leaf end
16
+ class XMLDecl; include Leaf end
17
+ class DocType; include Leaf end
18
+ class ProcIns; include Leaf end
19
+ class Comment; include Leaf end
20
+ class BogusETag; include Leaf end
21
+
22
+ module Traverse end
23
+ module Container::Trav; include Traverse end
24
+ module Leaf::Trav; include Traverse end
25
+ class Doc; module Trav; include Container::Trav end; include Trav end
26
+ class Elem; module Trav; include Container::Trav end; include Trav end
27
+ class Text; module Trav; include Leaf::Trav end; include Trav end
28
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
29
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
30
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
31
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
32
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
33
+
34
+ class Location; include HTree end
35
+ module Container::Loc end
36
+ module Leaf::Loc end
37
+ class Doc; class Loc < Location; include Trav, Container::Loc end end
38
+ class Elem; class Loc < Location; include Trav, Container::Loc end end
39
+ class Text; class Loc < Location; include Trav, Leaf::Loc end end
40
+ class XMLDecl; class Loc < Location; include Trav, Leaf::Loc end end
41
+ class DocType; class Loc < Location; include Trav, Leaf::Loc end end
42
+ class ProcIns; class Loc < Location; include Trav, Leaf::Loc end end
43
+ class Comment; class Loc < Location; include Trav, Leaf::Loc end end
44
+ class BogusETag; class Loc < Location; include Trav, Leaf::Loc end end
45
+
46
+ class Error < StandardError; end
47
+ end
48
+ # :startdoc:
@@ -0,0 +1,124 @@
1
+ # :stopdoc:
2
+ require 'htree/scan' # for Pat::Nmtoken
3
+ require 'htree/context'
4
+
5
+ module HTree # :nodoc:
6
+ # Name represents a element name and attribute name.
7
+ # It consists of a namespace prefix, a namespace URI and a local name.
8
+ class Name # :nodoc:
9
+ =begin
10
+ element name prefix uri localname
11
+ {u}n, n with xmlns=u nil 'u' 'n'
12
+ p{u}n, p:n with xmlns:p=u 'p' 'u' 'n'
13
+ n with xmlns='' nil '' 'n'
14
+
15
+ attribute name
16
+ xmlns= 'xmlns' nil nil
17
+ xmlns:n= 'xmlns' nil 'n'
18
+ p{u}n=, p:n= with xmlns:p=u 'p' 'u' 'n'
19
+ n= nil '' 'n'
20
+ =end
21
+ def Name.parse_element_name(name, context)
22
+ if /\{(.*)\}/ =~ name
23
+ # "{u}n" means "use default namespace",
24
+ # "p{u}n" means "use the specified prefix p"
25
+ $` == '' ? Name.new(nil, $1, $') : Name.new($`, $1, $')
26
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
27
+ Name.new($`, context.namespace_uri($`), $')
28
+ elsif !context.namespace_uri(nil).empty?
29
+ Name.new(nil, context.namespace_uri(nil), name)
30
+ else
31
+ Name.new(nil, '', name)
32
+ end
33
+ end
34
+
35
+ def Name.parse_attribute_name(name, context)
36
+ if name == 'xmlns'
37
+ Name.new('xmlns', nil, nil)
38
+ elsif /\Axmlns:/ =~ name
39
+ Name.new('xmlns', nil, $')
40
+ elsif /\{(.*)\}/ =~ name
41
+ case $`
42
+ when ''; Name.new(nil, $1, $')
43
+ else Name.new($`, $1, $')
44
+ end
45
+ elsif /:/ =~ name && !context.namespace_uri($`).empty?
46
+ Name.new($`, context.namespace_uri($`), $')
47
+ else
48
+ Name.new(nil, '', name)
49
+ end
50
+ end
51
+
52
+ NameCache = {}
53
+ def Name.new(namespace_prefix, namespace_uri, local_name)
54
+ key = [namespace_prefix, namespace_uri, local_name, self]
55
+ NameCache.fetch(key) {
56
+ 0.upto(2) {|i| key[i] = key[i].dup.freeze if key[i] }
57
+ NameCache[key] = super(key[0], key[1], key[2])
58
+ }
59
+ end
60
+
61
+ def initialize(namespace_prefix, namespace_uri, local_name)
62
+ @namespace_prefix = namespace_prefix
63
+ @namespace_uri = namespace_uri
64
+ @local_name = local_name
65
+ if @namespace_prefix && /\A#{Pat::Nmtoken}\z/o !~ @namespace_prefix
66
+ raise HTree::Error, "invalid namespace prefix: #{@namespace_prefix.inspect}"
67
+ end
68
+ if @local_name && /\A#{Pat::Nmtoken}\z/o !~ @local_name
69
+ raise HTree::Error, "invalid local name: #{@local_name.inspect}"
70
+ end
71
+ if @namespace_prefix == 'xmlns'
72
+ unless @namespace_uri == nil
73
+ raise HTree::Error, "Name object for xmlns:* must not have namespace URI: #{@namespace_uri.inspect}"
74
+ end
75
+ else
76
+ unless String === @namespace_uri
77
+ raise HTree::Error, "invalid namespace URI: #{@namespace_uri.inspect}"
78
+ end
79
+ end
80
+ end
81
+ attr_reader :namespace_prefix, :namespace_uri, :local_name
82
+
83
+ def xmlns?
84
+ @namespace_prefix == 'xmlns' && @namespace_uri == nil
85
+ end
86
+
87
+ def universal_name
88
+ if @namespace_uri && !@namespace_uri.empty?
89
+ "{#{@namespace_uri}}#{@local_name}"
90
+ else
91
+ @local_name.dup
92
+ end
93
+ end
94
+
95
+ def qualified_name
96
+ if @namespace_uri && !@namespace_uri.empty?
97
+ if @namespace_prefix
98
+ "#{@namespace_prefix}:#{@local_name}"
99
+ else
100
+ @local_name.dup
101
+ end
102
+ elsif @local_name
103
+ @local_name.dup
104
+ else
105
+ "xmlns"
106
+ end
107
+ end
108
+
109
+ def to_s
110
+ if @namespace_uri && !@namespace_uri.empty?
111
+ if @namespace_prefix
112
+ "#{@namespace_prefix}{#{@namespace_uri}}#{@local_name}"
113
+ else
114
+ "{#{@namespace_uri}}#{@local_name}"
115
+ end
116
+ elsif @local_name
117
+ @local_name.dup
118
+ else
119
+ "xmlns"
120
+ end
121
+ end
122
+ end
123
+ end
124
+ # :startdoc:
@@ -0,0 +1,207 @@
1
+ # :stopdoc:
2
+ require 'htree/encoder'
3
+ require 'htree/doc'
4
+ require 'htree/elem'
5
+ require 'htree/leaf'
6
+ require 'htree/text'
7
+
8
+ module HTree # :nodoc:
9
+
10
+ class Text # :nodoc:
11
+ ChRef = {
12
+ '>' => '&gt;',
13
+ '<' => '&lt;',
14
+ '"' => '&quot;',
15
+ }
16
+
17
+ def output(out, context)
18
+ out.output_text @rcdata.gsub(/[<>]/) {|s| ChRef[s] }
19
+ end
20
+
21
+ def to_attvalue_content
22
+ @rcdata.gsub(/[<>"]/) {|s| ChRef[s] }
23
+ end
24
+
25
+ def output_attvalue(out, context)
26
+ out.output_string '"'
27
+ out.output_text to_attvalue_content
28
+ out.output_string '"'
29
+ end
30
+ end
31
+
32
+ class Name # :nodoc:
33
+ def output(out, context)
34
+ # xxx: validate namespace prefix
35
+ if xmlns?
36
+ if @local_name
37
+ out.output_string "xmlns:#{@local_name}"
38
+ else
39
+ out.output_string "xmlns"
40
+ end
41
+ else
42
+ out.output_string qualified_name
43
+ end
44
+ end
45
+
46
+ def output_attribute(text, out, context)
47
+ output(out, context)
48
+ out.output_string '='
49
+ text.output_attvalue(out, context)
50
+ end
51
+ end
52
+
53
+ class Doc # :nodoc:
54
+ def output(out, context)
55
+ context = DefaultContext # discard outer context
56
+ xmldecl = false
57
+ doctypedecl = false
58
+ @children.each {|n|
59
+ if n.respond_to? :output_prolog_xmldecl
60
+ n.output_prolog_xmldecl(out, context) unless xmldecl # xxx: encoding?
61
+ xmldecl = true
62
+ elsif n.respond_to? :output_prolog_doctypedecl
63
+ n.output_prolog_doctypedecl(out, context) unless doctypedecl
64
+ doctypedecl = true
65
+ else
66
+ n.output(out, context)
67
+ end
68
+ }
69
+ end
70
+ end
71
+
72
+ class Elem # :nodoc:
73
+ def output(out, context)
74
+ if @empty
75
+ @stag.output_emptytag(out, context)
76
+ else
77
+ children_context = @stag.output_stag(out, context)
78
+ @children.each {|n| n.output(out, children_context) }
79
+ @stag.output_etag(out, context)
80
+ end
81
+ end
82
+ end
83
+
84
+ class STag # :nodoc:
85
+ def output_attributes(out, context)
86
+ @attributes.each {|aname, text|
87
+ next if aname.xmlns?
88
+ out.output_string ' '
89
+ aname.output_attribute(text, out, context)
90
+ }
91
+ @context.output_namespaces(out, context)
92
+ end
93
+
94
+ def output_emptytag(out, context)
95
+ out.output_string '<'
96
+ @name.output(out, context)
97
+ children_context = output_attributes(out, context)
98
+ out.output_string "\n/>"
99
+ children_context
100
+ end
101
+
102
+ def output_stag(out, context)
103
+ out.output_string '<'
104
+ @name.output(out, context)
105
+ children_context = output_attributes(out, context)
106
+ out.output_string "\n>"
107
+ children_context
108
+ end
109
+
110
+ def output_etag(out, context)
111
+ out.output_string '</'
112
+ @name.output(out, context)
113
+ out.output_string "\n>"
114
+ end
115
+ end
116
+
117
+ class Context # :nodoc:
118
+ def output_namespaces(out, outer_context)
119
+ unknown_namespaces = {}
120
+ @namespaces.each {|prefix, uri|
121
+ outer_uri = outer_context.namespace_uri(prefix)
122
+ if outer_uri == nil
123
+ unknown_namespaces[prefix] = uri
124
+ elsif outer_uri != uri
125
+ if prefix
126
+ out.output_string " xmlns:#{prefix}="
127
+ else
128
+ out.output_string " xmlns="
129
+ end
130
+ Text.new(uri).output_attvalue(out, outer_context)
131
+ end
132
+ }
133
+ unless unknown_namespaces.empty?
134
+ out.output_xmlns(unknown_namespaces)
135
+ end
136
+ outer_context.subst_namespaces(@namespaces)
137
+ end
138
+ end
139
+
140
+ class BogusETag # :nodoc:
141
+ # don't output anything.
142
+ def output(out, context)
143
+ end
144
+ end
145
+
146
+ class XMLDecl # :nodoc:
147
+ # don't output anything.
148
+ def output(out, context)
149
+ end
150
+
151
+ def output_prolog_xmldecl(out, context)
152
+ out.output_string "<?xml version=\"#{@version}\""
153
+ if @encoding
154
+ out.output_string " encoding=\"#{@encoding}\""
155
+ end
156
+ if @standalone != nil
157
+ out.output_string " standalone=\"#{@standalone ? 'yes' : 'no'}\""
158
+ end
159
+ out.output_string "?>"
160
+ end
161
+ end
162
+
163
+ class DocType # :nodoc:
164
+ # don't output anything.
165
+ def output(out, context)
166
+ end
167
+
168
+ def generate_content # :nodoc:
169
+ result = ''
170
+ if @public_identifier
171
+ result << "PUBLIC \"#{@public_identifier}\""
172
+ else
173
+ result << "SYSTEM"
174
+ end
175
+ # Although a system identifier is not omissible in XML,
176
+ # we cannot output it if it is not given.
177
+ if @system_identifier
178
+ if /"/ !~ @system_identifier
179
+ result << " \"#{@system_identifier}\""
180
+ else
181
+ result << " '#{@system_identifier}'"
182
+ end
183
+ end
184
+ result
185
+ end
186
+
187
+ def output_prolog_doctypedecl(out, context)
188
+ out.output_string "<!DOCTYPE #{@root_element_name} #{generate_content}>"
189
+ end
190
+ end
191
+
192
+ class ProcIns # :nodoc:
193
+ def output(out, context)
194
+ out.output_string "<?#{@target}"
195
+ out.output_string " #{@content}" if @content
196
+ out.output_string "?>"
197
+ end
198
+ end
199
+
200
+ class Comment # :nodoc:
201
+ def output(out, context)
202
+ out.output_string "<!--#{@content}-->"
203
+ end
204
+ end
205
+
206
+ end
207
+ # :startdoc:
@@ -0,0 +1,407 @@
1
+ # :stopdoc:
2
+ require 'htree/scan'
3
+ require 'htree/htmlinfo'
4
+ require 'htree/text'
5
+ require 'htree/tag'
6
+ require 'htree/leaf'
7
+ require 'htree/doc'
8
+ require 'htree/elem'
9
+ require 'htree/raw_string'
10
+ require 'htree/context'
11
+ require 'htree/encoder'
12
+ require 'htree/fstr'
13
+
14
+ module HTree # :nodoc:
15
+ # HTree.parse parses <i>input</i> and return a document tree.
16
+ # represented by HTree::Doc.
17
+ #
18
+ # <i>input</i> should be a String or
19
+ # an object which respond to read or open method.
20
+ # For example, IO, StringIO, Pathname, URI::HTTP and URI::FTP are acceptable.
21
+ # Note that the URIs need open-uri.
22
+ #
23
+ # HTree.parse guesses <i>input</i> is HTML or not and XML or not.
24
+ #
25
+ # If it is guessed as HTML, the default namespace in the result is set to http://www.w3.org/1999/xhtml
26
+ # regardless of <i>input</i> has XML namespace declaration or not nor even it is pre-XML HTML.
27
+ #
28
+ # If it is guessed as HTML and not XML, all element and attribute names are downcaseed.
29
+ #
30
+ # If opened file or read content has charset method,
31
+ # HTree.parse decode it according to $KCODE before parsing.
32
+ # Otherwise HTree.parse assumes the character encoding of the content is
33
+ # compatible to $KCODE.
34
+ # Note that the charset method is provided by URI::HTTP with open-uri.
35
+ def HTree.parse(input)
36
+ HTree.with_frozen_string_hash {
37
+ parse_as(input, false)
38
+ }
39
+ end
40
+
41
+ # HTree.parse_xml parses <i>input</i> as XML and
42
+ # return a document tree represented by HTree::Doc.
43
+ #
44
+ # It behaves almost same as HTree.parse but it assumes <i>input</> is XML
45
+ # even if no XML declaration.
46
+ # The assumption causes following differences.
47
+ # * doesn't downcase element name.
48
+ # * The content of <script> and <style> element is PCDATA, not CDATA.
49
+ def HTree.parse_xml(input)
50
+ HTree.with_frozen_string_hash {
51
+ parse_as(input, true)
52
+ }
53
+ end
54
+
55
+ def HTree.parse_as(input, is_xml)
56
+ input_charset = nil
57
+ if input.tainted? && 1 <= $SAFE
58
+ raise SecurityError, "input tainted"
59
+ end
60
+ if input.respond_to? :read # IO, StringIO
61
+ input = input.read.untaint
62
+ input_charset = input.charset if input.respond_to? :charset
63
+ elsif input.respond_to? :open # Pathname, URI with open-uri
64
+ input.open {|f|
65
+ input = f.read.untaint
66
+ input_charset = f.charset if f.respond_to? :charset
67
+ }
68
+ end
69
+ if input_charset && input_charset != Encoder.internal_charset
70
+ input = Iconv.conv(Encoder.internal_charset, input_charset, input)
71
+ end
72
+
73
+ tokens = []
74
+ is_xml, is_html = HTree.scan(input, is_xml) {|token|
75
+ tokens << token
76
+ }
77
+ context = is_html ? HTMLContext: DefaultContext
78
+ structure_list = parse_pairs(tokens, is_xml, is_html)
79
+ structure_list = fix_structure_list(structure_list, is_xml, is_html)
80
+ nodes = structure_list.map {|s| build_node(s, is_xml, is_html, context) }
81
+ Doc.new(nodes)
82
+ end
83
+
84
+ def HTree.parse_pairs(tokens, is_xml, is_html)
85
+ stack = [[nil, nil, []]]
86
+ tokens.each {|token|
87
+ case token[0]
88
+ when :stag
89
+ stag_raw_string = token[1]
90
+ stagname = stag_raw_string[Pat::Name]
91
+ stagname = stagname.downcase if !is_xml && is_html
92
+ stagname = HTree.frozen_string(stagname)
93
+ stack << [stagname, stag_raw_string, []]
94
+ when :etag
95
+ etag_raw_string = token[1]
96
+ etagname = etag_raw_string[Pat::Name]
97
+ etagname = etagname.downcase if !is_xml && is_html
98
+ etagname = HTree.frozen_string(etagname)
99
+ matched_elem = nil
100
+ stack.reverse_each {|elem|
101
+ stagname, _, _ = elem
102
+ if stagname == etagname
103
+ matched_elem = elem
104
+ break
105
+ end
106
+ }
107
+ if matched_elem
108
+ until matched_elem.equal? stack.last
109
+ stagname, stag_raw_string, children = stack.pop
110
+ stack.last[2] << [:elem, stag_raw_string, children]
111
+ end
112
+ stagname, stag_raw_string, children = stack.pop
113
+ stack.last[2] << [:elem, stag_raw_string, children, etag_raw_string]
114
+ else
115
+ stack.last[2] << [:bogus_etag, etag_raw_string]
116
+ end
117
+ else
118
+ stack.last[2] << token
119
+ end
120
+ }
121
+ elem = nil
122
+ while 1 < stack.length
123
+ stagname, stag_raw_string, children = stack.pop
124
+ stack.last[2] << [:elem, stag_raw_string, children]
125
+ end
126
+ stack[0][2]
127
+ end
128
+
129
+ def HTree.fix_structure_list(structure_list, is_xml, is_html)
130
+ result = []
131
+ rest = structure_list.dup
132
+ until rest.empty?
133
+ structure = rest.shift
134
+ if structure[0] == :elem
135
+ elem, rest2 = fix_element(structure, [], [], is_xml, is_html)
136
+ result << elem
137
+ rest = rest2 + rest
138
+ else
139
+ result << structure
140
+ end
141
+ end
142
+ result
143
+ end
144
+
145
+ def HTree.fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
146
+ stag_raw_string = elem[1]
147
+ children = elem[2]
148
+ if etag_raw_string = elem[3]
149
+ return [:elem, stag_raw_string, fix_structure_list(children, is_xml, is_html), etag_raw_string], []
150
+ else
151
+ tagname = stag_raw_string[Pat::Name]
152
+ tagname = tagname.downcase if !is_xml && is_html
153
+ if ElementContent[tagname] == :EMPTY
154
+ return [:elem, stag_raw_string, []], children
155
+ else
156
+ if ElementContent[tagname] == :CDATA
157
+ possible_tags = []
158
+ else
159
+ possible_tags = ElementContent[tagname]
160
+ end
161
+ if possible_tags
162
+ excluded_tags2 = ElementExclusions[tagname]
163
+ included_tags2 = ElementInclusions[tagname]
164
+ excluded_tags |= excluded_tags2 if excluded_tags2
165
+ included_tags |= included_tags2 if included_tags2
166
+ containable_tags = (possible_tags | included_tags) - excluded_tags
167
+ uncontainable_tags = ElementContent.keys - containable_tags
168
+ else
169
+ # If the tagname is unknown, it is assumed that any element
170
+ # except excluded can be contained.
171
+ uncontainable_tags = excluded_tags
172
+ end
173
+ fixed_children = []
174
+ rest = children
175
+ until rest.empty?
176
+ if rest[0][0] == :elem
177
+ elem = rest.shift
178
+ elem_tagname = elem[1][Pat::Name]
179
+ elem_tagname = elem_tagname.downcase if !is_xml && is_html
180
+ if uncontainable_tags.include? elem_tagname
181
+ rest.unshift elem
182
+ break
183
+ else
184
+ fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags, is_xml, is_html)
185
+ fixed_children << fixed_elem
186
+ rest = rest2 + rest
187
+ end
188
+ else
189
+ fixed_children << rest.shift
190
+ end
191
+ end
192
+ return [:elem, stag_raw_string, fixed_children], rest
193
+ end
194
+ end
195
+ end
196
+
197
+ def HTree.build_node(structure, is_xml, is_html, inherited_context=DefaultContext)
198
+ case structure[0]
199
+ when :text_pcdata
200
+ Text.parse_pcdata(structure[1])
201
+ when :elem
202
+ _, stag_rawstring, children, etag_rawstring = structure
203
+ etag = etag_rawstring && ETag.parse(etag_rawstring, is_xml, is_html)
204
+ stag = STag.parse(stag_rawstring, true, is_xml, is_html, inherited_context)
205
+ if !children.empty? || etag
206
+ Elem.new!(stag,
207
+ children.map {|c| build_node(c, is_xml, is_html, stag.context) },
208
+ etag)
209
+ else
210
+ Elem.new!(stag)
211
+ end
212
+ when :emptytag
213
+ Elem.new!(STag.parse(structure[1], false, is_xml, is_html, inherited_context))
214
+ when :bogus_etag
215
+ BogusETag.parse(structure[1], is_xml, is_html)
216
+ when :xmldecl
217
+ XMLDecl.parse(structure[1])
218
+ when :doctype
219
+ DocType.parse(structure[1], is_xml, is_html)
220
+ when :procins
221
+ ProcIns.parse(structure[1])
222
+ when :comment
223
+ Comment.parse(structure[1])
224
+ when :text_cdata_content
225
+ Text.parse_cdata_content(structure[1])
226
+ when :text_cdata_section
227
+ Text.parse_cdata_section(structure[1])
228
+ else
229
+ raise Exception, "[bug] unknown structure: #{structure.inspect}"
230
+ end
231
+ end
232
+
233
+ def STag.parse(raw_string, is_stag, is_xml, is_html, inherited_context=DefaultContext)
234
+ attrs = []
235
+ if (is_stag ? /\A#{Pat::ValidStartTag_C}\z/o : /\A#{Pat::ValidEmptyTag_C}\z/o) =~ raw_string
236
+ qname = $1
237
+ $2.scan(Pat::ValidAttr_C) {
238
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
239
+ }
240
+ elsif (is_stag ? /\A#{Pat::InvalidStartTag_C}\z/o : /\A#{Pat::InvalidEmptyTag_C}\z/o) =~ raw_string
241
+ qname = $1
242
+ last_attr = $3
243
+ $2.scan(Pat::InvalidAttr1_C) {
244
+ attrs << ($5 ? [nil, $5] : [$1, $2 || $3 || $4])
245
+ }
246
+ if last_attr
247
+ /#{Pat::InvalidAttr1End_C}/o =~ last_attr
248
+ attrs << [$1, $2 || $3]
249
+ end
250
+ else
251
+ raise HTree::Error, "cannot recognize as start tag or empty tag: #{raw_string.inspect}"
252
+ end
253
+
254
+ qname = qname.downcase if !is_xml && is_html
255
+
256
+ attrs.map! {|aname, aval|
257
+ if aname
258
+ aname = (!is_xml && is_html) ? aname.downcase : aname
259
+ [aname, Text.parse_pcdata(aval)]
260
+ else
261
+ if val2name = OmittedAttrName[qname]
262
+ aval_downcase = aval.downcase
263
+ aname = val2name.fetch(aval_downcase, aval_downcase)
264
+ else
265
+ aname = aval
266
+ end
267
+ [aname, Text.new(aval)]
268
+ end
269
+ }
270
+
271
+ result = STag.new(qname, attrs, inherited_context)
272
+ result.raw_string = raw_string
273
+ result
274
+ end
275
+
276
+ def ETag.parse(raw_string, is_xml, is_html)
277
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
278
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
279
+ end
280
+
281
+ qname = $1
282
+ qname = qname.downcase if !is_xml && is_html
283
+
284
+ result = self.new(qname)
285
+ result.raw_string = raw_string
286
+ result
287
+ end
288
+
289
+ def BogusETag.parse(raw_string, is_xml, is_html)
290
+ unless /\A#{Pat::EndTag_C}\z/o =~ raw_string
291
+ raise HTree::Error, "cannot recognize as end tag: #{raw_string.inspect}"
292
+ end
293
+
294
+ qname = $1
295
+ qname = qname.downcase if !is_xml && is_html
296
+
297
+ result = self.new(qname)
298
+ result.raw_string = raw_string
299
+ result
300
+ end
301
+
302
+ def Text.parse_pcdata(raw_string)
303
+ fixed = raw_string.gsub(/&(?:(?:#[0-9]+|#x[0-9a-fA-F]+|([A-Za-z][A-Za-z0-9]*));?)?/o) {|s|
304
+ name = $1
305
+ case s
306
+ when /;\z/
307
+ s
308
+ when /\A&#/
309
+ "#{s};"
310
+ when '&'
311
+ '&amp;'
312
+ else
313
+ if NamedCharactersPattern =~ name
314
+ "&#{name};"
315
+ else
316
+ "&amp;#{name}"
317
+ end
318
+ end
319
+ }
320
+ fixed = raw_string if fixed == raw_string
321
+ result = Text.new_internal(fixed)
322
+ result.raw_string = raw_string
323
+ result
324
+ end
325
+
326
+ def Text.parse_cdata_content(raw_string)
327
+ result = Text.new(raw_string)
328
+ result.raw_string = raw_string
329
+ result
330
+ end
331
+
332
+ def Text.parse_cdata_section(raw_string)
333
+ unless /\A#{Pat::CDATA_C}\z/o =~ raw_string
334
+ raise HTree::Error, "cannot recognize as CDATA section: #{raw_string.inspect}"
335
+ end
336
+
337
+ content = $1
338
+
339
+ result = Text.new(content)
340
+ result.raw_string = raw_string
341
+ result
342
+ end
343
+
344
+ def XMLDecl.parse(raw_string)
345
+ unless /\A#{Pat::XmlDecl_C}\z/o =~ raw_string
346
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
347
+ end
348
+
349
+ version = $1 || $2
350
+ encoding = $3 || $4
351
+ case $5 || $6
352
+ when 'yes'
353
+ standalone = true
354
+ when 'no'
355
+ standalone = false
356
+ else
357
+ standalone = nil
358
+ end
359
+
360
+ result = XMLDecl.new(version, encoding, standalone)
361
+ result.raw_string = raw_string
362
+ result
363
+ end
364
+
365
+ def DocType.parse(raw_string, is_xml, is_html)
366
+ unless /\A#{Pat::DocType_C}\z/o =~ raw_string
367
+ raise HTree::Error, "cannot recognize as XML declaration: #{raw_string.inspect}"
368
+ end
369
+
370
+ root_element_name = $1
371
+ public_identifier = $2 || $3
372
+ system_identifier = $4 || $5
373
+
374
+ root_element_name = root_element_name.downcase if !is_xml && is_html
375
+
376
+ result = DocType.new(root_element_name, public_identifier, system_identifier)
377
+ result.raw_string = raw_string
378
+ result
379
+ end
380
+
381
+ def ProcIns.parse(raw_string)
382
+ unless /\A#{Pat::XmlProcIns_C}\z/o =~ raw_string
383
+ raise HTree::Error, "cannot recognize as processing instruction: #{raw_string.inspect}"
384
+ end
385
+
386
+ target = $1
387
+ content = $2
388
+
389
+ result = ProcIns.new(target, content)
390
+ result.raw_string = raw_string
391
+ result
392
+ end
393
+
394
+ def Comment.parse(raw_string)
395
+ unless /\A#{Pat::Comment_C}\z/o =~ raw_string
396
+ raise HTree::Error, "cannot recognize as comment: #{raw_string.inspect}"
397
+ end
398
+
399
+ content = $1
400
+
401
+ result = Comment.new(content)
402
+ result.raw_string = raw_string
403
+ result
404
+ end
405
+
406
+ end
407
+ # :startdoc: