rubysl-rexml 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (179) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.travis.yml +8 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE +25 -0
  6. data/README.md +29 -0
  7. data/Rakefile +1 -0
  8. data/lib/rexml/attlistdecl.rb +62 -0
  9. data/lib/rexml/attribute.rb +185 -0
  10. data/lib/rexml/cdata.rb +67 -0
  11. data/lib/rexml/child.rb +96 -0
  12. data/lib/rexml/comment.rb +80 -0
  13. data/lib/rexml/doctype.rb +271 -0
  14. data/lib/rexml/document.rb +230 -0
  15. data/lib/rexml/dtd/attlistdecl.rb +10 -0
  16. data/lib/rexml/dtd/dtd.rb +51 -0
  17. data/lib/rexml/dtd/elementdecl.rb +17 -0
  18. data/lib/rexml/dtd/entitydecl.rb +56 -0
  19. data/lib/rexml/dtd/notationdecl.rb +39 -0
  20. data/lib/rexml/element.rb +1227 -0
  21. data/lib/rexml/encoding.rb +71 -0
  22. data/lib/rexml/encodings/CP-1252.rb +103 -0
  23. data/lib/rexml/encodings/EUC-JP.rb +35 -0
  24. data/lib/rexml/encodings/ICONV.rb +22 -0
  25. data/lib/rexml/encodings/ISO-8859-1.rb +7 -0
  26. data/lib/rexml/encodings/ISO-8859-15.rb +72 -0
  27. data/lib/rexml/encodings/SHIFT-JIS.rb +37 -0
  28. data/lib/rexml/encodings/SHIFT_JIS.rb +1 -0
  29. data/lib/rexml/encodings/UNILE.rb +34 -0
  30. data/lib/rexml/encodings/US-ASCII.rb +30 -0
  31. data/lib/rexml/encodings/UTF-16.rb +35 -0
  32. data/lib/rexml/encodings/UTF-8.rb +18 -0
  33. data/lib/rexml/entity.rb +166 -0
  34. data/lib/rexml/formatters/default.rb +109 -0
  35. data/lib/rexml/formatters/pretty.rb +138 -0
  36. data/lib/rexml/formatters/transitive.rb +56 -0
  37. data/lib/rexml/functions.rb +382 -0
  38. data/lib/rexml/instruction.rb +70 -0
  39. data/lib/rexml/light/node.rb +196 -0
  40. data/lib/rexml/namespace.rb +47 -0
  41. data/lib/rexml/node.rb +75 -0
  42. data/lib/rexml/output.rb +24 -0
  43. data/lib/rexml/parent.rb +166 -0
  44. data/lib/rexml/parseexception.rb +51 -0
  45. data/lib/rexml/parsers/baseparser.rb +503 -0
  46. data/lib/rexml/parsers/lightparser.rb +60 -0
  47. data/lib/rexml/parsers/pullparser.rb +196 -0
  48. data/lib/rexml/parsers/sax2parser.rb +238 -0
  49. data/lib/rexml/parsers/streamparser.rb +46 -0
  50. data/lib/rexml/parsers/treeparser.rb +97 -0
  51. data/lib/rexml/parsers/ultralightparser.rb +56 -0
  52. data/lib/rexml/parsers/xpathparser.rb +698 -0
  53. data/lib/rexml/quickpath.rb +266 -0
  54. data/lib/rexml/rexml.rb +32 -0
  55. data/lib/rexml/sax2listener.rb +97 -0
  56. data/lib/rexml/source.rb +251 -0
  57. data/lib/rexml/streamlistener.rb +92 -0
  58. data/lib/rexml/syncenumerator.rb +33 -0
  59. data/lib/rexml/text.rb +344 -0
  60. data/lib/rexml/undefinednamespaceexception.rb +8 -0
  61. data/lib/rexml/validation/relaxng.rb +559 -0
  62. data/lib/rexml/validation/validation.rb +155 -0
  63. data/lib/rexml/validation/validationexception.rb +9 -0
  64. data/lib/rexml/xmldecl.rb +119 -0
  65. data/lib/rexml/xmltokens.rb +18 -0
  66. data/lib/rexml/xpath.rb +66 -0
  67. data/lib/rexml/xpath_parser.rb +792 -0
  68. data/lib/rubysl/rexml.rb +1 -0
  69. data/lib/rubysl/rexml/version.rb +5 -0
  70. data/rubysl-rexml.gemspec +23 -0
  71. data/spec/attribute/clone_spec.rb +10 -0
  72. data/spec/attribute/element_spec.rb +22 -0
  73. data/spec/attribute/equal_value_spec.rb +17 -0
  74. data/spec/attribute/hash_spec.rb +12 -0
  75. data/spec/attribute/initialize_spec.rb +28 -0
  76. data/spec/attribute/inspect_spec.rb +19 -0
  77. data/spec/attribute/namespace_spec.rb +23 -0
  78. data/spec/attribute/node_type_spec.rb +9 -0
  79. data/spec/attribute/prefix_spec.rb +17 -0
  80. data/spec/attribute/remove_spec.rb +19 -0
  81. data/spec/attribute/to_s_spec.rb +13 -0
  82. data/spec/attribute/to_string_spec.rb +14 -0
  83. data/spec/attribute/value_spec.rb +14 -0
  84. data/spec/attribute/write_spec.rb +22 -0
  85. data/spec/attribute/xpath_spec.rb +19 -0
  86. data/spec/attributes/add_spec.rb +6 -0
  87. data/spec/attributes/append_spec.rb +6 -0
  88. data/spec/attributes/delete_all_spec.rb +30 -0
  89. data/spec/attributes/delete_spec.rb +26 -0
  90. data/spec/attributes/each_attribute_spec.rb +24 -0
  91. data/spec/attributes/each_spec.rb +24 -0
  92. data/spec/attributes/element_reference_spec.rb +18 -0
  93. data/spec/attributes/element_set_spec.rb +25 -0
  94. data/spec/attributes/get_attribute_ns_spec.rb +13 -0
  95. data/spec/attributes/get_attribute_spec.rb +28 -0
  96. data/spec/attributes/initialize_spec.rb +18 -0
  97. data/spec/attributes/length_spec.rb +6 -0
  98. data/spec/attributes/namespaces_spec.rb +5 -0
  99. data/spec/attributes/prefixes_spec.rb +23 -0
  100. data/spec/attributes/shared/add.rb +17 -0
  101. data/spec/attributes/shared/length.rb +12 -0
  102. data/spec/attributes/size_spec.rb +6 -0
  103. data/spec/attributes/to_a_spec.rb +20 -0
  104. data/spec/cdata/clone_spec.rb +9 -0
  105. data/spec/cdata/initialize_spec.rb +24 -0
  106. data/spec/cdata/shared/to_s.rb +11 -0
  107. data/spec/cdata/to_s_spec.rb +6 -0
  108. data/spec/cdata/value_spec.rb +6 -0
  109. data/spec/document/add_element_spec.rb +30 -0
  110. data/spec/document/add_spec.rb +60 -0
  111. data/spec/document/clone_spec.rb +19 -0
  112. data/spec/document/doctype_spec.rb +14 -0
  113. data/spec/document/encoding_spec.rb +21 -0
  114. data/spec/document/expanded_name_spec.rb +15 -0
  115. data/spec/document/new_spec.rb +37 -0
  116. data/spec/document/node_type_spec.rb +7 -0
  117. data/spec/document/root_spec.rb +11 -0
  118. data/spec/document/stand_alone_spec.rb +18 -0
  119. data/spec/document/version_spec.rb +13 -0
  120. data/spec/document/write_spec.rb +38 -0
  121. data/spec/document/xml_decl_spec.rb +14 -0
  122. data/spec/element/add_attribute_spec.rb +40 -0
  123. data/spec/element/add_attributes_spec.rb +21 -0
  124. data/spec/element/add_element_spec.rb +38 -0
  125. data/spec/element/add_namespace_spec.rb +23 -0
  126. data/spec/element/add_text_spec.rb +23 -0
  127. data/spec/element/attribute_spec.rb +16 -0
  128. data/spec/element/attributes_spec.rb +18 -0
  129. data/spec/element/cdatas_spec.rb +23 -0
  130. data/spec/element/clone_spec.rb +28 -0
  131. data/spec/element/comments_spec.rb +20 -0
  132. data/spec/element/delete_attribute_spec.rb +38 -0
  133. data/spec/element/delete_element_spec.rb +50 -0
  134. data/spec/element/delete_namespace_spec.rb +24 -0
  135. data/spec/element/document_spec.rb +17 -0
  136. data/spec/element/each_element_with_attribute_spec.rb +34 -0
  137. data/spec/element/each_element_with_text_spec.rb +30 -0
  138. data/spec/element/get_text_spec.rb +17 -0
  139. data/spec/element/has_attributes_spec.rb +16 -0
  140. data/spec/element/has_elements_spec.rb +17 -0
  141. data/spec/element/has_text_spec.rb +15 -0
  142. data/spec/element/inspect_spec.rb +26 -0
  143. data/spec/element/instructions_spec.rb +20 -0
  144. data/spec/element/namespace_spec.rb +26 -0
  145. data/spec/element/namespaces_spec.rb +31 -0
  146. data/spec/element/new_spec.rb +34 -0
  147. data/spec/element/next_element_spec.rb +18 -0
  148. data/spec/element/node_type_spec.rb +7 -0
  149. data/spec/element/prefixes_spec.rb +22 -0
  150. data/spec/element/previous_element_spec.rb +19 -0
  151. data/spec/element/raw_spec.rb +23 -0
  152. data/spec/element/root_spec.rb +27 -0
  153. data/spec/element/text_spec.rb +45 -0
  154. data/spec/element/texts_spec.rb +15 -0
  155. data/spec/element/whitespace_spec.rb +22 -0
  156. data/spec/node/each_recursive_spec.rb +20 -0
  157. data/spec/node/find_first_recursive_spec.rb +24 -0
  158. data/spec/node/index_in_parent_spec.rb +14 -0
  159. data/spec/node/next_sibling_node_spec.rb +20 -0
  160. data/spec/node/parent_spec.rb +20 -0
  161. data/spec/node/previous_sibling_node_spec.rb +20 -0
  162. data/spec/shared/each_element.rb +35 -0
  163. data/spec/shared/elements_to_a.rb +35 -0
  164. data/spec/text/append_spec.rb +9 -0
  165. data/spec/text/clone_spec.rb +9 -0
  166. data/spec/text/comparison_spec.rb +24 -0
  167. data/spec/text/empty_spec.rb +11 -0
  168. data/spec/text/indent_text_spec.rb +23 -0
  169. data/spec/text/inspect_spec.rb +7 -0
  170. data/spec/text/new_spec.rb +48 -0
  171. data/spec/text/node_type_spec.rb +7 -0
  172. data/spec/text/normalize_spec.rb +7 -0
  173. data/spec/text/read_with_substitution_spec.rb +12 -0
  174. data/spec/text/to_s_spec.rb +17 -0
  175. data/spec/text/unnormalize_spec.rb +7 -0
  176. data/spec/text/value_spec.rb +36 -0
  177. data/spec/text/wrap_spec.rb +20 -0
  178. data/spec/text/write_with_substitution_spec.rb +32 -0
  179. metadata +385 -0
@@ -0,0 +1,51 @@
1
+ module REXML
2
+ class ParseException < RuntimeError
3
+ attr_accessor :source, :parser, :continued_exception
4
+
5
+ def initialize( message, source=nil, parser=nil, exception=nil )
6
+ super(message)
7
+ @source = source
8
+ @parser = parser
9
+ @continued_exception = exception
10
+ end
11
+
12
+ def to_s
13
+ # Quote the original exception, if there was one
14
+ if @continued_exception
15
+ err = @continued_exception.inspect
16
+ err << "\n"
17
+ err << @continued_exception.backtrace.join("\n")
18
+ err << "\n...\n"
19
+ else
20
+ err = ""
21
+ end
22
+
23
+ # Get the stack trace and error message
24
+ err << super
25
+
26
+ # Add contextual information
27
+ if @source
28
+ err << "\nLine: #{line}\n"
29
+ err << "Position: #{position}\n"
30
+ err << "Last 80 unconsumed characters:\n"
31
+ err << @source.buffer[0..80].gsub(/\n/, ' ')
32
+ end
33
+
34
+ err
35
+ end
36
+
37
+ def position
38
+ @source.current_line[0] if @source and defined? @source.current_line and
39
+ @source.current_line
40
+ end
41
+
42
+ def line
43
+ @source.current_line[2] if @source and defined? @source.current_line and
44
+ @source.current_line
45
+ end
46
+
47
+ def context
48
+ @source.current_line
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,503 @@
1
+ require 'rexml/parseexception'
2
+ require 'rexml/undefinednamespaceexception'
3
+ require 'rexml/source'
4
+ require 'set'
5
+
6
+ module REXML
7
+ module Parsers
8
+ # = Using the Pull Parser
9
+ # <em>This API is experimental, and subject to change.</em>
10
+ # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
11
+ # while parser.has_next?
12
+ # res = parser.next
13
+ # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
14
+ # end
15
+ # See the PullEvent class for information on the content of the results.
16
+ # The data is identical to the arguments passed for the various events to
17
+ # the StreamListener API.
18
+ #
19
+ # Notice that:
20
+ # parser = PullParser.new( "<a>BAD DOCUMENT" )
21
+ # while parser.has_next?
22
+ # res = parser.next
23
+ # raise res[1] if res.error?
24
+ # end
25
+ #
26
+ # Nat Price gave me some good ideas for the API.
27
+ class BaseParser
28
+ NCNAME_STR= '[\w:][\-\w\d.]*'
29
+ NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
30
+ UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
31
+
32
+ NAMECHAR = '[\-\w\d\.:]'
33
+ NAME = "([\\w:]#{NAMECHAR}*)"
34
+ NMTOKEN = "(?:#{NAMECHAR})+"
35
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
36
+ REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
37
+ REFERENCE_RE = /#{REFERENCE}/
38
+
39
+ DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
40
+ DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
41
+ ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
42
+ COMMENT_START = /\A<!--/u
43
+ COMMENT_PATTERN = /<!--(.*?)-->/um
44
+ CDATA_START = /\A<!\[CDATA\[/u
45
+ CDATA_END = /^\s*\]\s*>/um
46
+ CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
47
+ XMLDECL_START = /\A<\?xml\s/u;
48
+ XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
49
+ INSTRUCTION_START = /\A<\?/u
50
+ INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
51
+ TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
52
+ CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
53
+
54
+ VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
55
+ ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
56
+ STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
57
+
58
+ ENTITY_START = /^\s*<!ENTITY/
59
+ IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
60
+ ELEMENTDECL_START = /^\s*<!ELEMENT/um
61
+ ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
62
+ SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
63
+ ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
64
+ NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
65
+ ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
66
+ ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
67
+ ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
68
+ DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
69
+ ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
70
+ ATTDEF_RE = /#{ATTDEF}/
71
+ ATTLISTDECL_START = /^\s*<!ATTLIST/um
72
+ ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
73
+ NOTATIONDECL_START = /^\s*<!NOTATION/um
74
+ PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
75
+ SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
76
+
77
+ TEXT_PATTERN = /\A([^<]*)/um
78
+
79
+ # Entity constants
80
+ PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
81
+ SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
82
+ PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
83
+ EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
84
+ NDATADECL = "\\s+NDATA\\s+#{NAME}"
85
+ PEREFERENCE = "%#{NAME};"
86
+ ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
87
+ PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
88
+ ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
89
+ PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
90
+ GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
91
+ ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
92
+
93
+ EREFERENCE = /&(?!#{NAME};)/
94
+
95
+ DEFAULT_ENTITIES = {
96
+ 'gt' => [/&gt;/, '&gt;', '>', />/],
97
+ 'lt' => [/&lt;/, '&lt;', '<', /</],
98
+ 'quot' => [/&quot;/, '&quot;', '"', /"/],
99
+ "apos" => [/&apos;/, "&apos;", "'", /'/]
100
+ }
101
+
102
+
103
+ ######################################################################
104
+ # These are patterns to identify common markup errors, to make the
105
+ # error messages more informative.
106
+ ######################################################################
107
+ MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
108
+
109
+ def initialize( source )
110
+ self.stream = source
111
+ end
112
+
113
+ def add_listener( listener )
114
+ if !defined?(@listeners) or !@listeners
115
+ @listeners = []
116
+ instance_eval <<-EOL
117
+ alias :_old_pull :pull
118
+ def pull
119
+ event = _old_pull
120
+ @listeners.each do |listener|
121
+ listener.receive event
122
+ end
123
+ event
124
+ end
125
+ EOL
126
+ end
127
+ @listeners << listener
128
+ end
129
+
130
+ attr_reader :source
131
+
132
+ def stream=( source )
133
+ @source = SourceFactory.create_from( source )
134
+ @closed = nil
135
+ @document_status = nil
136
+ @tags = []
137
+ @stack = []
138
+ @entities = []
139
+ @nsstack = []
140
+ end
141
+
142
+ def position
143
+ if @source.respond_to? :position
144
+ @source.position
145
+ else
146
+ # FIXME
147
+ 0
148
+ end
149
+ end
150
+
151
+ # Returns true if there are no more events
152
+ def empty?
153
+ return (@source.empty? and @stack.empty?)
154
+ end
155
+
156
+ # Returns true if there are more events. Synonymous with !empty?
157
+ def has_next?
158
+ return !(@source.empty? and @stack.empty?)
159
+ end
160
+
161
+ # Push an event back on the head of the stream. This method
162
+ # has (theoretically) infinite depth.
163
+ def unshift token
164
+ @stack.unshift(token)
165
+ end
166
+
167
+ # Peek at the +depth+ event in the stack. The first element on the stack
168
+ # is at depth 0. If +depth+ is -1, will parse to the end of the input
169
+ # stream and return the last event, which is always :end_document.
170
+ # Be aware that this causes the stream to be parsed up to the +depth+
171
+ # event, so you can effectively pre-parse the entire document (pull the
172
+ # entire thing into memory) using this method.
173
+ def peek depth=0
174
+ raise %Q[Illegal argument "#{depth}"] if depth < -1
175
+ temp = []
176
+ if depth == -1
177
+ temp.push(pull()) until empty?
178
+ else
179
+ while @stack.size+temp.size < depth+1
180
+ temp.push(pull())
181
+ end
182
+ end
183
+ @stack += temp if temp.size > 0
184
+ @stack[depth]
185
+ end
186
+
187
+ # Returns the next event. This is a +PullEvent+ object.
188
+ def pull
189
+ if @closed
190
+ x, @closed = @closed, nil
191
+ return [ :end_element, x ]
192
+ end
193
+ return [ :end_document ] if empty?
194
+ return @stack.shift if @stack.size > 0
195
+ #STDERR.puts @source.encoding
196
+ @source.read if @source.buffer.size<2
197
+ #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
198
+ if @document_status == nil
199
+ #@source.consume( /^\s*/um )
200
+ word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
201
+ word = word[1] unless word.nil?
202
+ #STDERR.puts "WORD = #{word.inspect}"
203
+ case word
204
+ when COMMENT_START
205
+ return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
+ when XMLDECL_START
207
+ #STDERR.puts "XMLDECL"
208
+ results = @source.match( XMLDECL_PATTERN, true )[1]
209
+ version = VERSION.match( results )
210
+ version = version[1] unless version.nil?
211
+ encoding = ENCODING.match(results)
212
+ encoding = encoding[1] unless encoding.nil?
213
+ @source.encoding = encoding
214
+ standalone = STANDALONE.match(results)
215
+ standalone = standalone[1] unless standalone.nil?
216
+ return [ :xmldecl, version, encoding, standalone ]
217
+ when INSTRUCTION_START
218
+ return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
219
+ when DOCTYPE_START
220
+ md = @source.match( DOCTYPE_PATTERN, true )
221
+ @nsstack.unshift(curr_ns=Set.new)
222
+ identity = md[1]
223
+ close = md[2]
224
+ identity =~ IDENTITY
225
+ name = $1
226
+ raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
227
+ pub_sys = $2.nil? ? nil : $2.strip
228
+ long_name = $4.nil? ? nil : $4.strip
229
+ uri = $6.nil? ? nil : $6.strip
230
+ args = [ :start_doctype, name, pub_sys, long_name, uri ]
231
+ if close == ">"
232
+ @document_status = :after_doctype
233
+ @source.read if @source.buffer.size<2
234
+ md = @source.match(/^\s*/um, true)
235
+ @stack << [ :end_doctype ]
236
+ else
237
+ @document_status = :in_doctype
238
+ end
239
+ return args
240
+ when /^\s+/
241
+ else
242
+ @document_status = :after_doctype
243
+ @source.read if @source.buffer.size<2
244
+ md = @source.match(/\s*/um, true)
245
+ end
246
+ end
247
+ if @document_status == :in_doctype
248
+ md = @source.match(/\s*(.*?>)/um)
249
+ case md[1]
250
+ when SYSTEMENTITY
251
+ match = @source.match( SYSTEMENTITY, true )[1]
252
+ return [ :externalentity, match ]
253
+
254
+ when ELEMENTDECL_START
255
+ return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
256
+
257
+ when ENTITY_START
258
+ match = @source.match( ENTITYDECL, true ).to_a.compact
259
+ match[0] = :entitydecl
260
+ ref = false
261
+ if match[1] == '%'
262
+ ref = true
263
+ match.delete_at 1
264
+ end
265
+ # Now we have to sort out what kind of entity reference this is
266
+ if match[2] == 'SYSTEM'
267
+ # External reference
268
+ match[3] = match[3][1..-2] # PUBID
269
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
270
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
271
+ elsif match[2] == 'PUBLIC'
272
+ # External reference
273
+ match[3] = match[3][1..-2] # PUBID
274
+ match[4] = match[4][1..-2] # HREF
275
+ # match is [ :entity, name, PUBLIC, pubid, href ]
276
+ else
277
+ match[2] = match[2][1..-2]
278
+ match.pop if match.size == 4
279
+ # match is [ :entity, name, value ]
280
+ end
281
+ match << '%' if ref
282
+ return match
283
+ when ATTLISTDECL_START
284
+ md = @source.match( ATTLISTDECL_PATTERN, true )
285
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
286
+ element = md[1]
287
+ contents = md[0]
288
+
289
+ pairs = {}
290
+ values = md[0].scan( ATTDEF_RE )
291
+ values.each do |attdef|
292
+ unless attdef[3] == "#IMPLIED"
293
+ attdef.compact!
294
+ val = attdef[3]
295
+ val = attdef[4] if val == "#FIXED "
296
+ pairs[attdef[0]] = val
297
+ if attdef[0] =~ /^xmlns:(.*)/
298
+ @nsstack[0] << $1
299
+ end
300
+ end
301
+ end
302
+ return [ :attlistdecl, element, pairs, contents ]
303
+ when NOTATIONDECL_START
304
+ md = nil
305
+ if @source.match( PUBLIC )
306
+ md = @source.match( PUBLIC, true )
307
+ vals = [md[1],md[2],md[4],md[6]]
308
+ elsif @source.match( SYSTEM )
309
+ md = @source.match( SYSTEM, true )
310
+ vals = [md[1],md[2],nil,md[4]]
311
+ else
312
+ raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
313
+ end
314
+ return [ :notationdecl, *vals ]
315
+ when CDATA_END
316
+ @document_status = :after_doctype
317
+ @source.match( CDATA_END, true )
318
+ return [ :end_doctype ]
319
+ end
320
+ end
321
+ begin
322
+ if @source.buffer[0] == ?<
323
+ if @source.buffer[1] == ?/
324
+ @nsstack.shift
325
+ last_tag = @tags.pop
326
+ #md = @source.match_to_consume( '>', CLOSE_MATCH)
327
+ md = @source.match( CLOSE_MATCH, true )
328
+ raise REXML::ParseException.new( "Missing end tag for "+
329
+ "'#{last_tag}' (got \"#{md[1]}\")",
330
+ @source) unless last_tag == md[1]
331
+ return [ :end_element, last_tag ]
332
+ elsif @source.buffer[1] == ?!
333
+ md = @source.match(/\A(\s*[^>]*>)/um)
334
+ #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
335
+ raise REXML::ParseException.new("Malformed node", @source) unless md
336
+ if md[0][2] == ?-
337
+ md = @source.match( COMMENT_PATTERN, true )
338
+ return [ :comment, md[1] ] if md
339
+ else
340
+ md = @source.match( CDATA_PATTERN, true )
341
+ return [ :cdata, md[1] ] if md
342
+ end
343
+ raise REXML::ParseException.new( "Declarations can only occur "+
344
+ "in the doctype declaration.", @source)
345
+ elsif @source.buffer[1] == ??
346
+ md = @source.match( INSTRUCTION_PATTERN, true )
347
+ return [ :processing_instruction, md[1], md[2] ] if md
348
+ raise REXML::ParseException.new( "Bad instruction declaration",
349
+ @source)
350
+ else
351
+ # Get the next tag
352
+ md = @source.match(TAG_MATCH, true)
353
+ unless md
354
+ # Check for missing attribute quotes
355
+ raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
356
+ raise REXML::ParseException.new("malformed XML: missing tag start", @source)
357
+ end
358
+ attributes = {}
359
+ prefixes = Set.new
360
+ prefixes << md[2] if md[2]
361
+ @nsstack.unshift(curr_ns=Set.new)
362
+ if md[4].size > 0
363
+ attrs = md[4].scan( ATTRIBUTE_PATTERN )
364
+ raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
365
+ attrs.each { |a,b,c,d,e|
366
+ if b == "xmlns"
367
+ if c == "xml"
368
+ if d != "http://www.w3.org/XML/1998/namespace"
369
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
370
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
371
+ raise REXML::ParseException.new( msg, @source, self )
372
+ end
373
+ elsif c == "xmlns"
374
+ msg = "The 'xmlns' prefix must not be declared "+
375
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
376
+ raise REXML::ParseException.new( msg, @source, self)
377
+ end
378
+ curr_ns << c
379
+ elsif b
380
+ prefixes << b unless b == "xml"
381
+ end
382
+ attributes[a] = e
383
+ }
384
+ end
385
+
386
+ # Verify that all of the prefixes have been defined
387
+ for prefix in prefixes
388
+ unless @nsstack.find{|k| k.member?(prefix)}
389
+ raise UndefinedNamespaceException.new(prefix,@source,self)
390
+ end
391
+ end
392
+
393
+ if md[6]
394
+ @closed = md[1]
395
+ @nsstack.shift
396
+ else
397
+ @tags.push( md[1] )
398
+ end
399
+ return [ :start_element, md[1], attributes ]
400
+ end
401
+ else
402
+ md = @source.match( TEXT_PATTERN, true )
403
+ if md[0].length == 0
404
+ @source.match( /(\s+)/, true )
405
+ end
406
+ #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
407
+ #return [ :text, "" ] if md[0].length == 0
408
+ # unnormalized = Text::unnormalize( md[1], self )
409
+ # return PullEvent.new( :text, md[1], unnormalized )
410
+ return [ :text, md[1] ]
411
+ end
412
+ rescue REXML::UndefinedNamespaceException
413
+ raise
414
+ rescue REXML::ParseException
415
+ raise
416
+ rescue Exception, NameError => error
417
+ raise REXML::ParseException.new( "Exception parsing",
418
+ @source, self, (error ? error : $!) )
419
+ end
420
+ return [ :dummy ]
421
+ end
422
+
423
+ def entity( reference, entities )
424
+ value = nil
425
+ value = entities[ reference ] if entities
426
+ if not value
427
+ value = DEFAULT_ENTITIES[ reference ]
428
+ value = value[2] if value
429
+ end
430
+ unnormalize( value, entities ) if value
431
+ end
432
+
433
+ # Escapes all possible entities
434
+ def normalize( input, entities=nil, entity_filter=nil )
435
+ copy = input.clone
436
+ # Doing it like this rather than in a loop improves the speed
437
+ copy.gsub!( EREFERENCE, '&amp;' )
438
+ entities.each do |key, value|
439
+ copy.gsub!( value, "&#{key};" ) unless entity_filter and
440
+ entity_filter.include?(entity)
441
+ end if entities
442
+ copy.gsub!( EREFERENCE, '&amp;' )
443
+ DEFAULT_ENTITIES.each do |key, value|
444
+ copy.gsub!( value[3], value[1] )
445
+ end
446
+ copy
447
+ end
448
+
449
+ # Unescapes all possible entities
450
+ def unnormalize( string, entities=nil, filter=nil )
451
+ rv = string.clone
452
+ rv.gsub!( /\r\n?/, "\n" )
453
+ matches = rv.scan( REFERENCE_RE )
454
+ return rv if matches.size == 0
455
+ rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
456
+ m=$1
457
+ m = "0#{m}" if m[0] == ?x
458
+ [Integer(m)].pack('U*')
459
+ }
460
+ matches.collect!{|x|x[0]}.compact!
461
+ if matches.size > 0
462
+ matches.each do |entity_reference|
463
+ unless filter and filter.include?(entity_reference)
464
+ entity_value = entity( entity_reference, entities )
465
+ if entity_value
466
+ re = /&#{entity_reference};/
467
+ rv.gsub!( re, entity_value )
468
+ end
469
+ end
470
+ end
471
+ matches.each do |entity_reference|
472
+ unless filter and filter.include?(entity_reference)
473
+ er = DEFAULT_ENTITIES[entity_reference]
474
+ rv.gsub!( er[0], er[2] ) if er
475
+ end
476
+ end
477
+ rv.gsub!( /&amp;/, '&' )
478
+ end
479
+ rv
480
+ end
481
+ end
482
+ end
483
+ end
484
+
485
+ =begin
486
+ case event[0]
487
+ when :start_element
488
+ when :text
489
+ when :end_element
490
+ when :processing_instruction
491
+ when :cdata
492
+ when :comment
493
+ when :xmldecl
494
+ when :start_doctype
495
+ when :end_doctype
496
+ when :externalentity
497
+ when :elementdecl
498
+ when :entity
499
+ when :attlistdecl
500
+ when :notationdecl
501
+ when :end_doctype
502
+ end
503
+ =end