rubysl-rexml 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.travis.yml +8 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE +25 -0
  6. data/README.md +29 -0
  7. data/Rakefile +1 -0
  8. data/lib/rexml/attlistdecl.rb +62 -0
  9. data/lib/rexml/attribute.rb +185 -0
  10. data/lib/rexml/cdata.rb +67 -0
  11. data/lib/rexml/child.rb +96 -0
  12. data/lib/rexml/comment.rb +80 -0
  13. data/lib/rexml/doctype.rb +271 -0
  14. data/lib/rexml/document.rb +230 -0
  15. data/lib/rexml/dtd/attlistdecl.rb +10 -0
  16. data/lib/rexml/dtd/dtd.rb +51 -0
  17. data/lib/rexml/dtd/elementdecl.rb +17 -0
  18. data/lib/rexml/dtd/entitydecl.rb +56 -0
  19. data/lib/rexml/dtd/notationdecl.rb +39 -0
  20. data/lib/rexml/element.rb +1227 -0
  21. data/lib/rexml/encoding.rb +71 -0
  22. data/lib/rexml/encodings/CP-1252.rb +103 -0
  23. data/lib/rexml/encodings/EUC-JP.rb +35 -0
  24. data/lib/rexml/encodings/ICONV.rb +22 -0
  25. data/lib/rexml/encodings/ISO-8859-1.rb +7 -0
  26. data/lib/rexml/encodings/ISO-8859-15.rb +72 -0
  27. data/lib/rexml/encodings/SHIFT-JIS.rb +37 -0
  28. data/lib/rexml/encodings/SHIFT_JIS.rb +1 -0
  29. data/lib/rexml/encodings/UNILE.rb +34 -0
  30. data/lib/rexml/encodings/US-ASCII.rb +30 -0
  31. data/lib/rexml/encodings/UTF-16.rb +35 -0
  32. data/lib/rexml/encodings/UTF-8.rb +18 -0
  33. data/lib/rexml/entity.rb +166 -0
  34. data/lib/rexml/formatters/default.rb +109 -0
  35. data/lib/rexml/formatters/pretty.rb +138 -0
  36. data/lib/rexml/formatters/transitive.rb +56 -0
  37. data/lib/rexml/functions.rb +382 -0
  38. data/lib/rexml/instruction.rb +70 -0
  39. data/lib/rexml/light/node.rb +196 -0
  40. data/lib/rexml/namespace.rb +47 -0
  41. data/lib/rexml/node.rb +75 -0
  42. data/lib/rexml/output.rb +24 -0
  43. data/lib/rexml/parent.rb +166 -0
  44. data/lib/rexml/parseexception.rb +51 -0
  45. data/lib/rexml/parsers/baseparser.rb +503 -0
  46. data/lib/rexml/parsers/lightparser.rb +60 -0
  47. data/lib/rexml/parsers/pullparser.rb +196 -0
  48. data/lib/rexml/parsers/sax2parser.rb +238 -0
  49. data/lib/rexml/parsers/streamparser.rb +46 -0
  50. data/lib/rexml/parsers/treeparser.rb +97 -0
  51. data/lib/rexml/parsers/ultralightparser.rb +56 -0
  52. data/lib/rexml/parsers/xpathparser.rb +698 -0
  53. data/lib/rexml/quickpath.rb +266 -0
  54. data/lib/rexml/rexml.rb +32 -0
  55. data/lib/rexml/sax2listener.rb +97 -0
  56. data/lib/rexml/source.rb +251 -0
  57. data/lib/rexml/streamlistener.rb +92 -0
  58. data/lib/rexml/syncenumerator.rb +33 -0
  59. data/lib/rexml/text.rb +344 -0
  60. data/lib/rexml/undefinednamespaceexception.rb +8 -0
  61. data/lib/rexml/validation/relaxng.rb +559 -0
  62. data/lib/rexml/validation/validation.rb +155 -0
  63. data/lib/rexml/validation/validationexception.rb +9 -0
  64. data/lib/rexml/xmldecl.rb +119 -0
  65. data/lib/rexml/xmltokens.rb +18 -0
  66. data/lib/rexml/xpath.rb +66 -0
  67. data/lib/rexml/xpath_parser.rb +792 -0
  68. data/lib/rubysl/rexml.rb +1 -0
  69. data/lib/rubysl/rexml/version.rb +5 -0
  70. data/rubysl-rexml.gemspec +23 -0
  71. data/spec/attribute/clone_spec.rb +10 -0
  72. data/spec/attribute/element_spec.rb +22 -0
  73. data/spec/attribute/equal_value_spec.rb +17 -0
  74. data/spec/attribute/hash_spec.rb +12 -0
  75. data/spec/attribute/initialize_spec.rb +28 -0
  76. data/spec/attribute/inspect_spec.rb +19 -0
  77. data/spec/attribute/namespace_spec.rb +23 -0
  78. data/spec/attribute/node_type_spec.rb +9 -0
  79. data/spec/attribute/prefix_spec.rb +17 -0
  80. data/spec/attribute/remove_spec.rb +19 -0
  81. data/spec/attribute/to_s_spec.rb +13 -0
  82. data/spec/attribute/to_string_spec.rb +14 -0
  83. data/spec/attribute/value_spec.rb +14 -0
  84. data/spec/attribute/write_spec.rb +22 -0
  85. data/spec/attribute/xpath_spec.rb +19 -0
  86. data/spec/attributes/add_spec.rb +6 -0
  87. data/spec/attributes/append_spec.rb +6 -0
  88. data/spec/attributes/delete_all_spec.rb +30 -0
  89. data/spec/attributes/delete_spec.rb +26 -0
  90. data/spec/attributes/each_attribute_spec.rb +24 -0
  91. data/spec/attributes/each_spec.rb +24 -0
  92. data/spec/attributes/element_reference_spec.rb +18 -0
  93. data/spec/attributes/element_set_spec.rb +25 -0
  94. data/spec/attributes/get_attribute_ns_spec.rb +13 -0
  95. data/spec/attributes/get_attribute_spec.rb +28 -0
  96. data/spec/attributes/initialize_spec.rb +18 -0
  97. data/spec/attributes/length_spec.rb +6 -0
  98. data/spec/attributes/namespaces_spec.rb +5 -0
  99. data/spec/attributes/prefixes_spec.rb +23 -0
  100. data/spec/attributes/shared/add.rb +17 -0
  101. data/spec/attributes/shared/length.rb +12 -0
  102. data/spec/attributes/size_spec.rb +6 -0
  103. data/spec/attributes/to_a_spec.rb +20 -0
  104. data/spec/cdata/clone_spec.rb +9 -0
  105. data/spec/cdata/initialize_spec.rb +24 -0
  106. data/spec/cdata/shared/to_s.rb +11 -0
  107. data/spec/cdata/to_s_spec.rb +6 -0
  108. data/spec/cdata/value_spec.rb +6 -0
  109. data/spec/document/add_element_spec.rb +30 -0
  110. data/spec/document/add_spec.rb +60 -0
  111. data/spec/document/clone_spec.rb +19 -0
  112. data/spec/document/doctype_spec.rb +14 -0
  113. data/spec/document/encoding_spec.rb +21 -0
  114. data/spec/document/expanded_name_spec.rb +15 -0
  115. data/spec/document/new_spec.rb +37 -0
  116. data/spec/document/node_type_spec.rb +7 -0
  117. data/spec/document/root_spec.rb +11 -0
  118. data/spec/document/stand_alone_spec.rb +18 -0
  119. data/spec/document/version_spec.rb +13 -0
  120. data/spec/document/write_spec.rb +38 -0
  121. data/spec/document/xml_decl_spec.rb +14 -0
  122. data/spec/element/add_attribute_spec.rb +40 -0
  123. data/spec/element/add_attributes_spec.rb +21 -0
  124. data/spec/element/add_element_spec.rb +38 -0
  125. data/spec/element/add_namespace_spec.rb +23 -0
  126. data/spec/element/add_text_spec.rb +23 -0
  127. data/spec/element/attribute_spec.rb +16 -0
  128. data/spec/element/attributes_spec.rb +18 -0
  129. data/spec/element/cdatas_spec.rb +23 -0
  130. data/spec/element/clone_spec.rb +28 -0
  131. data/spec/element/comments_spec.rb +20 -0
  132. data/spec/element/delete_attribute_spec.rb +38 -0
  133. data/spec/element/delete_element_spec.rb +50 -0
  134. data/spec/element/delete_namespace_spec.rb +24 -0
  135. data/spec/element/document_spec.rb +17 -0
  136. data/spec/element/each_element_with_attribute_spec.rb +34 -0
  137. data/spec/element/each_element_with_text_spec.rb +30 -0
  138. data/spec/element/get_text_spec.rb +17 -0
  139. data/spec/element/has_attributes_spec.rb +16 -0
  140. data/spec/element/has_elements_spec.rb +17 -0
  141. data/spec/element/has_text_spec.rb +15 -0
  142. data/spec/element/inspect_spec.rb +26 -0
  143. data/spec/element/instructions_spec.rb +20 -0
  144. data/spec/element/namespace_spec.rb +26 -0
  145. data/spec/element/namespaces_spec.rb +31 -0
  146. data/spec/element/new_spec.rb +34 -0
  147. data/spec/element/next_element_spec.rb +18 -0
  148. data/spec/element/node_type_spec.rb +7 -0
  149. data/spec/element/prefixes_spec.rb +22 -0
  150. data/spec/element/previous_element_spec.rb +19 -0
  151. data/spec/element/raw_spec.rb +23 -0
  152. data/spec/element/root_spec.rb +27 -0
  153. data/spec/element/text_spec.rb +45 -0
  154. data/spec/element/texts_spec.rb +15 -0
  155. data/spec/element/whitespace_spec.rb +22 -0
  156. data/spec/node/each_recursive_spec.rb +20 -0
  157. data/spec/node/find_first_recursive_spec.rb +24 -0
  158. data/spec/node/index_in_parent_spec.rb +14 -0
  159. data/spec/node/next_sibling_node_spec.rb +20 -0
  160. data/spec/node/parent_spec.rb +20 -0
  161. data/spec/node/previous_sibling_node_spec.rb +20 -0
  162. data/spec/shared/each_element.rb +35 -0
  163. data/spec/shared/elements_to_a.rb +35 -0
  164. data/spec/text/append_spec.rb +9 -0
  165. data/spec/text/clone_spec.rb +9 -0
  166. data/spec/text/comparison_spec.rb +24 -0
  167. data/spec/text/empty_spec.rb +11 -0
  168. data/spec/text/indent_text_spec.rb +23 -0
  169. data/spec/text/inspect_spec.rb +7 -0
  170. data/spec/text/new_spec.rb +48 -0
  171. data/spec/text/node_type_spec.rb +7 -0
  172. data/spec/text/normalize_spec.rb +7 -0
  173. data/spec/text/read_with_substitution_spec.rb +12 -0
  174. data/spec/text/to_s_spec.rb +17 -0
  175. data/spec/text/unnormalize_spec.rb +7 -0
  176. data/spec/text/value_spec.rb +36 -0
  177. data/spec/text/wrap_spec.rb +20 -0
  178. data/spec/text/write_with_substitution_spec.rb +32 -0
  179. metadata +385 -0
@@ -0,0 +1,51 @@
1
+ module REXML
2
+ class ParseException < RuntimeError
3
+ attr_accessor :source, :parser, :continued_exception
4
+
5
+ def initialize( message, source=nil, parser=nil, exception=nil )
6
+ super(message)
7
+ @source = source
8
+ @parser = parser
9
+ @continued_exception = exception
10
+ end
11
+
12
+ def to_s
13
+ # Quote the original exception, if there was one
14
+ if @continued_exception
15
+ err = @continued_exception.inspect
16
+ err << "\n"
17
+ err << @continued_exception.backtrace.join("\n")
18
+ err << "\n...\n"
19
+ else
20
+ err = ""
21
+ end
22
+
23
+ # Get the stack trace and error message
24
+ err << super
25
+
26
+ # Add contextual information
27
+ if @source
28
+ err << "\nLine: #{line}\n"
29
+ err << "Position: #{position}\n"
30
+ err << "Last 80 unconsumed characters:\n"
31
+ err << @source.buffer[0..80].gsub(/\n/, ' ')
32
+ end
33
+
34
+ err
35
+ end
36
+
37
+ def position
38
+ @source.current_line[0] if @source and defined? @source.current_line and
39
+ @source.current_line
40
+ end
41
+
42
+ def line
43
+ @source.current_line[2] if @source and defined? @source.current_line and
44
+ @source.current_line
45
+ end
46
+
47
+ def context
48
+ @source.current_line
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,503 @@
1
+ require 'rexml/parseexception'
2
+ require 'rexml/undefinednamespaceexception'
3
+ require 'rexml/source'
4
+ require 'set'
5
+
6
+ module REXML
7
+ module Parsers
8
+ # = Using the Pull Parser
9
+ # <em>This API is experimental, and subject to change.</em>
10
+ # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
11
+ # while parser.has_next?
12
+ # res = parser.next
13
+ # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
14
+ # end
15
+ # See the PullEvent class for information on the content of the results.
16
+ # The data is identical to the arguments passed for the various events to
17
+ # the StreamListener API.
18
+ #
19
+ # Notice that:
20
+ # parser = PullParser.new( "<a>BAD DOCUMENT" )
21
+ # while parser.has_next?
22
+ # res = parser.next
23
+ # raise res[1] if res.error?
24
+ # end
25
+ #
26
+ # Nat Price gave me some good ideas for the API.
27
+ class BaseParser
28
+ NCNAME_STR= '[\w:][\-\w\d.]*'
29
+ NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
30
+ UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
31
+
32
+ NAMECHAR = '[\-\w\d\.:]'
33
+ NAME = "([\\w:]#{NAMECHAR}*)"
34
+ NMTOKEN = "(?:#{NAMECHAR})+"
35
+ NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
36
+ REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
37
+ REFERENCE_RE = /#{REFERENCE}/
38
+
39
+ DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
40
+ DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
41
+ ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
42
+ COMMENT_START = /\A<!--/u
43
+ COMMENT_PATTERN = /<!--(.*?)-->/um
44
+ CDATA_START = /\A<!\[CDATA\[/u
45
+ CDATA_END = /^\s*\]\s*>/um
46
+ CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
47
+ XMLDECL_START = /\A<\?xml\s/u;
48
+ XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
49
+ INSTRUCTION_START = /\A<\?/u
50
+ INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
51
+ TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
52
+ CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
53
+
54
+ VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
55
+ ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
56
+ STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
57
+
58
+ ENTITY_START = /^\s*<!ENTITY/
59
+ IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
60
+ ELEMENTDECL_START = /^\s*<!ELEMENT/um
61
+ ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
62
+ SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
63
+ ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
64
+ NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
65
+ ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
66
+ ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
67
+ ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
68
+ DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
69
+ ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
70
+ ATTDEF_RE = /#{ATTDEF}/
71
+ ATTLISTDECL_START = /^\s*<!ATTLIST/um
72
+ ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
73
+ NOTATIONDECL_START = /^\s*<!NOTATION/um
74
+ PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
75
+ SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
76
+
77
+ TEXT_PATTERN = /\A([^<]*)/um
78
+
79
+ # Entity constants
80
+ PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
81
+ SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
82
+ PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
83
+ EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
84
+ NDATADECL = "\\s+NDATA\\s+#{NAME}"
85
+ PEREFERENCE = "%#{NAME};"
86
+ ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
87
+ PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
88
+ ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
89
+ PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
90
+ GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
91
+ ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
92
+
93
+ EREFERENCE = /&(?!#{NAME};)/
94
+
95
+ DEFAULT_ENTITIES = {
96
+ 'gt' => [/&gt;/, '&gt;', '>', />/],
97
+ 'lt' => [/&lt;/, '&lt;', '<', /</],
98
+ 'quot' => [/&quot;/, '&quot;', '"', /"/],
99
+ "apos" => [/&apos;/, "&apos;", "'", /'/]
100
+ }
101
+
102
+
103
+ ######################################################################
104
+ # These are patterns to identify common markup errors, to make the
105
+ # error messages more informative.
106
+ ######################################################################
107
+ MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
108
+
109
+ def initialize( source )
110
+ self.stream = source
111
+ end
112
+
113
+ def add_listener( listener )
114
+ if !defined?(@listeners) or !@listeners
115
+ @listeners = []
116
+ instance_eval <<-EOL
117
+ alias :_old_pull :pull
118
+ def pull
119
+ event = _old_pull
120
+ @listeners.each do |listener|
121
+ listener.receive event
122
+ end
123
+ event
124
+ end
125
+ EOL
126
+ end
127
+ @listeners << listener
128
+ end
129
+
130
+ attr_reader :source
131
+
132
+ def stream=( source )
133
+ @source = SourceFactory.create_from( source )
134
+ @closed = nil
135
+ @document_status = nil
136
+ @tags = []
137
+ @stack = []
138
+ @entities = []
139
+ @nsstack = []
140
+ end
141
+
142
+ def position
143
+ if @source.respond_to? :position
144
+ @source.position
145
+ else
146
+ # FIXME
147
+ 0
148
+ end
149
+ end
150
+
151
+ # Returns true if there are no more events
152
+ def empty?
153
+ return (@source.empty? and @stack.empty?)
154
+ end
155
+
156
+ # Returns true if there are more events. Synonymous with !empty?
157
+ def has_next?
158
+ return !(@source.empty? and @stack.empty?)
159
+ end
160
+
161
+ # Push an event back on the head of the stream. This method
162
+ # has (theoretically) infinite depth.
163
+ def unshift token
164
+ @stack.unshift(token)
165
+ end
166
+
167
+ # Peek at the +depth+ event in the stack. The first element on the stack
168
+ # is at depth 0. If +depth+ is -1, will parse to the end of the input
169
+ # stream and return the last event, which is always :end_document.
170
+ # Be aware that this causes the stream to be parsed up to the +depth+
171
+ # event, so you can effectively pre-parse the entire document (pull the
172
+ # entire thing into memory) using this method.
173
+ def peek depth=0
174
+ raise %Q[Illegal argument "#{depth}"] if depth < -1
175
+ temp = []
176
+ if depth == -1
177
+ temp.push(pull()) until empty?
178
+ else
179
+ while @stack.size+temp.size < depth+1
180
+ temp.push(pull())
181
+ end
182
+ end
183
+ @stack += temp if temp.size > 0
184
+ @stack[depth]
185
+ end
186
+
187
+ # Returns the next event. This is a +PullEvent+ object.
188
+ def pull
189
+ if @closed
190
+ x, @closed = @closed, nil
191
+ return [ :end_element, x ]
192
+ end
193
+ return [ :end_document ] if empty?
194
+ return @stack.shift if @stack.size > 0
195
+ #STDERR.puts @source.encoding
196
+ @source.read if @source.buffer.size<2
197
+ #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
198
+ if @document_status == nil
199
+ #@source.consume( /^\s*/um )
200
+ word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
201
+ word = word[1] unless word.nil?
202
+ #STDERR.puts "WORD = #{word.inspect}"
203
+ case word
204
+ when COMMENT_START
205
+ return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206
+ when XMLDECL_START
207
+ #STDERR.puts "XMLDECL"
208
+ results = @source.match( XMLDECL_PATTERN, true )[1]
209
+ version = VERSION.match( results )
210
+ version = version[1] unless version.nil?
211
+ encoding = ENCODING.match(results)
212
+ encoding = encoding[1] unless encoding.nil?
213
+ @source.encoding = encoding
214
+ standalone = STANDALONE.match(results)
215
+ standalone = standalone[1] unless standalone.nil?
216
+ return [ :xmldecl, version, encoding, standalone ]
217
+ when INSTRUCTION_START
218
+ return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
219
+ when DOCTYPE_START
220
+ md = @source.match( DOCTYPE_PATTERN, true )
221
+ @nsstack.unshift(curr_ns=Set.new)
222
+ identity = md[1]
223
+ close = md[2]
224
+ identity =~ IDENTITY
225
+ name = $1
226
+ raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
227
+ pub_sys = $2.nil? ? nil : $2.strip
228
+ long_name = $4.nil? ? nil : $4.strip
229
+ uri = $6.nil? ? nil : $6.strip
230
+ args = [ :start_doctype, name, pub_sys, long_name, uri ]
231
+ if close == ">"
232
+ @document_status = :after_doctype
233
+ @source.read if @source.buffer.size<2
234
+ md = @source.match(/^\s*/um, true)
235
+ @stack << [ :end_doctype ]
236
+ else
237
+ @document_status = :in_doctype
238
+ end
239
+ return args
240
+ when /^\s+/
241
+ else
242
+ @document_status = :after_doctype
243
+ @source.read if @source.buffer.size<2
244
+ md = @source.match(/\s*/um, true)
245
+ end
246
+ end
247
+ if @document_status == :in_doctype
248
+ md = @source.match(/\s*(.*?>)/um)
249
+ case md[1]
250
+ when SYSTEMENTITY
251
+ match = @source.match( SYSTEMENTITY, true )[1]
252
+ return [ :externalentity, match ]
253
+
254
+ when ELEMENTDECL_START
255
+ return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
256
+
257
+ when ENTITY_START
258
+ match = @source.match( ENTITYDECL, true ).to_a.compact
259
+ match[0] = :entitydecl
260
+ ref = false
261
+ if match[1] == '%'
262
+ ref = true
263
+ match.delete_at 1
264
+ end
265
+ # Now we have to sort out what kind of entity reference this is
266
+ if match[2] == 'SYSTEM'
267
+ # External reference
268
+ match[3] = match[3][1..-2] # PUBID
269
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
270
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
271
+ elsif match[2] == 'PUBLIC'
272
+ # External reference
273
+ match[3] = match[3][1..-2] # PUBID
274
+ match[4] = match[4][1..-2] # HREF
275
+ # match is [ :entity, name, PUBLIC, pubid, href ]
276
+ else
277
+ match[2] = match[2][1..-2]
278
+ match.pop if match.size == 4
279
+ # match is [ :entity, name, value ]
280
+ end
281
+ match << '%' if ref
282
+ return match
283
+ when ATTLISTDECL_START
284
+ md = @source.match( ATTLISTDECL_PATTERN, true )
285
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
286
+ element = md[1]
287
+ contents = md[0]
288
+
289
+ pairs = {}
290
+ values = md[0].scan( ATTDEF_RE )
291
+ values.each do |attdef|
292
+ unless attdef[3] == "#IMPLIED"
293
+ attdef.compact!
294
+ val = attdef[3]
295
+ val = attdef[4] if val == "#FIXED "
296
+ pairs[attdef[0]] = val
297
+ if attdef[0] =~ /^xmlns:(.*)/
298
+ @nsstack[0] << $1
299
+ end
300
+ end
301
+ end
302
+ return [ :attlistdecl, element, pairs, contents ]
303
+ when NOTATIONDECL_START
304
+ md = nil
305
+ if @source.match( PUBLIC )
306
+ md = @source.match( PUBLIC, true )
307
+ vals = [md[1],md[2],md[4],md[6]]
308
+ elsif @source.match( SYSTEM )
309
+ md = @source.match( SYSTEM, true )
310
+ vals = [md[1],md[2],nil,md[4]]
311
+ else
312
+ raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
313
+ end
314
+ return [ :notationdecl, *vals ]
315
+ when CDATA_END
316
+ @document_status = :after_doctype
317
+ @source.match( CDATA_END, true )
318
+ return [ :end_doctype ]
319
+ end
320
+ end
321
+ begin
322
+ if @source.buffer[0] == ?<
323
+ if @source.buffer[1] == ?/
324
+ @nsstack.shift
325
+ last_tag = @tags.pop
326
+ #md = @source.match_to_consume( '>', CLOSE_MATCH)
327
+ md = @source.match( CLOSE_MATCH, true )
328
+ raise REXML::ParseException.new( "Missing end tag for "+
329
+ "'#{last_tag}' (got \"#{md[1]}\")",
330
+ @source) unless last_tag == md[1]
331
+ return [ :end_element, last_tag ]
332
+ elsif @source.buffer[1] == ?!
333
+ md = @source.match(/\A(\s*[^>]*>)/um)
334
+ #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
335
+ raise REXML::ParseException.new("Malformed node", @source) unless md
336
+ if md[0][2] == ?-
337
+ md = @source.match( COMMENT_PATTERN, true )
338
+ return [ :comment, md[1] ] if md
339
+ else
340
+ md = @source.match( CDATA_PATTERN, true )
341
+ return [ :cdata, md[1] ] if md
342
+ end
343
+ raise REXML::ParseException.new( "Declarations can only occur "+
344
+ "in the doctype declaration.", @source)
345
+ elsif @source.buffer[1] == ??
346
+ md = @source.match( INSTRUCTION_PATTERN, true )
347
+ return [ :processing_instruction, md[1], md[2] ] if md
348
+ raise REXML::ParseException.new( "Bad instruction declaration",
349
+ @source)
350
+ else
351
+ # Get the next tag
352
+ md = @source.match(TAG_MATCH, true)
353
+ unless md
354
+ # Check for missing attribute quotes
355
+ raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
356
+ raise REXML::ParseException.new("malformed XML: missing tag start", @source)
357
+ end
358
+ attributes = {}
359
+ prefixes = Set.new
360
+ prefixes << md[2] if md[2]
361
+ @nsstack.unshift(curr_ns=Set.new)
362
+ if md[4].size > 0
363
+ attrs = md[4].scan( ATTRIBUTE_PATTERN )
364
+ raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
365
+ attrs.each { |a,b,c,d,e|
366
+ if b == "xmlns"
367
+ if c == "xml"
368
+ if d != "http://www.w3.org/XML/1998/namespace"
369
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
370
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
371
+ raise REXML::ParseException.new( msg, @source, self )
372
+ end
373
+ elsif c == "xmlns"
374
+ msg = "The 'xmlns' prefix must not be declared "+
375
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
376
+ raise REXML::ParseException.new( msg, @source, self)
377
+ end
378
+ curr_ns << c
379
+ elsif b
380
+ prefixes << b unless b == "xml"
381
+ end
382
+ attributes[a] = e
383
+ }
384
+ end
385
+
386
+ # Verify that all of the prefixes have been defined
387
+ for prefix in prefixes
388
+ unless @nsstack.find{|k| k.member?(prefix)}
389
+ raise UndefinedNamespaceException.new(prefix,@source,self)
390
+ end
391
+ end
392
+
393
+ if md[6]
394
+ @closed = md[1]
395
+ @nsstack.shift
396
+ else
397
+ @tags.push( md[1] )
398
+ end
399
+ return [ :start_element, md[1], attributes ]
400
+ end
401
+ else
402
+ md = @source.match( TEXT_PATTERN, true )
403
+ if md[0].length == 0
404
+ @source.match( /(\s+)/, true )
405
+ end
406
+ #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
407
+ #return [ :text, "" ] if md[0].length == 0
408
+ # unnormalized = Text::unnormalize( md[1], self )
409
+ # return PullEvent.new( :text, md[1], unnormalized )
410
+ return [ :text, md[1] ]
411
+ end
412
+ rescue REXML::UndefinedNamespaceException
413
+ raise
414
+ rescue REXML::ParseException
415
+ raise
416
+ rescue Exception, NameError => error
417
+ raise REXML::ParseException.new( "Exception parsing",
418
+ @source, self, (error ? error : $!) )
419
+ end
420
+ return [ :dummy ]
421
+ end
422
+
423
+ def entity( reference, entities )
424
+ value = nil
425
+ value = entities[ reference ] if entities
426
+ if not value
427
+ value = DEFAULT_ENTITIES[ reference ]
428
+ value = value[2] if value
429
+ end
430
+ unnormalize( value, entities ) if value
431
+ end
432
+
433
+ # Escapes all possible entities
434
+ def normalize( input, entities=nil, entity_filter=nil )
435
+ copy = input.clone
436
+ # Doing it like this rather than in a loop improves the speed
437
+ copy.gsub!( EREFERENCE, '&amp;' )
438
+ entities.each do |key, value|
439
+ copy.gsub!( value, "&#{key};" ) unless entity_filter and
440
+ entity_filter.include?(entity)
441
+ end if entities
442
+ copy.gsub!( EREFERENCE, '&amp;' )
443
+ DEFAULT_ENTITIES.each do |key, value|
444
+ copy.gsub!( value[3], value[1] )
445
+ end
446
+ copy
447
+ end
448
+
449
+ # Unescapes all possible entities
450
+ def unnormalize( string, entities=nil, filter=nil )
451
+ rv = string.clone
452
+ rv.gsub!( /\r\n?/, "\n" )
453
+ matches = rv.scan( REFERENCE_RE )
454
+ return rv if matches.size == 0
455
+ rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
456
+ m=$1
457
+ m = "0#{m}" if m[0] == ?x
458
+ [Integer(m)].pack('U*')
459
+ }
460
+ matches.collect!{|x|x[0]}.compact!
461
+ if matches.size > 0
462
+ matches.each do |entity_reference|
463
+ unless filter and filter.include?(entity_reference)
464
+ entity_value = entity( entity_reference, entities )
465
+ if entity_value
466
+ re = /&#{entity_reference};/
467
+ rv.gsub!( re, entity_value )
468
+ end
469
+ end
470
+ end
471
+ matches.each do |entity_reference|
472
+ unless filter and filter.include?(entity_reference)
473
+ er = DEFAULT_ENTITIES[entity_reference]
474
+ rv.gsub!( er[0], er[2] ) if er
475
+ end
476
+ end
477
+ rv.gsub!( /&amp;/, '&' )
478
+ end
479
+ rv
480
+ end
481
+ end
482
+ end
483
+ end
484
+
485
+ =begin
486
+ case event[0]
487
+ when :start_element
488
+ when :text
489
+ when :end_element
490
+ when :processing_instruction
491
+ when :cdata
492
+ when :comment
493
+ when :xmldecl
494
+ when :start_doctype
495
+ when :end_doctype
496
+ when :externalentity
497
+ when :elementdecl
498
+ when :entity
499
+ when :attlistdecl
500
+ when :notationdecl
501
+ when :end_doctype
502
+ end
503
+ =end