rubysl-rexml 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.travis.yml +8 -0
- data/Gemfile +4 -0
- data/LICENSE +25 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/lib/rexml/attlistdecl.rb +62 -0
- data/lib/rexml/attribute.rb +185 -0
- data/lib/rexml/cdata.rb +67 -0
- data/lib/rexml/child.rb +96 -0
- data/lib/rexml/comment.rb +80 -0
- data/lib/rexml/doctype.rb +271 -0
- data/lib/rexml/document.rb +230 -0
- data/lib/rexml/dtd/attlistdecl.rb +10 -0
- data/lib/rexml/dtd/dtd.rb +51 -0
- data/lib/rexml/dtd/elementdecl.rb +17 -0
- data/lib/rexml/dtd/entitydecl.rb +56 -0
- data/lib/rexml/dtd/notationdecl.rb +39 -0
- data/lib/rexml/element.rb +1227 -0
- data/lib/rexml/encoding.rb +71 -0
- data/lib/rexml/encodings/CP-1252.rb +103 -0
- data/lib/rexml/encodings/EUC-JP.rb +35 -0
- data/lib/rexml/encodings/ICONV.rb +22 -0
- data/lib/rexml/encodings/ISO-8859-1.rb +7 -0
- data/lib/rexml/encodings/ISO-8859-15.rb +72 -0
- data/lib/rexml/encodings/SHIFT-JIS.rb +37 -0
- data/lib/rexml/encodings/SHIFT_JIS.rb +1 -0
- data/lib/rexml/encodings/UNILE.rb +34 -0
- data/lib/rexml/encodings/US-ASCII.rb +30 -0
- data/lib/rexml/encodings/UTF-16.rb +35 -0
- data/lib/rexml/encodings/UTF-8.rb +18 -0
- data/lib/rexml/entity.rb +166 -0
- data/lib/rexml/formatters/default.rb +109 -0
- data/lib/rexml/formatters/pretty.rb +138 -0
- data/lib/rexml/formatters/transitive.rb +56 -0
- data/lib/rexml/functions.rb +382 -0
- data/lib/rexml/instruction.rb +70 -0
- data/lib/rexml/light/node.rb +196 -0
- data/lib/rexml/namespace.rb +47 -0
- data/lib/rexml/node.rb +75 -0
- data/lib/rexml/output.rb +24 -0
- data/lib/rexml/parent.rb +166 -0
- data/lib/rexml/parseexception.rb +51 -0
- data/lib/rexml/parsers/baseparser.rb +503 -0
- data/lib/rexml/parsers/lightparser.rb +60 -0
- data/lib/rexml/parsers/pullparser.rb +196 -0
- data/lib/rexml/parsers/sax2parser.rb +238 -0
- data/lib/rexml/parsers/streamparser.rb +46 -0
- data/lib/rexml/parsers/treeparser.rb +97 -0
- data/lib/rexml/parsers/ultralightparser.rb +56 -0
- data/lib/rexml/parsers/xpathparser.rb +698 -0
- data/lib/rexml/quickpath.rb +266 -0
- data/lib/rexml/rexml.rb +32 -0
- data/lib/rexml/sax2listener.rb +97 -0
- data/lib/rexml/source.rb +251 -0
- data/lib/rexml/streamlistener.rb +92 -0
- data/lib/rexml/syncenumerator.rb +33 -0
- data/lib/rexml/text.rb +344 -0
- data/lib/rexml/undefinednamespaceexception.rb +8 -0
- data/lib/rexml/validation/relaxng.rb +559 -0
- data/lib/rexml/validation/validation.rb +155 -0
- data/lib/rexml/validation/validationexception.rb +9 -0
- data/lib/rexml/xmldecl.rb +119 -0
- data/lib/rexml/xmltokens.rb +18 -0
- data/lib/rexml/xpath.rb +66 -0
- data/lib/rexml/xpath_parser.rb +792 -0
- data/lib/rubysl/rexml.rb +1 -0
- data/lib/rubysl/rexml/version.rb +5 -0
- data/rubysl-rexml.gemspec +23 -0
- data/spec/attribute/clone_spec.rb +10 -0
- data/spec/attribute/element_spec.rb +22 -0
- data/spec/attribute/equal_value_spec.rb +17 -0
- data/spec/attribute/hash_spec.rb +12 -0
- data/spec/attribute/initialize_spec.rb +28 -0
- data/spec/attribute/inspect_spec.rb +19 -0
- data/spec/attribute/namespace_spec.rb +23 -0
- data/spec/attribute/node_type_spec.rb +9 -0
- data/spec/attribute/prefix_spec.rb +17 -0
- data/spec/attribute/remove_spec.rb +19 -0
- data/spec/attribute/to_s_spec.rb +13 -0
- data/spec/attribute/to_string_spec.rb +14 -0
- data/spec/attribute/value_spec.rb +14 -0
- data/spec/attribute/write_spec.rb +22 -0
- data/spec/attribute/xpath_spec.rb +19 -0
- data/spec/attributes/add_spec.rb +6 -0
- data/spec/attributes/append_spec.rb +6 -0
- data/spec/attributes/delete_all_spec.rb +30 -0
- data/spec/attributes/delete_spec.rb +26 -0
- data/spec/attributes/each_attribute_spec.rb +24 -0
- data/spec/attributes/each_spec.rb +24 -0
- data/spec/attributes/element_reference_spec.rb +18 -0
- data/spec/attributes/element_set_spec.rb +25 -0
- data/spec/attributes/get_attribute_ns_spec.rb +13 -0
- data/spec/attributes/get_attribute_spec.rb +28 -0
- data/spec/attributes/initialize_spec.rb +18 -0
- data/spec/attributes/length_spec.rb +6 -0
- data/spec/attributes/namespaces_spec.rb +5 -0
- data/spec/attributes/prefixes_spec.rb +23 -0
- data/spec/attributes/shared/add.rb +17 -0
- data/spec/attributes/shared/length.rb +12 -0
- data/spec/attributes/size_spec.rb +6 -0
- data/spec/attributes/to_a_spec.rb +20 -0
- data/spec/cdata/clone_spec.rb +9 -0
- data/spec/cdata/initialize_spec.rb +24 -0
- data/spec/cdata/shared/to_s.rb +11 -0
- data/spec/cdata/to_s_spec.rb +6 -0
- data/spec/cdata/value_spec.rb +6 -0
- data/spec/document/add_element_spec.rb +30 -0
- data/spec/document/add_spec.rb +60 -0
- data/spec/document/clone_spec.rb +19 -0
- data/spec/document/doctype_spec.rb +14 -0
- data/spec/document/encoding_spec.rb +21 -0
- data/spec/document/expanded_name_spec.rb +15 -0
- data/spec/document/new_spec.rb +37 -0
- data/spec/document/node_type_spec.rb +7 -0
- data/spec/document/root_spec.rb +11 -0
- data/spec/document/stand_alone_spec.rb +18 -0
- data/spec/document/version_spec.rb +13 -0
- data/spec/document/write_spec.rb +38 -0
- data/spec/document/xml_decl_spec.rb +14 -0
- data/spec/element/add_attribute_spec.rb +40 -0
- data/spec/element/add_attributes_spec.rb +21 -0
- data/spec/element/add_element_spec.rb +38 -0
- data/spec/element/add_namespace_spec.rb +23 -0
- data/spec/element/add_text_spec.rb +23 -0
- data/spec/element/attribute_spec.rb +16 -0
- data/spec/element/attributes_spec.rb +18 -0
- data/spec/element/cdatas_spec.rb +23 -0
- data/spec/element/clone_spec.rb +28 -0
- data/spec/element/comments_spec.rb +20 -0
- data/spec/element/delete_attribute_spec.rb +38 -0
- data/spec/element/delete_element_spec.rb +50 -0
- data/spec/element/delete_namespace_spec.rb +24 -0
- data/spec/element/document_spec.rb +17 -0
- data/spec/element/each_element_with_attribute_spec.rb +34 -0
- data/spec/element/each_element_with_text_spec.rb +30 -0
- data/spec/element/get_text_spec.rb +17 -0
- data/spec/element/has_attributes_spec.rb +16 -0
- data/spec/element/has_elements_spec.rb +17 -0
- data/spec/element/has_text_spec.rb +15 -0
- data/spec/element/inspect_spec.rb +26 -0
- data/spec/element/instructions_spec.rb +20 -0
- data/spec/element/namespace_spec.rb +26 -0
- data/spec/element/namespaces_spec.rb +31 -0
- data/spec/element/new_spec.rb +34 -0
- data/spec/element/next_element_spec.rb +18 -0
- data/spec/element/node_type_spec.rb +7 -0
- data/spec/element/prefixes_spec.rb +22 -0
- data/spec/element/previous_element_spec.rb +19 -0
- data/spec/element/raw_spec.rb +23 -0
- data/spec/element/root_spec.rb +27 -0
- data/spec/element/text_spec.rb +45 -0
- data/spec/element/texts_spec.rb +15 -0
- data/spec/element/whitespace_spec.rb +22 -0
- data/spec/node/each_recursive_spec.rb +20 -0
- data/spec/node/find_first_recursive_spec.rb +24 -0
- data/spec/node/index_in_parent_spec.rb +14 -0
- data/spec/node/next_sibling_node_spec.rb +20 -0
- data/spec/node/parent_spec.rb +20 -0
- data/spec/node/previous_sibling_node_spec.rb +20 -0
- data/spec/shared/each_element.rb +35 -0
- data/spec/shared/elements_to_a.rb +35 -0
- data/spec/text/append_spec.rb +9 -0
- data/spec/text/clone_spec.rb +9 -0
- data/spec/text/comparison_spec.rb +24 -0
- data/spec/text/empty_spec.rb +11 -0
- data/spec/text/indent_text_spec.rb +23 -0
- data/spec/text/inspect_spec.rb +7 -0
- data/spec/text/new_spec.rb +48 -0
- data/spec/text/node_type_spec.rb +7 -0
- data/spec/text/normalize_spec.rb +7 -0
- data/spec/text/read_with_substitution_spec.rb +12 -0
- data/spec/text/to_s_spec.rb +17 -0
- data/spec/text/unnormalize_spec.rb +7 -0
- data/spec/text/value_spec.rb +36 -0
- data/spec/text/wrap_spec.rb +20 -0
- data/spec/text/write_with_substitution_spec.rb +32 -0
- metadata +385 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module REXML
|
2
|
+
class ParseException < RuntimeError
|
3
|
+
attr_accessor :source, :parser, :continued_exception
|
4
|
+
|
5
|
+
def initialize( message, source=nil, parser=nil, exception=nil )
|
6
|
+
super(message)
|
7
|
+
@source = source
|
8
|
+
@parser = parser
|
9
|
+
@continued_exception = exception
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
# Quote the original exception, if there was one
|
14
|
+
if @continued_exception
|
15
|
+
err = @continued_exception.inspect
|
16
|
+
err << "\n"
|
17
|
+
err << @continued_exception.backtrace.join("\n")
|
18
|
+
err << "\n...\n"
|
19
|
+
else
|
20
|
+
err = ""
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get the stack trace and error message
|
24
|
+
err << super
|
25
|
+
|
26
|
+
# Add contextual information
|
27
|
+
if @source
|
28
|
+
err << "\nLine: #{line}\n"
|
29
|
+
err << "Position: #{position}\n"
|
30
|
+
err << "Last 80 unconsumed characters:\n"
|
31
|
+
err << @source.buffer[0..80].gsub(/\n/, ' ')
|
32
|
+
end
|
33
|
+
|
34
|
+
err
|
35
|
+
end
|
36
|
+
|
37
|
+
def position
|
38
|
+
@source.current_line[0] if @source and defined? @source.current_line and
|
39
|
+
@source.current_line
|
40
|
+
end
|
41
|
+
|
42
|
+
def line
|
43
|
+
@source.current_line[2] if @source and defined? @source.current_line and
|
44
|
+
@source.current_line
|
45
|
+
end
|
46
|
+
|
47
|
+
def context
|
48
|
+
@source.current_line
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,503 @@
|
|
1
|
+
require 'rexml/parseexception'
|
2
|
+
require 'rexml/undefinednamespaceexception'
|
3
|
+
require 'rexml/source'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module REXML
|
7
|
+
module Parsers
|
8
|
+
# = Using the Pull Parser
|
9
|
+
# <em>This API is experimental, and subject to change.</em>
|
10
|
+
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|
11
|
+
# while parser.has_next?
|
12
|
+
# res = parser.next
|
13
|
+
# puts res[1]['att'] if res.start_tag? and res[0] == 'b'
|
14
|
+
# end
|
15
|
+
# See the PullEvent class for information on the content of the results.
|
16
|
+
# The data is identical to the arguments passed for the various events to
|
17
|
+
# the StreamListener API.
|
18
|
+
#
|
19
|
+
# Notice that:
|
20
|
+
# parser = PullParser.new( "<a>BAD DOCUMENT" )
|
21
|
+
# while parser.has_next?
|
22
|
+
# res = parser.next
|
23
|
+
# raise res[1] if res.error?
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# Nat Price gave me some good ideas for the API.
|
27
|
+
class BaseParser
|
28
|
+
NCNAME_STR= '[\w:][\-\w\d.]*'
|
29
|
+
NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
|
30
|
+
UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
|
31
|
+
|
32
|
+
NAMECHAR = '[\-\w\d\.:]'
|
33
|
+
NAME = "([\\w:]#{NAMECHAR}*)"
|
34
|
+
NMTOKEN = "(?:#{NAMECHAR})+"
|
35
|
+
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
|
36
|
+
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
|
37
|
+
REFERENCE_RE = /#{REFERENCE}/
|
38
|
+
|
39
|
+
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
|
40
|
+
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
|
41
|
+
ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
|
42
|
+
COMMENT_START = /\A<!--/u
|
43
|
+
COMMENT_PATTERN = /<!--(.*?)-->/um
|
44
|
+
CDATA_START = /\A<!\[CDATA\[/u
|
45
|
+
CDATA_END = /^\s*\]\s*>/um
|
46
|
+
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
|
47
|
+
XMLDECL_START = /\A<\?xml\s/u;
|
48
|
+
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
|
49
|
+
INSTRUCTION_START = /\A<\?/u
|
50
|
+
INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
|
51
|
+
TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
|
52
|
+
CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
|
53
|
+
|
54
|
+
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
|
55
|
+
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
|
56
|
+
STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
|
57
|
+
|
58
|
+
ENTITY_START = /^\s*<!ENTITY/
|
59
|
+
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
|
60
|
+
ELEMENTDECL_START = /^\s*<!ELEMENT/um
|
61
|
+
ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
|
62
|
+
SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
|
63
|
+
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
|
64
|
+
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
|
65
|
+
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
|
66
|
+
ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
|
67
|
+
ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
|
68
|
+
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
|
69
|
+
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
|
70
|
+
ATTDEF_RE = /#{ATTDEF}/
|
71
|
+
ATTLISTDECL_START = /^\s*<!ATTLIST/um
|
72
|
+
ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
73
|
+
NOTATIONDECL_START = /^\s*<!NOTATION/um
|
74
|
+
PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
|
75
|
+
SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
|
76
|
+
|
77
|
+
TEXT_PATTERN = /\A([^<]*)/um
|
78
|
+
|
79
|
+
# Entity constants
|
80
|
+
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
|
81
|
+
SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
|
82
|
+
PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
|
83
|
+
EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
|
84
|
+
NDATADECL = "\\s+NDATA\\s+#{NAME}"
|
85
|
+
PEREFERENCE = "%#{NAME};"
|
86
|
+
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
|
87
|
+
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
|
88
|
+
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
89
|
+
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
90
|
+
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
91
|
+
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
|
92
|
+
|
93
|
+
EREFERENCE = /&(?!#{NAME};)/
|
94
|
+
|
95
|
+
DEFAULT_ENTITIES = {
|
96
|
+
'gt' => [/>/, '>', '>', />/],
|
97
|
+
'lt' => [/</, '<', '<', /</],
|
98
|
+
'quot' => [/"/, '"', '"', /"/],
|
99
|
+
"apos" => [/'/, "'", "'", /'/]
|
100
|
+
}
|
101
|
+
|
102
|
+
|
103
|
+
######################################################################
|
104
|
+
# These are patterns to identify common markup errors, to make the
|
105
|
+
# error messages more informative.
|
106
|
+
######################################################################
|
107
|
+
MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
|
108
|
+
|
109
|
+
def initialize( source )
|
110
|
+
self.stream = source
|
111
|
+
end
|
112
|
+
|
113
|
+
def add_listener( listener )
|
114
|
+
if !defined?(@listeners) or !@listeners
|
115
|
+
@listeners = []
|
116
|
+
instance_eval <<-EOL
|
117
|
+
alias :_old_pull :pull
|
118
|
+
def pull
|
119
|
+
event = _old_pull
|
120
|
+
@listeners.each do |listener|
|
121
|
+
listener.receive event
|
122
|
+
end
|
123
|
+
event
|
124
|
+
end
|
125
|
+
EOL
|
126
|
+
end
|
127
|
+
@listeners << listener
|
128
|
+
end
|
129
|
+
|
130
|
+
attr_reader :source
|
131
|
+
|
132
|
+
def stream=( source )
|
133
|
+
@source = SourceFactory.create_from( source )
|
134
|
+
@closed = nil
|
135
|
+
@document_status = nil
|
136
|
+
@tags = []
|
137
|
+
@stack = []
|
138
|
+
@entities = []
|
139
|
+
@nsstack = []
|
140
|
+
end
|
141
|
+
|
142
|
+
def position
|
143
|
+
if @source.respond_to? :position
|
144
|
+
@source.position
|
145
|
+
else
|
146
|
+
# FIXME
|
147
|
+
0
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Returns true if there are no more events
|
152
|
+
def empty?
|
153
|
+
return (@source.empty? and @stack.empty?)
|
154
|
+
end
|
155
|
+
|
156
|
+
# Returns true if there are more events. Synonymous with !empty?
|
157
|
+
def has_next?
|
158
|
+
return !(@source.empty? and @stack.empty?)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Push an event back on the head of the stream. This method
|
162
|
+
# has (theoretically) infinite depth.
|
163
|
+
def unshift token
|
164
|
+
@stack.unshift(token)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Peek at the +depth+ event in the stack. The first element on the stack
|
168
|
+
# is at depth 0. If +depth+ is -1, will parse to the end of the input
|
169
|
+
# stream and return the last event, which is always :end_document.
|
170
|
+
# Be aware that this causes the stream to be parsed up to the +depth+
|
171
|
+
# event, so you can effectively pre-parse the entire document (pull the
|
172
|
+
# entire thing into memory) using this method.
|
173
|
+
def peek depth=0
|
174
|
+
raise %Q[Illegal argument "#{depth}"] if depth < -1
|
175
|
+
temp = []
|
176
|
+
if depth == -1
|
177
|
+
temp.push(pull()) until empty?
|
178
|
+
else
|
179
|
+
while @stack.size+temp.size < depth+1
|
180
|
+
temp.push(pull())
|
181
|
+
end
|
182
|
+
end
|
183
|
+
@stack += temp if temp.size > 0
|
184
|
+
@stack[depth]
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns the next event. This is a +PullEvent+ object.
|
188
|
+
def pull
|
189
|
+
if @closed
|
190
|
+
x, @closed = @closed, nil
|
191
|
+
return [ :end_element, x ]
|
192
|
+
end
|
193
|
+
return [ :end_document ] if empty?
|
194
|
+
return @stack.shift if @stack.size > 0
|
195
|
+
#STDERR.puts @source.encoding
|
196
|
+
@source.read if @source.buffer.size<2
|
197
|
+
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
198
|
+
if @document_status == nil
|
199
|
+
#@source.consume( /^\s*/um )
|
200
|
+
word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
|
201
|
+
word = word[1] unless word.nil?
|
202
|
+
#STDERR.puts "WORD = #{word.inspect}"
|
203
|
+
case word
|
204
|
+
when COMMENT_START
|
205
|
+
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
206
|
+
when XMLDECL_START
|
207
|
+
#STDERR.puts "XMLDECL"
|
208
|
+
results = @source.match( XMLDECL_PATTERN, true )[1]
|
209
|
+
version = VERSION.match( results )
|
210
|
+
version = version[1] unless version.nil?
|
211
|
+
encoding = ENCODING.match(results)
|
212
|
+
encoding = encoding[1] unless encoding.nil?
|
213
|
+
@source.encoding = encoding
|
214
|
+
standalone = STANDALONE.match(results)
|
215
|
+
standalone = standalone[1] unless standalone.nil?
|
216
|
+
return [ :xmldecl, version, encoding, standalone ]
|
217
|
+
when INSTRUCTION_START
|
218
|
+
return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
|
219
|
+
when DOCTYPE_START
|
220
|
+
md = @source.match( DOCTYPE_PATTERN, true )
|
221
|
+
@nsstack.unshift(curr_ns=Set.new)
|
222
|
+
identity = md[1]
|
223
|
+
close = md[2]
|
224
|
+
identity =~ IDENTITY
|
225
|
+
name = $1
|
226
|
+
raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
|
227
|
+
pub_sys = $2.nil? ? nil : $2.strip
|
228
|
+
long_name = $4.nil? ? nil : $4.strip
|
229
|
+
uri = $6.nil? ? nil : $6.strip
|
230
|
+
args = [ :start_doctype, name, pub_sys, long_name, uri ]
|
231
|
+
if close == ">"
|
232
|
+
@document_status = :after_doctype
|
233
|
+
@source.read if @source.buffer.size<2
|
234
|
+
md = @source.match(/^\s*/um, true)
|
235
|
+
@stack << [ :end_doctype ]
|
236
|
+
else
|
237
|
+
@document_status = :in_doctype
|
238
|
+
end
|
239
|
+
return args
|
240
|
+
when /^\s+/
|
241
|
+
else
|
242
|
+
@document_status = :after_doctype
|
243
|
+
@source.read if @source.buffer.size<2
|
244
|
+
md = @source.match(/\s*/um, true)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
if @document_status == :in_doctype
|
248
|
+
md = @source.match(/\s*(.*?>)/um)
|
249
|
+
case md[1]
|
250
|
+
when SYSTEMENTITY
|
251
|
+
match = @source.match( SYSTEMENTITY, true )[1]
|
252
|
+
return [ :externalentity, match ]
|
253
|
+
|
254
|
+
when ELEMENTDECL_START
|
255
|
+
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
|
256
|
+
|
257
|
+
when ENTITY_START
|
258
|
+
match = @source.match( ENTITYDECL, true ).to_a.compact
|
259
|
+
match[0] = :entitydecl
|
260
|
+
ref = false
|
261
|
+
if match[1] == '%'
|
262
|
+
ref = true
|
263
|
+
match.delete_at 1
|
264
|
+
end
|
265
|
+
# Now we have to sort out what kind of entity reference this is
|
266
|
+
if match[2] == 'SYSTEM'
|
267
|
+
# External reference
|
268
|
+
match[3] = match[3][1..-2] # PUBID
|
269
|
+
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
270
|
+
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
271
|
+
elsif match[2] == 'PUBLIC'
|
272
|
+
# External reference
|
273
|
+
match[3] = match[3][1..-2] # PUBID
|
274
|
+
match[4] = match[4][1..-2] # HREF
|
275
|
+
# match is [ :entity, name, PUBLIC, pubid, href ]
|
276
|
+
else
|
277
|
+
match[2] = match[2][1..-2]
|
278
|
+
match.pop if match.size == 4
|
279
|
+
# match is [ :entity, name, value ]
|
280
|
+
end
|
281
|
+
match << '%' if ref
|
282
|
+
return match
|
283
|
+
when ATTLISTDECL_START
|
284
|
+
md = @source.match( ATTLISTDECL_PATTERN, true )
|
285
|
+
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
286
|
+
element = md[1]
|
287
|
+
contents = md[0]
|
288
|
+
|
289
|
+
pairs = {}
|
290
|
+
values = md[0].scan( ATTDEF_RE )
|
291
|
+
values.each do |attdef|
|
292
|
+
unless attdef[3] == "#IMPLIED"
|
293
|
+
attdef.compact!
|
294
|
+
val = attdef[3]
|
295
|
+
val = attdef[4] if val == "#FIXED "
|
296
|
+
pairs[attdef[0]] = val
|
297
|
+
if attdef[0] =~ /^xmlns:(.*)/
|
298
|
+
@nsstack[0] << $1
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
return [ :attlistdecl, element, pairs, contents ]
|
303
|
+
when NOTATIONDECL_START
|
304
|
+
md = nil
|
305
|
+
if @source.match( PUBLIC )
|
306
|
+
md = @source.match( PUBLIC, true )
|
307
|
+
vals = [md[1],md[2],md[4],md[6]]
|
308
|
+
elsif @source.match( SYSTEM )
|
309
|
+
md = @source.match( SYSTEM, true )
|
310
|
+
vals = [md[1],md[2],nil,md[4]]
|
311
|
+
else
|
312
|
+
raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
|
313
|
+
end
|
314
|
+
return [ :notationdecl, *vals ]
|
315
|
+
when CDATA_END
|
316
|
+
@document_status = :after_doctype
|
317
|
+
@source.match( CDATA_END, true )
|
318
|
+
return [ :end_doctype ]
|
319
|
+
end
|
320
|
+
end
|
321
|
+
begin
|
322
|
+
if @source.buffer[0] == ?<
|
323
|
+
if @source.buffer[1] == ?/
|
324
|
+
@nsstack.shift
|
325
|
+
last_tag = @tags.pop
|
326
|
+
#md = @source.match_to_consume( '>', CLOSE_MATCH)
|
327
|
+
md = @source.match( CLOSE_MATCH, true )
|
328
|
+
raise REXML::ParseException.new( "Missing end tag for "+
|
329
|
+
"'#{last_tag}' (got \"#{md[1]}\")",
|
330
|
+
@source) unless last_tag == md[1]
|
331
|
+
return [ :end_element, last_tag ]
|
332
|
+
elsif @source.buffer[1] == ?!
|
333
|
+
md = @source.match(/\A(\s*[^>]*>)/um)
|
334
|
+
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
335
|
+
raise REXML::ParseException.new("Malformed node", @source) unless md
|
336
|
+
if md[0][2] == ?-
|
337
|
+
md = @source.match( COMMENT_PATTERN, true )
|
338
|
+
return [ :comment, md[1] ] if md
|
339
|
+
else
|
340
|
+
md = @source.match( CDATA_PATTERN, true )
|
341
|
+
return [ :cdata, md[1] ] if md
|
342
|
+
end
|
343
|
+
raise REXML::ParseException.new( "Declarations can only occur "+
|
344
|
+
"in the doctype declaration.", @source)
|
345
|
+
elsif @source.buffer[1] == ??
|
346
|
+
md = @source.match( INSTRUCTION_PATTERN, true )
|
347
|
+
return [ :processing_instruction, md[1], md[2] ] if md
|
348
|
+
raise REXML::ParseException.new( "Bad instruction declaration",
|
349
|
+
@source)
|
350
|
+
else
|
351
|
+
# Get the next tag
|
352
|
+
md = @source.match(TAG_MATCH, true)
|
353
|
+
unless md
|
354
|
+
# Check for missing attribute quotes
|
355
|
+
raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
|
356
|
+
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
357
|
+
end
|
358
|
+
attributes = {}
|
359
|
+
prefixes = Set.new
|
360
|
+
prefixes << md[2] if md[2]
|
361
|
+
@nsstack.unshift(curr_ns=Set.new)
|
362
|
+
if md[4].size > 0
|
363
|
+
attrs = md[4].scan( ATTRIBUTE_PATTERN )
|
364
|
+
raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
|
365
|
+
attrs.each { |a,b,c,d,e|
|
366
|
+
if b == "xmlns"
|
367
|
+
if c == "xml"
|
368
|
+
if d != "http://www.w3.org/XML/1998/namespace"
|
369
|
+
msg = "The 'xml' prefix must not be bound to any other namespace "+
|
370
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
371
|
+
raise REXML::ParseException.new( msg, @source, self )
|
372
|
+
end
|
373
|
+
elsif c == "xmlns"
|
374
|
+
msg = "The 'xmlns' prefix must not be declared "+
|
375
|
+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
376
|
+
raise REXML::ParseException.new( msg, @source, self)
|
377
|
+
end
|
378
|
+
curr_ns << c
|
379
|
+
elsif b
|
380
|
+
prefixes << b unless b == "xml"
|
381
|
+
end
|
382
|
+
attributes[a] = e
|
383
|
+
}
|
384
|
+
end
|
385
|
+
|
386
|
+
# Verify that all of the prefixes have been defined
|
387
|
+
for prefix in prefixes
|
388
|
+
unless @nsstack.find{|k| k.member?(prefix)}
|
389
|
+
raise UndefinedNamespaceException.new(prefix,@source,self)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
if md[6]
|
394
|
+
@closed = md[1]
|
395
|
+
@nsstack.shift
|
396
|
+
else
|
397
|
+
@tags.push( md[1] )
|
398
|
+
end
|
399
|
+
return [ :start_element, md[1], attributes ]
|
400
|
+
end
|
401
|
+
else
|
402
|
+
md = @source.match( TEXT_PATTERN, true )
|
403
|
+
if md[0].length == 0
|
404
|
+
@source.match( /(\s+)/, true )
|
405
|
+
end
|
406
|
+
#STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
|
407
|
+
#return [ :text, "" ] if md[0].length == 0
|
408
|
+
# unnormalized = Text::unnormalize( md[1], self )
|
409
|
+
# return PullEvent.new( :text, md[1], unnormalized )
|
410
|
+
return [ :text, md[1] ]
|
411
|
+
end
|
412
|
+
rescue REXML::UndefinedNamespaceException
|
413
|
+
raise
|
414
|
+
rescue REXML::ParseException
|
415
|
+
raise
|
416
|
+
rescue Exception, NameError => error
|
417
|
+
raise REXML::ParseException.new( "Exception parsing",
|
418
|
+
@source, self, (error ? error : $!) )
|
419
|
+
end
|
420
|
+
return [ :dummy ]
|
421
|
+
end
|
422
|
+
|
423
|
+
def entity( reference, entities )
|
424
|
+
value = nil
|
425
|
+
value = entities[ reference ] if entities
|
426
|
+
if not value
|
427
|
+
value = DEFAULT_ENTITIES[ reference ]
|
428
|
+
value = value[2] if value
|
429
|
+
end
|
430
|
+
unnormalize( value, entities ) if value
|
431
|
+
end
|
432
|
+
|
433
|
+
# Escapes all possible entities
|
434
|
+
def normalize( input, entities=nil, entity_filter=nil )
|
435
|
+
copy = input.clone
|
436
|
+
# Doing it like this rather than in a loop improves the speed
|
437
|
+
copy.gsub!( EREFERENCE, '&' )
|
438
|
+
entities.each do |key, value|
|
439
|
+
copy.gsub!( value, "&#{key};" ) unless entity_filter and
|
440
|
+
entity_filter.include?(entity)
|
441
|
+
end if entities
|
442
|
+
copy.gsub!( EREFERENCE, '&' )
|
443
|
+
DEFAULT_ENTITIES.each do |key, value|
|
444
|
+
copy.gsub!( value[3], value[1] )
|
445
|
+
end
|
446
|
+
copy
|
447
|
+
end
|
448
|
+
|
449
|
+
# Unescapes all possible entities
|
450
|
+
def unnormalize( string, entities=nil, filter=nil )
|
451
|
+
rv = string.clone
|
452
|
+
rv.gsub!( /\r\n?/, "\n" )
|
453
|
+
matches = rv.scan( REFERENCE_RE )
|
454
|
+
return rv if matches.size == 0
|
455
|
+
rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
|
456
|
+
m=$1
|
457
|
+
m = "0#{m}" if m[0] == ?x
|
458
|
+
[Integer(m)].pack('U*')
|
459
|
+
}
|
460
|
+
matches.collect!{|x|x[0]}.compact!
|
461
|
+
if matches.size > 0
|
462
|
+
matches.each do |entity_reference|
|
463
|
+
unless filter and filter.include?(entity_reference)
|
464
|
+
entity_value = entity( entity_reference, entities )
|
465
|
+
if entity_value
|
466
|
+
re = /&#{entity_reference};/
|
467
|
+
rv.gsub!( re, entity_value )
|
468
|
+
end
|
469
|
+
end
|
470
|
+
end
|
471
|
+
matches.each do |entity_reference|
|
472
|
+
unless filter and filter.include?(entity_reference)
|
473
|
+
er = DEFAULT_ENTITIES[entity_reference]
|
474
|
+
rv.gsub!( er[0], er[2] ) if er
|
475
|
+
end
|
476
|
+
end
|
477
|
+
rv.gsub!( /&/, '&' )
|
478
|
+
end
|
479
|
+
rv
|
480
|
+
end
|
481
|
+
end
|
482
|
+
end
|
483
|
+
end
|
484
|
+
|
485
|
+
=begin
|
486
|
+
case event[0]
|
487
|
+
when :start_element
|
488
|
+
when :text
|
489
|
+
when :end_element
|
490
|
+
when :processing_instruction
|
491
|
+
when :cdata
|
492
|
+
when :comment
|
493
|
+
when :xmldecl
|
494
|
+
when :start_doctype
|
495
|
+
when :end_doctype
|
496
|
+
when :externalentity
|
497
|
+
when :elementdecl
|
498
|
+
when :entity
|
499
|
+
when :attlistdecl
|
500
|
+
when :notationdecl
|
501
|
+
when :end_doctype
|
502
|
+
end
|
503
|
+
=end
|