syntax_tree-xml 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,384 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ class Parser
6
+ NAME_START =
7
+ "[:a-zA-Z_\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]"
8
+
9
+ NAME_CHAR =
10
+ "[#{NAME_START}-\\.\\d\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]"
11
+
12
+ NAME = "#{NAME_START}(?:#{NAME_CHAR})*"
13
+
14
+ # This is the parent class of any kind of errors that will be raised by
15
+ # the parser.
16
+ class ParseError < StandardError
17
+ end
18
+
19
+ # This error occurs when a certain token is expected in a certain place
20
+ # but is not found. Sometimes this is handled internally because some
21
+ # elements are optional. Other times it is not and it is raised to end the
22
+ # parsing process.
23
+ class MissingTokenError < ParseError
24
+ end
25
+
26
+ attr_reader :source, :tokens
27
+
28
+ def initialize(source)
29
+ @source = source
30
+ @tokens = make_tokens
31
+ end
32
+
33
+ def parse
34
+ parse_document
35
+ end
36
+
37
+ private
38
+
39
+ def make_tokens
40
+ Enumerator.new do |enum|
41
+ index = 0
42
+ line = 1
43
+ state = %i[outside]
44
+
45
+ while index < source.length
46
+ case state.last
47
+ in :outside
48
+ case source[index..]
49
+ when /\A(?: |\t|\n|\r\n)+/m
50
+ # whitespace
51
+ enum.yield :whitespace, $&, index, line
52
+ line += $&.count("\n")
53
+ when /\A<!--(.|\r?\n)*?-->/m
54
+ # comments
55
+ # <!-- this is a comment -->
56
+ enum.yield :comment, $&, index, line
57
+ line += $&.count("\n")
58
+ when /\A<!\[CDATA\[(.|\r?\n)*?\]\]>/m
59
+ # character data tags
60
+ # <![CDATA[<message>Welcome!</message>]]>
61
+ enum.yield :cdata, $&, index, line
62
+ line += $&.count("\n")
63
+ when /\A<!DOCTYPE/
64
+ # document type tags
65
+ # <!DOCTYPE
66
+ enum.yield :doctype, $&, index, line
67
+ state << :inside
68
+ when /\A<!.+?>/
69
+ # document type definition tags
70
+ # <!ENTITY nbsp "&#xA0;">
71
+ enum.yield :dtd, $&, index, line
72
+ when /\A<\?xml[ \t\r\n]/
73
+ # xml declaration opening
74
+ # <?xml
75
+ enum.yield :xml_decl, $&, index, line
76
+ state << :inside
77
+ line += $&.count("\n")
78
+ when %r{\A</}
79
+ # the beginning of a closing tag
80
+ # </
81
+ enum.yield :slash_open, $&, index, line
82
+ state << :inside
83
+ when /\A<\?#{NAME}.+?\?>/
84
+ # a processing instruction
85
+ # <?xml-stylesheet type="text/xsl" href="style.xsl" ?>
86
+ enum.yield :processing_instruction, $&, index, line
87
+ when /\A</
88
+ # the beginning of an opening tag
89
+ # <
90
+ enum.yield :open, $&, index, line
91
+ state << :inside
92
+ when /\A&#{NAME};/
93
+ # entity reference
94
+ # &amp;
95
+ enum.yield :entity_reference, $&, index, line
96
+ when /\A&#(?:\d+|x[a-fA-F0-9]+);/
97
+ # character reference
98
+ # &#1234;
99
+ enum.yield :character_reference, $&, index, line
100
+ when /\A[^<&]+/
101
+ # plain text content
102
+ # abc
103
+ enum.yield :text, $&, index, line
104
+ else
105
+ raise ParseError,
106
+ "Unexpected character at #{index}: #{source[index]}"
107
+ end
108
+ in :inside
109
+ case source[index..]
110
+ when /\A[ \t\r\n]+/
111
+ # whitespace
112
+ line += $&.count("\n")
113
+ when /\A>/
114
+ # the end of a tag
115
+ # >
116
+ enum.yield :close, $&, index, line
117
+ state.pop
118
+ when /\A\?>/
119
+ # the end of a tag
120
+ # ?>
121
+ enum.yield :special_close, $&, index, line
122
+ state.pop
123
+ when %r{\A/>}
124
+ # the end of a self-closing tag
125
+ enum.yield :slash_close, $&, index, line
126
+ state.pop
127
+ when %r{\A/}
128
+ # a forward slash
129
+ # /
130
+ enum.yield :slash, $&, index, line
131
+ when /\A=/
132
+ # an equals sign
133
+ # =
134
+ enum.yield :equals, $&, index, line
135
+ when /\A(?:"[^<"]*"|'[<^']*')/
136
+ # a quoted string
137
+ # "abc"
138
+ enum.yield :string, $&, index, line
139
+ when /\A#{NAME}/
140
+ # a name
141
+ # abc
142
+ enum.yield :name, $&, index, line
143
+ else
144
+ raise ParseError,
145
+ "Unexpected character at #{index}: #{source[index]}"
146
+ end
147
+ end
148
+
149
+ index += $&.length
150
+ end
151
+
152
+ enum.yield :EOF, nil, index, line
153
+ end
154
+ end
155
+
156
+ # If the next token in the list of tokens matches the expected type, then
157
+ # we're going to create a new Token, advance the token enumerator, and
158
+ # return the new Token. Otherwise we're going to raise a
159
+ # MissingTokenError.
160
+ def consume(expected)
161
+ type, value, index, line = tokens.peek
162
+
163
+ if expected != type
164
+ raise MissingTokenError, "expected #{expected} got #{type}"
165
+ end
166
+
167
+ tokens.next
168
+
169
+ Token.new(
170
+ type: type,
171
+ value: value,
172
+ location:
173
+ Location.new(
174
+ start_char: index,
175
+ end_char: index + value.length,
176
+ start_line: line,
177
+ end_line: line + value.count("\n")
178
+ )
179
+ )
180
+ end
181
+
182
+ # We're going to yield to the block which should attempt to consume some
183
+ # number of tokens. If any of them are missing, then we're going to return
184
+ # nil from this block.
185
+ def maybe
186
+ yield
187
+ rescue MissingTokenError
188
+ end
189
+
190
+ # We're going to attempt to parse everything by yielding to the block. If
191
+ # nothing is returned by the block, then we're going to raise an error.
192
+ # Otherwise we'll return the value returned by the block.
193
+ def atleast
194
+ result = yield
195
+ raise MissingTokenError if result.nil?
196
+ result
197
+ end
198
+
199
+ # We're going to attempt to parse with the block many times. We'll stop
200
+ # parsing once we get an error back from the block.
201
+ def many
202
+ items = []
203
+
204
+ loop do
205
+ begin
206
+ items << yield
207
+ rescue MissingTokenError
208
+ break
209
+ end
210
+ end
211
+
212
+ items
213
+ end
214
+
215
+ def parse_document
216
+ prolog = maybe { parse_prolog }
217
+ miscs = many { parse_misc }
218
+
219
+ doctype = maybe { parse_doctype }
220
+ miscs += many { parse_misc }
221
+
222
+ element = parse_element
223
+ miscs += many { parse_misc }
224
+
225
+ parts = [prolog, *miscs, doctype, element].compact
226
+
227
+ Document.new(
228
+ prolog: prolog,
229
+ miscs: miscs,
230
+ doctype: doctype,
231
+ element: element,
232
+ location: parts.first.location.to(parts.last.location)
233
+ )
234
+ end
235
+
236
+ def parse_prolog
237
+ opening = consume(:xml_decl)
238
+ attributes = many { parse_attribute }
239
+ closing = consume(:special_close)
240
+
241
+ Prolog.new(
242
+ opening: opening,
243
+ attributes: attributes,
244
+ closing: closing,
245
+ location: opening.location.to(closing.location)
246
+ )
247
+ end
248
+
249
+ def parse_doctype
250
+ opening = consume(:doctype)
251
+ name = consume(:name)
252
+ external_id = maybe { parse_external_id }
253
+ closing = consume(:close)
254
+
255
+ DocType.new(
256
+ opening: opening,
257
+ name: name,
258
+ external_id: external_id,
259
+ closing: closing,
260
+ location: opening.location.to(closing.location)
261
+ )
262
+ end
263
+
264
+ def parse_external_id
265
+ type = consume(:name)
266
+ public_id = consume(:string) if type.value == "PUBLIC"
267
+ system_id = consume(:string)
268
+
269
+ ExternalID.new(
270
+ type: type,
271
+ public_id: public_id,
272
+ system_id: system_id,
273
+ location: type.location.to(system_id.location)
274
+ )
275
+ end
276
+
277
+ def parse_content
278
+ many do
279
+ atleast do
280
+ maybe { parse_element } || maybe { parse_chardata } ||
281
+ maybe { parse_reference } || maybe { consume(:cdata) } ||
282
+ maybe { consume(:processing_instruction) } ||
283
+ maybe { consume(:comment) }
284
+ end
285
+ end
286
+ end
287
+
288
+ def parse_opening_tag
289
+ opening = consume(:open)
290
+ name = consume(:name)
291
+ attributes = many { parse_attribute }
292
+ closing =
293
+ atleast do
294
+ maybe { consume(:close) } || maybe { consume(:slash_close) }
295
+ end
296
+
297
+ Element::OpeningTag.new(
298
+ opening: opening,
299
+ name: name,
300
+ attributes: attributes,
301
+ closing: closing,
302
+ location: opening.location.to(closing.location)
303
+ )
304
+ end
305
+
306
+ def parse_closing_tag
307
+ opening = consume(:slash_open)
308
+ name = consume(:name)
309
+ closing = consume(:close)
310
+
311
+ Element::ClosingTag.new(
312
+ opening: opening,
313
+ name: name,
314
+ closing: closing,
315
+ location: opening.location.to(closing.location)
316
+ )
317
+ end
318
+
319
+ def parse_element
320
+ opening_tag = parse_opening_tag
321
+
322
+ if opening_tag.closing.value == ">"
323
+ content = parse_content
324
+ closing_tag = parse_closing_tag
325
+
326
+ Element.new(
327
+ opening_tag: opening_tag,
328
+ content: content,
329
+ closing_tag: closing_tag,
330
+ location: opening_tag.location.to(closing_tag.location)
331
+ )
332
+ else
333
+ Element.new(
334
+ opening_tag: opening_tag,
335
+ content: nil,
336
+ closing_tag: nil,
337
+ location: opening_tag.location
338
+ )
339
+ end
340
+ end
341
+
342
+ def parse_reference
343
+ value =
344
+ atleast do
345
+ maybe { consume(:entity_reference) } ||
346
+ maybe { consume(:character_reference) }
347
+ end
348
+
349
+ Reference.new(value: value, location: value.location)
350
+ end
351
+
352
+ def parse_attribute
353
+ key = consume(:name)
354
+ equals = consume(:equals)
355
+ value = consume(:string)
356
+
357
+ Attribute.new(
358
+ key: key,
359
+ equals: equals,
360
+ value: value,
361
+ location: key.location.to(value.location)
362
+ )
363
+ end
364
+
365
+ def parse_chardata
366
+ value =
367
+ atleast { maybe { consume(:text) } || maybe { consume(:whitespace) } }
368
+
369
+ CharData.new(value: value, location: value.location)
370
+ end
371
+
372
+ def parse_misc
373
+ value =
374
+ atleast do
375
+ maybe { consume(:comment) } ||
376
+ maybe { consume(:processing_instruction) } ||
377
+ maybe { consume(:whitespace) }
378
+ end
379
+
380
+ Misc.new(value: value, location: value.location)
381
+ end
382
+ end
383
+ end
384
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ class PrettyPrint < Visitor
6
+ attr_reader :q
7
+
8
+ def initialize(q)
9
+ @q = q
10
+ end
11
+
12
+ # Visit a Token node.
13
+ def visit_token(node)
14
+ q.pp(node.value)
15
+ end
16
+
17
+ # Visit a Document node.
18
+ def visit_document(node)
19
+ visit_node("document", node)
20
+ end
21
+
22
+ # Visit a Prolog node.
23
+ def visit_prolog(node)
24
+ visit_node("prolog", node)
25
+ end
26
+
27
+ # Visit a Doctype node.
28
+ def visit_doctype(node)
29
+ visit_node("doctype", node)
30
+ end
31
+
32
+ # Visit an ExternalID node.
33
+ def visit_external_id(node)
34
+ visit_node("external_id", node)
35
+ end
36
+
37
+ # Visit an Element node.
38
+ def visit_element(node)
39
+ visit_node("element", node)
40
+ end
41
+
42
+ # Visit an Element::OpeningTag node.
43
+ def visit_opening_tag(node)
44
+ visit_node("opening_tag", node)
45
+ end
46
+
47
+ # Visit an Element::ClosingTag node.
48
+ def visit_closing_tag(node)
49
+ visit_node("closing_tag", node)
50
+ end
51
+
52
+ # Visit a Reference node.
53
+ def visit_reference(node)
54
+ visit_node("reference", node)
55
+ end
56
+
57
+ # Visit an Attribute node.
58
+ def visit_attribute(node)
59
+ visit_node("attribute", node)
60
+ end
61
+
62
+ # Visit a CharData node.
63
+ def visit_char_data(node)
64
+ visit_node("char_data", node)
65
+ end
66
+
67
+ # Visit a Misc node.
68
+ def visit_misc(node)
69
+ visit_node("misc", node)
70
+ end
71
+
72
+ private
73
+
74
+ # A generic visit node function for how we pretty print nodes.
75
+ def visit_node(type, node)
76
+ q.group do
77
+ q.text("(#{type}")
78
+ q.nest(2) do
79
+ q.breakable
80
+ q.seplist(node.child_nodes) { |child_node| visit(child_node) }
81
+ end
82
+ q.breakable("")
83
+ q.text(")")
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ # Provides a visitor interface for visiting certain nodes. It's used
6
+ # internally to implement formatting and pretty-printing. It could also be
7
+ # used externally to visit a subset of nodes that are relevant to a certain
8
+ # task.
9
+ class Visitor
10
+ def visit(node)
11
+ node&.accept(self)
12
+ end
13
+
14
+ private
15
+
16
+ def visit_all(nodes)
17
+ nodes.map { |node| visit(node) }
18
+ end
19
+
20
+ def visit_child_nodes(node)
21
+ visit_all(node.child_nodes)
22
+ end
23
+
24
+ # Visit a Token node.
25
+ alias visit_token visit_child_nodes
26
+
27
+ # Visit a Document node.
28
+ alias visit_document visit_child_nodes
29
+
30
+ # Visit a Prolog node.
31
+ alias visit_prolog visit_child_nodes
32
+
33
+ # Visit a Doctype node.
34
+ alias visit_doctype visit_child_nodes
35
+
36
+ # Visit an ExternalID node.
37
+ alias visit_external_id visit_child_nodes
38
+
39
+ # Visit an Element node.
40
+ alias visit_element visit_child_nodes
41
+
42
+ # Visit an Element::OpeningTag node.
43
+ alias visit_opening_tag visit_child_nodes
44
+
45
+ # Visit an Element::ClosingTag node.
46
+ alias visit_closing_tag visit_child_nodes
47
+
48
+ # Visit a Reference node.
49
+ alias visit_reference visit_child_nodes
50
+
51
+ # Visit an Attribute node.
52
+ alias visit_attribute visit_child_nodes
53
+
54
+ # Visit a CharData node.
55
+ alias visit_char_data visit_child_nodes
56
+
57
+ # Visit a Misc node.
58
+ alias visit_misc visit_child_nodes
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "prettier_print"
4
+ require "syntax_tree"
5
+
6
+ require_relative "xml/nodes"
7
+ require_relative "xml/parser"
8
+ require_relative "xml/visitor"
9
+
10
+ require_relative "xml/format"
11
+ require_relative "xml/pretty_print"
12
+
13
+ module SyntaxTree
14
+ module XML
15
+ def self.format(source, maxwidth = 80)
16
+ PrettierPrint.format(+"", maxwidth) { |q| parse(source).format(q) }
17
+ end
18
+
19
+ def self.parse(source)
20
+ Parser.new(source).parse
21
+ end
22
+
23
+ def self.read(filepath)
24
+ File.read(filepath)
25
+ end
26
+ end
27
+
28
+ register_handler(".xml", XML)
29
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/syntax_tree/xml/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "syntax_tree-xml"
7
+ spec.version = SyntaxTree::XML::VERSION
8
+ spec.authors = ["Kevin Newton"]
9
+ spec.email = ["kddnewton@gmail.com"]
10
+
11
+ spec.summary = "Syntax Tree support for XML"
12
+ spec.homepage = "https://github.com/ruby-syntax-tree/syntax_tree-xml"
13
+ spec.license = "MIT"
14
+ spec.metadata = { "rubygems_mfa_required" => "true" }
15
+
16
+ spec.files = Dir.chdir(__dir__) do
17
+ `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ end
21
+
22
+ spec.bindir = "exe"
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = %w[lib]
25
+
26
+ spec.add_dependency "prettier_print"
27
+ spec.add_dependency "syntax_tree", ">= 2.0.1"
28
+
29
+ spec.add_development_dependency "bundler"
30
+ spec.add_development_dependency "minitest"
31
+ spec.add_development_dependency "rake"
32
+ spec.add_development_dependency "simplecov"
33
+ end