syntax_tree-xml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ class Parser
6
+ NAME_START =
7
+ "[:a-zA-Z_\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]"
8
+
9
+ NAME_CHAR =
10
+ "[#{NAME_START}-\\.\\d\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]"
11
+
12
+ NAME = "#{NAME_START}(?:#{NAME_CHAR})*"
13
+
14
+ # This is the parent class of any kind of errors that will be raised by
15
+ # the parser.
16
+ class ParseError < StandardError
17
+ end
18
+
19
+ # This error occurs when a certain token is expected in a certain place
20
+ # but is not found. Sometimes this is handled internally because some
21
+ # elements are optional. Other times it is not and it is raised to end the
22
+ # parsing process.
23
+ class MissingTokenError < ParseError
24
+ end
25
+
26
+ attr_reader :source, :tokens
27
+
28
+ def initialize(source)
29
+ @source = source
30
+ @tokens = make_tokens
31
+ end
32
+
33
+ def parse
34
+ parse_document
35
+ end
36
+
37
+ private
38
+
39
+ def make_tokens
40
+ Enumerator.new do |enum|
41
+ index = 0
42
+ line = 1
43
+ state = %i[outside]
44
+
45
+ while index < source.length
46
+ case state.last
47
+ in :outside
48
+ case source[index..]
49
+ when /\A(?: |\t|\n|\r\n)+/m
50
+ # whitespace
51
+ enum.yield :whitespace, $&, index, line
52
+ line += $&.count("\n")
53
+ when /\A<!--(.|\r?\n)*?-->/m
54
+ # comments
55
+ # <!-- this is a comment -->
56
+ enum.yield :comment, $&, index, line
57
+ line += $&.count("\n")
58
+ when /\A<!\[CDATA\[(.|\r?\n)*?\]\]>/m
59
+ # character data tags
60
+ # <![CDATA[<message>Welcome!</message>]]>
61
+ enum.yield :cdata, $&, index, line
62
+ line += $&.count("\n")
63
+ when /\A<!DOCTYPE/
64
+ # document type tags
65
+ # <!DOCTYPE
66
+ enum.yield :doctype, $&, index, line
67
+ state << :inside
68
+ when /\A<!.+?>/
69
+ # document type definition tags
70
+ # <!ENTITY nbsp "&#xA0;">
71
+ enum.yield :dtd, $&, index, line
72
+ when /\A<\?xml[ \t\r\n]/
73
+ # xml declaration opening
74
+ # <?xml
75
+ enum.yield :xml_decl, $&, index, line
76
+ state << :inside
77
+ line += $&.count("\n")
78
+ when %r{\A</}
79
+ # the beginning of a closing tag
80
+ # </
81
+ enum.yield :slash_open, $&, index, line
82
+ state << :inside
83
+ when /\A<\?#{NAME}.+?\?>/
84
+ # a processing instruction
85
+ # <?xml-stylesheet type="text/xsl" href="style.xsl" ?>
86
+ enum.yield :processing_instruction, $&, index, line
87
+ when /\A</
88
+ # the beginning of an opening tag
89
+ # <
90
+ enum.yield :open, $&, index, line
91
+ state << :inside
92
+ when /\A&#{NAME};/
93
+ # entity reference
94
+ # &amp;
95
+ enum.yield :entity_reference, $&, index, line
96
+ when /\A&#(?:\d+|x[a-fA-F0-9]+);/
97
+ # character reference
98
+ # &#1234;
99
+ enum.yield :character_reference, $&, index, line
100
+ when /\A[^<&]+/
101
+ # plain text content
102
+ # abc
103
+ enum.yield :text, $&, index, line
104
+ else
105
+ raise ParseError,
106
+ "Unexpected character at #{index}: #{source[index]}"
107
+ end
108
+ in :inside
109
+ case source[index..]
110
+ when /\A[ \t\r\n]+/
111
+ # whitespace
112
+ line += $&.count("\n")
113
+ when /\A>/
114
+ # the end of a tag
115
+ # >
116
+ enum.yield :close, $&, index, line
117
+ state.pop
118
+ when /\A\?>/
119
+ # the end of a tag
120
+ # ?>
121
+ enum.yield :special_close, $&, index, line
122
+ state.pop
123
+ when %r{\A/>}
124
+ # the end of a self-closing tag
125
+ enum.yield :slash_close, $&, index, line
126
+ state.pop
127
+ when %r{\A/}
128
+ # a forward slash
129
+ # /
130
+ enum.yield :slash, $&, index, line
131
+ when /\A=/
132
+ # an equals sign
133
+ # =
134
+ enum.yield :equals, $&, index, line
135
+ when /\A(?:"[^<"]*"|'[<^']*')/
136
+ # a quoted string
137
+ # "abc"
138
+ enum.yield :string, $&, index, line
139
+ when /\A#{NAME}/
140
+ # a name
141
+ # abc
142
+ enum.yield :name, $&, index, line
143
+ else
144
+ raise ParseError,
145
+ "Unexpected character at #{index}: #{source[index]}"
146
+ end
147
+ end
148
+
149
+ index += $&.length
150
+ end
151
+
152
+ enum.yield :EOF, nil, index, line
153
+ end
154
+ end
155
+
156
+ # If the next token in the list of tokens matches the expected type, then
157
+ # we're going to create a new Token, advance the token enumerator, and
158
+ # return the new Token. Otherwise we're going to raise a
159
+ # MissingTokenError.
160
+ def consume(expected)
161
+ type, value, index, line = tokens.peek
162
+
163
+ if expected != type
164
+ raise MissingTokenError, "expected #{expected} got #{type}"
165
+ end
166
+
167
+ tokens.next
168
+
169
+ Token.new(
170
+ type: type,
171
+ value: value,
172
+ location:
173
+ Location.new(
174
+ start_char: index,
175
+ end_char: index + value.length,
176
+ start_line: line,
177
+ end_line: line + value.count("\n")
178
+ )
179
+ )
180
+ end
181
+
182
+ # We're going to yield to the block which should attempt to consume some
183
+ # number of tokens. If any of them are missing, then we're going to return
184
+ # nil from this block.
185
+ def maybe
186
+ yield
187
+ rescue MissingTokenError
188
+ end
189
+
190
+ # We're going to attempt to parse everything by yielding to the block. If
191
+ # nothing is returned by the block, then we're going to raise an error.
192
+ # Otherwise we'll return the value returned by the block.
193
+ def atleast
194
+ result = yield
195
+ raise MissingTokenError if result.nil?
196
+ result
197
+ end
198
+
199
+ # We're going to attempt to parse with the block many times. We'll stop
200
+ # parsing once we get an error back from the block.
201
+ def many
202
+ items = []
203
+
204
+ loop do
205
+ begin
206
+ items << yield
207
+ rescue MissingTokenError
208
+ break
209
+ end
210
+ end
211
+
212
+ items
213
+ end
214
+
215
+ def parse_document
216
+ prolog = maybe { parse_prolog }
217
+ miscs = many { parse_misc }
218
+
219
+ doctype = maybe { parse_doctype }
220
+ miscs += many { parse_misc }
221
+
222
+ element = parse_element
223
+ miscs += many { parse_misc }
224
+
225
+ parts = [prolog, *miscs, doctype, element].compact
226
+
227
+ Document.new(
228
+ prolog: prolog,
229
+ miscs: miscs,
230
+ doctype: doctype,
231
+ element: element,
232
+ location: parts.first.location.to(parts.last.location)
233
+ )
234
+ end
235
+
236
+ def parse_prolog
237
+ opening = consume(:xml_decl)
238
+ attributes = many { parse_attribute }
239
+ closing = consume(:special_close)
240
+
241
+ Prolog.new(
242
+ opening: opening,
243
+ attributes: attributes,
244
+ closing: closing,
245
+ location: opening.location.to(closing.location)
246
+ )
247
+ end
248
+
249
+ def parse_doctype
250
+ opening = consume(:doctype)
251
+ name = consume(:name)
252
+ external_id = maybe { parse_external_id }
253
+ closing = consume(:close)
254
+
255
+ DocType.new(
256
+ opening: opening,
257
+ name: name,
258
+ external_id: external_id,
259
+ closing: closing,
260
+ location: opening.location.to(closing.location)
261
+ )
262
+ end
263
+
264
+ def parse_external_id
265
+ type = consume(:name)
266
+ public_id = consume(:string) if type.value == "PUBLIC"
267
+ system_id = consume(:string)
268
+
269
+ ExternalID.new(
270
+ type: type,
271
+ public_id: public_id,
272
+ system_id: system_id,
273
+ location: type.location.to(system_id.location)
274
+ )
275
+ end
276
+
277
+ def parse_content
278
+ many do
279
+ atleast do
280
+ maybe { parse_element } || maybe { parse_chardata } ||
281
+ maybe { parse_reference } || maybe { consume(:cdata) } ||
282
+ maybe { consume(:processing_instruction) } ||
283
+ maybe { consume(:comment) }
284
+ end
285
+ end
286
+ end
287
+
288
+ def parse_opening_tag
289
+ opening = consume(:open)
290
+ name = consume(:name)
291
+ attributes = many { parse_attribute }
292
+ closing =
293
+ atleast do
294
+ maybe { consume(:close) } || maybe { consume(:slash_close) }
295
+ end
296
+
297
+ Element::OpeningTag.new(
298
+ opening: opening,
299
+ name: name,
300
+ attributes: attributes,
301
+ closing: closing,
302
+ location: opening.location.to(closing.location)
303
+ )
304
+ end
305
+
306
+ def parse_closing_tag
307
+ opening = consume(:slash_open)
308
+ name = consume(:name)
309
+ closing = consume(:close)
310
+
311
+ Element::ClosingTag.new(
312
+ opening: opening,
313
+ name: name,
314
+ closing: closing,
315
+ location: opening.location.to(closing.location)
316
+ )
317
+ end
318
+
319
+ def parse_element
320
+ opening_tag = parse_opening_tag
321
+
322
+ if opening_tag.closing.value == ">"
323
+ content = parse_content
324
+ closing_tag = parse_closing_tag
325
+
326
+ Element.new(
327
+ opening_tag: opening_tag,
328
+ content: content,
329
+ closing_tag: closing_tag,
330
+ location: opening_tag.location.to(closing_tag.location)
331
+ )
332
+ else
333
+ Element.new(
334
+ opening_tag: opening_tag,
335
+ content: nil,
336
+ closing_tag: nil,
337
+ location: opening_tag.location
338
+ )
339
+ end
340
+ end
341
+
342
+ def parse_reference
343
+ value =
344
+ atleast do
345
+ maybe { consume(:entity_reference) } ||
346
+ maybe { consume(:character_reference) }
347
+ end
348
+
349
+ Reference.new(value: value, location: value.location)
350
+ end
351
+
352
+ def parse_attribute
353
+ key = consume(:name)
354
+ equals = consume(:equals)
355
+ value = consume(:string)
356
+
357
+ Attribute.new(
358
+ key: key,
359
+ equals: equals,
360
+ value: value,
361
+ location: key.location.to(value.location)
362
+ )
363
+ end
364
+
365
+ def parse_chardata
366
+ value =
367
+ atleast { maybe { consume(:text) } || maybe { consume(:whitespace) } }
368
+
369
+ CharData.new(value: value, location: value.location)
370
+ end
371
+
372
+ def parse_misc
373
+ value =
374
+ atleast do
375
+ maybe { consume(:comment) } ||
376
+ maybe { consume(:processing_instruction) } ||
377
+ maybe { consume(:whitespace) }
378
+ end
379
+
380
+ Misc.new(value: value, location: value.location)
381
+ end
382
+ end
383
+ end
384
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ class PrettyPrint < Visitor
6
+ attr_reader :q
7
+
8
+ def initialize(q)
9
+ @q = q
10
+ end
11
+
12
+ # Visit a Token node.
13
+ def visit_token(node)
14
+ q.pp(node.value)
15
+ end
16
+
17
+ # Visit a Document node.
18
+ def visit_document(node)
19
+ visit_node("document", node)
20
+ end
21
+
22
+ # Visit a Prolog node.
23
+ def visit_prolog(node)
24
+ visit_node("prolog", node)
25
+ end
26
+
27
+ # Visit a Doctype node.
28
+ def visit_doctype(node)
29
+ visit_node("doctype", node)
30
+ end
31
+
32
+ # Visit an ExternalID node.
33
+ def visit_external_id(node)
34
+ visit_node("external_id", node)
35
+ end
36
+
37
+ # Visit an Element node.
38
+ def visit_element(node)
39
+ visit_node("element", node)
40
+ end
41
+
42
+ # Visit an Element::OpeningTag node.
43
+ def visit_opening_tag(node)
44
+ visit_node("opening_tag", node)
45
+ end
46
+
47
+ # Visit an Element::ClosingTag node.
48
+ def visit_closing_tag(node)
49
+ visit_node("closing_tag", node)
50
+ end
51
+
52
+ # Visit a Reference node.
53
+ def visit_reference(node)
54
+ visit_node("reference", node)
55
+ end
56
+
57
+ # Visit an Attribute node.
58
+ def visit_attribute(node)
59
+ visit_node("attribute", node)
60
+ end
61
+
62
+ # Visit a CharData node.
63
+ def visit_char_data(node)
64
+ visit_node("char_data", node)
65
+ end
66
+
67
+ # Visit a Misc node.
68
+ def visit_misc(node)
69
+ visit_node("misc", node)
70
+ end
71
+
72
+ private
73
+
74
+ # A generic visit node function for how we pretty print nodes.
75
+ def visit_node(type, node)
76
+ q.group do
77
+ q.text("(#{type}")
78
+ q.nest(2) do
79
+ q.breakable
80
+ q.seplist(node.child_nodes) { |child_node| visit(child_node) }
81
+ end
82
+ q.breakable("")
83
+ q.text(")")
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module XML
5
+ # Provides a visitor interface for visiting certain nodes. It's used
6
+ # internally to implement formatting and pretty-printing. It could also be
7
+ # used externally to visit a subset of nodes that are relevant to a certain
8
+ # task.
9
+ class Visitor
10
+ def visit(node)
11
+ node&.accept(self)
12
+ end
13
+
14
+ private
15
+
16
+ def visit_all(nodes)
17
+ nodes.map { |node| visit(node) }
18
+ end
19
+
20
+ def visit_child_nodes(node)
21
+ visit_all(node.child_nodes)
22
+ end
23
+
24
+ # Visit a Token node.
25
+ alias visit_token visit_child_nodes
26
+
27
+ # Visit a Document node.
28
+ alias visit_document visit_child_nodes
29
+
30
+ # Visit a Prolog node.
31
+ alias visit_prolog visit_child_nodes
32
+
33
+ # Visit a Doctype node.
34
+ alias visit_doctype visit_child_nodes
35
+
36
+ # Visit an ExternalID node.
37
+ alias visit_external_id visit_child_nodes
38
+
39
+ # Visit an Element node.
40
+ alias visit_element visit_child_nodes
41
+
42
+ # Visit an Element::OpeningTag node.
43
+ alias visit_opening_tag visit_child_nodes
44
+
45
+ # Visit an Element::ClosingTag node.
46
+ alias visit_closing_tag visit_child_nodes
47
+
48
+ # Visit a Reference node.
49
+ alias visit_reference visit_child_nodes
50
+
51
+ # Visit an Attribute node.
52
+ alias visit_attribute visit_child_nodes
53
+
54
+ # Visit a CharData node.
55
+ alias visit_char_data visit_child_nodes
56
+
57
+ # Visit a Misc node.
58
+ alias visit_misc visit_child_nodes
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "prettier_print"
4
+ require "syntax_tree"
5
+
6
+ require_relative "xml/nodes"
7
+ require_relative "xml/parser"
8
+ require_relative "xml/visitor"
9
+
10
+ require_relative "xml/format"
11
+ require_relative "xml/pretty_print"
12
+
13
+ module SyntaxTree
14
+ module XML
15
+ def self.format(source, maxwidth = 80)
16
+ PrettierPrint.format(+"", maxwidth) { |q| parse(source).format(q) }
17
+ end
18
+
19
+ def self.parse(source)
20
+ Parser.new(source).parse
21
+ end
22
+
23
+ def self.read(filepath)
24
+ File.read(filepath)
25
+ end
26
+ end
27
+
28
+ register_handler(".xml", XML)
29
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/syntax_tree/xml/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "syntax_tree-xml"
7
+ spec.version = SyntaxTree::XML::VERSION
8
+ spec.authors = ["Kevin Newton"]
9
+ spec.email = ["kddnewton@gmail.com"]
10
+
11
+ spec.summary = "Syntax Tree support for XML"
12
+ spec.homepage = "https://github.com/ruby-syntax-tree/syntax_tree-xml"
13
+ spec.license = "MIT"
14
+ spec.metadata = { "rubygems_mfa_required" => "true" }
15
+
16
+ spec.files = Dir.chdir(__dir__) do
17
+ `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ end
21
+
22
+ spec.bindir = "exe"
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = %w[lib]
25
+
26
+ spec.add_dependency "prettier_print"
27
+ spec.add_dependency "syntax_tree", ">= 2.0.1"
28
+
29
+ spec.add_development_dependency "bundler"
30
+ spec.add_development_dependency "minitest"
31
+ spec.add_development_dependency "rake"
32
+ spec.add_development_dependency "simplecov"
33
+ end