prosereflect 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +63 -0
  3. data/.github/workflows/links.yml +97 -0
  4. data/.gitignore +4 -0
  5. data/.rubocop_todo.yml +61 -75
  6. data/README.adoc +2 -0
  7. data/docs/Gemfile +10 -0
  8. data/docs/INDEX.adoc +45 -0
  9. data/docs/_advanced/index.adoc +15 -0
  10. data/docs/_advanced/schema.adoc +112 -0
  11. data/docs/_advanced/step-map.adoc +66 -0
  12. data/docs/_advanced/steps.adoc +88 -0
  13. data/docs/_advanced/test-builder.adoc +61 -0
  14. data/docs/_advanced/transform.adoc +92 -0
  15. data/docs/_config.yml +174 -0
  16. data/docs/_features/html-input.adoc +69 -0
  17. data/docs/_features/html-output.adoc +45 -0
  18. data/docs/_features/index.adoc +15 -0
  19. data/docs/_features/marks.adoc +86 -0
  20. data/docs/_features/node-types.adoc +124 -0
  21. data/docs/_features/user-mentions.adoc +47 -0
  22. data/docs/_guides/custom-nodes.adoc +107 -0
  23. data/docs/_guides/index.adoc +13 -0
  24. data/docs/_guides/round-trip-html.adoc +91 -0
  25. data/docs/_guides/serialization.adoc +109 -0
  26. data/docs/_pages/index.adoc +67 -0
  27. data/docs/_reference/document-api.adoc +49 -0
  28. data/docs/_reference/index.adoc +14 -0
  29. data/docs/_reference/node-api.adoc +79 -0
  30. data/docs/_reference/schema-api.adoc +95 -0
  31. data/docs/_reference/transform-api.adoc +77 -0
  32. data/docs/_understanding/document-model.adoc +65 -0
  33. data/docs/_understanding/fragment.adoc +52 -0
  34. data/docs/_understanding/index.adoc +14 -0
  35. data/docs/_understanding/resolved-position.adoc +53 -0
  36. data/docs/_understanding/slice.adoc +54 -0
  37. data/docs/lychee.toml +63 -0
  38. data/lib/prosereflect/blockquote.rb +9 -0
  39. data/lib/prosereflect/bullet_list.rb +25 -19
  40. data/lib/prosereflect/code_block.rb +1 -5
  41. data/lib/prosereflect/fragment.rb +249 -0
  42. data/lib/prosereflect/horizontal_rule.rb +9 -0
  43. data/lib/prosereflect/image.rb +9 -0
  44. data/lib/prosereflect/input/html.rb +96 -0
  45. data/lib/prosereflect/node.rb +141 -3
  46. data/lib/prosereflect/ordered_list.rb +2 -0
  47. data/lib/prosereflect/output/html.rb +227 -0
  48. data/lib/prosereflect/parser.rb +9 -0
  49. data/lib/prosereflect/resolved_pos.rb +256 -0
  50. data/lib/prosereflect/schema/attribute.rb +57 -0
  51. data/lib/prosereflect/schema/content_match.rb +656 -0
  52. data/lib/prosereflect/schema/fragment.rb +166 -0
  53. data/lib/prosereflect/schema/mark.rb +121 -0
  54. data/lib/prosereflect/schema/mark_type.rb +130 -0
  55. data/lib/prosereflect/schema/node.rb +236 -0
  56. data/lib/prosereflect/schema/node_type.rb +274 -0
  57. data/lib/prosereflect/schema/schema_main.rb +190 -0
  58. data/lib/prosereflect/schema/spec.rb +92 -0
  59. data/lib/prosereflect/schema.rb +39 -0
  60. data/lib/prosereflect/text.rb +24 -0
  61. data/lib/prosereflect/transform/attr_step.rb +157 -0
  62. data/lib/prosereflect/transform/insert_step.rb +115 -0
  63. data/lib/prosereflect/transform/mapping.rb +82 -0
  64. data/lib/prosereflect/transform/mark_step.rb +269 -0
  65. data/lib/prosereflect/transform/replace_around_step.rb +181 -0
  66. data/lib/prosereflect/transform/replace_step.rb +157 -0
  67. data/lib/prosereflect/transform/slice.rb +91 -0
  68. data/lib/prosereflect/transform/step.rb +89 -0
  69. data/lib/prosereflect/transform/step_map.rb +126 -0
  70. data/lib/prosereflect/transform/structure.rb +120 -0
  71. data/lib/prosereflect/transform/transform.rb +341 -0
  72. data/lib/prosereflect/transform.rb +26 -0
  73. data/lib/prosereflect/version.rb +1 -1
  74. data/lib/prosereflect.rb +3 -0
  75. data/spec/fixtures/documents/formatted_text.yaml +14 -0
  76. data/spec/fixtures/documents/heading_paragraph.yaml +16 -0
  77. data/spec/fixtures/documents/lists_doc.yaml +32 -0
  78. data/spec/fixtures/documents/mixed_content.yaml +40 -0
  79. data/spec/fixtures/documents/nested_doc.yaml +20 -0
  80. data/spec/fixtures/documents/simple_doc.yaml +6 -0
  81. data/spec/fixtures/documents/table_doc.yaml +32 -0
  82. data/spec/fixtures/documents/transform_test.yaml +14 -0
  83. data/spec/fixtures/schema/custom_schema.rb +37 -0
  84. data/spec/fixtures/schema/test_schema.rb +46 -0
  85. data/spec/fixtures/test_builder/helpers.rb +212 -0
  86. data/spec/prosereflect/document_spec.rb +1 -1
  87. data/spec/prosereflect/fragment_spec.rb +273 -0
  88. data/spec/prosereflect/input/html_spec.rb +197 -1
  89. data/spec/prosereflect/node_spec.rb +128 -0
  90. data/spec/prosereflect/output/whitespace_spec.rb +248 -0
  91. data/spec/prosereflect/parser/round_trip_spec.rb +472 -0
  92. data/spec/prosereflect/resolved_pos_spec.rb +74 -0
  93. data/spec/prosereflect/schema/conftest.rb +68 -0
  94. data/spec/prosereflect/schema/content_match_spec.rb +237 -0
  95. data/spec/prosereflect/schema/mark_spec.rb +274 -0
  96. data/spec/prosereflect/schema/mark_type_spec.rb +86 -0
  97. data/spec/prosereflect/schema/node_type_spec.rb +142 -0
  98. data/spec/prosereflect/schema/schema_spec.rb +194 -0
  99. data/spec/prosereflect/test_builder/marks_spec.rb +127 -0
  100. data/spec/prosereflect/transform/equivalence_spec.rb +487 -0
  101. data/spec/prosereflect/transform/mapping_spec.rb +226 -0
  102. data/spec/prosereflect/transform/replace_spec.rb +832 -0
  103. data/spec/prosereflect/transform/replace_step_spec.rb +157 -0
  104. data/spec/prosereflect/transform/slice_spec.rb +48 -0
  105. data/spec/prosereflect/transform/step_map_spec.rb +70 -0
  106. data/spec/prosereflect/transform/step_spec.rb +211 -0
  107. data/spec/prosereflect/transform/structure_spec.rb +98 -0
  108. data/spec/prosereflect/transform/transform_spec.rb +238 -0
  109. data/spec/spec_helper.rb +1 -0
  110. metadata +90 -2
@@ -0,0 +1,656 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Prosereflect
4
+ class Schema
5
+ # Represents an edge in the content match graph
6
+ class MatchEdge
7
+ attr_reader :type, :next_match
8
+
9
+ def initialize(type:, next_match:)
10
+ @type = type
11
+ @next_match = next_match
12
+ end
13
+ end
14
+
15
+ # Represents a match state for node content expressions
16
+ # Parses expressions like "block+", "inline*", "(paragraph | heading){2,4}"
17
+ class ContentMatch
18
+ attr_reader :valid_end, :next_edges, :wrap_cache
19
+
20
+ def initialize(valid_end:, next_edges: [])
21
+ @valid_end = valid_end
22
+ @next_edges = next_edges
23
+ @wrap_cache = [] # [[target_node_type, computed_wrapping]]
24
+ end
25
+
26
+ def self.empty
27
+ @empty ||= new(valid_end: true, next_edges: [])
28
+ end
29
+
30
+ # Match a node type and return the next match state
31
+ def match_type(node_type)
32
+ @next_edges.find { |edge| edge.type == node_type }&.next_match
33
+ end
34
+
35
+ # Check if this match has inline content
36
+ def inline_content?
37
+ @next_edges.any? && @next_edges.first.type.is_a?(NodeType) && @next_edges.first.type.inline?
38
+ end
39
+
40
+ # Get the default type for this match (first non-text type without required attrs)
41
+ def default_type
42
+ @next_edges.each do |edge|
43
+ type = edge.type
44
+ if !type.text? && !type.has_required_attrs?
45
+ return type
46
+ end
47
+ end
48
+ nil
49
+ end
50
+
51
+ # Check if this content expression is compatible with another
52
+ def compatible?(other)
53
+ @next_edges.any? do |i|
54
+ other.next_edges.any? { |j| i.type == j.type }
55
+ end
56
+ end
57
+
58
+ # Fill in content before the given fragment
59
+ # Returns a Fragment if successful, nil otherwise
60
+ def fill_before(after:, to_end: false, start_index: 0)
61
+ seen = [self]
62
+
63
+ search = ->(match, types) do
64
+ finished = match_fragment(after, start_index)
65
+ if finished && (!to_end || finished.valid_end)
66
+ return make_fragment(types)
67
+ end
68
+
69
+ match.next_edges.each do |edge|
70
+ type = edge.type
71
+ next_match = edge.next_match
72
+ if !type.text? && !type.has_required_attrs? && !seen.include?(next_match)
73
+ seen << next_match
74
+ result = search.call(next_match, types + [type])
75
+ return result if result
76
+ end
77
+ end
78
+ nil
79
+ end
80
+
81
+ search.call(self, [])
82
+ end
83
+
84
+ # Find wrapping nodes to reach the target type
85
+ def find_wrapping(target_node_type)
86
+ cached = @wrap_cache.find { |entry| entry[0] == target_node_type }
87
+ return cached[1] if cached
88
+
89
+ computed = compute_wrapping(target_node_type)
90
+ @wrap_cache << [target_node_type, computed]
91
+ computed
92
+ end
93
+
94
+ # Number of edges
95
+ def edge_count
96
+ @next_edges.length
97
+ end
98
+
99
+ # Get edge at index n
100
+ def edge(n)
101
+ if n >= edge_count
102
+ raise Prosereflect::SchemaErrors::ContentMatchError,
103
+ "There's no #{n}th edge in this content match"
104
+ end
105
+
106
+ @next_edges[n]
107
+ end
108
+
109
+ # Match a fragment and return the next match state
110
+ def match_fragment(fragment, start: 0, end_index: nil)
111
+ end_index ||= fragment.content.size
112
+ current = self
113
+ i = start
114
+
115
+ while current && i < end_index
116
+ child = fragment[i]
117
+ current = current.match_type(child.type)
118
+ i += 1
119
+ end
120
+ current
121
+ end
122
+
123
+ # Parse content expression and return ContentMatch
124
+ def self.parse(expression, node_types)
125
+ return empty if expression.nil? || expression.empty?
126
+
127
+ stream = TokenStream.new(expression, node_types)
128
+ return empty if stream.peek.nil?
129
+
130
+ expr = parse_expression(stream)
131
+ unless stream.peek.nil?
132
+ stream.error("Unexpected trailing text")
133
+ end
134
+
135
+ nfa_result = to_nfa(expr)
136
+ dfa_result = to_dfa(nfa_result)
137
+ check_for_dead_ends(dfa_result, stream)
138
+ dfa_result
139
+ end
140
+
141
+ class << self
142
+ private
143
+
144
+ def parse_expression(stream)
145
+ exprs = []
146
+ loop do
147
+ exprs << parse_sequence(stream)
148
+ break unless stream.accept("|")
149
+ end
150
+ exprs.length == 1 ? exprs.first : { type: :choice, exprs: exprs }
151
+ end
152
+
153
+ def parse_sequence(stream)
154
+ exprs = []
155
+ loop do
156
+ exprs << parse_subscript(stream)
157
+ next_token = stream.peek
158
+ break if next_token.nil? || next_token == ")" || next_token == "|"
159
+ end
160
+ exprs.length == 1 ? exprs.first : { type: :seq, exprs: exprs }
161
+ end
162
+
163
+ def parse_subscript(stream)
164
+ expr = parse_atom(stream)
165
+ loop do
166
+ case stream.peek
167
+ when "+"
168
+ stream.advance
169
+ expr = { type: :plus, expr: expr }
170
+ when "*"
171
+ stream.advance
172
+ expr = { type: :star, expr: expr }
173
+ when "?"
174
+ stream.advance
175
+ expr = { type: :opt, expr: expr }
176
+ when "{"
177
+ expr = parse_range(stream, expr)
178
+ else
179
+ break
180
+ end
181
+ end
182
+ expr
183
+ end
184
+
185
+ def parse_range(stream, expr)
186
+ stream.expect("{")
187
+ min = stream.expect_number
188
+ max = min
189
+
190
+ if stream.accept(",")
191
+ max = stream.peek == "}" ? -1 : stream.expect_number
192
+ end
193
+ stream.expect("}")
194
+
195
+ { type: :range, min: min, max: max, expr: expr }
196
+ end
197
+
198
+ def parse_atom(stream)
199
+ if stream.accept("(")
200
+ expr = parse_expression(stream)
201
+ stream.expect(")")
202
+ expr
203
+ elsif (token = stream.peek) && token =~ /^\w+$/
204
+ stream.advance
205
+ resolve_node_types(stream, token)
206
+ else
207
+ stream.error("Unexpected token \"#{stream.peek}\"")
208
+ end
209
+ end
210
+
211
+ def resolve_node_types(stream, name)
212
+ node_types = stream.node_types
213
+ types = []
214
+
215
+ if node_types.key?(name)
216
+ types << node_types[name]
217
+ else
218
+ # Check groups
219
+ node_types.each_value do |type|
220
+ types << type if type.in_group?(name)
221
+ end
222
+ end
223
+
224
+ stream.error("No node type or group \"#{name}\" found") if types.empty?
225
+
226
+ if types.length == 1
227
+ { type: :name, value: types.first }
228
+ else
229
+ { type: :choice, exprs: types.map { { type: :name, value: _1 } } }
230
+ end
231
+ end
232
+
233
+ # Convert expression AST to NFA
234
+ def to_nfa(expr)
235
+ nfa = [[]] # Array of states, each state is array of edges
236
+ start_state = 0
237
+
238
+ # Use a marker for the terminal - we'll replace it later
239
+ terminal_marker = :terminal
240
+
241
+ compile_to_nfa(expr, start_state, terminal_marker, nfa)
242
+
243
+ # Now replace terminal marker with actual terminal state at the end
244
+ # Find and update any edges pointing to the marker
245
+ actual_terminal_index = nfa.length
246
+ nfa.each do |edges|
247
+ edges.each do |edge|
248
+ edge[:to] = actual_terminal_index if edge[:to] == terminal_marker
249
+ end
250
+ end
251
+ # Add the terminal state
252
+ nfa << []
253
+
254
+ nfa
255
+ end
256
+
257
+ # Compile expression to NFA edges
258
+ # The edges are returned with :to = target_state, ready to be added to nfa
259
+ def compile_to_nfa(expr, from, target_state, nfa, terminal_state = nil)
260
+ # If target_state is the terminal marker, use nfa.length (will be replaced later)
261
+ # If target_state is nil, use nfa.length (next state to be allocated)
262
+ # Otherwise use target_state directly
263
+ actual_target = if target_state == :terminal
264
+ nfa.length
265
+ elsif target_state.nil?
266
+ nfa.length
267
+ else
268
+ target_state
269
+ end
270
+
271
+ case expr[:type]
272
+ when :choice
273
+ expr[:exprs].flat_map do
274
+ compile_to_nfa(_1, from, actual_target, nfa, terminal_state)
275
+ end
276
+ when :seq
277
+ # Pass target_state (not actual_target) so compile_sequence knows about :terminal marker
278
+ compile_sequence(expr[:exprs], from, target_state, nfa,
279
+ terminal_state)
280
+ when :star
281
+ loop_state = new_nfa_state(nfa)
282
+ # Skip edge: from -> target (allows zero occurrences)
283
+ skip_edge = { from: from, to: actual_target, term: nil }
284
+ nfa[from] << skip_edge
285
+ # Enter loop edge: from -> loop_state
286
+ enter_edge = { from: from, to: loop_state, term: nil }
287
+ nfa[from] << enter_edge
288
+ # Compile inner expression at loop_state, targeting loop_state
289
+ inner_edges = compile_to_nfa(expr[:expr], loop_state, loop_state,
290
+ nfa, terminal_state)
291
+ # Loopback edge: loop_state -> loop_state (epsilon)
292
+ loop_edge = { from: loop_state, to: loop_state, term: nil }
293
+ nfa[loop_state] << loop_edge
294
+ # Exit edge: loop_state -> target (exit loop)
295
+ # Compute exit_target at this moment since nfa may have grown
296
+ exit_target = target_state == :terminal ? nfa.length : target_state
297
+ exit_edge = { from: loop_state, to: exit_target, term: nil }
298
+ nfa[loop_state] << exit_edge
299
+ inner_edges
300
+ when :plus
301
+ # If target_state is :terminal, create our own loop_state for repetition
302
+ # If target_state is a real state, use it as the loop_state (for use in sequences)
303
+ loop_state = if target_state == :terminal
304
+ new_nfa_state(nfa)
305
+ else
306
+ target_state
307
+ end
308
+
309
+ # Track if inner expression might set @last_loop_state (repetition constructs)
310
+ inner_sets_loop = %i[plus star
311
+ range].include?(expr[:expr][:type])
312
+
313
+ # First edge: from -> loop_state via inner expression
314
+ compile_to_nfa(expr[:expr], from, loop_state, nfa, terminal_state)
315
+
316
+ # Self-loop edge: loop_state -> loop_state via the term (for repetition)
317
+ # For :seq, use the first element's value as the term
318
+ term_value = if expr[:expr][:type] == :seq && expr[:expr][:exprs].first[:type] == :name
319
+ expr[:expr][:exprs].first[:value]
320
+ elsif expr[:expr][:type] == :name
321
+ expr[:expr][:value]
322
+ elsif %i[plus star range].include?(expr[:expr][:type])
323
+ expr[:expr][:expr][:value]
324
+ end
325
+ if term_value
326
+ term_edge = { from: loop_state, to: loop_state, term: term_value }
327
+ nfa[loop_state] << term_edge
328
+ end
329
+ # Exit edge: loop_state -> target (exit loop)
330
+ exit_target = target_state == :terminal ? nfa.length : target_state
331
+ exit_edge = { from: loop_state, to: exit_target, term: nil }
332
+ nfa[loop_state] << exit_edge
333
+ # Return loop_state so compile_sequence can update current_from
334
+ # Only update if inner expression didn't set it (repetition constructs set it themselves)
335
+ @last_loop_state = loop_state unless inner_sets_loop
336
+ []
337
+ when :opt
338
+ # Skip edge: from -> target (allows zero)
339
+ skip_edge = { from: from, to: actual_target, term: nil }
340
+ nfa[from] << skip_edge
341
+ # Inner expression edges: from -> target
342
+ compile_to_nfa(expr[:expr], from, actual_target, nfa,
343
+ terminal_state)
344
+
345
+ when :range
346
+ compile_range(expr[:min], expr[:max], expr[:expr], from,
347
+ target_state, nfa, terminal_state)
348
+ when :name
349
+ edge = { from: from, to: actual_target, term: expr[:value] }
350
+ nfa[from] << edge
351
+ [edge]
352
+ end
353
+ end
354
+
355
+ def compile_sequence(exprs, from, terminal_state, nfa, term_state = nil)
356
+ results = []
357
+ i = 0
358
+ current_from = from
359
+ while i < exprs.length
360
+ if i == exprs.length - 1
361
+ # Last element: use terminal as target
362
+ edges = compile_to_nfa(exprs[i], current_from, terminal_state,
363
+ nfa, term_state)
364
+ results.concat(edges)
365
+ else
366
+ # For non-last elements, determine if we should use terminal_state directly
367
+ # or allocate a new intermediate state.
368
+ # At i=0 (first element), if terminal_state is a real state (not :terminal),
369
+ # use it directly - the first element should target the loop_state.
370
+ # For i>0, only reuse terminal_state if we're already at the loop_state
371
+ # (terminal_state == current_from).
372
+ use_terminal = (i.zero? && terminal_state != :terminal) ||
373
+ (terminal_state != :terminal && terminal_state == current_from)
374
+
375
+ if use_terminal
376
+ # Use terminal_state as target
377
+ saved_last_loop = @last_loop_state
378
+ @last_loop_state = nil
379
+ edges = compile_to_nfa(exprs[i], current_from, terminal_state,
380
+ nfa, term_state)
381
+ results.concat(edges)
382
+ # After :plus, update current_from to loop_state
383
+ current_from = @last_loop_state || terminal_state
384
+ else
385
+ # Allocate new intermediate state for this element
386
+ next_from = new_nfa_state(nfa)
387
+ saved_last_loop = @last_loop_state
388
+ @last_loop_state = nil
389
+ edges = compile_to_nfa(exprs[i], current_from, next_from, nfa,
390
+ term_state)
391
+ results.concat(edges)
392
+ # After :plus, update current_from to loop_state
393
+ current_from = @last_loop_state || next_from
394
+ end
395
+ @last_loop_state = saved_last_loop
396
+ end
397
+ i += 1
398
+ end
399
+ results
400
+ end
401
+
402
+ def compile_range(min, max, expr, from, target_state, nfa,
403
+ term_state = nil)
404
+ results = []
405
+ cur = from
406
+
407
+ # Track if target_state was originally :terminal so we can always use
408
+ # nfa.length at final edge time, not a stale resolved value
409
+ target_is_terminal = target_state == :terminal || target_state.nil?
410
+ # Resolve target_state to actual target for edges within this expression
411
+ target_is_terminal ? nfa.length : target_state
412
+
413
+ # Required repetitions
414
+ min.times do
415
+ next_state = new_nfa_state(nfa)
416
+ edges = compile_to_nfa(expr, cur, next_state, nfa, term_state)
417
+ # Edges are already added to nfa by compile_to_nfa
418
+ results.concat(edges)
419
+ cur = next_state
420
+ end
421
+
422
+ if max == -1
423
+ # Unbounded: connect cur to loop_state via expr, then loop at loop_state with expr, with exit to target
424
+ loop_state = new_nfa_state(nfa)
425
+ # Connect cur to loop_state with expr (not loop_state to loop_state)
426
+ edges = compile_to_nfa(expr, cur, loop_state, nfa, term_state)
427
+ # Edges already added by compile_to_nfa
428
+ results.concat(edges)
429
+ # Create self-loop at loop_state using the expression term (allows staying in loop via expr)
430
+ loop_edge = if expr[:type] == :name
431
+ { from: loop_state, to: loop_state,
432
+ term: expr[:value] }
433
+ else
434
+ # For complex expressions, create epsilon self-loop
435
+ { from: loop_state, to: loop_state, term: nil }
436
+ end
437
+ nfa[loop_state] << loop_edge
438
+ # Exit edge from loop_state to target
439
+ exit_target = target_is_terminal ? nfa.length : target_state
440
+ exit_edge = { from: loop_state, to: exit_target, term: nil }
441
+ nfa[loop_state] << exit_edge
442
+ results << exit_edge
443
+ # Also add exit from cur directly to target (for when we stop after minimum)
444
+ direct_exit_target = target_is_terminal ? nfa.length : target_state
445
+ direct_exit = { from: cur, to: direct_exit_target, term: nil }
446
+ nfa[cur] << direct_exit
447
+ else
448
+ # Bounded: create optional skips for additional repetitions
449
+ (min...max).each do
450
+ next_state = new_nfa_state(nfa)
451
+ # Expr edge from cur to next_state
452
+ edges = compile_to_nfa(expr, cur, next_state, nfa, term_state)
453
+ # Edges already added by compile_to_nfa
454
+ results.concat(edges)
455
+ # Skip edge from cur to next_state (optional)
456
+ skip_edge = { from: cur, to: next_state, term: nil }
457
+ nfa[cur] << skip_edge
458
+ results << skip_edge
459
+ cur = next_state
460
+ end
461
+ # Final edge to target
462
+ final_target = target_is_terminal ? nfa.length : target_state
463
+ final_edge = { from: cur, to: final_target, term: nil }
464
+ nfa[cur] << final_edge
465
+ results << final_edge
466
+ end
467
+
468
+ results
469
+ end
470
+
471
+ def new_nfa_state(nfa)
472
+ nfa << []
473
+ nfa.length - 1
474
+ end
475
+
476
+ def connect_edges(edges, to)
477
+ edges.each do |edge|
478
+ edge[:to] = to
479
+ end
480
+ end
481
+
482
+ # Convert NFA to DFA using subset construction
483
+ def to_dfa(nfa_states)
484
+ labeled = {}
485
+
486
+ explore = ->(states) do
487
+ key = states.sort.join(",")
488
+
489
+ return labeled[key] if labeled.key?(key)
490
+
491
+ out = [] # Array of [node_type, [next_state_indices]]
492
+
493
+ states.each do |state_index|
494
+ nfa_states[state_index].each do |edge|
495
+ next unless edge[:term]
496
+
497
+ term = edge[:term]
498
+ to_states = null_from(nfa_states, edge[:to])
499
+
500
+ existing = out.find { |e| e[0] == term }
501
+ if existing
502
+ existing[1] |= to_states
503
+ else
504
+ out << [term, to_states]
505
+ end
506
+ end
507
+ end
508
+
509
+ # Terminal state is the last state in the NFA
510
+ terminal_index = nfa_states.length - 1
511
+ state = new(valid_end: states.include?(terminal_index))
512
+ labeled[key] = state
513
+
514
+ out.each do |term, next_states|
515
+ next_key = next_states.sort.join(",")
516
+ next_state = labeled[next_key] || explore.call(next_states)
517
+ state.next_edges << MatchEdge.new(type: term,
518
+ next_match: next_state)
519
+ end
520
+
521
+ state
522
+ end
523
+
524
+ start_states = null_from(nfa_states, 0)
525
+ explore.call(start_states)
526
+ end
527
+
528
+ # Compute epsilon closure (states reachable via null transitions)
529
+ def null_from(nfa_states, state_index)
530
+ result = [state_index] # Start with the state itself
531
+ scan = ->(idx) do
532
+ nfa_states[idx].each do |edge|
533
+ next unless edge[:term].nil?
534
+
535
+ to = edge[:to]
536
+ unless result.include?(to)
537
+ result << to
538
+ scan.call(to)
539
+ end
540
+ end
541
+ end
542
+ scan.call(state_index)
543
+ result.sort
544
+ end
545
+
546
+ def check_for_dead_ends(match, stream)
547
+ work = [match]
548
+ visited = []
549
+ i = 0
550
+
551
+ while i < work.length
552
+ state = work[i]
553
+ next if visited.include?(state.object_id)
554
+
555
+ visited << state.object_id
556
+
557
+ dead = !state.valid_end
558
+ node_names = []
559
+
560
+ state.next_edges.each do |edge|
561
+ node = edge.type
562
+ node_names << node.name
563
+
564
+ # Text nodes are always generatable; other nodes are generatable if they have no required attrs
565
+ if dead && (node.text? || !node.has_required_attrs?)
566
+ dead = false
567
+ end
568
+
569
+ unless work.include?(edge.next_match)
570
+ work << edge.next_match
571
+ end
572
+ end
573
+
574
+ if dead
575
+ stream.error(
576
+ "Only non-generatable nodes (#{node_names.join(', ')}) in a required " \
577
+ "position",
578
+ )
579
+ end
580
+
581
+ i += 1
582
+ end
583
+ end
584
+ end
585
+
586
+ # For creating test fragments
587
+ def make_fragment(types)
588
+ return nil if types.empty?
589
+
590
+ nodes = types.map(&:create_and_fill)
591
+ return nil if nodes.any?(&:nil?)
592
+
593
+ Fragment.new(nodes)
594
+ end
595
+
596
+ # Token stream for parsing content expressions
597
+ class TokenStream
598
+ attr_reader :string, :node_types
599
+
600
+ def initialize(string, node_types)
601
+ @string = string
602
+ @node_types = node_types
603
+ @pos = 0
604
+ @tokens = tokenize(string)
605
+ end
606
+
607
+ def peek
608
+ @tokens[@pos]
609
+ end
610
+
611
+ def advance
612
+ @tokens[@pos]&.tap { @pos += 1 }
613
+ end
614
+
615
+ def accept(tok)
616
+ if peek == tok
617
+ @pos += 1
618
+ true
619
+ else
620
+ false
621
+ end
622
+ end
623
+
624
+ def expect(tok)
625
+ unless accept(tok)
626
+ raise Prosereflect::SchemaErrors::ContentMatchError,
627
+ "Expected #{tok}, got #{peek.inspect}"
628
+ end
629
+
630
+ true
631
+ end
632
+
633
+ def expect_number
634
+ tok = peek
635
+ unless /^\d+$/.match?(tok)
636
+ raise Prosereflect::SchemaErrors::ContentMatchError,
637
+ "Expected number, got #{tok.inspect}"
638
+ end
639
+
640
+ advance.to_i
641
+ end
642
+
643
+ def error(message)
644
+ raise Prosereflect::SchemaErrors::ContentMatchError,
645
+ "#{message} (in content expression) \"#{@string}\""
646
+ end
647
+
648
+ private
649
+
650
+ def tokenize(string)
651
+ string.scan(/\w+|\W/).grep_v(/^\s+$/)
652
+ end
653
+ end
654
+ end
655
+ end
656
+ end