prosereflect 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docs.yml +63 -0
- data/.github/workflows/links.yml +97 -0
- data/.github/workflows/rake.yml +4 -0
- data/.github/workflows/release.yml +5 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +19 -1
- data/.rubocop_todo.yml +119 -183
- data/CLAUDE.md +78 -0
- data/Gemfile +8 -4
- data/README.adoc +2 -0
- data/Rakefile +3 -3
- data/docs/Gemfile +10 -0
- data/docs/INDEX.adoc +45 -0
- data/docs/_advanced/index.adoc +15 -0
- data/docs/_advanced/schema.adoc +112 -0
- data/docs/_advanced/step-map.adoc +66 -0
- data/docs/_advanced/steps.adoc +88 -0
- data/docs/_advanced/test-builder.adoc +61 -0
- data/docs/_advanced/transform.adoc +92 -0
- data/docs/_config.yml +174 -0
- data/docs/_features/html-input.adoc +69 -0
- data/docs/_features/html-output.adoc +45 -0
- data/docs/_features/index.adoc +15 -0
- data/docs/_features/marks.adoc +86 -0
- data/docs/_features/node-types.adoc +124 -0
- data/docs/_features/user-mentions.adoc +47 -0
- data/docs/_guides/custom-nodes.adoc +107 -0
- data/docs/_guides/index.adoc +13 -0
- data/docs/_guides/round-trip-html.adoc +91 -0
- data/docs/_guides/serialization.adoc +109 -0
- data/docs/_pages/index.adoc +67 -0
- data/docs/_reference/document-api.adoc +49 -0
- data/docs/_reference/index.adoc +14 -0
- data/docs/_reference/node-api.adoc +79 -0
- data/docs/_reference/schema-api.adoc +95 -0
- data/docs/_reference/transform-api.adoc +77 -0
- data/docs/_understanding/document-model.adoc +65 -0
- data/docs/_understanding/fragment.adoc +52 -0
- data/docs/_understanding/index.adoc +14 -0
- data/docs/_understanding/resolved-position.adoc +53 -0
- data/docs/_understanding/slice.adoc +54 -0
- data/docs/lychee.toml +63 -0
- data/lib/prosereflect/attribute/base.rb +4 -6
- data/lib/prosereflect/attribute/bold.rb +2 -4
- data/lib/prosereflect/attribute/href.rb +1 -3
- data/lib/prosereflect/attribute/id.rb +7 -7
- data/lib/prosereflect/attribute.rb +4 -7
- data/lib/prosereflect/blockquote.rb +19 -11
- data/lib/prosereflect/bullet_list.rb +36 -29
- data/lib/prosereflect/code_block.rb +23 -27
- data/lib/prosereflect/code_block_wrapper.rb +12 -13
- data/lib/prosereflect/document.rb +14 -22
- data/lib/prosereflect/fragment.rb +249 -0
- data/lib/prosereflect/hard_break.rb +6 -6
- data/lib/prosereflect/heading.rb +14 -15
- data/lib/prosereflect/horizontal_rule.rb +23 -14
- data/lib/prosereflect/image.rb +32 -23
- data/lib/prosereflect/input/html.rb +179 -104
- data/lib/prosereflect/input.rb +7 -0
- data/lib/prosereflect/list_item.rb +11 -12
- data/lib/prosereflect/mark/base.rb +9 -11
- data/lib/prosereflect/mark/bold.rb +1 -3
- data/lib/prosereflect/mark/code.rb +1 -3
- data/lib/prosereflect/mark/italic.rb +1 -3
- data/lib/prosereflect/mark/link.rb +1 -3
- data/lib/prosereflect/mark/strike.rb +1 -3
- data/lib/prosereflect/mark/subscript.rb +1 -3
- data/lib/prosereflect/mark/superscript.rb +1 -3
- data/lib/prosereflect/mark/underline.rb +1 -3
- data/lib/prosereflect/mark.rb +9 -5
- data/lib/prosereflect/node.rb +171 -33
- data/lib/prosereflect/ordered_list.rb +17 -14
- data/lib/prosereflect/output/html.rb +279 -50
- data/lib/prosereflect/output.rb +7 -0
- data/lib/prosereflect/paragraph.rb +11 -13
- data/lib/prosereflect/parser.rb +56 -66
- data/lib/prosereflect/resolved_pos.rb +256 -0
- data/lib/prosereflect/schema/attribute.rb +57 -0
- data/lib/prosereflect/schema/content_match.rb +656 -0
- data/lib/prosereflect/schema/fragment.rb +166 -0
- data/lib/prosereflect/schema/mark.rb +121 -0
- data/lib/prosereflect/schema/mark_type.rb +130 -0
- data/lib/prosereflect/schema/node.rb +236 -0
- data/lib/prosereflect/schema/node_type.rb +274 -0
- data/lib/prosereflect/schema/schema_main.rb +190 -0
- data/lib/prosereflect/schema/spec.rb +92 -0
- data/lib/prosereflect/schema.rb +39 -0
- data/lib/prosereflect/table.rb +12 -13
- data/lib/prosereflect/table_cell.rb +13 -13
- data/lib/prosereflect/table_header.rb +17 -17
- data/lib/prosereflect/table_row.rb +12 -12
- data/lib/prosereflect/text.rb +35 -11
- data/lib/prosereflect/transform/attr_step.rb +157 -0
- data/lib/prosereflect/transform/insert_step.rb +115 -0
- data/lib/prosereflect/transform/mapping.rb +82 -0
- data/lib/prosereflect/transform/mark_step.rb +269 -0
- data/lib/prosereflect/transform/replace_around_step.rb +181 -0
- data/lib/prosereflect/transform/replace_step.rb +157 -0
- data/lib/prosereflect/transform/slice.rb +91 -0
- data/lib/prosereflect/transform/step.rb +89 -0
- data/lib/prosereflect/transform/step_map.rb +126 -0
- data/lib/prosereflect/transform/structure.rb +120 -0
- data/lib/prosereflect/transform/transform.rb +341 -0
- data/lib/prosereflect/transform.rb +26 -0
- data/lib/prosereflect/user.rb +15 -15
- data/lib/prosereflect/version.rb +1 -1
- data/lib/prosereflect.rb +30 -17
- data/prosereflect.gemspec +17 -16
- data/spec/fixtures/documents/formatted_text.yaml +14 -0
- data/spec/fixtures/documents/heading_paragraph.yaml +16 -0
- data/spec/fixtures/documents/lists_doc.yaml +32 -0
- data/spec/fixtures/documents/mixed_content.yaml +40 -0
- data/spec/fixtures/documents/nested_doc.yaml +20 -0
- data/spec/fixtures/documents/simple_doc.yaml +6 -0
- data/spec/fixtures/documents/table_doc.yaml +32 -0
- data/spec/fixtures/documents/transform_test.yaml +14 -0
- data/spec/fixtures/schema/custom_schema.rb +37 -0
- data/spec/fixtures/schema/test_schema.rb +46 -0
- data/spec/fixtures/test_builder/helpers.rb +212 -0
- data/spec/prosereflect/document_spec.rb +332 -330
- data/spec/prosereflect/fragment_spec.rb +273 -0
- data/spec/prosereflect/hard_break_spec.rb +125 -125
- data/spec/prosereflect/input/html_spec.rb +718 -522
- data/spec/prosereflect/node_spec.rb +311 -182
- data/spec/prosereflect/output/html_spec.rb +105 -105
- data/spec/prosereflect/output/whitespace_spec.rb +248 -0
- data/spec/prosereflect/paragraph_spec.rb +275 -274
- data/spec/prosereflect/parser/round_trip_spec.rb +472 -0
- data/spec/prosereflect/parser_spec.rb +185 -180
- data/spec/prosereflect/resolved_pos_spec.rb +74 -0
- data/spec/prosereflect/schema/conftest.rb +68 -0
- data/spec/prosereflect/schema/content_match_spec.rb +237 -0
- data/spec/prosereflect/schema/mark_spec.rb +274 -0
- data/spec/prosereflect/schema/mark_type_spec.rb +86 -0
- data/spec/prosereflect/schema/node_type_spec.rb +142 -0
- data/spec/prosereflect/schema/schema_spec.rb +194 -0
- data/spec/prosereflect/table_cell_spec.rb +183 -183
- data/spec/prosereflect/table_row_spec.rb +149 -149
- data/spec/prosereflect/table_spec.rb +320 -318
- data/spec/prosereflect/test_builder/marks_spec.rb +127 -0
- data/spec/prosereflect/text_spec.rb +133 -132
- data/spec/prosereflect/transform/equivalence_spec.rb +487 -0
- data/spec/prosereflect/transform/mapping_spec.rb +226 -0
- data/spec/prosereflect/transform/replace_spec.rb +832 -0
- data/spec/prosereflect/transform/replace_step_spec.rb +157 -0
- data/spec/prosereflect/transform/slice_spec.rb +48 -0
- data/spec/prosereflect/transform/step_map_spec.rb +70 -0
- data/spec/prosereflect/transform/step_spec.rb +211 -0
- data/spec/prosereflect/transform/structure_spec.rb +98 -0
- data/spec/prosereflect/transform/transform_spec.rb +238 -0
- data/spec/prosereflect/user_spec.rb +31 -28
- data/spec/prosereflect_spec.rb +28 -26
- data/spec/spec_helper.rb +7 -6
- data/spec/support/matchers.rb +6 -6
- data/spec/support/shared_examples.rb +49 -49
- metadata +96 -5
- data/spec/prosereflect/version_spec.rb +0 -11
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Prosereflect
|
|
4
|
+
class Schema
|
|
5
|
+
# Represents an edge in the content match graph
|
|
6
|
+
class MatchEdge
|
|
7
|
+
attr_reader :type, :next_match
|
|
8
|
+
|
|
9
|
+
def initialize(type:, next_match:)
|
|
10
|
+
@type = type
|
|
11
|
+
@next_match = next_match
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Represents a match state for node content expressions
|
|
16
|
+
# Parses expressions like "block+", "inline*", "(paragraph | heading){2,4}"
|
|
17
|
+
class ContentMatch
|
|
18
|
+
attr_reader :valid_end, :next_edges, :wrap_cache
|
|
19
|
+
|
|
20
|
+
def initialize(valid_end:, next_edges: [])
|
|
21
|
+
@valid_end = valid_end
|
|
22
|
+
@next_edges = next_edges
|
|
23
|
+
@wrap_cache = [] # [[target_node_type, computed_wrapping]]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.empty
|
|
27
|
+
@empty ||= new(valid_end: true, next_edges: [])
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Match a node type and return the next match state
|
|
31
|
+
def match_type(node_type)
|
|
32
|
+
@next_edges.find { |edge| edge.type == node_type }&.next_match
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Check if this match has inline content
|
|
36
|
+
def inline_content?
|
|
37
|
+
@next_edges.any? && @next_edges.first.type.is_a?(NodeType) && @next_edges.first.type.inline?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Get the default type for this match (first non-text type without required attrs)
|
|
41
|
+
def default_type
|
|
42
|
+
@next_edges.each do |edge|
|
|
43
|
+
type = edge.type
|
|
44
|
+
if !type.text? && !type.has_required_attrs?
|
|
45
|
+
return type
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Check if this content expression is compatible with another
|
|
52
|
+
def compatible?(other)
|
|
53
|
+
@next_edges.any? do |i|
|
|
54
|
+
other.next_edges.any? { |j| i.type == j.type }
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Fill in content before the given fragment
|
|
59
|
+
# Returns a Fragment if successful, nil otherwise
|
|
60
|
+
def fill_before(after:, to_end: false, start_index: 0)
|
|
61
|
+
seen = [self]
|
|
62
|
+
|
|
63
|
+
search = ->(match, types) do
|
|
64
|
+
finished = match_fragment(after, start_index)
|
|
65
|
+
if finished && (!to_end || finished.valid_end)
|
|
66
|
+
return make_fragment(types)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
match.next_edges.each do |edge|
|
|
70
|
+
type = edge.type
|
|
71
|
+
next_match = edge.next_match
|
|
72
|
+
if !type.text? && !type.has_required_attrs? && !seen.include?(next_match)
|
|
73
|
+
seen << next_match
|
|
74
|
+
result = search.call(next_match, types + [type])
|
|
75
|
+
return result if result
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
search.call(self, [])
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Find wrapping nodes to reach the target type
|
|
85
|
+
def find_wrapping(target_node_type)
|
|
86
|
+
cached = @wrap_cache.find { |entry| entry[0] == target_node_type }
|
|
87
|
+
return cached[1] if cached
|
|
88
|
+
|
|
89
|
+
computed = compute_wrapping(target_node_type)
|
|
90
|
+
@wrap_cache << [target_node_type, computed]
|
|
91
|
+
computed
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Number of edges
|
|
95
|
+
def edge_count
|
|
96
|
+
@next_edges.length
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get edge at index n
|
|
100
|
+
def edge(n)
|
|
101
|
+
if n >= edge_count
|
|
102
|
+
raise Prosereflect::SchemaErrors::ContentMatchError,
|
|
103
|
+
"There's no #{n}th edge in this content match"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
@next_edges[n]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Match a fragment and return the next match state
|
|
110
|
+
def match_fragment(fragment, start: 0, end_index: nil)
|
|
111
|
+
end_index ||= fragment.content.size
|
|
112
|
+
current = self
|
|
113
|
+
i = start
|
|
114
|
+
|
|
115
|
+
while current && i < end_index
|
|
116
|
+
child = fragment[i]
|
|
117
|
+
current = current.match_type(child.type)
|
|
118
|
+
i += 1
|
|
119
|
+
end
|
|
120
|
+
current
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Parse content expression and return ContentMatch
|
|
124
|
+
def self.parse(expression, node_types)
|
|
125
|
+
return empty if expression.nil? || expression.empty?
|
|
126
|
+
|
|
127
|
+
stream = TokenStream.new(expression, node_types)
|
|
128
|
+
return empty if stream.peek.nil?
|
|
129
|
+
|
|
130
|
+
expr = parse_expression(stream)
|
|
131
|
+
unless stream.peek.nil?
|
|
132
|
+
stream.error("Unexpected trailing text")
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
nfa_result = to_nfa(expr)
|
|
136
|
+
dfa_result = to_dfa(nfa_result)
|
|
137
|
+
check_for_dead_ends(dfa_result, stream)
|
|
138
|
+
dfa_result
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
class << self
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
def parse_expression(stream)
|
|
145
|
+
exprs = []
|
|
146
|
+
loop do
|
|
147
|
+
exprs << parse_sequence(stream)
|
|
148
|
+
break unless stream.accept("|")
|
|
149
|
+
end
|
|
150
|
+
exprs.length == 1 ? exprs.first : { type: :choice, exprs: exprs }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def parse_sequence(stream)
|
|
154
|
+
exprs = []
|
|
155
|
+
loop do
|
|
156
|
+
exprs << parse_subscript(stream)
|
|
157
|
+
next_token = stream.peek
|
|
158
|
+
break if next_token.nil? || next_token == ")" || next_token == "|"
|
|
159
|
+
end
|
|
160
|
+
exprs.length == 1 ? exprs.first : { type: :seq, exprs: exprs }
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def parse_subscript(stream)
|
|
164
|
+
expr = parse_atom(stream)
|
|
165
|
+
loop do
|
|
166
|
+
case stream.peek
|
|
167
|
+
when "+"
|
|
168
|
+
stream.advance
|
|
169
|
+
expr = { type: :plus, expr: expr }
|
|
170
|
+
when "*"
|
|
171
|
+
stream.advance
|
|
172
|
+
expr = { type: :star, expr: expr }
|
|
173
|
+
when "?"
|
|
174
|
+
stream.advance
|
|
175
|
+
expr = { type: :opt, expr: expr }
|
|
176
|
+
when "{"
|
|
177
|
+
expr = parse_range(stream, expr)
|
|
178
|
+
else
|
|
179
|
+
break
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
expr
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def parse_range(stream, expr)
|
|
186
|
+
stream.expect("{")
|
|
187
|
+
min = stream.expect_number
|
|
188
|
+
max = min
|
|
189
|
+
|
|
190
|
+
if stream.accept(",")
|
|
191
|
+
max = stream.peek == "}" ? -1 : stream.expect_number
|
|
192
|
+
end
|
|
193
|
+
stream.expect("}")
|
|
194
|
+
|
|
195
|
+
{ type: :range, min: min, max: max, expr: expr }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def parse_atom(stream)
|
|
199
|
+
if stream.accept("(")
|
|
200
|
+
expr = parse_expression(stream)
|
|
201
|
+
stream.expect(")")
|
|
202
|
+
expr
|
|
203
|
+
elsif (token = stream.peek) && token =~ /^\w+$/
|
|
204
|
+
stream.advance
|
|
205
|
+
resolve_node_types(stream, token)
|
|
206
|
+
else
|
|
207
|
+
stream.error("Unexpected token \"#{stream.peek}\"")
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def resolve_node_types(stream, name)
|
|
212
|
+
node_types = stream.node_types
|
|
213
|
+
types = []
|
|
214
|
+
|
|
215
|
+
if node_types.key?(name)
|
|
216
|
+
types << node_types[name]
|
|
217
|
+
else
|
|
218
|
+
# Check groups
|
|
219
|
+
node_types.each_value do |type|
|
|
220
|
+
types << type if type.in_group?(name)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
stream.error("No node type or group \"#{name}\" found") if types.empty?
|
|
225
|
+
|
|
226
|
+
if types.length == 1
|
|
227
|
+
{ type: :name, value: types.first }
|
|
228
|
+
else
|
|
229
|
+
{ type: :choice, exprs: types.map { { type: :name, value: _1 } } }
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Convert expression AST to NFA
|
|
234
|
+
def to_nfa(expr)
|
|
235
|
+
nfa = [[]] # Array of states, each state is array of edges
|
|
236
|
+
start_state = 0
|
|
237
|
+
|
|
238
|
+
# Use a marker for the terminal - we'll replace it later
|
|
239
|
+
terminal_marker = :terminal
|
|
240
|
+
|
|
241
|
+
compile_to_nfa(expr, start_state, terminal_marker, nfa)
|
|
242
|
+
|
|
243
|
+
# Now replace terminal marker with actual terminal state at the end
|
|
244
|
+
# Find and update any edges pointing to the marker
|
|
245
|
+
actual_terminal_index = nfa.length
|
|
246
|
+
nfa.each do |edges|
|
|
247
|
+
edges.each do |edge|
|
|
248
|
+
edge[:to] = actual_terminal_index if edge[:to] == terminal_marker
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
# Add the terminal state
|
|
252
|
+
nfa << []
|
|
253
|
+
|
|
254
|
+
nfa
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Compile expression to NFA edges
|
|
258
|
+
# The edges are returned with :to = target_state, ready to be added to nfa
|
|
259
|
+
def compile_to_nfa(expr, from, target_state, nfa, terminal_state = nil)
|
|
260
|
+
# If target_state is the terminal marker, use nfa.length (will be replaced later)
|
|
261
|
+
# If target_state is nil, use nfa.length (next state to be allocated)
|
|
262
|
+
# Otherwise use target_state directly
|
|
263
|
+
actual_target = if target_state == :terminal
|
|
264
|
+
nfa.length
|
|
265
|
+
elsif target_state.nil?
|
|
266
|
+
nfa.length
|
|
267
|
+
else
|
|
268
|
+
target_state
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
case expr[:type]
|
|
272
|
+
when :choice
|
|
273
|
+
expr[:exprs].flat_map do
|
|
274
|
+
compile_to_nfa(_1, from, actual_target, nfa, terminal_state)
|
|
275
|
+
end
|
|
276
|
+
when :seq
|
|
277
|
+
# Pass target_state (not actual_target) so compile_sequence knows about :terminal marker
|
|
278
|
+
compile_sequence(expr[:exprs], from, target_state, nfa,
|
|
279
|
+
terminal_state)
|
|
280
|
+
when :star
|
|
281
|
+
loop_state = new_nfa_state(nfa)
|
|
282
|
+
# Skip edge: from -> target (allows zero occurrences)
|
|
283
|
+
skip_edge = { from: from, to: actual_target, term: nil }
|
|
284
|
+
nfa[from] << skip_edge
|
|
285
|
+
# Enter loop edge: from -> loop_state
|
|
286
|
+
enter_edge = { from: from, to: loop_state, term: nil }
|
|
287
|
+
nfa[from] << enter_edge
|
|
288
|
+
# Compile inner expression at loop_state, targeting loop_state
|
|
289
|
+
inner_edges = compile_to_nfa(expr[:expr], loop_state, loop_state,
|
|
290
|
+
nfa, terminal_state)
|
|
291
|
+
# Loopback edge: loop_state -> loop_state (epsilon)
|
|
292
|
+
loop_edge = { from: loop_state, to: loop_state, term: nil }
|
|
293
|
+
nfa[loop_state] << loop_edge
|
|
294
|
+
# Exit edge: loop_state -> target (exit loop)
|
|
295
|
+
# Compute exit_target at this moment since nfa may have grown
|
|
296
|
+
exit_target = target_state == :terminal ? nfa.length : target_state
|
|
297
|
+
exit_edge = { from: loop_state, to: exit_target, term: nil }
|
|
298
|
+
nfa[loop_state] << exit_edge
|
|
299
|
+
inner_edges
|
|
300
|
+
when :plus
|
|
301
|
+
# If target_state is :terminal, create our own loop_state for repetition
|
|
302
|
+
# If target_state is a real state, use it as the loop_state (for use in sequences)
|
|
303
|
+
loop_state = if target_state == :terminal
|
|
304
|
+
new_nfa_state(nfa)
|
|
305
|
+
else
|
|
306
|
+
target_state
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Track if inner expression might set @last_loop_state (repetition constructs)
|
|
310
|
+
inner_sets_loop = %i[plus star
|
|
311
|
+
range].include?(expr[:expr][:type])
|
|
312
|
+
|
|
313
|
+
# First edge: from -> loop_state via inner expression
|
|
314
|
+
compile_to_nfa(expr[:expr], from, loop_state, nfa, terminal_state)
|
|
315
|
+
|
|
316
|
+
# Self-loop edge: loop_state -> loop_state via the term (for repetition)
|
|
317
|
+
# For :seq, use the first element's value as the term
|
|
318
|
+
term_value = if expr[:expr][:type] == :seq && expr[:expr][:exprs].first[:type] == :name
|
|
319
|
+
expr[:expr][:exprs].first[:value]
|
|
320
|
+
elsif expr[:expr][:type] == :name
|
|
321
|
+
expr[:expr][:value]
|
|
322
|
+
elsif %i[plus star range].include?(expr[:expr][:type])
|
|
323
|
+
expr[:expr][:expr][:value]
|
|
324
|
+
end
|
|
325
|
+
if term_value
|
|
326
|
+
term_edge = { from: loop_state, to: loop_state, term: term_value }
|
|
327
|
+
nfa[loop_state] << term_edge
|
|
328
|
+
end
|
|
329
|
+
# Exit edge: loop_state -> target (exit loop)
|
|
330
|
+
exit_target = target_state == :terminal ? nfa.length : target_state
|
|
331
|
+
exit_edge = { from: loop_state, to: exit_target, term: nil }
|
|
332
|
+
nfa[loop_state] << exit_edge
|
|
333
|
+
# Return loop_state so compile_sequence can update current_from
|
|
334
|
+
# Only update if inner expression didn't set it (repetition constructs set it themselves)
|
|
335
|
+
@last_loop_state = loop_state unless inner_sets_loop
|
|
336
|
+
[]
|
|
337
|
+
when :opt
|
|
338
|
+
# Skip edge: from -> target (allows zero)
|
|
339
|
+
skip_edge = { from: from, to: actual_target, term: nil }
|
|
340
|
+
nfa[from] << skip_edge
|
|
341
|
+
# Inner expression edges: from -> target
|
|
342
|
+
compile_to_nfa(expr[:expr], from, actual_target, nfa,
|
|
343
|
+
terminal_state)
|
|
344
|
+
|
|
345
|
+
when :range
|
|
346
|
+
compile_range(expr[:min], expr[:max], expr[:expr], from,
|
|
347
|
+
target_state, nfa, terminal_state)
|
|
348
|
+
when :name
|
|
349
|
+
edge = { from: from, to: actual_target, term: expr[:value] }
|
|
350
|
+
nfa[from] << edge
|
|
351
|
+
[edge]
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def compile_sequence(exprs, from, terminal_state, nfa, term_state = nil)
|
|
356
|
+
results = []
|
|
357
|
+
i = 0
|
|
358
|
+
current_from = from
|
|
359
|
+
while i < exprs.length
|
|
360
|
+
if i == exprs.length - 1
|
|
361
|
+
# Last element: use terminal as target
|
|
362
|
+
edges = compile_to_nfa(exprs[i], current_from, terminal_state,
|
|
363
|
+
nfa, term_state)
|
|
364
|
+
results.concat(edges)
|
|
365
|
+
else
|
|
366
|
+
# For non-last elements, determine if we should use terminal_state directly
|
|
367
|
+
# or allocate a new intermediate state.
|
|
368
|
+
# At i=0 (first element), if terminal_state is a real state (not :terminal),
|
|
369
|
+
# use it directly - the first element should target the loop_state.
|
|
370
|
+
# For i>0, only reuse terminal_state if we're already at the loop_state
|
|
371
|
+
# (terminal_state == current_from).
|
|
372
|
+
use_terminal = (i.zero? && terminal_state != :terminal) ||
|
|
373
|
+
(terminal_state != :terminal && terminal_state == current_from)
|
|
374
|
+
|
|
375
|
+
if use_terminal
|
|
376
|
+
# Use terminal_state as target
|
|
377
|
+
saved_last_loop = @last_loop_state
|
|
378
|
+
@last_loop_state = nil
|
|
379
|
+
edges = compile_to_nfa(exprs[i], current_from, terminal_state,
|
|
380
|
+
nfa, term_state)
|
|
381
|
+
results.concat(edges)
|
|
382
|
+
# After :plus, update current_from to loop_state
|
|
383
|
+
current_from = @last_loop_state || terminal_state
|
|
384
|
+
else
|
|
385
|
+
# Allocate new intermediate state for this element
|
|
386
|
+
next_from = new_nfa_state(nfa)
|
|
387
|
+
saved_last_loop = @last_loop_state
|
|
388
|
+
@last_loop_state = nil
|
|
389
|
+
edges = compile_to_nfa(exprs[i], current_from, next_from, nfa,
|
|
390
|
+
term_state)
|
|
391
|
+
results.concat(edges)
|
|
392
|
+
# After :plus, update current_from to loop_state
|
|
393
|
+
current_from = @last_loop_state || next_from
|
|
394
|
+
end
|
|
395
|
+
@last_loop_state = saved_last_loop
|
|
396
|
+
end
|
|
397
|
+
i += 1
|
|
398
|
+
end
|
|
399
|
+
results
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def compile_range(min, max, expr, from, target_state, nfa,
|
|
403
|
+
term_state = nil)
|
|
404
|
+
results = []
|
|
405
|
+
cur = from
|
|
406
|
+
|
|
407
|
+
# Track if target_state was originally :terminal so we can always use
|
|
408
|
+
# nfa.length at final edge time, not a stale resolved value
|
|
409
|
+
target_is_terminal = target_state == :terminal || target_state.nil?
|
|
410
|
+
# Resolve target_state to actual target for edges within this expression
|
|
411
|
+
target_is_terminal ? nfa.length : target_state
|
|
412
|
+
|
|
413
|
+
# Required repetitions
|
|
414
|
+
min.times do
|
|
415
|
+
next_state = new_nfa_state(nfa)
|
|
416
|
+
edges = compile_to_nfa(expr, cur, next_state, nfa, term_state)
|
|
417
|
+
# Edges are already added to nfa by compile_to_nfa
|
|
418
|
+
results.concat(edges)
|
|
419
|
+
cur = next_state
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
if max == -1
|
|
423
|
+
# Unbounded: connect cur to loop_state via expr, then loop at loop_state with expr, with exit to target
|
|
424
|
+
loop_state = new_nfa_state(nfa)
|
|
425
|
+
# Connect cur to loop_state with expr (not loop_state to loop_state)
|
|
426
|
+
edges = compile_to_nfa(expr, cur, loop_state, nfa, term_state)
|
|
427
|
+
# Edges already added by compile_to_nfa
|
|
428
|
+
results.concat(edges)
|
|
429
|
+
# Create self-loop at loop_state using the expression term (allows staying in loop via expr)
|
|
430
|
+
loop_edge = if expr[:type] == :name
|
|
431
|
+
{ from: loop_state, to: loop_state,
|
|
432
|
+
term: expr[:value] }
|
|
433
|
+
else
|
|
434
|
+
# For complex expressions, create epsilon self-loop
|
|
435
|
+
{ from: loop_state, to: loop_state, term: nil }
|
|
436
|
+
end
|
|
437
|
+
nfa[loop_state] << loop_edge
|
|
438
|
+
# Exit edge from loop_state to target
|
|
439
|
+
exit_target = target_is_terminal ? nfa.length : target_state
|
|
440
|
+
exit_edge = { from: loop_state, to: exit_target, term: nil }
|
|
441
|
+
nfa[loop_state] << exit_edge
|
|
442
|
+
results << exit_edge
|
|
443
|
+
# Also add exit from cur directly to target (for when we stop after minimum)
|
|
444
|
+
direct_exit_target = target_is_terminal ? nfa.length : target_state
|
|
445
|
+
direct_exit = { from: cur, to: direct_exit_target, term: nil }
|
|
446
|
+
nfa[cur] << direct_exit
|
|
447
|
+
else
|
|
448
|
+
# Bounded: create optional skips for additional repetitions
|
|
449
|
+
(min...max).each do
|
|
450
|
+
next_state = new_nfa_state(nfa)
|
|
451
|
+
# Expr edge from cur to next_state
|
|
452
|
+
edges = compile_to_nfa(expr, cur, next_state, nfa, term_state)
|
|
453
|
+
# Edges already added by compile_to_nfa
|
|
454
|
+
results.concat(edges)
|
|
455
|
+
# Skip edge from cur to next_state (optional)
|
|
456
|
+
skip_edge = { from: cur, to: next_state, term: nil }
|
|
457
|
+
nfa[cur] << skip_edge
|
|
458
|
+
results << skip_edge
|
|
459
|
+
cur = next_state
|
|
460
|
+
end
|
|
461
|
+
# Final edge to target
|
|
462
|
+
final_target = target_is_terminal ? nfa.length : target_state
|
|
463
|
+
final_edge = { from: cur, to: final_target, term: nil }
|
|
464
|
+
nfa[cur] << final_edge
|
|
465
|
+
results << final_edge
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
results
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def new_nfa_state(nfa)
|
|
472
|
+
nfa << []
|
|
473
|
+
nfa.length - 1
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
def connect_edges(edges, to)
|
|
477
|
+
edges.each do |edge|
|
|
478
|
+
edge[:to] = to
|
|
479
|
+
end
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
# Convert NFA to DFA using subset construction
|
|
483
|
+
def to_dfa(nfa_states)
|
|
484
|
+
labeled = {}
|
|
485
|
+
|
|
486
|
+
explore = ->(states) do
|
|
487
|
+
key = states.sort.join(",")
|
|
488
|
+
|
|
489
|
+
return labeled[key] if labeled.key?(key)
|
|
490
|
+
|
|
491
|
+
out = [] # Array of [node_type, [next_state_indices]]
|
|
492
|
+
|
|
493
|
+
states.each do |state_index|
|
|
494
|
+
nfa_states[state_index].each do |edge|
|
|
495
|
+
next unless edge[:term]
|
|
496
|
+
|
|
497
|
+
term = edge[:term]
|
|
498
|
+
to_states = null_from(nfa_states, edge[:to])
|
|
499
|
+
|
|
500
|
+
existing = out.find { |e| e[0] == term }
|
|
501
|
+
if existing
|
|
502
|
+
existing[1] |= to_states
|
|
503
|
+
else
|
|
504
|
+
out << [term, to_states]
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Terminal state is the last state in the NFA
|
|
510
|
+
terminal_index = nfa_states.length - 1
|
|
511
|
+
state = new(valid_end: states.include?(terminal_index))
|
|
512
|
+
labeled[key] = state
|
|
513
|
+
|
|
514
|
+
out.each do |term, next_states|
|
|
515
|
+
next_key = next_states.sort.join(",")
|
|
516
|
+
next_state = labeled[next_key] || explore.call(next_states)
|
|
517
|
+
state.next_edges << MatchEdge.new(type: term,
|
|
518
|
+
next_match: next_state)
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
state
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
start_states = null_from(nfa_states, 0)
|
|
525
|
+
explore.call(start_states)
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
# Compute epsilon closure (states reachable via null transitions)
|
|
529
|
+
def null_from(nfa_states, state_index)
|
|
530
|
+
result = [state_index] # Start with the state itself
|
|
531
|
+
scan = ->(idx) do
|
|
532
|
+
nfa_states[idx].each do |edge|
|
|
533
|
+
next unless edge[:term].nil?
|
|
534
|
+
|
|
535
|
+
to = edge[:to]
|
|
536
|
+
unless result.include?(to)
|
|
537
|
+
result << to
|
|
538
|
+
scan.call(to)
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
end
|
|
542
|
+
scan.call(state_index)
|
|
543
|
+
result.sort
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def check_for_dead_ends(match, stream)
|
|
547
|
+
work = [match]
|
|
548
|
+
visited = []
|
|
549
|
+
i = 0
|
|
550
|
+
|
|
551
|
+
while i < work.length
|
|
552
|
+
state = work[i]
|
|
553
|
+
next if visited.include?(state.object_id)
|
|
554
|
+
|
|
555
|
+
visited << state.object_id
|
|
556
|
+
|
|
557
|
+
dead = !state.valid_end
|
|
558
|
+
node_names = []
|
|
559
|
+
|
|
560
|
+
state.next_edges.each do |edge|
|
|
561
|
+
node = edge.type
|
|
562
|
+
node_names << node.name
|
|
563
|
+
|
|
564
|
+
# Text nodes are always generatable; other nodes are generatable if they have no required attrs
|
|
565
|
+
if dead && (node.text? || !node.has_required_attrs?)
|
|
566
|
+
dead = false
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
unless work.include?(edge.next_match)
|
|
570
|
+
work << edge.next_match
|
|
571
|
+
end
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
if dead
|
|
575
|
+
stream.error(
|
|
576
|
+
"Only non-generatable nodes (#{node_names.join(', ')}) in a required " \
|
|
577
|
+
"position",
|
|
578
|
+
)
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
i += 1
|
|
582
|
+
end
|
|
583
|
+
end
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# For creating test fragments
|
|
587
|
+
def make_fragment(types)
|
|
588
|
+
return nil if types.empty?
|
|
589
|
+
|
|
590
|
+
nodes = types.map(&:create_and_fill)
|
|
591
|
+
return nil if nodes.any?(&:nil?)
|
|
592
|
+
|
|
593
|
+
Fragment.new(nodes)
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Token stream for parsing content expressions
|
|
597
|
+
class TokenStream
|
|
598
|
+
attr_reader :string, :node_types
|
|
599
|
+
|
|
600
|
+
def initialize(string, node_types)
|
|
601
|
+
@string = string
|
|
602
|
+
@node_types = node_types
|
|
603
|
+
@pos = 0
|
|
604
|
+
@tokens = tokenize(string)
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
def peek
|
|
608
|
+
@tokens[@pos]
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
def advance
|
|
612
|
+
@tokens[@pos]&.tap { @pos += 1 }
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
def accept(tok)
|
|
616
|
+
if peek == tok
|
|
617
|
+
@pos += 1
|
|
618
|
+
true
|
|
619
|
+
else
|
|
620
|
+
false
|
|
621
|
+
end
|
|
622
|
+
end
|
|
623
|
+
|
|
624
|
+
def expect(tok)
|
|
625
|
+
unless accept(tok)
|
|
626
|
+
raise Prosereflect::SchemaErrors::ContentMatchError,
|
|
627
|
+
"Expected #{tok}, got #{peek.inspect}"
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
true
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
def expect_number
|
|
634
|
+
tok = peek
|
|
635
|
+
unless /^\d+$/.match?(tok)
|
|
636
|
+
raise Prosereflect::SchemaErrors::ContentMatchError,
|
|
637
|
+
"Expected number, got #{tok.inspect}"
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
advance.to_i
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
def error(message)
|
|
644
|
+
raise Prosereflect::SchemaErrors::ContentMatchError,
|
|
645
|
+
"#{message} (in content expression) \"#{@string}\""
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
private
|
|
649
|
+
|
|
650
|
+
def tokenize(string)
|
|
651
|
+
string.scan(/\w+|\W/).grep_v(/^\s+$/)
|
|
652
|
+
end
|
|
653
|
+
end
|
|
654
|
+
end
|
|
655
|
+
end
|
|
656
|
+
end
|