prosereflect 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/docs.yml +63 -0
  3. data/.github/workflows/links.yml +97 -0
  4. data/.gitignore +4 -0
  5. data/.rubocop_todo.yml +61 -75
  6. data/README.adoc +2 -0
  7. data/docs/Gemfile +10 -0
  8. data/docs/INDEX.adoc +45 -0
  9. data/docs/_advanced/index.adoc +15 -0
  10. data/docs/_advanced/schema.adoc +112 -0
  11. data/docs/_advanced/step-map.adoc +66 -0
  12. data/docs/_advanced/steps.adoc +88 -0
  13. data/docs/_advanced/test-builder.adoc +61 -0
  14. data/docs/_advanced/transform.adoc +92 -0
  15. data/docs/_config.yml +174 -0
  16. data/docs/_features/html-input.adoc +69 -0
  17. data/docs/_features/html-output.adoc +45 -0
  18. data/docs/_features/index.adoc +15 -0
  19. data/docs/_features/marks.adoc +86 -0
  20. data/docs/_features/node-types.adoc +124 -0
  21. data/docs/_features/user-mentions.adoc +47 -0
  22. data/docs/_guides/custom-nodes.adoc +107 -0
  23. data/docs/_guides/index.adoc +13 -0
  24. data/docs/_guides/round-trip-html.adoc +91 -0
  25. data/docs/_guides/serialization.adoc +109 -0
  26. data/docs/_pages/index.adoc +67 -0
  27. data/docs/_reference/document-api.adoc +49 -0
  28. data/docs/_reference/index.adoc +14 -0
  29. data/docs/_reference/node-api.adoc +79 -0
  30. data/docs/_reference/schema-api.adoc +95 -0
  31. data/docs/_reference/transform-api.adoc +77 -0
  32. data/docs/_understanding/document-model.adoc +65 -0
  33. data/docs/_understanding/fragment.adoc +52 -0
  34. data/docs/_understanding/index.adoc +14 -0
  35. data/docs/_understanding/resolved-position.adoc +53 -0
  36. data/docs/_understanding/slice.adoc +54 -0
  37. data/docs/lychee.toml +63 -0
  38. data/lib/prosereflect/blockquote.rb +9 -0
  39. data/lib/prosereflect/bullet_list.rb +25 -19
  40. data/lib/prosereflect/code_block.rb +1 -5
  41. data/lib/prosereflect/fragment.rb +249 -0
  42. data/lib/prosereflect/horizontal_rule.rb +9 -0
  43. data/lib/prosereflect/image.rb +9 -0
  44. data/lib/prosereflect/input/html.rb +96 -0
  45. data/lib/prosereflect/node.rb +141 -3
  46. data/lib/prosereflect/ordered_list.rb +2 -0
  47. data/lib/prosereflect/output/html.rb +227 -0
  48. data/lib/prosereflect/parser.rb +9 -0
  49. data/lib/prosereflect/resolved_pos.rb +256 -0
  50. data/lib/prosereflect/schema/attribute.rb +57 -0
  51. data/lib/prosereflect/schema/content_match.rb +656 -0
  52. data/lib/prosereflect/schema/fragment.rb +166 -0
  53. data/lib/prosereflect/schema/mark.rb +121 -0
  54. data/lib/prosereflect/schema/mark_type.rb +130 -0
  55. data/lib/prosereflect/schema/node.rb +236 -0
  56. data/lib/prosereflect/schema/node_type.rb +274 -0
  57. data/lib/prosereflect/schema/schema_main.rb +190 -0
  58. data/lib/prosereflect/schema/spec.rb +92 -0
  59. data/lib/prosereflect/schema.rb +39 -0
  60. data/lib/prosereflect/text.rb +24 -0
  61. data/lib/prosereflect/transform/attr_step.rb +157 -0
  62. data/lib/prosereflect/transform/insert_step.rb +115 -0
  63. data/lib/prosereflect/transform/mapping.rb +82 -0
  64. data/lib/prosereflect/transform/mark_step.rb +269 -0
  65. data/lib/prosereflect/transform/replace_around_step.rb +181 -0
  66. data/lib/prosereflect/transform/replace_step.rb +157 -0
  67. data/lib/prosereflect/transform/slice.rb +91 -0
  68. data/lib/prosereflect/transform/step.rb +89 -0
  69. data/lib/prosereflect/transform/step_map.rb +126 -0
  70. data/lib/prosereflect/transform/structure.rb +120 -0
  71. data/lib/prosereflect/transform/transform.rb +341 -0
  72. data/lib/prosereflect/transform.rb +26 -0
  73. data/lib/prosereflect/version.rb +1 -1
  74. data/lib/prosereflect.rb +3 -0
  75. data/spec/fixtures/documents/formatted_text.yaml +14 -0
  76. data/spec/fixtures/documents/heading_paragraph.yaml +16 -0
  77. data/spec/fixtures/documents/lists_doc.yaml +32 -0
  78. data/spec/fixtures/documents/mixed_content.yaml +40 -0
  79. data/spec/fixtures/documents/nested_doc.yaml +20 -0
  80. data/spec/fixtures/documents/simple_doc.yaml +6 -0
  81. data/spec/fixtures/documents/table_doc.yaml +32 -0
  82. data/spec/fixtures/documents/transform_test.yaml +14 -0
  83. data/spec/fixtures/schema/custom_schema.rb +37 -0
  84. data/spec/fixtures/schema/test_schema.rb +46 -0
  85. data/spec/fixtures/test_builder/helpers.rb +212 -0
  86. data/spec/prosereflect/document_spec.rb +1 -1
  87. data/spec/prosereflect/fragment_spec.rb +273 -0
  88. data/spec/prosereflect/input/html_spec.rb +197 -1
  89. data/spec/prosereflect/node_spec.rb +128 -0
  90. data/spec/prosereflect/output/whitespace_spec.rb +248 -0
  91. data/spec/prosereflect/parser/round_trip_spec.rb +472 -0
  92. data/spec/prosereflect/resolved_pos_spec.rb +74 -0
  93. data/spec/prosereflect/schema/conftest.rb +68 -0
  94. data/spec/prosereflect/schema/content_match_spec.rb +237 -0
  95. data/spec/prosereflect/schema/mark_spec.rb +274 -0
  96. data/spec/prosereflect/schema/mark_type_spec.rb +86 -0
  97. data/spec/prosereflect/schema/node_type_spec.rb +142 -0
  98. data/spec/prosereflect/schema/schema_spec.rb +194 -0
  99. data/spec/prosereflect/test_builder/marks_spec.rb +127 -0
  100. data/spec/prosereflect/transform/equivalence_spec.rb +487 -0
  101. data/spec/prosereflect/transform/mapping_spec.rb +226 -0
  102. data/spec/prosereflect/transform/replace_spec.rb +832 -0
  103. data/spec/prosereflect/transform/replace_step_spec.rb +157 -0
  104. data/spec/prosereflect/transform/slice_spec.rb +48 -0
  105. data/spec/prosereflect/transform/step_map_spec.rb +70 -0
  106. data/spec/prosereflect/transform/step_spec.rb +211 -0
  107. data/spec/prosereflect/transform/structure_spec.rb +98 -0
  108. data/spec/prosereflect/transform/transform_spec.rb +238 -0
  109. data/spec/spec_helper.rb +1 -0
  110. metadata +90 -2
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Prosereflect
4
+ # Fragment represents a sequence of nodes.
5
+ # Used for document content, slice content, etc.
6
+ class Fragment
7
+ attr_reader :content
8
+
9
+ def initialize(content = [])
10
+ @content = if content.is_a?(Array)
11
+ content
12
+ elsif content.respond_to?(:to_a)
13
+ content.to_a
14
+ else
15
+ [content]
16
+ end
17
+ end
18
+
19
+ # Total size of all nodes in this fragment
20
+ def size
21
+ @content.sum { |n| n.respond_to?(:node_size) ? n.node_size : n.text_content.length + 1 }
22
+ end
23
+
24
+ # Check if fragment is empty
25
+ def empty?
26
+ @content.empty?
27
+ end
28
+
29
+ # Append another fragment to this one
30
+ def append(other)
31
+ if other.is_a?(Fragment)
32
+ Fragment.new(@content + other.content)
33
+ else
34
+ Fragment.new(@content + [other])
35
+ end
36
+ end
37
+
38
+ # Cut this fragment to a range
39
+ def cut(from = 0, to = nil)
40
+ to ||= size
41
+
42
+ return Fragment.new([]) if from >= to
43
+
44
+ cut_nodes(from, to)
45
+ end
46
+
47
+ def cut_nodes(from, to)
48
+ result = []
49
+ pos = 0
50
+
51
+ @content.each do |node|
52
+ node_end = pos + node.node_size
53
+
54
+ result << node if in_range_before_from?(pos, node_end, from)
55
+ result << node if overlaps_range?(pos, node_end, from, to)
56
+
57
+ pos = node_end
58
+ break if pos >= to
59
+ end
60
+
61
+ Fragment.new(result)
62
+ end
63
+
64
+ def in_range_before_from?(_pos, node_end, from)
65
+ node_end <= from
66
+ end
67
+
68
+ def overlaps_range?(pos, node_end, from, to)
69
+ (pos >= from && node_end <= to) || (pos < from && node_end > from)
70
+ end
71
+
72
+ # Replace child at index
73
+ def replace_child(index, replacement)
74
+ new_content = @content.dup
75
+ new_content[index] = replacement
76
+ Fragment.new(new_content)
77
+ end
78
+
79
+ # Iterate over all nodes between positions
80
+ def nodes_between(from, to, callback = nil, node_start = 0, &blk)
81
+ cb = callback || blk
82
+ return unless cb && to > from
83
+
84
+ pos = 0
85
+
86
+ @content.each do |node|
87
+ node_end = pos + node.node_size
88
+ next unless node_end > from
89
+
90
+ dispatch_node_callback(node, pos, node_end, from, to, cb, node_start)
91
+ pos = node_end
92
+ break if pos >= to
93
+ end
94
+ end
95
+
96
+ def dispatch_node_callback(node, pos, node_end, from, to, callback, node_start)
97
+ if node.text?
98
+ text_node_callback(node, pos, from, node_start, callback)
99
+ elsif node_fully_in_range?(pos, node_end, from, to)
100
+ full_node_callback(node, pos, node_end, from, to, callback, node_start)
101
+ elsif node_overlaps_from?(pos, node_end, from)
102
+ partial_node_callback(node, pos, node_end, from, to, callback, node_start)
103
+ end
104
+ end
105
+
106
+ def text_node_callback(node, pos, from, node_start, callback)
107
+ callback.call(node, node_start + (from - pos).clamp(0, node.node_size - 1))
108
+ end
109
+
110
+ def node_fully_in_range?(pos, node_end, from, to)
111
+ pos >= from && node_end <= to
112
+ end
113
+
114
+ def full_node_callback(node, _pos, _node_end, _from, _to, callback, node_start)
115
+ callback.call(node, node_start)
116
+ recurse_into_node(node, 0, node.content.size, callback, node_start)
117
+ end
118
+
119
+ def partial_node_callback(node, pos, _node_end, from, to, callback, node_start)
120
+ recurse_into_node(node, from - pos, [to - pos, node.content.size].min, callback, node_start)
121
+ end
122
+
123
+ def node_overlaps_from?(pos, node_end, from)
124
+ pos < from && node_end > from
125
+ end
126
+
127
+ def recurse_into_node(node, start_pos, end_pos, callback, node_start)
128
+ return unless node.respond_to?(:nodes_between)
129
+
130
+ node.nodes_between(start_pos, end_pos, callback, node_start)
131
+ end
132
+
133
+ # Iterate over all descendant nodes
134
+ def descendants(block, node_start = 0)
135
+ nodes_between(0, size, block, node_start)
136
+ end
137
+
138
+ # Extract text content between positions
139
+ def text_between(_from, _to, separator = "", _block_separator = "\n")
140
+ result = []
141
+ @content.each do |node|
142
+ if node.respond_to?(:text)
143
+ result << node.text
144
+ elsif node.respond_to?(:text_content)
145
+ result << node.text_content
146
+ end
147
+ end
148
+ result.join(separator)
149
+ end
150
+
151
+ # Find first position where two fragments differ
152
+ def find_diff_start(other)
153
+ min_length = [@content.length, other.content.length].min
154
+
155
+ pos = 0
156
+ min_length.times do |i|
157
+ return pos if @content[i] != other.content[i]
158
+
159
+ pos += @content[i].node_size
160
+ end
161
+
162
+ return nil if @content.length == other.content.length
163
+
164
+ pos
165
+ end
166
+
167
+ # Find last position where two fragments differ
168
+ def find_diff_end(other)
169
+ my_nodes = @content.reverse
170
+ other_nodes = other.content.reverse
171
+
172
+ i = 0
173
+ end_pos = size
174
+
175
+ while i < my_nodes.length && i < other_nodes.length
176
+ my_node = my_nodes[i]
177
+ other_node = other_nodes[i]
178
+
179
+ unless my_node == other_node
180
+ return end_pos
181
+ end
182
+
183
+ end_pos -= my_node.node_size
184
+ i += 1
185
+ end
186
+
187
+ nil
188
+ end
189
+
190
+ # Check equality
191
+ def eq?(other)
192
+ return false unless other.is_a?(Fragment)
193
+
194
+ @content.length == other.content.length &&
195
+ @content.zip(other.content).all? { |a, b| a.to_h == b.to_h }
196
+ end
197
+
198
+ alias == eq?
199
+
200
+ # Hash for use in sets/hashes
201
+ def hash
202
+ @content.map(&:to_h).hash
203
+ end
204
+
205
+ # Access by index
206
+ def [](index)
207
+ @content[index]
208
+ end
209
+
210
+ # Iterate
211
+ def each(&block)
212
+ @content.each(&block)
213
+ end
214
+
215
+ # Number of items
216
+ def length
217
+ @content.length
218
+ end
219
+
220
+ alias count length
221
+
222
+ # Convert to array
223
+ def to_a
224
+ @content.dup
225
+ end
226
+
227
+ # Create empty fragment
228
+ def self.empty
229
+ @empty ||= new([])
230
+ end
231
+
232
+ # Create from content
233
+ def self.from(content)
234
+ case content
235
+ when Fragment then content
236
+ when Array then new(content.flatten)
237
+ else new([content])
238
+ end
239
+ end
240
+
241
+ def to_s
242
+ "<Fragment #{@content.length} nodes>"
243
+ end
244
+
245
+ def inspect
246
+ to_s
247
+ end
248
+ end
249
+ end
@@ -59,6 +59,15 @@ module Prosereflect
59
59
  end
60
60
 
61
61
  # Override content-related methods since horizontal rules don't have content
62
+ def to_h
63
+ hash = super
64
+ if hash["attrs"]
65
+ %w[style width thickness].each { |k| hash["attrs"].delete(k) if hash["attrs"][k].nil? }
66
+ hash.delete("attrs") if hash["attrs"].empty?
67
+ end
68
+ hash
69
+ end
70
+
62
71
  def add_child(*)
63
72
  raise NotImplementedError, "Horizontal rule nodes cannot have children"
64
73
  end
@@ -94,6 +94,15 @@ module Prosereflect
94
94
  }.compact
95
95
  end
96
96
 
97
+ def to_h
98
+ hash = super
99
+ if hash["attrs"]
100
+ %w[title width height].each { |k| hash["attrs"].delete(k) if hash["attrs"][k].nil? }
101
+ hash.delete("attrs") if hash["attrs"].empty?
102
+ end
103
+ hash
104
+ end
105
+
97
106
  # Override content-related methods since images don't have content
98
107
  def add_child(*)
99
108
  raise NotImplementedError, "Image nodes cannot have children"
@@ -478,7 +478,103 @@ module Prosereflect
478
478
  user.id = html_node["data-id"]
479
479
  user
480
480
  end
481
+
482
+ # Parse HTML with full schema validation
483
+ def parse_with_schema(html, schema, _rules = {})
484
+ document = parse(html)
485
+ validate_against_schema(document, schema)
486
+ document
487
+ rescue ValidationError
488
+ # Fall back to basic parsing if validation fails
489
+ document
490
+ end
491
+
492
+ # Parse HTML with custom parse rules
493
+ def parse_with_rules(html, rules:)
494
+ options = {
495
+ keep_empty: rules[:keep_empty] || false,
496
+ find_wrapping: rules[:find_wrapping],
497
+ top_node: rules[:top_node],
498
+ top_start: rules[:top_start],
499
+ }.merge(rules)
500
+
501
+ document = parse(html)
502
+ apply_parse_rules(document, options)
503
+ end
504
+
505
+ # Parse a single node with context
506
+ def parse_node(html_node, options = {})
507
+ parent_node = options[:node]
508
+ saved_styles = options[:saved_styles] || []
509
+ top_node = options[:top_node] || false
510
+ clear_null = options.fetch(:clear_null, true)
511
+
512
+ node = convert_node(html_node)
513
+ return nil if clear_null && node.nil?
514
+
515
+ apply_node_options(node, parent_node, saved_styles, top_node)
516
+ end
517
+
518
+ # Check if whitespace should be preserved in node
519
+ def preserve_whitespace?(node)
520
+ return true if node.name == "pre"
521
+ return true if node.name == "textarea"
522
+
523
+ style = node["style"]
524
+ return false unless style
525
+
526
+ style.include?("white-space") && style.include?("pre")
527
+ end
528
+
529
+ # Determine space collapsing behavior
530
+ def collapsed_spaces(node)
531
+ return :preserve if preserve_whitespace?(node)
532
+ return :collapse if node.name == "br"
533
+
534
+ :collapse
535
+ end
536
+
537
+ # Normalize whitespace in text
538
+ def normalize_whitespace(text)
539
+ text.gsub(/\s+/, " ").strip
540
+ end
541
+
542
+ def validate_against_schema(document, schema)
543
+ # Basic schema validation
544
+ document.nodes.each do |node|
545
+ validate_node_against_schema(node, schema)
546
+ end
547
+ end
548
+
549
+ def validate_node_against_schema(node, schema)
550
+ node_type = schema.node_type(node.type)
551
+ return unless node_type
552
+
553
+ # Check required content
554
+ return unless node_type.required_content.any?
555
+
556
+ missing = node_type.required_content - (node.content.map(&:type) & node_type.required_content)
557
+ raise ValidationError, "Missing required content: #{missing.join(', ')}" unless missing.empty?
558
+ end
559
+
560
+ def apply_parse_rules(document, options)
561
+ return document unless options[:keep_empty]
562
+
563
+ document
564
+ end
565
+
566
+ def apply_node_options(node, parent_node, saved_styles, top_node)
567
+ return node unless node.respond_to?(:marks=)
568
+
569
+ if top_node && parent_node
570
+ # Apply parent context marks
571
+ node.marks = saved_styles.dup
572
+ end
573
+ node
574
+ end
481
575
  end
576
+
577
+ class ValidationError < StandardError; end
482
578
  end
483
579
  end
484
580
  end
@@ -206,18 +206,156 @@ module Prosereflect
206
206
  content.map(&:text_content).join
207
207
  end
208
208
 
209
+ # Size of this node in the document tree.
210
+ # For non-text nodes: 1 (opening token) + sum of children's node_size.
211
+ # For text nodes: overridden to text.length + 1.
212
+ def node_size
213
+ size = 1
214
+ content&.each { |child| size += child.node_size }
215
+ size
216
+ end
217
+
218
+ # Whether this node represents a text node.
219
+ # Overridden to true in Text class.
220
+ def text?
221
+ false
222
+ end
223
+
224
+ # Return a copy of this node with content restricted to the given range.
225
+ # Positions are relative to the start of this node's content.
226
+ def cut(from = 0, to = nil)
227
+ to ||= node_size
228
+ return self if from.zero? && to == node_size
229
+
230
+ if text?
231
+ # Text nodes override this
232
+ self
233
+ else
234
+ copy(cut_content(from, to))
235
+ end
236
+ end
237
+
238
+ # Iterate over all nodes between two positions in this node.
239
+ # Accepts a block or a callable as the third positional argument.
240
+ def nodes_between(from, to, callback = nil, node_start = 0, &block)
241
+ cb = callback || block
242
+ return unless cb && to > from && content
243
+
244
+ pos = 0
245
+ content.each_with_index do |child, i|
246
+ break if pos >= to
247
+
248
+ child_end = pos + child.node_size
249
+ next unless child_end > from
250
+
251
+ child_start = node_start + pos + 1
252
+ if cb.call(child, child_start, i) != false && child.content && child.content.any?
253
+ child.nodes_between(
254
+ [0, from - pos - 1].max,
255
+ [child.content ? child.content.size : 0, to - pos - 1].min,
256
+ cb,
257
+ child_start,
258
+ )
259
+ end
260
+
261
+ pos = child_end
262
+ end
263
+ end
264
+
265
+ # Iterate over all descendant nodes.
266
+ def descendants(&block)
267
+ nodes_between(0, node_size - 1, &block)
268
+ end
269
+
270
+ # Check structural equality with another node.
271
+ def eq?(other)
272
+ return false unless other.is_a?(Node)
273
+
274
+ type == other.type && to_h == other.to_h
275
+ end
276
+
277
+ # Create a copy of this node with different content.
278
+ def copy(new_content = nil)
279
+ new_node = self.class.new(type: type, attrs: attrs, marks: raw_marks)
280
+ case new_content
281
+ when nil
282
+ # no content
283
+ when Array
284
+ new_node.content = new_content
285
+ when Fragment
286
+ new_node.content = new_content.to_a
287
+ else
288
+ new_node.content = [new_content]
289
+ end
290
+ new_node
291
+ end
292
+
209
293
  # Ensures YAML serialization outputs plain data instead of a Ruby object
210
294
  def to_yaml(*args)
211
295
  to_h.to_yaml(*args)
212
296
  end
213
297
 
298
+ # Resolve a document position to a ResolvedPos
299
+ def resolve(pos)
300
+ path = []
301
+ build_path_for_pos(pos, path)
302
+ depth = [(path.length / 3) - 1, 0].max
303
+ ResolvedPos.new(pos, path, depth)
304
+ end
305
+
306
+ # Get the node at a given depth in the path
307
+ def node(depth)
308
+ @path[depth * 2]
309
+ end
310
+
214
311
  private
215
312
 
216
- def process_node_attributes(attrs, node_type)
313
+ def cut_content(from, to)
314
+ return [] unless content
315
+
316
+ result = []
317
+ pos = 0
318
+ content.each do |child|
319
+ child_end = pos + child.node_size
320
+ if pos >= from && child_end <= to
321
+ result << child
322
+ elsif pos < to && child_end > from
323
+ result << child.cut([0, from - pos - 1].max, child.node_size - [0, child_end - to].max)
324
+ end
325
+ pos = child_end
326
+ break if pos >= to
327
+ end
328
+ result
329
+ end
330
+
331
+ def build_path_for_pos(pos, path, index = 0, start_offset = 0)
332
+ path << self << index << start_offset
333
+ return if pos.zero?
334
+
335
+ traverse_children_for_resolve(pos, path)
336
+ end
337
+
338
+ def traverse_children_for_resolve(pos, path)
339
+ return unless content
340
+
341
+ content_offset = 1
342
+ child_index = 0
343
+
344
+ content.each do |child|
345
+ child_end = content_offset + child.node_size
346
+ if pos < child_end
347
+ child.send(:build_path_for_pos, pos - content_offset, path, child_index, content_offset)
348
+ return
349
+ end
350
+
351
+ content_offset = child_end
352
+ child_index += 1
353
+ end
354
+ end
355
+
356
+ def process_node_attributes(attrs, _node_type)
217
357
  if attrs["attrs"].is_a?(Hash)
218
358
  attrs["attrs"]
219
- elsif node_type == "bullet_list" && attrs["bullet_style"].nil?
220
- nil
221
359
  else
222
360
  attrs
223
361
  end
@@ -28,6 +28,8 @@ module Prosereflect
28
28
 
29
29
  def start=(value)
30
30
  @start = value
31
+ return if value.nil?
32
+
31
33
  self.attrs ||= {}
32
34
  attrs["start"] = value
33
35
  end