markdown-merge 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +251 -0
  4. data/CITATION.cff +20 -0
  5. data/CODE_OF_CONDUCT.md +134 -0
  6. data/CONTRIBUTING.md +227 -0
  7. data/FUNDING.md +74 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +1087 -0
  10. data/REEK +0 -0
  11. data/RUBOCOP.md +71 -0
  12. data/SECURITY.md +21 -0
  13. data/lib/markdown/merge/cleanse/block_spacing.rb +253 -0
  14. data/lib/markdown/merge/cleanse/code_fence_spacing.rb +294 -0
  15. data/lib/markdown/merge/cleanse/condensed_link_refs.rb +405 -0
  16. data/lib/markdown/merge/cleanse.rb +42 -0
  17. data/lib/markdown/merge/code_block_merger.rb +300 -0
  18. data/lib/markdown/merge/conflict_resolver.rb +128 -0
  19. data/lib/markdown/merge/debug_logger.rb +26 -0
  20. data/lib/markdown/merge/document_problems.rb +190 -0
  21. data/lib/markdown/merge/file_aligner.rb +196 -0
  22. data/lib/markdown/merge/file_analysis.rb +353 -0
  23. data/lib/markdown/merge/file_analysis_base.rb +629 -0
  24. data/lib/markdown/merge/freeze_node.rb +93 -0
  25. data/lib/markdown/merge/gap_line_node.rb +136 -0
  26. data/lib/markdown/merge/link_definition_formatter.rb +49 -0
  27. data/lib/markdown/merge/link_definition_node.rb +157 -0
  28. data/lib/markdown/merge/link_parser.rb +421 -0
  29. data/lib/markdown/merge/link_reference_rehydrator.rb +320 -0
  30. data/lib/markdown/merge/markdown_structure.rb +123 -0
  31. data/lib/markdown/merge/merge_result.rb +166 -0
  32. data/lib/markdown/merge/node_type_normalizer.rb +126 -0
  33. data/lib/markdown/merge/output_builder.rb +166 -0
  34. data/lib/markdown/merge/partial_template_merger.rb +334 -0
  35. data/lib/markdown/merge/smart_merger.rb +221 -0
  36. data/lib/markdown/merge/smart_merger_base.rb +621 -0
  37. data/lib/markdown/merge/table_match_algorithm.rb +504 -0
  38. data/lib/markdown/merge/table_match_refiner.rb +136 -0
  39. data/lib/markdown/merge/version.rb +12 -0
  40. data/lib/markdown/merge/whitespace_normalizer.rb +251 -0
  41. data/lib/markdown/merge.rb +149 -0
  42. data/lib/markdown-merge.rb +4 -0
  43. data/sig/markdown/merge.rbs +341 -0
  44. data.tar.gz.sig +0 -0
  45. metadata +365 -0
  46. metadata.gz.sig +0 -0
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdown
4
+ module Merge
5
+ # Aligns Markdown block elements between template and destination files.
6
+ #
7
+ # Uses structural signatures to match headings, paragraphs, lists, code blocks,
8
+ # and other block elements. The alignment is then used by SmartMerger to
9
+ # determine how to combine the files.
10
+ #
11
+ # @example Basic usage
12
+ # aligner = FileAligner.new(template_analysis, dest_analysis)
13
+ # alignment = aligner.align
14
+ # alignment.each do |entry|
15
+ # case entry[:type]
16
+ # when :match
17
+ # # Both files have this element
18
+ # when :template_only
19
+ # # Only in template
20
+ # when :dest_only
21
+ # # Only in destination
22
+ # end
23
+ # end
24
+ #
25
+ # @see FileAnalysisBase
26
+ # @see SmartMergerBase
27
+ class FileAligner
28
+ # @return [FileAnalysisBase] Template file analysis
29
+ attr_reader :template_analysis
30
+
31
+ # @return [FileAnalysisBase] Destination file analysis
32
+ attr_reader :dest_analysis
33
+
34
+ # @return [#call, nil] Optional match refiner for fuzzy matching
35
+ attr_reader :match_refiner
36
+
37
+ # Initialize a file aligner
38
+ #
39
+ # @param template_analysis [FileAnalysisBase] Analysis of the template file
40
+ # @param dest_analysis [FileAnalysisBase] Analysis of the destination file
41
+ # @param match_refiner [#call, nil] Optional match refiner for fuzzy matching
42
+ def initialize(template_analysis, dest_analysis, match_refiner: nil)
43
+ @template_analysis = template_analysis
44
+ @dest_analysis = dest_analysis
45
+ @match_refiner = match_refiner
46
+ end
47
+
48
+ # Perform alignment between template and destination statements
49
+ #
50
+ # @return [Array<Hash>] Alignment entries with type, indices, and nodes
51
+ def align
52
+ template_statements = @template_analysis.statements
53
+ dest_statements = @dest_analysis.statements
54
+
55
+ # Build signature maps
56
+ template_by_sig = build_signature_map(template_statements, @template_analysis)
57
+ dest_by_sig = build_signature_map(dest_statements, @dest_analysis)
58
+
59
+ # Track which indices have been matched
60
+ matched_template = Set.new
61
+ matched_dest = Set.new
62
+ alignment = []
63
+
64
+ # First pass: find matches by signature
65
+ template_by_sig.each do |sig, template_indices|
66
+ next unless dest_by_sig.key?(sig)
67
+
68
+ dest_indices = dest_by_sig[sig]
69
+
70
+ # Match indices pairwise (first template with first dest, etc.)
71
+ template_indices.zip(dest_indices).each do |t_idx, d_idx|
72
+ next unless t_idx && d_idx
73
+
74
+ alignment << {
75
+ type: :match,
76
+ template_index: t_idx,
77
+ dest_index: d_idx,
78
+ signature: sig,
79
+ template_node: template_statements[t_idx],
80
+ dest_node: dest_statements[d_idx],
81
+ }
82
+
83
+ matched_template << t_idx
84
+ matched_dest << d_idx
85
+ end
86
+ end
87
+
88
+ # Apply match refiner to find additional fuzzy matches
89
+ if @match_refiner
90
+ unmatched_t_nodes = template_statements.each_with_index.reject { |_, i| matched_template.include?(i) }.map(&:first)
91
+ unmatched_d_nodes = dest_statements.each_with_index.reject { |_, i| matched_dest.include?(i) }.map(&:first)
92
+
93
+ unless unmatched_t_nodes.empty? || unmatched_d_nodes.empty?
94
+ refiner_matches = @match_refiner.call(unmatched_t_nodes, unmatched_d_nodes, {
95
+ template_analysis: @template_analysis,
96
+ dest_analysis: @dest_analysis,
97
+ })
98
+
99
+ refiner_matches.each do |match|
100
+ t_idx = template_statements.index(match.template_node)
101
+ d_idx = dest_statements.index(match.dest_node)
102
+
103
+ next unless t_idx && d_idx
104
+ next if matched_template.include?(t_idx) || matched_dest.include?(d_idx)
105
+
106
+ alignment << {
107
+ type: :match,
108
+ template_index: t_idx,
109
+ dest_index: d_idx,
110
+ signature: [:refined_match, match.score],
111
+ template_node: match.template_node,
112
+ dest_node: match.dest_node,
113
+ }
114
+
115
+ matched_template << t_idx
116
+ matched_dest << d_idx
117
+ end
118
+ end
119
+ end
120
+
121
+ # Second pass: add template-only entries
122
+ template_statements.each_with_index do |stmt, idx|
123
+ next if matched_template.include?(idx)
124
+
125
+ alignment << {
126
+ type: :template_only,
127
+ template_index: idx,
128
+ dest_index: nil,
129
+ signature: @template_analysis.signature_at(idx),
130
+ template_node: stmt,
131
+ dest_node: nil,
132
+ }
133
+ end
134
+
135
+ # Third pass: add dest-only entries
136
+ dest_statements.each_with_index do |stmt, idx|
137
+ next if matched_dest.include?(idx)
138
+
139
+ alignment << {
140
+ type: :dest_only,
141
+ template_index: nil,
142
+ dest_index: idx,
143
+ signature: @dest_analysis.signature_at(idx),
144
+ template_node: nil,
145
+ dest_node: stmt,
146
+ }
147
+ end
148
+
149
+ # Sort by appearance order (destination order for matched/dest-only, then template-only)
150
+ alignment.sort_by! do |entry|
151
+ case entry[:type]
152
+ when :match
153
+ [0, entry[:dest_index]]
154
+ when :dest_only
155
+ [0, entry[:dest_index]]
156
+ when :template_only
157
+ [1, entry[:template_index]]
158
+ else
159
+ # :nocov: defensive - only :match, :dest_only, :template_only types are created
160
+ [2, 0] # Unknown types sort last
161
+ # :nocov:
162
+ end
163
+ end
164
+
165
+ DebugLogger.debug("Alignment complete", {
166
+ total: alignment.size,
167
+ matches: alignment.count { |e| e[:type] == :match },
168
+ template_only: alignment.count { |e| e[:type] == :template_only },
169
+ dest_only: alignment.count { |e| e[:type] == :dest_only },
170
+ })
171
+
172
+ alignment
173
+ end
174
+
175
+ private
176
+
177
+ # Build a map from signatures to statement indices
178
+ #
179
+ # @param statements [Array] List of statements
180
+ # @param analysis [FileAnalysisBase] Analysis for signature generation
181
+ # @return [Hash<Array, Array<Integer>>] Map from signature to indices
182
+ def build_signature_map(statements, analysis)
183
+ map = Hash.new { |h, k| h[k] = [] }
184
+
185
+ statements.each_with_index do |_stmt, idx|
186
+ sig = analysis.signature_at(idx)
187
+ # :nocov: defensive - signature_at always returns a value for valid indices
188
+ map[sig] << idx if sig
189
+ # :nocov:
190
+ end
191
+
192
+ map
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,353 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module Markdown
6
+ module Merge
7
+ # File analysis for Markdown files using tree_haver backends.
8
+ #
9
+ # Extends FileAnalysisBase with backend-agnostic parsing via tree_haver.
10
+ # Supports both Commonmarker and Markly backends through tree_haver's
11
+ # unified API.
12
+ #
13
+ # Parses Markdown source code and extracts:
14
+ # - Top-level block elements (headings, paragraphs, lists, code blocks, etc.)
15
+ # - Freeze blocks marked with HTML comments
16
+ # - Structural signatures for matching elements between files
17
+ #
18
+ # All nodes are wrapped with canonical types via NodeTypeNormalizer,
19
+ # enabling portable merge rules across backends.
20
+ #
21
+ # Freeze blocks are marked with HTML comments:
22
+ # <!-- markdown-merge:freeze -->
23
+ # ... content to preserve ...
24
+ # <!-- markdown-merge:unfreeze -->
25
+ #
26
+ # @example Basic usage with auto backend
27
+ # analysis = FileAnalysis.new(markdown_source)
28
+ # analysis.statements.each do |node|
29
+ # puts "#{node.merge_type}: #{node.type}"
30
+ # end
31
+ #
32
+ # @example With specific backend
33
+ # analysis = FileAnalysis.new(markdown_source, backend: :markly)
34
+ #
35
+ # @example With custom freeze token
36
+ # analysis = FileAnalysis.new(source, freeze_token: "my-merge")
37
+ # # Looks for: <!-- my-merge:freeze --> / <!-- my-merge:unfreeze -->
38
+ #
39
+ # @see FileAnalysisBase Base class
40
+ # @see NodeTypeNormalizer Type normalization
41
+ class FileAnalysis < FileAnalysisBase
42
+ # Default freeze token for identifying freeze blocks
43
+ # @return [String]
44
+ DEFAULT_FREEZE_TOKEN = "markdown-merge"
45
+
46
+ # @return [Symbol] The backend being used (:commonmarker, :markly)
47
+ attr_reader :backend
48
+
49
+ # @return [Hash] Parser-specific options
50
+ attr_reader :parser_options
51
+
52
+ # Initialize file analysis with tree_haver backend.
53
+ #
54
+ # @param source [String] Markdown source code to analyze
55
+ # @param backend [Symbol] Backend to use (:commonmarker, :markly, :auto)
56
+ # @param freeze_token [String] Token for freeze block markers
57
+ # @param signature_generator [Proc, nil] Custom signature generator
58
+ # @param parser_options [Hash] Backend-specific parser options
59
+ # For commonmarker: { options: {} }
60
+ # For markly: { flags: Markly::DEFAULT, extensions: [:table] }
61
+ def initialize(
62
+ source,
63
+ backend: :auto,
64
+ freeze_token: DEFAULT_FREEZE_TOKEN,
65
+ signature_generator: nil,
66
+ **parser_options
67
+ )
68
+ @requested_backend = backend
69
+ @parser_options = parser_options
70
+
71
+ # Resolve and initialize the backend
72
+ @backend = resolve_backend(backend)
73
+ @parser = create_parser
74
+
75
+ super(source, freeze_token: freeze_token, signature_generator: signature_generator)
76
+ end
77
+
78
+ # Parse the source document using tree_haver backend.
79
+ #
80
+ # Error handling follows the same pattern as other *-merge gems:
81
+ # - TreeHaver::Error (which inherits from Exception, not StandardError) is caught
82
+ # - TreeHaver::NotAvailable is a subclass of TreeHaver::Error, so it's also caught
83
+ # - When an error occurs, the error is stored in @errors and nil is returned
84
+ # - SmartMergerBase#parse_and_analyze checks valid? and raises the appropriate parse error
85
+ #
86
+ # @param source [String] Markdown source to parse
87
+ # @return [Object, nil] Root document node from tree_haver, or nil on error
88
+ def parse_document(source)
89
+ tree = @parser.parse(source)
90
+ tree.root_node
91
+ rescue TreeHaver::Error => e
92
+ # TreeHaver::Error inherits from Exception, not StandardError.
93
+ # This also catches TreeHaver::NotAvailable (subclass of Error).
94
+ @errors << e.message
95
+ nil
96
+ end
97
+
98
+ # Get the next sibling of a node.
99
+ #
100
+ # Handles differences between backends:
101
+ # - Commonmarker: node.next_sibling
102
+ # - Markly: node.next
103
+ #
104
+ # @param node [Object] Current node
105
+ # @return [Object, nil] Next sibling or nil
106
+ def next_sibling(node)
107
+ # tree_haver normalizes this, but handle both patterns for safety
108
+ if node.respond_to?(:next_sibling)
109
+ node.next_sibling
110
+ elsif node.respond_to?(:next)
111
+ node.next
112
+ end
113
+ end
114
+
115
+ # Returns the FreezeNode class to use.
116
+ #
117
+ # @return [Class] Markdown::Merge::FreezeNode
118
+ def freeze_node_class
119
+ FreezeNode
120
+ end
121
+
122
+ # Check if value is a tree_haver node.
123
+ #
124
+ # @param value [Object] Value to check
125
+ # @return [Boolean] true if this is a parser node
126
+ def parser_node?(value)
127
+ # Check for tree_haver node or wrapped node
128
+ return true if value.respond_to?(:type) && value.respond_to?(:source_position)
129
+ return true if Ast::Merge::NodeTyping.typed_node?(value)
130
+
131
+ false
132
+ end
133
+
134
+ # Override to detect tree_haver nodes for signature generator fallthrough
135
+ # @param value [Object] The value to check
136
+ # @return [Boolean] true if this is a fallthrough node
137
+ def fallthrough_node?(value)
138
+ Ast::Merge::NodeTyping.typed_node?(value) ||
139
+ value.is_a?(Ast::Merge::FreezeNodeBase) ||
140
+ parser_node?(value) ||
141
+ super
142
+ end
143
+
144
+ # Compute signature for a tree_haver node.
145
+ #
146
+ # Uses canonical types from NodeTypeNormalizer for portable signatures.
147
+ #
148
+ # @param node [Object] The node (may be wrapped)
149
+ # @return [Array, nil] Signature array
150
+ def compute_parser_signature(node)
151
+ # Get canonical type from wrapper or normalize raw type
152
+ canonical_type = if Ast::Merge::NodeTyping.typed_node?(node)
153
+ Ast::Merge::NodeTyping.merge_type_for(node)
154
+ else
155
+ NodeTypeNormalizer.canonical_type(node.type, @backend)
156
+ end
157
+
158
+ # Unwrap to access underlying node methods
159
+ raw_node = Ast::Merge::NodeTyping.unwrap(node)
160
+
161
+ case canonical_type
162
+ when :heading
163
+ # Content-based: Match headings by level and text content
164
+ [:heading, raw_node.header_level, extract_text_content(raw_node)]
165
+ when :paragraph
166
+ # Content-based: Match paragraphs by content hash (first 32 chars of digest)
167
+ text = extract_text_content(raw_node)
168
+ [:paragraph, Digest::SHA256.hexdigest(text)[0, 32]]
169
+ when :code_block
170
+ # Content-based: Match code blocks by fence info and content hash
171
+ content = safe_string_content(raw_node)
172
+ fence_info = raw_node.respond_to?(:fence_info) ? raw_node.fence_info : nil
173
+ [:code_block, fence_info, Digest::SHA256.hexdigest(content)[0, 16]]
174
+ when :list
175
+ # Structure-based: Match lists by type and item count (content may differ)
176
+ list_type = raw_node.respond_to?(:list_type) ? raw_node.list_type : nil
177
+ [:list, list_type, count_children(raw_node)]
178
+ when :block_quote
179
+ # Content-based: Match block quotes by content hash
180
+ text = extract_text_content(raw_node)
181
+ [:block_quote, Digest::SHA256.hexdigest(text)[0, 16]]
182
+ when :thematic_break
183
+ # Structure-based: All thematic breaks are equivalent
184
+ [:thematic_break]
185
+ when :html_block
186
+ # Content-based: Match HTML blocks by content hash
187
+ content = safe_string_content(raw_node)
188
+ [:html_block, Digest::SHA256.hexdigest(content)[0, 16]]
189
+ when :table
190
+ # Content-based: Match tables by structure and header content
191
+ header_content = extract_table_header_content(raw_node)
192
+ [:table, count_children(raw_node), Digest::SHA256.hexdigest(header_content)[0, 16]]
193
+ when :footnote_definition
194
+ # Name/label-based: Match footnotes by name or label
195
+ label = raw_node.respond_to?(:name) ? raw_node.name : safe_string_content(raw_node)
196
+ [:footnote_definition, label]
197
+ when :custom_block
198
+ # Content-based: Match custom blocks by content hash
199
+ text = extract_text_content(raw_node)
200
+ [:custom_block, Digest::SHA256.hexdigest(text)[0, 16]]
201
+ else
202
+ # Unknown type - use canonical type and position
203
+ pos = raw_node.source_position
204
+ [:unknown, canonical_type, pos&.dig(:start_line)]
205
+ end
206
+ end
207
+
208
+ # Extract all text content from a node and its children.
209
+ #
210
+ # Override for tree_haver nodes which don't have a `walk` method.
211
+ # Uses recursive traversal via `children` instead.
212
+ #
213
+ # @param node [Object] The node
214
+ # @return [String] Concatenated text content
215
+ def extract_text_content(node)
216
+ text_parts = []
217
+ collect_text_recursive(node, text_parts)
218
+ text_parts.join
219
+ end
220
+
221
+ # Safely get string content from a node.
222
+ #
223
+ # Override for tree_haver nodes which use `text` instead of `string_content`.
224
+ #
225
+ # @param node [Object] The node
226
+ # @return [String] String content or empty string
227
+ def safe_string_content(node)
228
+ if node.respond_to?(:string_content)
229
+ node.string_content.to_s
230
+ elsif node.respond_to?(:text)
231
+ node.text.to_s
232
+ else
233
+ extract_text_content(node)
234
+ end
235
+ rescue TypeError, NoMethodError
236
+ extract_text_content(node)
237
+ end
238
+
239
+ # Collect top-level nodes from document, wrapping with canonical types.
240
+ #
241
+ # @return [Array<Object>] Wrapped nodes
242
+ def collect_top_level_nodes
243
+ nodes = []
244
+ child = @document.first_child
245
+ while child
246
+ # Wrap each node with its canonical type
247
+ wrapped = NodeTypeNormalizer.wrap(child, @backend)
248
+ nodes << wrapped
249
+ child = next_sibling(child)
250
+ end
251
+ nodes
252
+ end
253
+
254
+ private
255
+
256
+ # Recursively collect text content from a node and its descendants.
257
+ #
258
+ # Uses NodeTypeNormalizer to map backend-specific types to canonical types,
259
+ # enabling portable type checking across different markdown parsers.
260
+ #
261
+ # NOTE: We use `type` here instead of `merge_type` because this method operates
262
+ # on child nodes (text, code), not top-level statements.
263
+ # Only top-level statements are wrapped by NodeTypeNormalizer with `merge_type`.
264
+ # However, we use NodeTypeNormalizer.canonical_type to normalize the raw type.
265
+ #
266
+ # @param node [Object] The node to traverse
267
+ # @param text_parts [Array<String>] Array to accumulate text into
268
+ # @return [void]
269
+ def collect_text_recursive(node, text_parts)
270
+ # Normalize the type using NodeTypeNormalizer for backend portability
271
+ canonical_type = NodeTypeNormalizer.canonical_type(node.type, @backend)
272
+
273
+ # Collect text from text and code nodes
274
+ if canonical_type == :text || canonical_type == :code
275
+ content = if node.respond_to?(:string_content)
276
+ node.string_content.to_s
277
+ elsif node.respond_to?(:text)
278
+ node.text.to_s
279
+ else
280
+ ""
281
+ end
282
+ text_parts << content unless content.empty?
283
+ end
284
+
285
+ # Recurse into children
286
+ node.children.each do |child|
287
+ collect_text_recursive(child, text_parts)
288
+ end
289
+ end
290
+
291
+ # Resolve the backend to use.
292
+ #
293
+ # For :auto, attempts commonmarker first, then markly.
294
+ # tree_haver handles the actual availability checking.
295
+ #
296
+ # @param backend [Symbol] Requested backend
297
+ # @return [Symbol] Resolved backend (:commonmarker or :markly)
298
+ def resolve_backend(backend)
299
+ return backend unless backend == :auto
300
+
301
+ # Try commonmarker first, then markly
302
+ if TreeHaver::BackendRegistry.available?(:commonmarker)
303
+ :commonmarker
304
+ elsif TreeHaver::BackendRegistry.available?(:markly)
305
+ :markly
306
+ else
307
+ # Let tree_haver raise the appropriate error
308
+ :commonmarker
309
+ end
310
+ end
311
+
312
+ # Create a parser for the resolved backend.
313
+ #
314
+ # @return [Object] tree_haver parser instance
315
+ def create_parser
316
+ case @backend
317
+ when :commonmarker
318
+ create_commonmarker_parser
319
+ when :markly
320
+ create_markly_parser
321
+ else
322
+ raise ArgumentError, "Unknown backend: #{@backend}"
323
+ end
324
+ end
325
+
326
+ # Create a Commonmarker parser via commonmarker-merge backend.
327
+ #
328
+ # @return [Commonmarker::Merge::Backend::Parser]
329
+ def create_commonmarker_parser
330
+ parser = Commonmarker::Merge::Backend::Parser.new
331
+ # Default options enable table extension for GFM compatibility
332
+ default_options = {extension: {table: true}}
333
+ options = default_options.merge(@parser_options[:options] || {})
334
+ parser.language = Commonmarker::Merge::Backend::Language.markdown(options: options)
335
+ parser
336
+ end
337
+
338
+ # Create a Markly parser via markly-merge backend.
339
+ #
340
+ # @return [Markly::Merge::Backend::Parser]
341
+ def create_markly_parser
342
+ parser = Markly::Merge::Backend::Parser.new
343
+ flags = @parser_options[:flags]
344
+ extensions = @parser_options[:extensions] || [:table]
345
+ parser.language = Markly::Merge::Backend::Language.markdown(
346
+ flags: flags,
347
+ extensions: extensions,
348
+ )
349
+ parser
350
+ end
351
+ end
352
+ end
353
+ end