markdown-merge 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +251 -0
  4. data/CITATION.cff +20 -0
  5. data/CODE_OF_CONDUCT.md +134 -0
  6. data/CONTRIBUTING.md +227 -0
  7. data/FUNDING.md +74 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +1087 -0
  10. data/REEK +0 -0
  11. data/RUBOCOP.md +71 -0
  12. data/SECURITY.md +21 -0
  13. data/lib/markdown/merge/cleanse/block_spacing.rb +253 -0
  14. data/lib/markdown/merge/cleanse/code_fence_spacing.rb +294 -0
  15. data/lib/markdown/merge/cleanse/condensed_link_refs.rb +405 -0
  16. data/lib/markdown/merge/cleanse.rb +42 -0
  17. data/lib/markdown/merge/code_block_merger.rb +300 -0
  18. data/lib/markdown/merge/conflict_resolver.rb +128 -0
  19. data/lib/markdown/merge/debug_logger.rb +26 -0
  20. data/lib/markdown/merge/document_problems.rb +190 -0
  21. data/lib/markdown/merge/file_aligner.rb +196 -0
  22. data/lib/markdown/merge/file_analysis.rb +353 -0
  23. data/lib/markdown/merge/file_analysis_base.rb +629 -0
  24. data/lib/markdown/merge/freeze_node.rb +93 -0
  25. data/lib/markdown/merge/gap_line_node.rb +136 -0
  26. data/lib/markdown/merge/link_definition_formatter.rb +49 -0
  27. data/lib/markdown/merge/link_definition_node.rb +157 -0
  28. data/lib/markdown/merge/link_parser.rb +421 -0
  29. data/lib/markdown/merge/link_reference_rehydrator.rb +320 -0
  30. data/lib/markdown/merge/markdown_structure.rb +123 -0
  31. data/lib/markdown/merge/merge_result.rb +166 -0
  32. data/lib/markdown/merge/node_type_normalizer.rb +126 -0
  33. data/lib/markdown/merge/output_builder.rb +166 -0
  34. data/lib/markdown/merge/partial_template_merger.rb +334 -0
  35. data/lib/markdown/merge/smart_merger.rb +221 -0
  36. data/lib/markdown/merge/smart_merger_base.rb +621 -0
  37. data/lib/markdown/merge/table_match_algorithm.rb +504 -0
  38. data/lib/markdown/merge/table_match_refiner.rb +136 -0
  39. data/lib/markdown/merge/version.rb +12 -0
  40. data/lib/markdown/merge/whitespace_normalizer.rb +251 -0
  41. data/lib/markdown/merge.rb +149 -0
  42. data/lib/markdown-merge.rb +4 -0
  43. data/sig/markdown/merge.rbs +341 -0
  44. data.tar.gz.sig +0 -0
  45. metadata +365 -0
  46. metadata.gz.sig +0 -0
@@ -0,0 +1,421 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "parslet"
4
+
5
+ module Markdown
6
+ module Merge
7
+ # Parslet-based parser for markdown link structures.
8
+ #
9
+ # This parser extracts:
10
+ # - Link reference definitions: `[label]: url` or `[label]: url "title"`
11
+ # - Inline links: `[text](url)` or `[text](url "title")`
12
+ # - Inline images: `![alt](url)` or `![alt](url "title")`
13
+ # - Linked images: `[![alt](img-url)](link-url)` (nested structures)
14
+ #
15
+ # Handles complex cases like:
16
+ # - Emoji in labels (e.g., `[🖼️galtzo-discord]`)
17
+ # - Nested brackets (for linked images like `[![alt][ref]](url)`)
18
+ # - Multi-byte UTF-8 characters
19
+ #
20
+ # @example Parse link definitions
21
+ # parser = LinkParser.new
22
+ # defs = parser.parse_definitions("[example]: https://example.com\n[🎨logo]: https://logo.png")
23
+ # # => [{ label: "example", url: "https://example.com" }, { label: "🎨logo", url: "https://logo.png" }]
24
+ #
25
+ # @example Find inline links with nested structure detection
26
+ # parser = LinkParser.new
27
+ # items = parser.find_all_link_constructs("Click [![Logo](img.png)](link.com) here")
28
+ # # Returns a tree structure with :children for nested items
29
+ #
30
+ class LinkParser
31
+ # Grammar for link reference definitions
32
+ class DefinitionGrammar < Parslet::Parser
33
+ rule(:space) { match('[ \t]') }
34
+ rule(:spaces) { space.repeat(1) }
35
+ rule(:spaces?) { space.repeat }
36
+
37
+ # Bracket content: handles nested brackets recursively
38
+ rule(:bracket_content) {
39
+ (
40
+ str("[") >> bracket_content.maybe >> str("]") |
41
+ str("]").absent? >> any
42
+ ).repeat
43
+ }
44
+
45
+ rule(:label) { str("[") >> bracket_content.as(:label) >> str("]") }
46
+
47
+ rule(:url_char) { match('[^\s>]') }
48
+ rule(:bare_url) { url_char.repeat(1) }
49
+ rule(:angled_url_char) { match("[^>]") }
50
+ rule(:angled_url) { str("<") >> angled_url_char.repeat(1) >> str(">") }
51
+ rule(:url) { (angled_url | bare_url).as(:url) }
52
+
53
+ rule(:title_content_double) { (str('"').absent? >> any).repeat }
54
+ rule(:title_content_single) { (str("'").absent? >> any).repeat }
55
+ rule(:title_content_paren) { (str(")").absent? >> any).repeat }
56
+
57
+ rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
58
+ rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
59
+ rule(:title_paren) { str("(") >> title_content_paren.as(:title) >> str(")") }
60
+ rule(:title) { title_double | title_single | title_paren }
61
+
62
+ rule(:definition) {
63
+ spaces? >> label >> str(":") >> spaces? >> url >> (spaces >> title).maybe >> spaces?
64
+ }
65
+
66
+ root(:definition)
67
+ end
68
+
69
+ # Grammar for inline links: [text](url) or [text](url "title")
70
+ class InlineLinkGrammar < Parslet::Parser
71
+ rule(:space) { match('[ \t]') }
72
+ rule(:spaces) { space.repeat(1) }
73
+
74
+ # Bracket content with recursive nesting
75
+ rule(:bracket_content) {
76
+ (
77
+ str("[") >> bracket_content.maybe >> str("]") |
78
+ str("]").absent? >> any
79
+ ).repeat
80
+ }
81
+
82
+ rule(:link_text) { str("[") >> bracket_content.as(:text) >> str("]") }
83
+
84
+ # URL content - handles balanced parens inside URLs
85
+ rule(:paren_content) {
86
+ (
87
+ str("(") >> paren_content.maybe >> str(")") |
88
+ match('[^()\s"\']')
89
+ ).repeat
90
+ }
91
+
92
+ rule(:url) { paren_content.as(:url) }
93
+
94
+ rule(:title_content_double) { (str('"').absent? >> any).repeat }
95
+ rule(:title_content_single) { (str("'").absent? >> any).repeat }
96
+ rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
97
+ rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
98
+ rule(:title) { title_double | title_single }
99
+
100
+ rule(:url_part) { str("(") >> url >> (spaces >> title).maybe >> str(")") }
101
+
102
+ rule(:inline_link) { link_text >> url_part }
103
+
104
+ root(:inline_link)
105
+ end
106
+
107
+ # Grammar for inline images: ![alt](url) or ![alt](url "title")
108
+ class InlineImageGrammar < Parslet::Parser
109
+ rule(:space) { match('[ \t]') }
110
+ rule(:spaces) { space.repeat(1) }
111
+
112
+ rule(:bracket_content) {
113
+ (
114
+ str("[") >> bracket_content.maybe >> str("]") |
115
+ str("]").absent? >> any
116
+ ).repeat
117
+ }
118
+
119
+ rule(:alt_text) { str("![") >> bracket_content.as(:alt) >> str("]") }
120
+
121
+ rule(:paren_content) {
122
+ (
123
+ str("(") >> paren_content.maybe >> str(")") |
124
+ match('[^()\s"\']')
125
+ ).repeat
126
+ }
127
+
128
+ rule(:url) { paren_content.as(:url) }
129
+
130
+ rule(:title_content_double) { (str('"').absent? >> any).repeat }
131
+ rule(:title_content_single) { (str("'").absent? >> any).repeat }
132
+ rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
133
+ rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
134
+ rule(:title) { title_double | title_single }
135
+
136
+ rule(:url_part) { str("(") >> url >> (spaces >> title).maybe >> str(")") }
137
+
138
+ rule(:inline_image) { alt_text >> url_part }
139
+
140
+ root(:inline_image)
141
+ end
142
+
143
+ def initialize
144
+ @definition_grammar = DefinitionGrammar.new
145
+ @link_grammar = InlineLinkGrammar.new
146
+ @image_grammar = InlineImageGrammar.new
147
+ end
148
+
149
+ # Parse link reference definitions from content.
150
+ #
151
+ # @param content [String] Markdown content
152
+ # @return [Array<Hash>] Array of { label:, url:, title: (optional) }
153
+ def parse_definitions(content)
154
+ definitions = []
155
+
156
+ content.each_line do |line|
157
+ result = parse_definition_line(line.chomp)
158
+ definitions << result if result
159
+ end
160
+
161
+ definitions
162
+ end
163
+
164
+ # Parse a single line as a link reference definition.
165
+ #
166
+ # @param line [String] A single line
167
+ # @return [Hash, nil] { label:, url:, title: } or nil
168
+ def parse_definition_line(line)
169
+ result = @definition_grammar.parse(line)
170
+
171
+ url = result[:url].to_s
172
+ # Strip angle brackets if present
173
+ url = url[1..-2] if url.start_with?("<") && url.end_with?(">")
174
+
175
+ definition = {
176
+ label: result[:label].to_s,
177
+ url: url,
178
+ }
179
+ definition[:title] = result[:title].to_s if result[:title]
180
+ definition
181
+ rescue Parslet::ParseFailed
182
+ nil
183
+ end
184
+
185
+ # Find all inline links in content with positions.
186
+ #
187
+ # @param content [String] Markdown content
188
+ # @return [Array<Hash>] Array of { text:, url:, title:, start_pos:, end_pos: }
189
+ def find_inline_links(content)
190
+ find_constructs(content, :link)
191
+ end
192
+
193
+ # Find all inline images in content with positions.
194
+ #
195
+ # @param content [String] Markdown content
196
+ # @return [Array<Hash>] Array of { alt:, url:, title:, start_pos:, end_pos: }
197
+ def find_inline_images(content)
198
+ find_constructs(content, :image)
199
+ end
200
+
201
+ # Build URL to label mapping from definitions.
202
+ #
203
+ # @param definitions [Array<Hash>] From parse_definitions
204
+ # @return [Hash<String, String>] URL => best label
205
+ def build_url_to_label_map(definitions)
206
+ url_to_labels = Hash.new { |h, k| h[k] = [] }
207
+
208
+ definitions.each do |defn|
209
+ url_to_labels[defn[:url]] << defn[:label]
210
+ end
211
+
212
+ url_to_labels.transform_values do |labels|
213
+ labels.min_by { |l| [l.length, l] }
214
+ end
215
+ end
216
+
217
+ # Find all link constructs (links and images) with proper nesting structure.
218
+ #
219
+ # This method returns a flat list of items where linked images are represented
220
+ # as a single item with :children containing the nested image. This allows
221
+ # for proper replacement from leaves to root.
222
+ #
223
+ # @param content [String] Markdown content
224
+ # @return [Array<Hash>] Array of link/image constructs with :children for nested items
225
+ def find_all_link_constructs(content)
226
+ # Find all images and links
227
+ images = find_inline_images(content)
228
+ links = find_inline_links(content)
229
+
230
+ # Build a tree structure where images inside links are children
231
+ build_link_tree(links, images)
232
+ end
233
+
234
+ # Build a tree structure from links and images, detecting nesting.
235
+ #
236
+ # @param links [Array<Hash>] Links with :start_pos and :end_pos
237
+ # @param images [Array<Hash>] Images with :start_pos and :end_pos
238
+ # @return [Array<Hash>] Links/images with :children for nested items
239
+ def build_link_tree(links, images)
240
+ # Combine all items
241
+ all_items = links.map { |l| l.merge(type: :link) } +
242
+ images.map { |i| i.merge(type: :image) }
243
+
244
+ # Sort by start position
245
+ sorted = all_items.sort_by { |item| item[:start_pos] }
246
+
247
+ result = []
248
+ skip_until = -1
249
+
250
+ sorted.each do |item|
251
+ # Skip items that are children of a previous item
252
+ next if item[:start_pos] < skip_until
253
+
254
+ # Find any items nested inside this one
255
+ children = sorted.select do |other|
256
+ other[:start_pos] > item[:start_pos] &&
257
+ other[:end_pos] <= item[:end_pos] &&
258
+ other != item
259
+ end
260
+
261
+ if children.any?
262
+ item = item.merge(children: children)
263
+ # Mark children to be skipped
264
+ skip_until = item[:end_pos]
265
+ end
266
+
267
+ result << item
268
+ end
269
+
270
+ result
271
+ end
272
+
273
+ # Flatten a tree of link constructs to leaf-first order for processing.
274
+ #
275
+ # This is useful for replacement operations where we want to process
276
+ # innermost items first (depth-first, post-order traversal).
277
+ #
278
+ # @param items [Array<Hash>] Items from find_all_link_constructs
279
+ # @return [Array<Hash>] Items in leaf-first order (children before parents)
280
+ def flatten_leaf_first(items)
281
+ result = []
282
+
283
+ items.each do |item|
284
+ if item[:children]
285
+ # First add children (recursively), then the parent
286
+ result.concat(flatten_leaf_first(item[:children]))
287
+ end
288
+ # Add the item without children key for cleaner processing
289
+ result << item.except(:children)
290
+ end
291
+
292
+ result
293
+ end
294
+
295
+ private
296
+
297
+ def find_constructs(content, type)
298
+ results = []
299
+ pos = 0
300
+ grammar = (type == :image) ? @image_grammar : @link_grammar
301
+ start_marker = (type == :image) ? "![" : "["
302
+
303
+ while pos < content.length
304
+ idx = content.index(start_marker, pos)
305
+ break unless idx
306
+
307
+ # For links, skip if preceded by ! (that's an image)
308
+ if type == :link && idx > 0 && content[idx - 1] == "!"
309
+ pos = idx + 1
310
+ next
311
+ end
312
+
313
+ result = try_parse_construct_at(content, idx, grammar, type)
314
+
315
+ if result
316
+ results << result
317
+ pos = result[:end_pos]
318
+ else
319
+ pos = idx + 1
320
+ end
321
+ end
322
+
323
+ results
324
+ end
325
+
326
+ def try_parse_construct_at(content, start_idx, grammar, type)
327
+ remaining = content[start_idx..]
328
+
329
+ # Find the closing ) by tracking balanced brackets/parens
330
+ bracket_end = find_bracket_end(remaining, (type == :image) ? 1 : 0)
331
+ return unless bracket_end
332
+
333
+ # Check for ( after ]
334
+ return if bracket_end + 1 >= remaining.length
335
+ return unless remaining[bracket_end + 1] == "("
336
+
337
+ paren_end = find_paren_end(remaining, bracket_end + 1)
338
+ return unless paren_end
339
+
340
+ # Extract the substring and try to parse it
341
+ substring = remaining[0..paren_end]
342
+
343
+ begin
344
+ result = grammar.parse(substring)
345
+
346
+ parsed = if type == :image
347
+ {
348
+ alt: result[:alt].to_s,
349
+ url: result[:url].to_s,
350
+ start_pos: start_idx,
351
+ end_pos: start_idx + substring.length,
352
+ original: substring,
353
+ }
354
+ else
355
+ {
356
+ text: result[:text].to_s,
357
+ url: result[:url].to_s,
358
+ start_pos: start_idx,
359
+ end_pos: start_idx + substring.length,
360
+ original: substring,
361
+ }
362
+ end
363
+
364
+ parsed[:title] = result[:title].to_s if result[:title]
365
+ parsed
366
+ rescue Parslet::ParseFailed
367
+ nil
368
+ end
369
+ end
370
+
371
+ def find_bracket_end(text, start_offset)
372
+ depth = 0
373
+ pos = start_offset
374
+
375
+ while pos < text.length
376
+ case text[pos]
377
+ when "["
378
+ depth += 1
379
+ when "]"
380
+ depth -= 1
381
+ return pos if depth == 0
382
+ end
383
+ pos += 1
384
+ end
385
+
386
+ nil
387
+ end
388
+
389
+ def find_paren_end(text, start_offset)
390
+ depth = 0
391
+ pos = start_offset
392
+ in_quotes = false
393
+ quote_char = nil
394
+
395
+ while pos < text.length
396
+ char = text[pos]
397
+
398
+ if !in_quotes && (char == '"' || char == "'")
399
+ in_quotes = true
400
+ quote_char = char
401
+ elsif in_quotes && char == quote_char
402
+ in_quotes = false
403
+ quote_char = nil
404
+ elsif !in_quotes
405
+ case char
406
+ when "("
407
+ depth += 1
408
+ when ")"
409
+ depth -= 1
410
+ return pos if depth == 0
411
+ end
412
+ end
413
+
414
+ pos += 1
415
+ end
416
+
417
+ nil
418
+ end
419
+ end
420
+ end
421
+ end