markdown-merge 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +251 -0
- data/CITATION.cff +20 -0
- data/CODE_OF_CONDUCT.md +134 -0
- data/CONTRIBUTING.md +227 -0
- data/FUNDING.md +74 -0
- data/LICENSE.txt +21 -0
- data/README.md +1087 -0
- data/REEK +0 -0
- data/RUBOCOP.md +71 -0
- data/SECURITY.md +21 -0
- data/lib/markdown/merge/cleanse/block_spacing.rb +253 -0
- data/lib/markdown/merge/cleanse/code_fence_spacing.rb +294 -0
- data/lib/markdown/merge/cleanse/condensed_link_refs.rb +405 -0
- data/lib/markdown/merge/cleanse.rb +42 -0
- data/lib/markdown/merge/code_block_merger.rb +300 -0
- data/lib/markdown/merge/conflict_resolver.rb +128 -0
- data/lib/markdown/merge/debug_logger.rb +26 -0
- data/lib/markdown/merge/document_problems.rb +190 -0
- data/lib/markdown/merge/file_aligner.rb +196 -0
- data/lib/markdown/merge/file_analysis.rb +353 -0
- data/lib/markdown/merge/file_analysis_base.rb +629 -0
- data/lib/markdown/merge/freeze_node.rb +93 -0
- data/lib/markdown/merge/gap_line_node.rb +136 -0
- data/lib/markdown/merge/link_definition_formatter.rb +49 -0
- data/lib/markdown/merge/link_definition_node.rb +157 -0
- data/lib/markdown/merge/link_parser.rb +421 -0
- data/lib/markdown/merge/link_reference_rehydrator.rb +320 -0
- data/lib/markdown/merge/markdown_structure.rb +123 -0
- data/lib/markdown/merge/merge_result.rb +166 -0
- data/lib/markdown/merge/node_type_normalizer.rb +126 -0
- data/lib/markdown/merge/output_builder.rb +166 -0
- data/lib/markdown/merge/partial_template_merger.rb +334 -0
- data/lib/markdown/merge/smart_merger.rb +221 -0
- data/lib/markdown/merge/smart_merger_base.rb +621 -0
- data/lib/markdown/merge/table_match_algorithm.rb +504 -0
- data/lib/markdown/merge/table_match_refiner.rb +136 -0
- data/lib/markdown/merge/version.rb +12 -0
- data/lib/markdown/merge/whitespace_normalizer.rb +251 -0
- data/lib/markdown/merge.rb +149 -0
- data/lib/markdown-merge.rb +4 -0
- data/sig/markdown/merge.rbs +341 -0
- data.tar.gz.sig +0 -0
- metadata +365 -0
- metadata.gz.sig +0 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "parslet"
|
|
4
|
+
|
|
5
|
+
module Markdown
|
|
6
|
+
module Merge
|
|
7
|
+
# Parslet-based parser for markdown link structures.
|
|
8
|
+
#
|
|
9
|
+
# This parser extracts:
|
|
10
|
+
# - Link reference definitions: `[label]: url` or `[label]: url "title"`
|
|
11
|
+
# - Inline links: `[text](url)` or `[text](url "title")`
|
|
12
|
+
# - Inline images: `` or ``
|
|
13
|
+
# - Linked images: `[](link-url)` (nested structures)
|
|
14
|
+
#
|
|
15
|
+
# Handles complex cases like:
|
|
16
|
+
# - Emoji in labels (e.g., `[🖼️galtzo-discord]`)
|
|
17
|
+
# - Nested brackets (for linked images like `[![alt][ref]](url)`)
|
|
18
|
+
# - Multi-byte UTF-8 characters
|
|
19
|
+
#
|
|
20
|
+
# @example Parse link definitions
|
|
21
|
+
# parser = LinkParser.new
|
|
22
|
+
# defs = parser.parse_definitions("[example]: https://example.com\n[🎨logo]: https://logo.png")
|
|
23
|
+
# # => [{ label: "example", url: "https://example.com" }, { label: "🎨logo", url: "https://logo.png" }]
|
|
24
|
+
#
|
|
25
|
+
# @example Find inline links with nested structure detection
|
|
26
|
+
# parser = LinkParser.new
|
|
27
|
+
# items = parser.find_all_link_constructs("Click [](link.com) here")
|
|
28
|
+
# # Returns a tree structure with :children for nested items
|
|
29
|
+
#
|
|
30
|
+
class LinkParser
|
|
31
|
+
# Grammar for link reference definitions
|
|
32
|
+
class DefinitionGrammar < Parslet::Parser
|
|
33
|
+
rule(:space) { match('[ \t]') }
|
|
34
|
+
rule(:spaces) { space.repeat(1) }
|
|
35
|
+
rule(:spaces?) { space.repeat }
|
|
36
|
+
|
|
37
|
+
# Bracket content: handles nested brackets recursively
|
|
38
|
+
rule(:bracket_content) {
|
|
39
|
+
(
|
|
40
|
+
str("[") >> bracket_content.maybe >> str("]") |
|
|
41
|
+
str("]").absent? >> any
|
|
42
|
+
).repeat
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
rule(:label) { str("[") >> bracket_content.as(:label) >> str("]") }
|
|
46
|
+
|
|
47
|
+
rule(:url_char) { match('[^\s>]') }
|
|
48
|
+
rule(:bare_url) { url_char.repeat(1) }
|
|
49
|
+
rule(:angled_url_char) { match("[^>]") }
|
|
50
|
+
rule(:angled_url) { str("<") >> angled_url_char.repeat(1) >> str(">") }
|
|
51
|
+
rule(:url) { (angled_url | bare_url).as(:url) }
|
|
52
|
+
|
|
53
|
+
rule(:title_content_double) { (str('"').absent? >> any).repeat }
|
|
54
|
+
rule(:title_content_single) { (str("'").absent? >> any).repeat }
|
|
55
|
+
rule(:title_content_paren) { (str(")").absent? >> any).repeat }
|
|
56
|
+
|
|
57
|
+
rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
|
|
58
|
+
rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
|
|
59
|
+
rule(:title_paren) { str("(") >> title_content_paren.as(:title) >> str(")") }
|
|
60
|
+
rule(:title) { title_double | title_single | title_paren }
|
|
61
|
+
|
|
62
|
+
rule(:definition) {
|
|
63
|
+
spaces? >> label >> str(":") >> spaces? >> url >> (spaces >> title).maybe >> spaces?
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
root(:definition)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Grammar for inline links: [text](url) or [text](url "title")
|
|
70
|
+
class InlineLinkGrammar < Parslet::Parser
|
|
71
|
+
rule(:space) { match('[ \t]') }
|
|
72
|
+
rule(:spaces) { space.repeat(1) }
|
|
73
|
+
|
|
74
|
+
# Bracket content with recursive nesting
|
|
75
|
+
rule(:bracket_content) {
|
|
76
|
+
(
|
|
77
|
+
str("[") >> bracket_content.maybe >> str("]") |
|
|
78
|
+
str("]").absent? >> any
|
|
79
|
+
).repeat
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
rule(:link_text) { str("[") >> bracket_content.as(:text) >> str("]") }
|
|
83
|
+
|
|
84
|
+
# URL content - handles balanced parens inside URLs
|
|
85
|
+
rule(:paren_content) {
|
|
86
|
+
(
|
|
87
|
+
str("(") >> paren_content.maybe >> str(")") |
|
|
88
|
+
match('[^()\s"\']')
|
|
89
|
+
).repeat
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
rule(:url) { paren_content.as(:url) }
|
|
93
|
+
|
|
94
|
+
rule(:title_content_double) { (str('"').absent? >> any).repeat }
|
|
95
|
+
rule(:title_content_single) { (str("'").absent? >> any).repeat }
|
|
96
|
+
rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
|
|
97
|
+
rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
|
|
98
|
+
rule(:title) { title_double | title_single }
|
|
99
|
+
|
|
100
|
+
rule(:url_part) { str("(") >> url >> (spaces >> title).maybe >> str(")") }
|
|
101
|
+
|
|
102
|
+
rule(:inline_link) { link_text >> url_part }
|
|
103
|
+
|
|
104
|
+
root(:inline_link)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Grammar for inline images:  or 
|
|
108
|
+
class InlineImageGrammar < Parslet::Parser
|
|
109
|
+
rule(:space) { match('[ \t]') }
|
|
110
|
+
rule(:spaces) { space.repeat(1) }
|
|
111
|
+
|
|
112
|
+
rule(:bracket_content) {
|
|
113
|
+
(
|
|
114
|
+
str("[") >> bracket_content.maybe >> str("]") |
|
|
115
|
+
str("]").absent? >> any
|
|
116
|
+
).repeat
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
rule(:alt_text) { str("![") >> bracket_content.as(:alt) >> str("]") }
|
|
120
|
+
|
|
121
|
+
rule(:paren_content) {
|
|
122
|
+
(
|
|
123
|
+
str("(") >> paren_content.maybe >> str(")") |
|
|
124
|
+
match('[^()\s"\']')
|
|
125
|
+
).repeat
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
rule(:url) { paren_content.as(:url) }
|
|
129
|
+
|
|
130
|
+
rule(:title_content_double) { (str('"').absent? >> any).repeat }
|
|
131
|
+
rule(:title_content_single) { (str("'").absent? >> any).repeat }
|
|
132
|
+
rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
|
|
133
|
+
rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
|
|
134
|
+
rule(:title) { title_double | title_single }
|
|
135
|
+
|
|
136
|
+
rule(:url_part) { str("(") >> url >> (spaces >> title).maybe >> str(")") }
|
|
137
|
+
|
|
138
|
+
rule(:inline_image) { alt_text >> url_part }
|
|
139
|
+
|
|
140
|
+
root(:inline_image)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def initialize
|
|
144
|
+
@definition_grammar = DefinitionGrammar.new
|
|
145
|
+
@link_grammar = InlineLinkGrammar.new
|
|
146
|
+
@image_grammar = InlineImageGrammar.new
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Parse link reference definitions from content.
|
|
150
|
+
#
|
|
151
|
+
# @param content [String] Markdown content
|
|
152
|
+
# @return [Array<Hash>] Array of { label:, url:, title: (optional) }
|
|
153
|
+
def parse_definitions(content)
|
|
154
|
+
definitions = []
|
|
155
|
+
|
|
156
|
+
content.each_line do |line|
|
|
157
|
+
result = parse_definition_line(line.chomp)
|
|
158
|
+
definitions << result if result
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
definitions
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Parse a single line as a link reference definition.
|
|
165
|
+
#
|
|
166
|
+
# @param line [String] A single line
|
|
167
|
+
# @return [Hash, nil] { label:, url:, title: } or nil
|
|
168
|
+
def parse_definition_line(line)
|
|
169
|
+
result = @definition_grammar.parse(line)
|
|
170
|
+
|
|
171
|
+
url = result[:url].to_s
|
|
172
|
+
# Strip angle brackets if present
|
|
173
|
+
url = url[1..-2] if url.start_with?("<") && url.end_with?(">")
|
|
174
|
+
|
|
175
|
+
definition = {
|
|
176
|
+
label: result[:label].to_s,
|
|
177
|
+
url: url,
|
|
178
|
+
}
|
|
179
|
+
definition[:title] = result[:title].to_s if result[:title]
|
|
180
|
+
definition
|
|
181
|
+
rescue Parslet::ParseFailed
|
|
182
|
+
nil
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Find all inline links in content with positions.
|
|
186
|
+
#
|
|
187
|
+
# @param content [String] Markdown content
|
|
188
|
+
# @return [Array<Hash>] Array of { text:, url:, title:, start_pos:, end_pos: }
|
|
189
|
+
def find_inline_links(content)
|
|
190
|
+
find_constructs(content, :link)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Find all inline images in content with positions.
|
|
194
|
+
#
|
|
195
|
+
# @param content [String] Markdown content
|
|
196
|
+
# @return [Array<Hash>] Array of { alt:, url:, title:, start_pos:, end_pos: }
|
|
197
|
+
def find_inline_images(content)
|
|
198
|
+
find_constructs(content, :image)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Build URL to label mapping from definitions.
|
|
202
|
+
#
|
|
203
|
+
# @param definitions [Array<Hash>] From parse_definitions
|
|
204
|
+
# @return [Hash<String, String>] URL => best label
|
|
205
|
+
def build_url_to_label_map(definitions)
|
|
206
|
+
url_to_labels = Hash.new { |h, k| h[k] = [] }
|
|
207
|
+
|
|
208
|
+
definitions.each do |defn|
|
|
209
|
+
url_to_labels[defn[:url]] << defn[:label]
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
url_to_labels.transform_values do |labels|
|
|
213
|
+
labels.min_by { |l| [l.length, l] }
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Find all link constructs (links and images) with proper nesting structure.
|
|
218
|
+
#
|
|
219
|
+
# This method returns a flat list of items where linked images are represented
|
|
220
|
+
# as a single item with :children containing the nested image. This allows
|
|
221
|
+
# for proper replacement from leaves to root.
|
|
222
|
+
#
|
|
223
|
+
# @param content [String] Markdown content
|
|
224
|
+
# @return [Array<Hash>] Array of link/image constructs with :children for nested items
|
|
225
|
+
def find_all_link_constructs(content)
|
|
226
|
+
# Find all images and links
|
|
227
|
+
images = find_inline_images(content)
|
|
228
|
+
links = find_inline_links(content)
|
|
229
|
+
|
|
230
|
+
# Build a tree structure where images inside links are children
|
|
231
|
+
build_link_tree(links, images)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Build a tree structure from links and images, detecting nesting.
|
|
235
|
+
#
|
|
236
|
+
# @param links [Array<Hash>] Links with :start_pos and :end_pos
|
|
237
|
+
# @param images [Array<Hash>] Images with :start_pos and :end_pos
|
|
238
|
+
# @return [Array<Hash>] Links/images with :children for nested items
|
|
239
|
+
def build_link_tree(links, images)
|
|
240
|
+
# Combine all items
|
|
241
|
+
all_items = links.map { |l| l.merge(type: :link) } +
|
|
242
|
+
images.map { |i| i.merge(type: :image) }
|
|
243
|
+
|
|
244
|
+
# Sort by start position
|
|
245
|
+
sorted = all_items.sort_by { |item| item[:start_pos] }
|
|
246
|
+
|
|
247
|
+
result = []
|
|
248
|
+
skip_until = -1
|
|
249
|
+
|
|
250
|
+
sorted.each do |item|
|
|
251
|
+
# Skip items that are children of a previous item
|
|
252
|
+
next if item[:start_pos] < skip_until
|
|
253
|
+
|
|
254
|
+
# Find any items nested inside this one
|
|
255
|
+
children = sorted.select do |other|
|
|
256
|
+
other[:start_pos] > item[:start_pos] &&
|
|
257
|
+
other[:end_pos] <= item[:end_pos] &&
|
|
258
|
+
other != item
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
if children.any?
|
|
262
|
+
item = item.merge(children: children)
|
|
263
|
+
# Mark children to be skipped
|
|
264
|
+
skip_until = item[:end_pos]
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
result << item
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
result
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Flatten a tree of link constructs to leaf-first order for processing.
|
|
274
|
+
#
|
|
275
|
+
# This is useful for replacement operations where we want to process
|
|
276
|
+
# innermost items first (depth-first, post-order traversal).
|
|
277
|
+
#
|
|
278
|
+
# @param items [Array<Hash>] Items from find_all_link_constructs
|
|
279
|
+
# @return [Array<Hash>] Items in leaf-first order (children before parents)
|
|
280
|
+
def flatten_leaf_first(items)
|
|
281
|
+
result = []
|
|
282
|
+
|
|
283
|
+
items.each do |item|
|
|
284
|
+
if item[:children]
|
|
285
|
+
# First add children (recursively), then the parent
|
|
286
|
+
result.concat(flatten_leaf_first(item[:children]))
|
|
287
|
+
end
|
|
288
|
+
# Add the item without children key for cleaner processing
|
|
289
|
+
result << item.except(:children)
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
result
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
private
|
|
296
|
+
|
|
297
|
+
def find_constructs(content, type)
|
|
298
|
+
results = []
|
|
299
|
+
pos = 0
|
|
300
|
+
grammar = (type == :image) ? @image_grammar : @link_grammar
|
|
301
|
+
start_marker = (type == :image) ? "![" : "["
|
|
302
|
+
|
|
303
|
+
while pos < content.length
|
|
304
|
+
idx = content.index(start_marker, pos)
|
|
305
|
+
break unless idx
|
|
306
|
+
|
|
307
|
+
# For links, skip if preceded by ! (that's an image)
|
|
308
|
+
if type == :link && idx > 0 && content[idx - 1] == "!"
|
|
309
|
+
pos = idx + 1
|
|
310
|
+
next
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
result = try_parse_construct_at(content, idx, grammar, type)
|
|
314
|
+
|
|
315
|
+
if result
|
|
316
|
+
results << result
|
|
317
|
+
pos = result[:end_pos]
|
|
318
|
+
else
|
|
319
|
+
pos = idx + 1
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
results
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def try_parse_construct_at(content, start_idx, grammar, type)
|
|
327
|
+
remaining = content[start_idx..]
|
|
328
|
+
|
|
329
|
+
# Find the closing ) by tracking balanced brackets/parens
|
|
330
|
+
bracket_end = find_bracket_end(remaining, (type == :image) ? 1 : 0)
|
|
331
|
+
return unless bracket_end
|
|
332
|
+
|
|
333
|
+
# Check for ( after ]
|
|
334
|
+
return if bracket_end + 1 >= remaining.length
|
|
335
|
+
return unless remaining[bracket_end + 1] == "("
|
|
336
|
+
|
|
337
|
+
paren_end = find_paren_end(remaining, bracket_end + 1)
|
|
338
|
+
return unless paren_end
|
|
339
|
+
|
|
340
|
+
# Extract the substring and try to parse it
|
|
341
|
+
substring = remaining[0..paren_end]
|
|
342
|
+
|
|
343
|
+
begin
|
|
344
|
+
result = grammar.parse(substring)
|
|
345
|
+
|
|
346
|
+
parsed = if type == :image
|
|
347
|
+
{
|
|
348
|
+
alt: result[:alt].to_s,
|
|
349
|
+
url: result[:url].to_s,
|
|
350
|
+
start_pos: start_idx,
|
|
351
|
+
end_pos: start_idx + substring.length,
|
|
352
|
+
original: substring,
|
|
353
|
+
}
|
|
354
|
+
else
|
|
355
|
+
{
|
|
356
|
+
text: result[:text].to_s,
|
|
357
|
+
url: result[:url].to_s,
|
|
358
|
+
start_pos: start_idx,
|
|
359
|
+
end_pos: start_idx + substring.length,
|
|
360
|
+
original: substring,
|
|
361
|
+
}
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
parsed[:title] = result[:title].to_s if result[:title]
|
|
365
|
+
parsed
|
|
366
|
+
rescue Parslet::ParseFailed
|
|
367
|
+
nil
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def find_bracket_end(text, start_offset)
|
|
372
|
+
depth = 0
|
|
373
|
+
pos = start_offset
|
|
374
|
+
|
|
375
|
+
while pos < text.length
|
|
376
|
+
case text[pos]
|
|
377
|
+
when "["
|
|
378
|
+
depth += 1
|
|
379
|
+
when "]"
|
|
380
|
+
depth -= 1
|
|
381
|
+
return pos if depth == 0
|
|
382
|
+
end
|
|
383
|
+
pos += 1
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
nil
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
def find_paren_end(text, start_offset)
|
|
390
|
+
depth = 0
|
|
391
|
+
pos = start_offset
|
|
392
|
+
in_quotes = false
|
|
393
|
+
quote_char = nil
|
|
394
|
+
|
|
395
|
+
while pos < text.length
|
|
396
|
+
char = text[pos]
|
|
397
|
+
|
|
398
|
+
if !in_quotes && (char == '"' || char == "'")
|
|
399
|
+
in_quotes = true
|
|
400
|
+
quote_char = char
|
|
401
|
+
elsif in_quotes && char == quote_char
|
|
402
|
+
in_quotes = false
|
|
403
|
+
quote_char = nil
|
|
404
|
+
elsif !in_quotes
|
|
405
|
+
case char
|
|
406
|
+
when "("
|
|
407
|
+
depth += 1
|
|
408
|
+
when ")"
|
|
409
|
+
depth -= 1
|
|
410
|
+
return pos if depth == 0
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
pos += 1
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
nil
|
|
418
|
+
end
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
end
|