markdown-merge 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +0 -0
  3. data/CHANGELOG.md +251 -0
  4. data/CITATION.cff +20 -0
  5. data/CODE_OF_CONDUCT.md +134 -0
  6. data/CONTRIBUTING.md +227 -0
  7. data/FUNDING.md +74 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +1087 -0
  10. data/REEK +0 -0
  11. data/RUBOCOP.md +71 -0
  12. data/SECURITY.md +21 -0
  13. data/lib/markdown/merge/cleanse/block_spacing.rb +253 -0
  14. data/lib/markdown/merge/cleanse/code_fence_spacing.rb +294 -0
  15. data/lib/markdown/merge/cleanse/condensed_link_refs.rb +405 -0
  16. data/lib/markdown/merge/cleanse.rb +42 -0
  17. data/lib/markdown/merge/code_block_merger.rb +300 -0
  18. data/lib/markdown/merge/conflict_resolver.rb +128 -0
  19. data/lib/markdown/merge/debug_logger.rb +26 -0
  20. data/lib/markdown/merge/document_problems.rb +190 -0
  21. data/lib/markdown/merge/file_aligner.rb +196 -0
  22. data/lib/markdown/merge/file_analysis.rb +353 -0
  23. data/lib/markdown/merge/file_analysis_base.rb +629 -0
  24. data/lib/markdown/merge/freeze_node.rb +93 -0
  25. data/lib/markdown/merge/gap_line_node.rb +136 -0
  26. data/lib/markdown/merge/link_definition_formatter.rb +49 -0
  27. data/lib/markdown/merge/link_definition_node.rb +157 -0
  28. data/lib/markdown/merge/link_parser.rb +421 -0
  29. data/lib/markdown/merge/link_reference_rehydrator.rb +320 -0
  30. data/lib/markdown/merge/markdown_structure.rb +123 -0
  31. data/lib/markdown/merge/merge_result.rb +166 -0
  32. data/lib/markdown/merge/node_type_normalizer.rb +126 -0
  33. data/lib/markdown/merge/output_builder.rb +166 -0
  34. data/lib/markdown/merge/partial_template_merger.rb +334 -0
  35. data/lib/markdown/merge/smart_merger.rb +221 -0
  36. data/lib/markdown/merge/smart_merger_base.rb +621 -0
  37. data/lib/markdown/merge/table_match_algorithm.rb +504 -0
  38. data/lib/markdown/merge/table_match_refiner.rb +136 -0
  39. data/lib/markdown/merge/version.rb +12 -0
  40. data/lib/markdown/merge/whitespace_normalizer.rb +251 -0
  41. data/lib/markdown/merge.rb +149 -0
  42. data/lib/markdown-merge.rb +4 -0
  43. data/sig/markdown/merge.rbs +341 -0
  44. data.tar.gz.sig +0 -0
  45. metadata +365 -0
  46. metadata.gz.sig +0 -0
@@ -0,0 +1,405 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "parslet"
4
+
5
+ module Markdown
6
+ module Merge
7
+ module Cleanse
8
+ # Parslet-based parser for fixing condensed Markdown link reference definitions.
9
+ #
10
+ # == The Problem
11
+ #
12
+ # This class fixes **corrupted Markdown files** where link reference definitions
13
+ # that were originally on separate lines got smashed together by having their
14
+ # separating newlines removed.
15
+ #
16
+ # A previous bug in ast-merge caused link reference definitions at the bottom
17
+ # of Markdown files to be merged together into a single line without newlines
18
+ # or whitespace between them.
19
+ #
20
+ # == Corruption Patterns
21
+ #
22
+ # Two types of corruption are detected and fixed:
23
+ #
24
+ # 1. **Multiple definitions condensed on one line:**
25
+ # - Corrupted: `[label1]: url1[label2]: url2`
26
+ # - Fixed: Each definition on its own line
27
+ #
28
+ # 2. **Content followed by definition without newline:**
29
+ # - Corrupted: `Some text or URL[label]: url`
30
+ # - Fixed: Newline inserted before `[label]:`
31
+ #
32
+ # @example Condensed definitions (Pattern 1)
33
+ # # Before (corrupted):
34
+ # "[⛳liberapay-img]: https://example.com/img.svg[⛳liberapay]: https://example.com"
35
+ #
36
+ # # After (fixed):
37
+ # "[⛳liberapay-img]: https://example.com/img.svg\n[⛳liberapay]: https://example.com"
38
+ #
39
+ # @example Content before definition (Pattern 2)
40
+ # # Before (corrupted):
41
+ # "https://donate.codeberg.org/[🤝contributing]: CONTRIBUTING.md"
42
+ #
43
+ # # After (fixed):
44
+ # "https://donate.codeberg.org/\n[🤝contributing]: CONTRIBUTING.md"
45
+ #
46
+ # == How It Works
47
+ #
48
+ # The parser uses a **PEG grammar** (via Parslet) to:
49
+ # - Recognize link reference definition patterns: `[label]: url`
50
+ # - Detect when multiple definitions are on the same line
51
+ # - Detect when content precedes a definition without newline separation
52
+ # - Parse and reconstruct definitions with proper newlines
53
+ #
54
+ # **Why PEG?** The previous regex-based implementation had potential ReDoS
55
+ # (Regular Expression Denial of Service) vulnerabilities due to complex
56
+ # lookahead/lookbehind patterns. PEG parsers are linear-time and immune to
57
+ # ReDoS attacks.
58
+ #
59
+ # The grammar extends the pattern from {LinkParser::DefinitionGrammar} but
60
+ # handles the case where definitions are concatenated without separators.
61
+ #
62
+ # @example Basic usage
63
+ # parser = Markdown::Merge::Cleanse::CondensedLinkRefs.new(condensed_text)
64
+ # fixed_text = parser.expand
65
+ #
66
+ # @example Check if text contains condensed refs
67
+ # parser = Markdown::Merge::Cleanse::CondensedLinkRefs.new(text)
68
+ # parser.condensed? # => true/false
69
+ #
70
+ # @example Process a file
71
+ # content = File.read("README.md")
72
+ # parser = Markdown::Merge::Cleanse::CondensedLinkRefs.new(content)
73
+ # if parser.condensed?
74
+ # File.write("README.md", parser.expand)
75
+ # end
76
+ #
77
+ # @example Get parsed definitions
78
+ # parser = Markdown::Merge::Cleanse::CondensedLinkRefs.new(condensed_text)
79
+ # parser.definitions.each do |defn|
80
+ # puts "#{defn[:label]} => #{defn[:url]}"
81
+ # end
82
+ #
83
+ # @see LinkParser For parsing properly-formatted link definitions
84
+ # @api public
85
+ class CondensedLinkRefs
86
+ # Grammar for parsing multiple condensed link reference definitions.
87
+ #
88
+ # This grammar handles the specific bug pattern where link definitions
89
+ # are concatenated without newlines or whitespace between them.
90
+ #
91
+ # Key insight: A bare URL ends at any character that's not valid in a URL.
92
+ # The `[` character that starts the next definition is NOT valid in a bare URL,
93
+ # so we can use it as the delimiter.
94
+ #
95
+ # This PEG grammar is linear-time and cannot have polynomial backtracking,
96
+ # eliminating ReDoS vulnerabilities.
97
+ #
98
+ # @api private
99
+ class CondensedDefinitionsGrammar < Parslet::Parser
100
+ rule(:space) { match('[ \t]') }
101
+ rule(:spaces) { space.repeat(1) }
102
+ rule(:spaces?) { space.repeat }
103
+ rule(:newline) { match('[\r\n]') }
104
+ rule(:newlines?) { newline.repeat }
105
+
106
+ # Bracket content: handles nested brackets recursively
107
+ # Same as LinkParser::DefinitionGrammar
108
+ rule(:bracket_content) {
109
+ (
110
+ str("[") >> bracket_content.maybe >> str("]") |
111
+ str("]").absent? >> any
112
+ ).repeat
113
+ }
114
+
115
+ rule(:label) { str("[") >> bracket_content.as(:label) >> str("]") }
116
+
117
+ # URL characters - everything except whitespace, >, and [
118
+ # The [ is excluded because it signals the start of the next definition
119
+ rule(:url_char) { match('[^\s>\[]') }
120
+ rule(:bare_url) { url_char.repeat(1) }
121
+
122
+ # Angled URLs can contain [ since they're delimited by <>
123
+ rule(:angled_url_char) { match("[^>]") }
124
+ rule(:angled_url) { str("<") >> angled_url_char.repeat(1) >> str(">") }
125
+
126
+ rule(:url) { (angled_url | bare_url).as(:url) }
127
+
128
+ # Title handling (same as LinkParser)
129
+ rule(:title_content_double) { (str('"').absent? >> any).repeat }
130
+ rule(:title_content_single) { (str("'").absent? >> any).repeat }
131
+ rule(:title_content_paren) { (str(")").absent? >> any).repeat }
132
+
133
+ rule(:title_double) { str('"') >> title_content_double.as(:title) >> str('"') }
134
+ rule(:title_single) { str("'") >> title_content_single.as(:title) >> str("'") }
135
+ rule(:title_paren) { str("(") >> title_content_paren.as(:title) >> str(")") }
136
+ rule(:title) { title_double | title_single | title_paren }
137
+
138
+ # A single definition
139
+ rule(:definition) {
140
+ spaces? >>
141
+ label >>
142
+ str(":") >>
143
+ spaces? >>
144
+ url >>
145
+ (spaces >> title).maybe >>
146
+ spaces?
147
+ }
148
+
149
+ # Multiple definitions, possibly with or without newlines between them
150
+ rule(:definitions) {
151
+ (definition.as(:definition) >> newlines?).repeat(1)
152
+ }
153
+
154
+ root(:definitions)
155
+ end
156
+
157
+ # @return [String] the input text to parse
158
+ attr_reader :source
159
+
160
+ # Create a new parser for the given text.
161
+ #
162
+ # @param source [String] the text that may contain condensed link refs
163
+ def initialize(source)
164
+ @source = source.to_s
165
+ @grammar = CondensedDefinitionsGrammar.new
166
+ @parsed = nil
167
+ @definitions = nil
168
+ end
169
+
170
+ # Check if the source contains condensed link reference definitions.
171
+ #
172
+ # Detects patterns where link definitions are not properly separated:
173
+ # 1. Multiple link defs on same line: `[l1]: url1[l2]: url2`
174
+ # 2. Content followed by link def without newline: `text[label]: url`
175
+ #
176
+ # Uses the PEG grammar to parse and detect condensed sequences.
177
+ #
178
+ # @return [Boolean] true if condensed refs are detected
179
+ def condensed?
180
+ source.each_line do |line|
181
+ # Pattern 1: Line contains 2+ link definitions (condensed together)
182
+ return true if contains_multiple_definitions?(line)
183
+
184
+ # Pattern 2: Line has content before first link definition
185
+ # (indicates corruption where newline before def was removed)
186
+ return true if has_content_before_definition?(line)
187
+ end
188
+ false
189
+ end
190
+
191
+ # Parse the source into individual link reference definitions that are condensed.
192
+ #
193
+ # This finds link refs that are part of corrupted patterns:
194
+ # 1. Multiple refs on same line without newlines
195
+ # 2. Content followed by ref without newline
196
+ #
197
+ # Uses the PEG grammar to properly parse link definitions.
198
+ #
199
+ # @return [Array<Hash>] Array of { label:, url:, title: (optional) }
200
+ def definitions
201
+ return @definitions if @definitions
202
+
203
+ @definitions = []
204
+
205
+ # Find all condensed sequences line by line
206
+ source.each_line do |line|
207
+ # Try to parse as definitions
208
+ parsed = parse_line(line)
209
+ next unless parsed && !parsed.empty?
210
+
211
+ # Check if line has content before first definition
212
+ first_bracket = line.index("[")
213
+ has_prefix = first_bracket && first_bracket > 0 && !line[0...first_bracket].strip.empty?
214
+
215
+ # Include if: multiple definitions OR single definition with prefix
216
+ next unless parsed.size > 1 || has_prefix
217
+
218
+ # Extract definition info from parse tree
219
+ parsed.each do |def_tree|
220
+ @definitions << extract_definition(def_tree)
221
+ end
222
+ end
223
+
224
+ @definitions
225
+ end
226
+
227
+ # Expand condensed link reference definitions to separate lines.
228
+ #
229
+ # Fixes only the condensed patterns (where a URL is immediately followed
230
+ # by a new link ref definition without a newline). All other content
231
+ # is preserved exactly as-is.
232
+ #
233
+ # Uses the PEG grammar to properly parse and reconstruct definitions.
234
+ #
235
+ # @return [String] the source with condensed link refs expanded to separate lines
236
+ def expand
237
+ return source unless condensed?
238
+
239
+ lines = source.lines.map do |line|
240
+ expand_line(line)
241
+ end
242
+
243
+ lines.join
244
+ end
245
+
246
+ # Count the number of link reference definitions in the source.
247
+ #
248
+ # @return [Integer] number of link ref definitions found
249
+ def count
250
+ definitions.size
251
+ end
252
+
253
+ private
254
+
255
+ # Check if a line contains multiple link definitions (condensed).
256
+ #
257
+ # @param line [String] the line to check
258
+ # @return [Boolean] true if line has 2+ definitions
259
+ def contains_multiple_definitions?(line)
260
+ parsed = parse_line(line)
261
+ parsed && parsed.size > 1
262
+ end
263
+
264
+ # Check if a line has content before the first link definition.
265
+ #
266
+ # This indicates corruption where a newline was removed between
267
+ # regular content and a link definition.
268
+ #
269
+ # Example: `https://example.com[label]: url` (should be on separate lines)
270
+ #
271
+ # @param line [String] the line to check
272
+ # @return [Boolean] true if there's content before first `[label]:`
273
+ def has_content_before_definition?(line)
274
+ # Skip if no link definition pattern
275
+ return false unless line.include?("]:")
276
+
277
+ # Find first occurrence of [label]:
278
+ first_bracket = line.index("[")
279
+ return false unless first_bracket
280
+
281
+ # Check if there's non-whitespace content before it
282
+ prefix = line[0...first_bracket].strip
283
+ return false if prefix.empty?
284
+
285
+ # Verify what follows is actually a link definition by trying to parse
286
+ parsed = parse_line(line)
287
+ !parsed.nil? && !parsed.empty?
288
+ end
289
+
290
+ # Parse a line into link definitions using PEG grammar.
291
+ #
292
+ # Handles lines that may have content before the first definition.
293
+ # For example: "https://example.com[label]: url.txt"
294
+ #
295
+ # @param line [String] the line to parse
296
+ # @return [Array<Hash>, nil] array of definition parse trees, or nil if parse fails
297
+ def parse_line(line)
298
+ # Skip lines that don't look like link definitions
299
+ return unless line.include?("]:")
300
+
301
+ # First, try to find where the first link definition starts
302
+ # Look for pattern: [anything]:
303
+ first_bracket = line.index("[")
304
+ return unless first_bracket
305
+
306
+ # Try parsing from the first bracket onward
307
+ candidate = line[first_bracket..]
308
+
309
+ begin
310
+ tree = @grammar.parse(candidate)
311
+
312
+ # Extract the definitions array from parse tree
313
+ # Parslet returns either a single item or array
314
+ defs = tree.is_a?(Array) ? tree : [tree]
315
+
316
+ # Filter out non-definition nodes and return only definitions
317
+ defs.select { |node| node.is_a?(Hash) && node.key?(:definition) }
318
+ .map { |node| node[:definition] }
319
+ rescue Parslet::ParseFailed
320
+ nil
321
+ end
322
+ end
323
+
324
+ # Extract definition data from a parse tree node.
325
+ #
326
+ # @param def_tree [Hash] the definition parse tree
327
+ # @return [Hash] definition with :label and :url
328
+ def extract_definition(def_tree)
329
+ label_tree = def_tree[:label]
330
+ url_tree = def_tree[:url]
331
+
332
+ # Convert Parslet slices to strings
333
+ label = label_tree.is_a?(Array) ? label_tree.map(&:to_s).join : label_tree.to_s
334
+ url = url_tree.to_s
335
+
336
+ {
337
+ label: label,
338
+ url: clean_url(url),
339
+ }
340
+ end
341
+
342
+ # Expand a single line if it contains condensed definitions.
343
+ #
344
+ # Handles two cases:
345
+ # 1. Multiple definitions on same line (always needs expansion)
346
+ # 2. Single definition with content before it (needs newline before def)
347
+ #
348
+ # @param line [String] the line to expand
349
+ # @return [String] expanded line with newlines between definitions
350
+ def expand_line(line)
351
+ parsed = parse_line(line)
352
+ return line unless parsed && !parsed.empty?
353
+
354
+ # Find where the first definition starts
355
+ first_bracket = line.index("[")
356
+ prefix = (first_bracket && first_bracket > 0) ? line[0...first_bracket].strip : ""
357
+
358
+ # Case 1: Multiple definitions - always expand
359
+ if parsed.size > 1
360
+ definitions = parsed.map { |def_tree| reconstruct_definition(def_tree) }
361
+
362
+ # First definition gets the prefix if present
363
+ result = if prefix && !prefix.empty?
364
+ "#{prefix}\n#{definitions.join("\n")}"
365
+ else
366
+ definitions.join("\n")
367
+ end
368
+
369
+ result += "\n" if line.end_with?("\n")
370
+ return result
371
+ end
372
+
373
+ # Case 2: Single definition with prefix content - add newline before it
374
+ if parsed.size == 1 && prefix && !prefix.empty?
375
+ defn = reconstruct_definition(parsed[0])
376
+ result = "#{prefix}\n#{defn}"
377
+ result += "\n" if line.end_with?("\n")
378
+ return result
379
+ end
380
+
381
+ # No expansion needed
382
+ line
383
+ end
384
+
385
+ # Reconstruct a single definition from parse tree.
386
+ #
387
+ # @param def_tree [Hash] the definition parse tree
388
+ # @return [String] reconstructed definition string
389
+ def reconstruct_definition(def_tree)
390
+ defn = extract_definition(def_tree)
391
+ "[#{defn[:label]}]: #{defn[:url]}"
392
+ end
393
+
394
+ # Clean a URL (strip angle brackets if present).
395
+ #
396
+ # @param url [String] the URL to clean
397
+ # @return [String] cleaned URL
398
+ def clean_url(url)
399
+ url = url.strip
400
+ (url.start_with?("<") && url.end_with?(">")) ? url[1..-2] : url
401
+ end
402
+ end
403
+ end
404
+ end
405
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markdown
4
+ module Merge
5
+ # Namespace for document cleansing/repair utilities.
6
+ #
7
+ # The Cleanse module contains parsers and fixers for repairing malformed
8
+ # Markdown documents, particularly those affected by previous bugs in
9
+ # ast-merge or other merge tools.
10
+ #
11
+ # @example Fix condensed link reference definitions
12
+ # content = File.read("README.md")
13
+ # parser = Markdown::Merge::Cleanse::CondensedLinkRefs.new(content)
14
+ # if parser.condensed?
15
+ # File.write("README.md", parser.expand)
16
+ # end
17
+ #
18
+ # @example Fix code fence spacing issues
19
+ # content = File.read("README.md")
20
+ # parser = Markdown::Merge::Cleanse::CodeFenceSpacing.new(content)
21
+ # if parser.malformed?
22
+ # File.write("README.md", parser.fix)
23
+ # end
24
+ #
25
+ # @example Fix block element spacing issues
26
+ # content = File.read("README.md")
27
+ # parser = Markdown::Merge::Cleanse::BlockSpacing.new(content)
28
+ # if parser.malformed?
29
+ # File.write("README.md", parser.fix)
30
+ # end
31
+ #
32
+ # @see Cleanse::CondensedLinkRefs For fixing condensed link reference definitions
33
+ # @see Cleanse::CodeFenceSpacing For fixing code fence spacing issues
34
+ # @see Cleanse::BlockSpacing For fixing missing blank lines between block elements
35
+ # @api public
36
+ module Cleanse
37
+ autoload :BlockSpacing, "markdown/merge/cleanse/block_spacing"
38
+ autoload :CodeFenceSpacing, "markdown/merge/cleanse/code_fence_spacing"
39
+ autoload :CondensedLinkRefs, "markdown/merge/cleanse/condensed_link_refs"
40
+ end
41
+ end
42
+ end