markdown-merge 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +251 -0
- data/CITATION.cff +20 -0
- data/CODE_OF_CONDUCT.md +134 -0
- data/CONTRIBUTING.md +227 -0
- data/FUNDING.md +74 -0
- data/LICENSE.txt +21 -0
- data/README.md +1087 -0
- data/REEK +0 -0
- data/RUBOCOP.md +71 -0
- data/SECURITY.md +21 -0
- data/lib/markdown/merge/cleanse/block_spacing.rb +253 -0
- data/lib/markdown/merge/cleanse/code_fence_spacing.rb +294 -0
- data/lib/markdown/merge/cleanse/condensed_link_refs.rb +405 -0
- data/lib/markdown/merge/cleanse.rb +42 -0
- data/lib/markdown/merge/code_block_merger.rb +300 -0
- data/lib/markdown/merge/conflict_resolver.rb +128 -0
- data/lib/markdown/merge/debug_logger.rb +26 -0
- data/lib/markdown/merge/document_problems.rb +190 -0
- data/lib/markdown/merge/file_aligner.rb +196 -0
- data/lib/markdown/merge/file_analysis.rb +353 -0
- data/lib/markdown/merge/file_analysis_base.rb +629 -0
- data/lib/markdown/merge/freeze_node.rb +93 -0
- data/lib/markdown/merge/gap_line_node.rb +136 -0
- data/lib/markdown/merge/link_definition_formatter.rb +49 -0
- data/lib/markdown/merge/link_definition_node.rb +157 -0
- data/lib/markdown/merge/link_parser.rb +421 -0
- data/lib/markdown/merge/link_reference_rehydrator.rb +320 -0
- data/lib/markdown/merge/markdown_structure.rb +123 -0
- data/lib/markdown/merge/merge_result.rb +166 -0
- data/lib/markdown/merge/node_type_normalizer.rb +126 -0
- data/lib/markdown/merge/output_builder.rb +166 -0
- data/lib/markdown/merge/partial_template_merger.rb +334 -0
- data/lib/markdown/merge/smart_merger.rb +221 -0
- data/lib/markdown/merge/smart_merger_base.rb +621 -0
- data/lib/markdown/merge/table_match_algorithm.rb +504 -0
- data/lib/markdown/merge/table_match_refiner.rb +136 -0
- data/lib/markdown/merge/version.rb +12 -0
- data/lib/markdown/merge/whitespace_normalizer.rb +251 -0
- data/lib/markdown/merge.rb +149 -0
- data/lib/markdown-merge.rb +4 -0
- data/sig/markdown/merge.rbs +341 -0
- data.tar.gz.sig +0 -0
- metadata +365 -0
- metadata.gz.sig +0 -0
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdown
|
|
4
|
+
module Merge
|
|
5
|
+
# Algorithm for computing match scores between two Markdown tables.
|
|
6
|
+
#
|
|
7
|
+
# This algorithm uses multiple factors to determine how well two tables match:
|
|
8
|
+
# - (A) Percentage of matching header cells (using Levenshtein similarity)
|
|
9
|
+
# - (B) Percentage of matching cells in the first column (using Levenshtein similarity)
|
|
10
|
+
# - (C) Average percentage of matching cells in rows with matching first column
|
|
11
|
+
# - (D) Percentage of matching total cells
|
|
12
|
+
# - (E) Position distance weight (closer tables score higher)
|
|
13
|
+
#
|
|
14
|
+
# Cell comparisons use Levenshtein distance to compute similarity, allowing
|
|
15
|
+
# partial matches (e.g., "Value" vs "Values" would get a high similarity score).
|
|
16
|
+
#
|
|
17
|
+
# The final score is the weighted average of these factors.
|
|
18
|
+
#
|
|
19
|
+
# @example Basic usage
|
|
20
|
+
# algorithm = TableMatchAlgorithm.new
|
|
21
|
+
# score = algorithm.call(table_a, table_b)
|
|
22
|
+
#
|
|
23
|
+
# @example With position information
|
|
24
|
+
# algorithm = TableMatchAlgorithm.new(
|
|
25
|
+
# position_a: 0, # First table in template
|
|
26
|
+
# position_b: 2, # Third table in destination
|
|
27
|
+
# total_tables_a: 3,
|
|
28
|
+
# total_tables_b: 3
|
|
29
|
+
# )
|
|
30
|
+
# score = algorithm.call(table_a, table_b)
|
|
31
|
+
class TableMatchAlgorithm
|
|
32
|
+
# Default weights for each factor in the algorithm
|
|
33
|
+
DEFAULT_WEIGHTS = {
|
|
34
|
+
header_match: 0.25, # (A) Header row matching
|
|
35
|
+
first_column: 0.20, # (B) First column matching
|
|
36
|
+
row_content: 0.25, # (C) Content in matching rows
|
|
37
|
+
total_cells: 0.15, # (D) Overall cell matching
|
|
38
|
+
position: 0.15, # (E) Position distance
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
# Minimum similarity threshold to consider cells as potentially matching
|
|
42
|
+
# for first column lookup (used in row content matching)
|
|
43
|
+
FIRST_COLUMN_SIMILARITY_THRESHOLD = 0.7
|
|
44
|
+
|
|
45
|
+
# @return [Integer, nil] Position of table A in its document (0-indexed)
|
|
46
|
+
attr_reader :position_a
|
|
47
|
+
|
|
48
|
+
# @return [Integer, nil] Position of table B in its document (0-indexed)
|
|
49
|
+
attr_reader :position_b
|
|
50
|
+
|
|
51
|
+
# @return [Integer] Total number of tables in document A
|
|
52
|
+
attr_reader :total_tables_a
|
|
53
|
+
|
|
54
|
+
# @return [Integer] Total number of tables in document B
|
|
55
|
+
attr_reader :total_tables_b
|
|
56
|
+
|
|
57
|
+
# @return [Hash] Weights for each scoring factor
|
|
58
|
+
attr_reader :weights
|
|
59
|
+
|
|
60
|
+
# @return [Symbol] The markdown backend being used
|
|
61
|
+
attr_reader :backend
|
|
62
|
+
|
|
63
|
+
# Initialize the table match algorithm.
|
|
64
|
+
#
|
|
65
|
+
# @param position_a [Integer, nil] Position of first table in its document
|
|
66
|
+
# @param position_b [Integer, nil] Position of second table in its document
|
|
67
|
+
# @param total_tables_a [Integer] Total tables in first document (default: 1)
|
|
68
|
+
# @param total_tables_b [Integer] Total tables in second document (default: 1)
|
|
69
|
+
# @param weights [Hash] Custom weights for scoring factors
|
|
70
|
+
# @param backend [Symbol] Markdown backend for type normalization (default: :commonmarker)
|
|
71
|
+
def initialize(position_a: nil, position_b: nil, total_tables_a: 1, total_tables_b: 1, weights: {}, backend: :commonmarker)
|
|
72
|
+
@position_a = position_a
|
|
73
|
+
@position_b = position_b
|
|
74
|
+
@total_tables_a = [total_tables_a, 1].max
|
|
75
|
+
@total_tables_b = [total_tables_b, 1].max
|
|
76
|
+
@weights = DEFAULT_WEIGHTS.merge(weights)
|
|
77
|
+
@backend = backend
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Compute the match score between two tables.
|
|
81
|
+
#
|
|
82
|
+
# @param table_a [Object] First table node
|
|
83
|
+
# @param table_b [Object] Second table node
|
|
84
|
+
# @return [Float] Score between 0.0 and 1.0
|
|
85
|
+
def call(table_a, table_b)
|
|
86
|
+
rows_a = extract_rows(table_a)
|
|
87
|
+
rows_b = extract_rows(table_b)
|
|
88
|
+
|
|
89
|
+
return 0.0 if rows_a.empty? || rows_b.empty?
|
|
90
|
+
|
|
91
|
+
scores = {
|
|
92
|
+
header_match: compute_header_match(rows_a, rows_b),
|
|
93
|
+
first_column: compute_first_column_match(rows_a, rows_b),
|
|
94
|
+
row_content: compute_row_content_match(rows_a, rows_b),
|
|
95
|
+
total_cells: compute_total_cells_match(rows_a, rows_b),
|
|
96
|
+
position: compute_position_score,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
weighted_average(scores)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
private
|
|
103
|
+
|
|
104
|
+
# Compute Levenshtein distance between two strings.
|
|
105
|
+
#
|
|
106
|
+
# Uses the Wagner-Fischer algorithm with O(min(m,n)) space.
|
|
107
|
+
#
|
|
108
|
+
# @param str_a [String] First string
|
|
109
|
+
# @param str_b [String] Second string
|
|
110
|
+
# @return [Integer] Edit distance between the strings
|
|
111
|
+
def levenshtein_distance(str_a, str_b)
|
|
112
|
+
return str_b.length if str_a.empty?
|
|
113
|
+
return str_a.length if str_b.empty?
|
|
114
|
+
|
|
115
|
+
# Ensure str_a is the shorter string for space optimization
|
|
116
|
+
if str_a.length > str_b.length
|
|
117
|
+
str_a, str_b = str_b, str_a
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
m = str_a.length
|
|
121
|
+
n = str_b.length
|
|
122
|
+
|
|
123
|
+
# Only need two rows at a time
|
|
124
|
+
prev_row = (0..m).to_a
|
|
125
|
+
curr_row = Array.new(m + 1, 0)
|
|
126
|
+
|
|
127
|
+
(1..n).each do |j|
|
|
128
|
+
curr_row[0] = j
|
|
129
|
+
|
|
130
|
+
(1..m).each do |i|
|
|
131
|
+
cost = (str_a[i - 1] == str_b[j - 1]) ? 0 : 1
|
|
132
|
+
curr_row[i] = [
|
|
133
|
+
curr_row[i - 1] + 1, # insertion
|
|
134
|
+
prev_row[i] + 1, # deletion
|
|
135
|
+
prev_row[i - 1] + cost, # substitution
|
|
136
|
+
].min
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
prev_row, curr_row = curr_row, prev_row
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
prev_row[m]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Compute similarity between two strings using Levenshtein distance.
|
|
146
|
+
#
|
|
147
|
+
# @param str_a [String] First string
|
|
148
|
+
# @param str_b [String] Second string
|
|
149
|
+
# @return [Float] Similarity score between 0.0 and 1.0
|
|
150
|
+
def string_similarity(str_a, str_b)
|
|
151
|
+
a = normalize(str_a)
|
|
152
|
+
b = normalize(str_b)
|
|
153
|
+
|
|
154
|
+
return 1.0 if a == b
|
|
155
|
+
return 1.0 if a.empty? && b.empty?
|
|
156
|
+
return 0.0 if a.empty? || b.empty?
|
|
157
|
+
|
|
158
|
+
max_len = [a.length, b.length].max
|
|
159
|
+
distance = levenshtein_distance(a, b)
|
|
160
|
+
|
|
161
|
+
1.0 - (distance.to_f / max_len)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Extract rows from a table node as arrays of cell text.
|
|
165
|
+
#
|
|
166
|
+
# Subclasses may override this for parser-specific iteration.
|
|
167
|
+
#
|
|
168
|
+
# @param table [Object] Table node
|
|
169
|
+
# @return [Array<Array<String>>] Array of rows, each row is array of cell texts
|
|
170
|
+
def extract_rows(table)
|
|
171
|
+
rows = []
|
|
172
|
+
child = table.first_child
|
|
173
|
+
while child
|
|
174
|
+
if table_row_type?(child)
|
|
175
|
+
rows << extract_cells(child)
|
|
176
|
+
end
|
|
177
|
+
child = next_sibling(child)
|
|
178
|
+
end
|
|
179
|
+
rows
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Check if a node is a table row type.
|
|
183
|
+
#
|
|
184
|
+
# Uses NodeTypeNormalizer to map backend-specific types to canonical types,
|
|
185
|
+
# enabling portable type checking across different markdown parsers.
|
|
186
|
+
#
|
|
187
|
+
# NOTE: We use `type` here instead of `merge_type` because this method operates
|
|
188
|
+
# on child nodes of tables (table_row, table_header), not top-level statements.
|
|
189
|
+
# Only top-level statements are wrapped by NodeTypeNormalizer with `merge_type`.
|
|
190
|
+
# However, we use NodeTypeNormalizer.canonical_type to normalize the raw type.
|
|
191
|
+
#
|
|
192
|
+
# @param node [Object] Node to check
|
|
193
|
+
# @return [Boolean] true if this is a table row
|
|
194
|
+
def table_row_type?(node)
|
|
195
|
+
return false unless node.respond_to?(:type)
|
|
196
|
+
|
|
197
|
+
# Normalize the type using NodeTypeNormalizer for backend portability
|
|
198
|
+
canonical = NodeTypeNormalizer.canonical_type(node.type, @backend || :commonmarker)
|
|
199
|
+
canonical == :table_row || canonical == :table_header
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Get the next sibling of a node.
|
|
203
|
+
#
|
|
204
|
+
# Different parsers use different methods (next vs next_sibling).
|
|
205
|
+
#
|
|
206
|
+
# @param node [Object] Current node
|
|
207
|
+
# @return [Object, nil] Next sibling or nil
|
|
208
|
+
def next_sibling(node)
|
|
209
|
+
if node.respond_to?(:next_sibling)
|
|
210
|
+
node.next_sibling
|
|
211
|
+
elsif node.respond_to?(:next)
|
|
212
|
+
node.next
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Extract cell texts from a table row.
|
|
217
|
+
#
|
|
218
|
+
# Uses NodeTypeNormalizer to map backend-specific types to canonical types,
|
|
219
|
+
# enabling portable type checking across different markdown parsers.
|
|
220
|
+
#
|
|
221
|
+
# NOTE: We use `type` here instead of `merge_type` because this method operates
|
|
222
|
+
# on child nodes of table rows (table_cell), not top-level statements.
|
|
223
|
+
# Only top-level statements are wrapped by NodeTypeNormalizer with `merge_type`.
|
|
224
|
+
# However, we use NodeTypeNormalizer.canonical_type to normalize the raw type.
|
|
225
|
+
#
|
|
226
|
+
# @param row [Object] Table row node
|
|
227
|
+
# @return [Array<String>] Array of cell text contents
|
|
228
|
+
def extract_cells(row)
|
|
229
|
+
cells = []
|
|
230
|
+
child = row.first_child
|
|
231
|
+
while child
|
|
232
|
+
if child.respond_to?(:type)
|
|
233
|
+
canonical = NodeTypeNormalizer.canonical_type(child.type, @backend || :commonmarker)
|
|
234
|
+
if canonical == :table_cell
|
|
235
|
+
cells << extract_text_content(child)
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
child = next_sibling(child)
|
|
239
|
+
end
|
|
240
|
+
cells
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Extract all text content from a node.
|
|
244
|
+
#
|
|
245
|
+
# Uses recursive traversal instead of `walk` for compatibility
|
|
246
|
+
# with tree_haver nodes which don't have a `walk` method.
|
|
247
|
+
#
|
|
248
|
+
# @param node [Object] Node to extract text from
|
|
249
|
+
# @return [String] Concatenated text content
|
|
250
|
+
def extract_text_content(node)
|
|
251
|
+
text_parts = []
|
|
252
|
+
collect_text_recursive(node, text_parts)
|
|
253
|
+
text_parts.join.strip
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Recursively collect text content from a node and its descendants.
|
|
257
|
+
#
|
|
258
|
+
# Uses NodeTypeNormalizer to map backend-specific types to canonical types,
|
|
259
|
+
# enabling portable type checking across different markdown parsers.
|
|
260
|
+
#
|
|
261
|
+
# NOTE: We use `type` here instead of `merge_type` because this method operates
|
|
262
|
+
# on child nodes (text, code), not top-level statements.
|
|
263
|
+
# Only top-level statements are wrapped by NodeTypeNormalizer with `merge_type`.
|
|
264
|
+
# However, we use NodeTypeNormalizer.canonical_type to normalize the raw type.
|
|
265
|
+
#
|
|
266
|
+
# @param node [Object] The node to traverse
|
|
267
|
+
# @param text_parts [Array<String>] Array to accumulate text into
|
|
268
|
+
# @return [void]
|
|
269
|
+
def collect_text_recursive(node, text_parts)
|
|
270
|
+
# Normalize the type using NodeTypeNormalizer for backend portability
|
|
271
|
+
canonical_type = NodeTypeNormalizer.canonical_type(node.type, @backend || :commonmarker)
|
|
272
|
+
|
|
273
|
+
# Collect text from text and code nodes
|
|
274
|
+
if canonical_type == :text || canonical_type == :code
|
|
275
|
+
content = if node.respond_to?(:string_content)
|
|
276
|
+
node.string_content.to_s
|
|
277
|
+
elsif node.respond_to?(:text)
|
|
278
|
+
node.text.to_s
|
|
279
|
+
else
|
|
280
|
+
""
|
|
281
|
+
end
|
|
282
|
+
text_parts << content unless content.empty?
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# Recurse into children - support both children array and first_child iteration
|
|
286
|
+
if node.respond_to?(:children)
|
|
287
|
+
node.children.each do |child|
|
|
288
|
+
collect_text_recursive(child, text_parts)
|
|
289
|
+
end
|
|
290
|
+
elsif node.respond_to?(:first_child)
|
|
291
|
+
child = node.first_child
|
|
292
|
+
while child
|
|
293
|
+
collect_text_recursive(child, text_parts)
|
|
294
|
+
child = if child.respond_to?(:next_sibling)
|
|
295
|
+
child.next_sibling
|
|
296
|
+
else
|
|
297
|
+
(child.respond_to?(:next) ? child.next : nil)
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# (A) Compute header row match percentage using Levenshtein similarity.
|
|
304
|
+
#
|
|
305
|
+
# @param rows_a [Array<Array<String>>] Rows from table A
|
|
306
|
+
# @param rows_b [Array<Array<String>>] Rows from table B
|
|
307
|
+
# @return [Float] Average similarity of header cells (0.0-1.0)
|
|
308
|
+
def compute_header_match(rows_a, rows_b)
|
|
309
|
+
header_a = rows_a.first || []
|
|
310
|
+
header_b = rows_b.first || []
|
|
311
|
+
|
|
312
|
+
return 1.0 if header_a.empty? && header_b.empty?
|
|
313
|
+
return 0.0 if header_a.empty? || header_b.empty?
|
|
314
|
+
|
|
315
|
+
max_cells = [header_a.size, header_b.size].max
|
|
316
|
+
|
|
317
|
+
# Compute similarity for each cell pair
|
|
318
|
+
similarities = header_a.zip(header_b).map do |a, b|
|
|
319
|
+
next 0.0 if a.nil? || b.nil?
|
|
320
|
+
|
|
321
|
+
string_similarity(a, b)
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# Pad with zeros for missing cells
|
|
325
|
+
(max_cells - similarities.size).times { similarities << 0.0 }
|
|
326
|
+
|
|
327
|
+
similarities.sum / max_cells
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# (B) Compute first column match percentage using Levenshtein similarity.
|
|
331
|
+
#
|
|
332
|
+
# @param rows_a [Array<Array<String>>] Rows from table A
|
|
333
|
+
# @param rows_b [Array<Array<String>>] Rows from table B
|
|
334
|
+
# @return [Float] Percentage of matching first column cells (0.0-1.0)
|
|
335
|
+
def compute_first_column_match(rows_a, rows_b)
|
|
336
|
+
col_a = rows_a.map { |row| row.first }.compact
|
|
337
|
+
col_b = rows_b.map { |row| row.first }.compact
|
|
338
|
+
|
|
339
|
+
return 1.0 if col_a.empty? && col_b.empty?
|
|
340
|
+
return 0.0 if col_a.empty? || col_b.empty?
|
|
341
|
+
|
|
342
|
+
# For each cell in column A, find best match in column B
|
|
343
|
+
total_similarity = 0.0
|
|
344
|
+
col_a.each do |cell_a|
|
|
345
|
+
best_match = col_b.map { |cell_b| string_similarity(cell_a, cell_b) }.max || 0.0
|
|
346
|
+
total_similarity += best_match
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
# Also check cells in B that might not have matches in A
|
|
350
|
+
col_b.each do |cell_b|
|
|
351
|
+
best_match = col_a.map { |cell_a| string_similarity(cell_a, cell_b) }.max || 0.0
|
|
352
|
+
total_similarity += best_match
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Average over total cells
|
|
356
|
+
total_cells = col_a.size + col_b.size
|
|
357
|
+
(total_cells > 0) ? total_similarity / total_cells : 0.0
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
# (C) Compute average match percentage for rows with matching first column.
|
|
361
|
+
#
|
|
362
|
+
# Uses Levenshtein similarity to find matching rows by first column.
|
|
363
|
+
#
|
|
364
|
+
# @param rows_a [Array<Array<String>>] Rows from table A
|
|
365
|
+
# @param rows_b [Array<Array<String>>] Rows from table B
|
|
366
|
+
# @return [Float] Average percentage of matching cells in linked rows (0.0-1.0)
|
|
367
|
+
def compute_row_content_match(rows_a, rows_b)
|
|
368
|
+
return 0.0 if rows_a.empty? || rows_b.empty?
|
|
369
|
+
|
|
370
|
+
match_scores = []
|
|
371
|
+
|
|
372
|
+
rows_a.each do |row_a|
|
|
373
|
+
first_col_a = row_a.first
|
|
374
|
+
next if first_col_a.nil?
|
|
375
|
+
|
|
376
|
+
# Find best matching row in B based on first column similarity
|
|
377
|
+
best_row_match = nil
|
|
378
|
+
best_first_col_similarity = 0.0
|
|
379
|
+
|
|
380
|
+
rows_b.each do |row_b|
|
|
381
|
+
first_col_b = row_b.first
|
|
382
|
+
next if first_col_b.nil?
|
|
383
|
+
|
|
384
|
+
similarity = string_similarity(first_col_a, first_col_b)
|
|
385
|
+
if similarity > best_first_col_similarity && similarity >= FIRST_COLUMN_SIMILARITY_THRESHOLD
|
|
386
|
+
best_first_col_similarity = similarity
|
|
387
|
+
best_row_match = row_b
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
next unless best_row_match
|
|
392
|
+
|
|
393
|
+
# Compute row content similarity
|
|
394
|
+
match_scores << row_match_score(row_a, best_row_match)
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
return 0.0 if match_scores.empty?
|
|
398
|
+
|
|
399
|
+
match_scores.sum / match_scores.size
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Compute match score between two rows using Levenshtein similarity.
|
|
403
|
+
#
|
|
404
|
+
# @param row_a [Array<String>] First row
|
|
405
|
+
# @param row_b [Array<String>] Second row
|
|
406
|
+
# @return [Float] Average similarity of cells (0.0-1.0)
|
|
407
|
+
def row_match_score(row_a, row_b)
|
|
408
|
+
max_cells = [row_a.size, row_b.size].max
|
|
409
|
+
return 1.0 if max_cells == 0
|
|
410
|
+
|
|
411
|
+
similarities = row_a.zip(row_b).map do |a, b|
|
|
412
|
+
next 0.0 if a.nil? || b.nil?
|
|
413
|
+
|
|
414
|
+
string_similarity(a, b)
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# Pad with zeros for missing cells
|
|
418
|
+
(max_cells - similarities.size).times { similarities << 0.0 }
|
|
419
|
+
|
|
420
|
+
similarities.sum / max_cells
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# (D) Compute total cells match percentage using Levenshtein similarity.
|
|
424
|
+
#
|
|
425
|
+
# @param rows_a [Array<Array<String>>] Rows from table A
|
|
426
|
+
# @param rows_b [Array<Array<String>>] Rows from table B
|
|
427
|
+
# @return [Float] Percentage of matching total cells (0.0-1.0)
|
|
428
|
+
def compute_total_cells_match(rows_a, rows_b)
|
|
429
|
+
cells_a = rows_a.flatten.compact
|
|
430
|
+
cells_b = rows_b.flatten.compact
|
|
431
|
+
|
|
432
|
+
return 1.0 if cells_a.empty? && cells_b.empty?
|
|
433
|
+
return 0.0 if cells_a.empty? || cells_b.empty?
|
|
434
|
+
|
|
435
|
+
# For each cell in A, find best match in B
|
|
436
|
+
used_b_indices = Set.new
|
|
437
|
+
total_similarity = 0.0
|
|
438
|
+
|
|
439
|
+
cells_a.each do |cell_a|
|
|
440
|
+
best_similarity = 0.0
|
|
441
|
+
best_index = nil
|
|
442
|
+
|
|
443
|
+
cells_b.each_with_index do |cell_b, idx|
|
|
444
|
+
next if used_b_indices.include?(idx)
|
|
445
|
+
|
|
446
|
+
similarity = string_similarity(cell_a, cell_b)
|
|
447
|
+
if similarity > best_similarity
|
|
448
|
+
best_similarity = similarity
|
|
449
|
+
best_index = idx
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
if best_index && best_similarity > 0.5
|
|
454
|
+
used_b_indices << best_index
|
|
455
|
+
total_similarity += best_similarity
|
|
456
|
+
end
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# Calculate score based on how many cells found good matches
|
|
460
|
+
max_cells = [cells_a.size, cells_b.size].max
|
|
461
|
+
total_similarity / max_cells
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
# (E) Compute position-based score.
|
|
465
|
+
#
|
|
466
|
+
# Tables at similar positions in their documents score higher.
|
|
467
|
+
#
|
|
468
|
+
# @return [Float] Position similarity score (0.0-1.0)
|
|
469
|
+
def compute_position_score
|
|
470
|
+
return 1.0 if position_a.nil? || position_b.nil?
|
|
471
|
+
|
|
472
|
+
# Normalize positions to 0-1 range based on total tables
|
|
473
|
+
norm_pos_a = position_a.to_f / total_tables_a
|
|
474
|
+
norm_pos_b = position_b.to_f / total_tables_b
|
|
475
|
+
|
|
476
|
+
# Distance is absolute difference in normalized positions
|
|
477
|
+
distance = (norm_pos_a - norm_pos_b).abs
|
|
478
|
+
|
|
479
|
+
# Convert to similarity (1.0 = same position, 0.0 = max distance)
|
|
480
|
+
1.0 - distance
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Normalize a cell value for comparison.
|
|
484
|
+
#
|
|
485
|
+
# @param value [String, nil] Cell value
|
|
486
|
+
# @return [String] Normalized value (downcased, stripped)
|
|
487
|
+
def normalize(value)
|
|
488
|
+
value.to_s.strip.downcase
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
# Compute weighted average of scores.
|
|
492
|
+
#
|
|
493
|
+
# @param scores [Hash<Symbol, Float>] Individual scores by factor
|
|
494
|
+
# @return [Float] Weighted average score
|
|
495
|
+
def weighted_average(scores)
|
|
496
|
+
total_weight = weights.values.sum
|
|
497
|
+
return 0.0 if total_weight == 0
|
|
498
|
+
|
|
499
|
+
weighted_sum = scores.sum { |key, score| score * weights.fetch(key, 0) }
|
|
500
|
+
weighted_sum / total_weight
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
end
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdown
|
|
4
|
+
module Merge
|
|
5
|
+
# Match refiner for Markdown tables that didn't match by exact signature.
|
|
6
|
+
#
|
|
7
|
+
# This refiner uses the TableMatchAlgorithm to pair tables that have:
|
|
8
|
+
# - Similar but not identical headers
|
|
9
|
+
# - Similar structure (row/column counts)
|
|
10
|
+
# - Similar content in key columns
|
|
11
|
+
#
|
|
12
|
+
# Tables are matched using a multi-factor scoring algorithm that considers:
|
|
13
|
+
# - Header cell similarity
|
|
14
|
+
# - First column (row label) similarity
|
|
15
|
+
# - Overall content overlap
|
|
16
|
+
# - Position in document
|
|
17
|
+
#
|
|
18
|
+
# @example Basic usage
|
|
19
|
+
# refiner = TableMatchRefiner.new(threshold: 0.5)
|
|
20
|
+
# matches = refiner.call(template_nodes, dest_nodes)
|
|
21
|
+
#
|
|
22
|
+
# @example With custom algorithm options
|
|
23
|
+
# refiner = TableMatchRefiner.new(
|
|
24
|
+
# threshold: 0.6,
|
|
25
|
+
# algorithm_options: {
|
|
26
|
+
# weights: { header_match: 0.4, position: 0.1 }
|
|
27
|
+
# }
|
|
28
|
+
# )
|
|
29
|
+
#
|
|
30
|
+
# @see Ast::Merge::MatchRefinerBase
|
|
31
|
+
# @see TableMatchAlgorithm
|
|
32
|
+
class TableMatchRefiner < Ast::Merge::MatchRefinerBase
|
|
33
|
+
# @return [Hash] Options passed to TableMatchAlgorithm
|
|
34
|
+
attr_reader :algorithm_options
|
|
35
|
+
|
|
36
|
+
# @return [Symbol] The markdown backend being used
|
|
37
|
+
attr_reader :backend
|
|
38
|
+
|
|
39
|
+
# Initialize a table match refiner.
|
|
40
|
+
#
|
|
41
|
+
# @param threshold [Float] Minimum score to accept a match (default: 0.5)
|
|
42
|
+
# @param algorithm_options [Hash] Options for TableMatchAlgorithm
|
|
43
|
+
# @param backend [Symbol] Markdown backend for type normalization (default: :commonmarker)
|
|
44
|
+
def initialize(threshold: DEFAULT_THRESHOLD, algorithm_options: {}, backend: :commonmarker, **options)
|
|
45
|
+
super(threshold: threshold, node_types: [:table], **options)
|
|
46
|
+
@algorithm_options = algorithm_options
|
|
47
|
+
@backend = backend
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Find matches between unmatched table nodes.
|
|
51
|
+
#
|
|
52
|
+
# @param template_nodes [Array] Unmatched nodes from template
|
|
53
|
+
# @param dest_nodes [Array] Unmatched nodes from destination
|
|
54
|
+
# @param context [Hash] Additional context (may contain :template_analysis, :dest_analysis)
|
|
55
|
+
# @return [Array<MatchResult>] Array of table matches
|
|
56
|
+
def call(template_nodes, dest_nodes, context = {})
|
|
57
|
+
template_tables = extract_tables(template_nodes)
|
|
58
|
+
dest_tables = extract_tables(dest_nodes)
|
|
59
|
+
|
|
60
|
+
return [] if template_tables.empty? || dest_tables.empty?
|
|
61
|
+
|
|
62
|
+
# Build position information for better matching
|
|
63
|
+
total_template = template_tables.size
|
|
64
|
+
total_dest = dest_tables.size
|
|
65
|
+
|
|
66
|
+
greedy_match(template_tables, dest_tables) do |t_node, d_node|
|
|
67
|
+
t_idx = template_tables.index(t_node) || 0
|
|
68
|
+
d_idx = dest_tables.index(d_node) || 0
|
|
69
|
+
|
|
70
|
+
compute_table_similarity(t_node, d_node, t_idx, d_idx, total_template, total_dest)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
# Extract table nodes from a collection.
|
|
77
|
+
#
|
|
78
|
+
# @param nodes [Array] Nodes to filter
|
|
79
|
+
# @return [Array] Table nodes
|
|
80
|
+
def extract_tables(nodes)
|
|
81
|
+
nodes.select { |n| table_node?(n) }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Check if a node is a table.
|
|
85
|
+
#
|
|
86
|
+
# Handles wrapped nodes (merge_type is symbol) and raw nodes (type is string).
|
|
87
|
+
#
|
|
88
|
+
# @param node [Object] Node to check
|
|
89
|
+
# @return [Boolean]
|
|
90
|
+
def table_node?(node)
|
|
91
|
+
# Check if it's a typed wrapper node first
|
|
92
|
+
if Ast::Merge::NodeTyping.typed_node?(node)
|
|
93
|
+
return Ast::Merge::NodeTyping.merge_type_for(node) == :table
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Check merge_type directly (wrapped nodes from NodeTypeNormalizer)
|
|
97
|
+
if node.respond_to?(:merge_type) && node.merge_type
|
|
98
|
+
return node.merge_type == :table
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Check raw type (string comparison for tree_haver nodes)
|
|
102
|
+
if node.respond_to?(:type)
|
|
103
|
+
node_type = node.type
|
|
104
|
+
return node_type == :table || node_type == "table" || node_type.to_s == "table"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Fallback: class name check
|
|
108
|
+
return true if node.class.name.to_s.include?("Table")
|
|
109
|
+
|
|
110
|
+
false
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Compute similarity score between two tables.
|
|
114
|
+
#
|
|
115
|
+
# @param t_table [Object] Template table
|
|
116
|
+
# @param d_table [Object] Destination table
|
|
117
|
+
# @param t_idx [Integer] Template table index
|
|
118
|
+
# @param d_idx [Integer] Destination table index
|
|
119
|
+
# @param total_t [Integer] Total template tables
|
|
120
|
+
# @param total_d [Integer] Total destination tables
|
|
121
|
+
# @return [Float] Similarity score (0.0-1.0)
|
|
122
|
+
def compute_table_similarity(t_table, d_table, t_idx, d_idx, total_t, total_d)
|
|
123
|
+
algorithm = TableMatchAlgorithm.new(
|
|
124
|
+
position_a: t_idx,
|
|
125
|
+
position_b: d_idx,
|
|
126
|
+
total_tables_a: total_t,
|
|
127
|
+
total_tables_b: total_d,
|
|
128
|
+
backend: @backend,
|
|
129
|
+
**algorithm_options,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
algorithm.call(t_table, d_table)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markdown
|
|
4
|
+
module Merge
|
|
5
|
+
# Version information for Markdown::Merge
|
|
6
|
+
module Version
|
|
7
|
+
# Current version of the markdown-merge gem
|
|
8
|
+
VERSION = "1.0.0"
|
|
9
|
+
end
|
|
10
|
+
VERSION = Version::VERSION # traditional location
|
|
11
|
+
end
|
|
12
|
+
end
|