ast-merge 1.1.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +198 -7
- data/README.md +208 -39
- data/exe/ast-merge-recipe +366 -0
- data/lib/ast/merge/conflict_resolver_base.rb +8 -1
- data/lib/ast/merge/content_match_refiner.rb +278 -0
- data/lib/ast/merge/debug_logger.rb +2 -1
- data/lib/ast/merge/detector/base.rb +193 -0
- data/lib/ast/merge/detector/fenced_code_block.rb +227 -0
- data/lib/ast/merge/detector/mergeable.rb +369 -0
- data/lib/ast/merge/detector/toml_frontmatter.rb +82 -0
- data/lib/ast/merge/detector/yaml_frontmatter.rb +82 -0
- data/lib/ast/merge/merge_result_base.rb +4 -1
- data/lib/ast/merge/navigable_statement.rb +630 -0
- data/lib/ast/merge/partial_template_merger.rb +432 -0
- data/lib/ast/merge/recipe/config.rb +198 -0
- data/lib/ast/merge/recipe/preset.rb +171 -0
- data/lib/ast/merge/recipe/runner.rb +254 -0
- data/lib/ast/merge/recipe/script_loader.rb +181 -0
- data/lib/ast/merge/recipe.rb +26 -0
- data/lib/ast/merge/rspec/dependency_tags.rb +252 -0
- data/lib/ast/merge/rspec/shared_examples/reproducible_merge.rb +3 -2
- data/lib/ast/merge/rspec.rb +33 -2
- data/lib/ast/merge/smart_merger_base.rb +86 -3
- data/lib/ast/merge/version.rb +1 -1
- data/lib/ast/merge.rb +10 -6
- data/sig/ast/merge.rbs +389 -2
- data.tar.gz.sig +0 -0
- metadata +60 -16
- metadata.gz.sig +0 -0
- data/lib/ast/merge/fenced_code_block_detector.rb +0 -313
- data/lib/ast/merge/region.rb +0 -124
- data/lib/ast/merge/region_detector_base.rb +0 -114
- data/lib/ast/merge/region_mergeable.rb +0 -364
- data/lib/ast/merge/toml_frontmatter_detector.rb +0 -88
- data/lib/ast/merge/yaml_frontmatter_detector.rb +0 -88
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# AST Merge Recipe Runner
|
|
5
|
+
#
|
|
6
|
+
# Run YAML-based merge recipes against target files.
|
|
7
|
+
# This is a shipped executable that can be used after installing the ast-merge gem.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# ast-merge-recipe RECIPE_FILE [options]
|
|
11
|
+
#
|
|
12
|
+
# Examples:
|
|
13
|
+
# ast-merge-recipe .merge-recipes/gem_family_section.yml --dry-run
|
|
14
|
+
# ast-merge-recipe .merge-recipes/gem_family_section.yml --verbose --parser=commonmarker
|
|
15
|
+
|
|
16
|
+
require "bundler/inline"
|
|
17
|
+
require "optparse"
|
|
18
|
+
require "yaml"
|
|
19
|
+
|
|
20
|
+
# Parse options first to get merge_gems before bundler/inline
|
|
21
|
+
options = {
|
|
22
|
+
dry_run: false,
|
|
23
|
+
verbose: false,
|
|
24
|
+
parser: :markly,
|
|
25
|
+
base_dir: Dir.pwd,
|
|
26
|
+
recipe_file: nil,
|
|
27
|
+
merge_gems: [],
|
|
28
|
+
dev_mode: ENV.fetch("KETTLE_RB_DEV", "false").casecmp?("true"),
|
|
29
|
+
dev_root: nil,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Pre-parse to extract recipe file and check for merge_gems in recipe
|
|
33
|
+
# We need to do this before bundler/inline to know which gems to load
|
|
34
|
+
ARGV.each do |arg|
|
|
35
|
+
case arg
|
|
36
|
+
when /^--dev-root=(.+)$/
|
|
37
|
+
options[:dev_root] = File.expand_path($1)
|
|
38
|
+
when /^-/
|
|
39
|
+
# Skip options for now
|
|
40
|
+
else
|
|
41
|
+
options[:recipe_file] ||= arg
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# If recipe file specified, try to load merge_gems from it
|
|
46
|
+
recipe_merge_gems = []
|
|
47
|
+
if options[:recipe_file] && File.exist?(options[:recipe_file])
|
|
48
|
+
begin
|
|
49
|
+
recipe_config = YAML.safe_load_file(options[:recipe_file], permitted_classes: [Symbol])
|
|
50
|
+
if recipe_config.is_a?(Hash) && recipe_config["merge_gems"]
|
|
51
|
+
recipe_merge_gems = Array(recipe_config["merge_gems"])
|
|
52
|
+
end
|
|
53
|
+
rescue
|
|
54
|
+
# Ignore errors here, we'll catch them later
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Determine dev root for local gems
|
|
59
|
+
dev_root = options[:dev_root] || ENV["AST_MERGE_DEV_ROOT"]
|
|
60
|
+
if options[:dev_mode] && dev_root.nil?
|
|
61
|
+
# Try to find dev root by looking for ast-merge directory
|
|
62
|
+
possible_roots = [
|
|
63
|
+
File.expand_path("../..", __FILE__),
|
|
64
|
+
File.expand_path("../../..", __FILE__),
|
|
65
|
+
Dir.pwd,
|
|
66
|
+
]
|
|
67
|
+
dev_root = possible_roots.find { |p| File.exist?(File.join(p, "ast-merge.gemspec")) }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Load dependencies via bundler/inline
|
|
71
|
+
gemfile do
|
|
72
|
+
source "https://gem.coop"
|
|
73
|
+
|
|
74
|
+
if options[:dev_mode] && dev_root
|
|
75
|
+
# Development mode - use local gems
|
|
76
|
+
gem "ast-merge", path: dev_root
|
|
77
|
+
gem "tree_haver", path: File.join(dev_root, "vendor/tree_haver")
|
|
78
|
+
gem "markdown-merge", path: File.join(dev_root, "vendor/markdown-merge")
|
|
79
|
+
gem "markly-merge", path: File.join(dev_root, "vendor/markly-merge")
|
|
80
|
+
gem "commonmarker-merge", path: File.join(dev_root, "vendor/commonmarker-merge")
|
|
81
|
+
gem "prism-merge", path: File.join(dev_root, "vendor/prism-merge")
|
|
82
|
+
gem "psych-merge", path: File.join(dev_root, "vendor/psych-merge")
|
|
83
|
+
else
|
|
84
|
+
# Production mode - use released gems
|
|
85
|
+
# gem.coop gems need a source block
|
|
86
|
+
gem "ast-merge"
|
|
87
|
+
gem "tree_haver"
|
|
88
|
+
gem "markdown-merge"
|
|
89
|
+
gem "markly-merge"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Load additional merge gems specified in recipe
|
|
93
|
+
recipe_merge_gems.each do |gem_spec|
|
|
94
|
+
case gem_spec
|
|
95
|
+
when String
|
|
96
|
+
gem(gem_spec)
|
|
97
|
+
when Hash
|
|
98
|
+
name = gem_spec["name"] || gem_spec[:name]
|
|
99
|
+
gem_opts = {}
|
|
100
|
+
gem_opts[:version] = gem_spec["version"] || gem_spec[:version] if gem_spec["version"] || gem_spec[:version]
|
|
101
|
+
gem_opts[:path] = gem_spec["path"] || gem_spec[:path] if gem_spec["path"] || gem_spec[:path]
|
|
102
|
+
gem_opts[:git] = gem_spec["git"] || gem_spec[:git] if gem_spec["git"] || gem_spec[:git]
|
|
103
|
+
gem_opts[:branch] = gem_spec["branch"] || gem_spec[:branch] if gem_spec["branch"] || gem_spec[:branch]
|
|
104
|
+
gem_opts[:require] = gem_spec["require"] || gem_spec[:require] if gem_spec.key?("require") || gem_spec.key?(:require)
|
|
105
|
+
|
|
106
|
+
if gem_opts.empty?
|
|
107
|
+
gem(name)
|
|
108
|
+
else
|
|
109
|
+
gem(name, **gem_opts)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Try to load table_tennis for nice output
|
|
115
|
+
gem "table_tennis", require: false
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Now load the actual libraries
|
|
119
|
+
require "ast-merge"
|
|
120
|
+
|
|
121
|
+
# Try to load table_tennis
|
|
122
|
+
begin
|
|
123
|
+
require "table_tennis"
|
|
124
|
+
HAS_TABLE_TENNIS = true
|
|
125
|
+
rescue LoadError
|
|
126
|
+
HAS_TABLE_TENNIS = false
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# ANSI color helpers
|
|
130
|
+
module Colors
|
|
131
|
+
class << self
|
|
132
|
+
def green(str) = "\e[32m#{str}\e[0m"
|
|
133
|
+
def red(str) = "\e[31m#{str}\e[0m"
|
|
134
|
+
def yellow(str) = "\e[33m#{str}\e[0m"
|
|
135
|
+
def cyan(str) = "\e[36m#{str}\e[0m"
|
|
136
|
+
def bold(str) = "\e[1m#{str}\e[0m"
|
|
137
|
+
def dim(str) = "\e[2m#{str}\e[0m"
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Main runner class
|
|
142
|
+
class AstMergeRecipeCLI
|
|
143
|
+
VERSION = Ast::Merge::VERSION
|
|
144
|
+
|
|
145
|
+
def initialize
|
|
146
|
+
@options = {
|
|
147
|
+
dry_run: false,
|
|
148
|
+
verbose: false,
|
|
149
|
+
parser: :markly,
|
|
150
|
+
base_dir: Dir.pwd,
|
|
151
|
+
recipe_file: nil,
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def run(argv = ARGV)
|
|
156
|
+
parse_options(argv)
|
|
157
|
+
validate_options!
|
|
158
|
+
execute_recipe
|
|
159
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument => e
|
|
160
|
+
$stderr.puts Colors.red("ERROR: #{e.message}")
|
|
161
|
+
$stderr.puts
|
|
162
|
+
$stderr.puts @option_parser
|
|
163
|
+
exit(1)
|
|
164
|
+
rescue => e
|
|
165
|
+
$stderr.puts Colors.red("ERROR: #{e.message}")
|
|
166
|
+
$stderr.puts e.backtrace.first(5).join("\n") if @options[:verbose]
|
|
167
|
+
exit(1)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def parse_options(argv)
|
|
173
|
+
@option_parser = OptionParser.new do |opts|
|
|
174
|
+
opts.banner = "Usage: #{File.basename($0)} RECIPE_FILE [options]"
|
|
175
|
+
opts.separator("")
|
|
176
|
+
opts.separator("Run a YAML-based merge recipe against target files.")
|
|
177
|
+
opts.separator("")
|
|
178
|
+
opts.separator("Options:")
|
|
179
|
+
|
|
180
|
+
opts.on("-n", "--dry-run", "Show what would change without modifying files") do
|
|
181
|
+
@options[:dry_run] = true
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
opts.on("-v", "--verbose", "Show detailed output") do
|
|
185
|
+
@options[:verbose] = true
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
opts.on(
|
|
189
|
+
"-p",
|
|
190
|
+
"--parser=PARSER",
|
|
191
|
+
String,
|
|
192
|
+
"Parser to use (markly, commonmarker, prism, psych)",
|
|
193
|
+
"Default: markly",
|
|
194
|
+
) do |parser|
|
|
195
|
+
@options[:parser] = parser.to_sym
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
opts.on(
|
|
199
|
+
"-d",
|
|
200
|
+
"--base-dir=DIR",
|
|
201
|
+
String,
|
|
202
|
+
"Base directory for path resolution",
|
|
203
|
+
"Default: current directory",
|
|
204
|
+
) do |dir|
|
|
205
|
+
@options[:base_dir] = File.expand_path(dir)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
opts.on(
|
|
209
|
+
"--dev-root=DIR",
|
|
210
|
+
String,
|
|
211
|
+
"Root directory for development gems (implies dev mode)",
|
|
212
|
+
) do |dir|
|
|
213
|
+
# Already handled in pre-parse
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
opts.on("-V", "--version", "Show version") do
|
|
217
|
+
puts "ast-merge-recipe #{VERSION}"
|
|
218
|
+
exit(0)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
opts.on("-h", "--help", "Show this help message") do
|
|
222
|
+
puts opts
|
|
223
|
+
puts
|
|
224
|
+
puts "Examples:"
|
|
225
|
+
puts " #{File.basename($0)} .merge-recipes/gem_family_section.yml --dry-run"
|
|
226
|
+
puts " #{File.basename($0)} recipe.yml --verbose --parser=commonmarker"
|
|
227
|
+
puts
|
|
228
|
+
puts "Recipe YAML format:"
|
|
229
|
+
puts " See lib/ast/merge/recipe/README.md for full documentation"
|
|
230
|
+
exit(0)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Parse options, leaving non-option arguments
|
|
235
|
+
remaining = @option_parser.parse(argv)
|
|
236
|
+
|
|
237
|
+
# First non-option argument is the recipe file
|
|
238
|
+
@options[:recipe_file] = remaining.shift
|
|
239
|
+
|
|
240
|
+
# Warn about extra arguments
|
|
241
|
+
if remaining.any?
|
|
242
|
+
$stderr.puts Colors.yellow("WARNING: Ignoring extra arguments: #{remaining.join(", ")}")
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def validate_options!
|
|
247
|
+
unless @options[:recipe_file]
|
|
248
|
+
$stderr.puts Colors.red("ERROR: No recipe file specified")
|
|
249
|
+
$stderr.puts
|
|
250
|
+
$stderr.puts @option_parser
|
|
251
|
+
exit(1)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
recipe_path = File.expand_path(@options[:recipe_file])
|
|
255
|
+
unless File.exist?(recipe_path)
|
|
256
|
+
$stderr.puts Colors.red("ERROR: Recipe file not found: #{recipe_path}")
|
|
257
|
+
exit(1)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
@options[:recipe_file] = recipe_path
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def execute_recipe
|
|
264
|
+
print_header
|
|
265
|
+
|
|
266
|
+
# Load recipe
|
|
267
|
+
recipe = Ast::Merge::Recipe::Config.load(@options[:recipe_file])
|
|
268
|
+
print_recipe_info(recipe)
|
|
269
|
+
|
|
270
|
+
# Create runner
|
|
271
|
+
runner = Ast::Merge::Recipe::Runner.new(
|
|
272
|
+
recipe,
|
|
273
|
+
dry_run: @options[:dry_run],
|
|
274
|
+
base_dir: @options[:base_dir],
|
|
275
|
+
parser: @options[:parser],
|
|
276
|
+
verbose: @options[:verbose],
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Run and display results
|
|
280
|
+
puts Colors.cyan("Processing files...")
|
|
281
|
+
puts
|
|
282
|
+
|
|
283
|
+
runner.run do |result|
|
|
284
|
+
print_result(result)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
print_summary(runner)
|
|
288
|
+
|
|
289
|
+
# Exit with error if there were failures
|
|
290
|
+
exit(1) if runner.summary[:errors] > 0
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def print_header
|
|
294
|
+
puts Colors.bold("=" * 70)
|
|
295
|
+
puts Colors.bold("AST Merge Recipe Runner")
|
|
296
|
+
puts Colors.bold("=" * 70)
|
|
297
|
+
puts
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def print_recipe_info(recipe)
|
|
301
|
+
puts Colors.cyan("Recipe: #{recipe.name}")
|
|
302
|
+
puts Colors.dim(" #{recipe.description}") if recipe.description
|
|
303
|
+
puts
|
|
304
|
+
puts Colors.yellow("Mode: #{@options[:dry_run] ? "DRY RUN" : "LIVE"}")
|
|
305
|
+
puts Colors.dim("Parser: #{@options[:parser]}")
|
|
306
|
+
puts
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def print_result(result)
|
|
310
|
+
symbol = status_symbol(result.status)
|
|
311
|
+
puts " #{symbol} #{result.relative_path}"
|
|
312
|
+
|
|
313
|
+
if @options[:verbose] || result.status == :error
|
|
314
|
+
puts Colors.dim(" #{result.message}") if result.message
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
if @options[:verbose] && result.stats
|
|
318
|
+
puts Colors.dim(" Stats: #{result.stats.inspect}")
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
if result.error && @options[:verbose]
|
|
322
|
+
puts Colors.red(" #{result.error.class}: #{result.error.message}")
|
|
323
|
+
puts Colors.dim(" #{result.error.backtrace&.first(3)&.join("\n ")}")
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
def print_summary(runner)
|
|
328
|
+
puts
|
|
329
|
+
puts Colors.bold("=" * 70)
|
|
330
|
+
puts Colors.bold("Summary")
|
|
331
|
+
puts Colors.bold("=" * 70)
|
|
332
|
+
puts
|
|
333
|
+
|
|
334
|
+
summary = runner.summary
|
|
335
|
+
|
|
336
|
+
if HAS_TABLE_TENNIS
|
|
337
|
+
puts TableTennis.new(runner.summary_table)
|
|
338
|
+
else
|
|
339
|
+
puts " Total files: #{summary[:total]}"
|
|
340
|
+
if @options[:dry_run]
|
|
341
|
+
puts " Would update: #{summary[:would_update]}"
|
|
342
|
+
else
|
|
343
|
+
puts " Updated: #{summary[:updated]}"
|
|
344
|
+
end
|
|
345
|
+
puts " Unchanged: #{summary[:unchanged]}"
|
|
346
|
+
puts " Skipped (no anchor):#{summary[:skipped]}"
|
|
347
|
+
puts " Errors: #{summary[:errors]}" if summary[:errors] > 0
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
puts
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def status_symbol(status)
|
|
354
|
+
case status
|
|
355
|
+
when :updated then Colors.green("✓")
|
|
356
|
+
when :would_update then Colors.yellow("~")
|
|
357
|
+
when :unchanged then Colors.dim("○")
|
|
358
|
+
when :skipped then Colors.dim("-")
|
|
359
|
+
when :error then Colors.red("✗")
|
|
360
|
+
else "?"
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# Run the CLI
|
|
366
|
+
AstMergeRecipeCLI.new.run
|
|
@@ -118,6 +118,9 @@ module Ast
|
|
|
118
118
|
# @return [Boolean] Whether to add template-only nodes (batch strategy)
|
|
119
119
|
attr_reader :add_template_only_nodes
|
|
120
120
|
|
|
121
|
+
# @return [Object, nil] Match refiner for fuzzy matching
|
|
122
|
+
attr_reader :match_refiner
|
|
123
|
+
|
|
121
124
|
# Initialize the conflict resolver
|
|
122
125
|
#
|
|
123
126
|
# @param strategy [Symbol] Resolution strategy (:node, :batch, or :boundary)
|
|
@@ -129,7 +132,9 @@ module Ast
|
|
|
129
132
|
# @param template_analysis [Object] Analysis of the template file
|
|
130
133
|
# @param dest_analysis [Object] Analysis of the destination file
|
|
131
134
|
# @param add_template_only_nodes [Boolean] Whether to add nodes only in template (batch/boundary strategy)
|
|
132
|
-
|
|
135
|
+
# @param match_refiner [#call, nil] Optional match refiner for fuzzy matching
|
|
136
|
+
# @param options [Hash] Additional options for forward compatibility
|
|
137
|
+
def initialize(strategy:, preference:, template_analysis:, dest_analysis:, add_template_only_nodes: false, match_refiner: nil, **options)
|
|
133
138
|
unless %i[node batch boundary].include?(strategy)
|
|
134
139
|
raise ArgumentError, "Invalid strategy: #{strategy}. Must be :node, :batch, or :boundary"
|
|
135
140
|
end
|
|
@@ -141,6 +146,8 @@ module Ast
|
|
|
141
146
|
@template_analysis = template_analysis
|
|
142
147
|
@dest_analysis = dest_analysis
|
|
143
148
|
@add_template_only_nodes = add_template_only_nodes
|
|
149
|
+
@match_refiner = match_refiner
|
|
150
|
+
# **options captured for forward compatibility - subclasses may use additional options
|
|
144
151
|
end
|
|
145
152
|
|
|
146
153
|
# Resolve conflicts using the configured strategy
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ast
|
|
4
|
+
module Merge
|
|
5
|
+
# Match refiner for text content-based fuzzy matching.
|
|
6
|
+
#
|
|
7
|
+
# This refiner uses Levenshtein distance to pair nodes that have similar
|
|
8
|
+
# but not identical text content. It's useful for matching nodes where
|
|
9
|
+
# the content has been slightly modified (typos, rewording, etc.).
|
|
10
|
+
#
|
|
11
|
+
# Unlike signature-based matching which requires exact content hashes,
|
|
12
|
+
# this refiner allows fuzzy matching based on text similarity. This is
|
|
13
|
+
# particularly useful for:
|
|
14
|
+
# - Paragraphs with minor edits
|
|
15
|
+
# - Headings with slight rewording
|
|
16
|
+
# - Comments with updated text
|
|
17
|
+
# - Any text-based node type
|
|
18
|
+
#
|
|
19
|
+
# @example Basic usage
|
|
20
|
+
# refiner = ContentMatchRefiner.new(threshold: 0.7)
|
|
21
|
+
# matches = refiner.call(template_nodes, dest_nodes)
|
|
22
|
+
#
|
|
23
|
+
# @example With specific node types
|
|
24
|
+
# # Only match paragraphs and headings
|
|
25
|
+
# refiner = ContentMatchRefiner.new(
|
|
26
|
+
# threshold: 0.6,
|
|
27
|
+
# node_types: [:paragraph, :heading]
|
|
28
|
+
# )
|
|
29
|
+
#
|
|
30
|
+
# @example With custom content extractor
|
|
31
|
+
# refiner = ContentMatchRefiner.new(
|
|
32
|
+
# threshold: 0.7,
|
|
33
|
+
# content_extractor: ->(node) { node.text_content.downcase.strip }
|
|
34
|
+
# )
|
|
35
|
+
#
|
|
36
|
+
# @example Combined with other refiners
|
|
37
|
+
# merger = SmartMerger.new(
|
|
38
|
+
# template,
|
|
39
|
+
# destination,
|
|
40
|
+
# match_refiner: [
|
|
41
|
+
# ContentMatchRefiner.new(threshold: 0.7, node_types: [:paragraph]),
|
|
42
|
+
# TableMatchRefiner.new(threshold: 0.5)
|
|
43
|
+
# ]
|
|
44
|
+
# )
|
|
45
|
+
#
|
|
46
|
+
# @see MatchRefinerBase Base class
|
|
47
|
+
class ContentMatchRefiner < MatchRefinerBase
|
|
48
|
+
# Default weights for content similarity scoring
|
|
49
|
+
DEFAULT_WEIGHTS = {
|
|
50
|
+
content: 0.7, # Text content similarity (Levenshtein)
|
|
51
|
+
length: 0.15, # Length similarity
|
|
52
|
+
position: 0.15, # Position similarity in document
|
|
53
|
+
}.freeze
|
|
54
|
+
|
|
55
|
+
# @return [Hash] Scoring weights
|
|
56
|
+
attr_reader :weights
|
|
57
|
+
|
|
58
|
+
# @return [Proc, nil] Custom content extraction function
|
|
59
|
+
attr_reader :content_extractor
|
|
60
|
+
|
|
61
|
+
# Initialize a content match refiner.
|
|
62
|
+
#
|
|
63
|
+
# @param threshold [Float] Minimum score to accept a match (default: 0.5)
|
|
64
|
+
# @param node_types [Array<Symbol>] Node types to process (empty = all)
|
|
65
|
+
# @param weights [Hash] Custom scoring weights
|
|
66
|
+
# @param content_extractor [Proc, nil] Custom function to extract text from nodes
|
|
67
|
+
# Should accept a node and return a String
|
|
68
|
+
# @param options [Hash] Additional options for forward compatibility
|
|
69
|
+
def initialize(
|
|
70
|
+
threshold: DEFAULT_THRESHOLD,
|
|
71
|
+
node_types: [],
|
|
72
|
+
weights: {},
|
|
73
|
+
content_extractor: nil,
|
|
74
|
+
**options
|
|
75
|
+
)
|
|
76
|
+
super(threshold: threshold, node_types: node_types, **options)
|
|
77
|
+
@weights = DEFAULT_WEIGHTS.merge(weights)
|
|
78
|
+
@content_extractor = content_extractor
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Find matches between unmatched nodes based on content similarity.
|
|
82
|
+
#
|
|
83
|
+
# @param template_nodes [Array] Unmatched nodes from template
|
|
84
|
+
# @param dest_nodes [Array] Unmatched nodes from destination
|
|
85
|
+
# @param context [Hash] Additional context (may contain :template_analysis, :dest_analysis)
|
|
86
|
+
# @return [Array<MatchResult>] Array of content-based matches
|
|
87
|
+
def call(template_nodes, dest_nodes, context = {})
|
|
88
|
+
template_filtered = filter_nodes(template_nodes)
|
|
89
|
+
dest_filtered = filter_nodes(dest_nodes)
|
|
90
|
+
|
|
91
|
+
return [] if template_filtered.empty? || dest_filtered.empty?
|
|
92
|
+
|
|
93
|
+
# Build position information for scoring
|
|
94
|
+
total_template = template_filtered.size
|
|
95
|
+
total_dest = dest_filtered.size
|
|
96
|
+
|
|
97
|
+
greedy_match(template_filtered, dest_filtered) do |t_node, d_node|
|
|
98
|
+
t_idx = template_filtered.index(t_node) || 0
|
|
99
|
+
d_idx = dest_filtered.index(d_node) || 0
|
|
100
|
+
|
|
101
|
+
compute_content_similarity(
|
|
102
|
+
t_node,
|
|
103
|
+
d_node,
|
|
104
|
+
t_idx,
|
|
105
|
+
d_idx,
|
|
106
|
+
total_template,
|
|
107
|
+
total_dest,
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
protected
|
|
113
|
+
|
|
114
|
+
# Filter nodes by configured node types.
|
|
115
|
+
#
|
|
116
|
+
# @param nodes [Array] Nodes to filter
|
|
117
|
+
# @return [Array] Filtered nodes (matching node_types, or all if empty)
|
|
118
|
+
def filter_nodes(nodes)
|
|
119
|
+
return nodes if node_types.empty?
|
|
120
|
+
|
|
121
|
+
nodes.select { |n| handles_type?(extract_node_type(n)) }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Extract the type from a node.
|
|
125
|
+
#
|
|
126
|
+
# Handles wrapped nodes (merge_type) and raw nodes (type).
|
|
127
|
+
#
|
|
128
|
+
# @param node [Object] The node
|
|
129
|
+
# @return [Symbol, nil] The node type
|
|
130
|
+
def extract_node_type(node)
|
|
131
|
+
if NodeTyping.typed_node?(node)
|
|
132
|
+
NodeTyping.merge_type_for(node)
|
|
133
|
+
elsif node.respond_to?(:merge_type) && node.merge_type
|
|
134
|
+
node.merge_type
|
|
135
|
+
elsif node.respond_to?(:type)
|
|
136
|
+
type = node.type
|
|
137
|
+
type.is_a?(Symbol) ? type : type.to_s.to_sym
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Extract text content from a node.
|
|
142
|
+
#
|
|
143
|
+
# Uses the custom content_extractor if provided, otherwise tries
|
|
144
|
+
# common methods for getting text content.
|
|
145
|
+
#
|
|
146
|
+
# @param node [Object] The node
|
|
147
|
+
# @return [String] The text content
|
|
148
|
+
def extract_content(node)
|
|
149
|
+
return @content_extractor.call(node) if @content_extractor
|
|
150
|
+
|
|
151
|
+
# Try common content extraction methods
|
|
152
|
+
if node.respond_to?(:text_content)
|
|
153
|
+
node.text_content.to_s
|
|
154
|
+
elsif node.respond_to?(:string_content)
|
|
155
|
+
node.string_content.to_s
|
|
156
|
+
elsif node.respond_to?(:content)
|
|
157
|
+
node.content.to_s
|
|
158
|
+
elsif node.respond_to?(:text)
|
|
159
|
+
node.text.to_s
|
|
160
|
+
elsif node.respond_to?(:to_s)
|
|
161
|
+
node.to_s
|
|
162
|
+
else
|
|
163
|
+
""
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Compute similarity score between two nodes based on content.
|
|
168
|
+
#
|
|
169
|
+
# @param t_node [Object] Template node
|
|
170
|
+
# @param d_node [Object] Destination node
|
|
171
|
+
# @param t_idx [Integer] Template node index
|
|
172
|
+
# @param d_idx [Integer] Destination node index
|
|
173
|
+
# @param total_t [Integer] Total template nodes
|
|
174
|
+
# @param total_d [Integer] Total destination nodes
|
|
175
|
+
# @return [Float] Similarity score (0.0-1.0)
|
|
176
|
+
def compute_content_similarity(t_node, d_node, t_idx, d_idx, total_t, total_d)
|
|
177
|
+
t_content = extract_content(t_node)
|
|
178
|
+
d_content = extract_content(d_node)
|
|
179
|
+
|
|
180
|
+
# Calculate component scores
|
|
181
|
+
content_score = string_similarity(t_content, d_content)
|
|
182
|
+
length_score = length_similarity(t_content, d_content)
|
|
183
|
+
position_score = position_similarity(t_idx, d_idx, total_t, total_d)
|
|
184
|
+
|
|
185
|
+
# Weighted combination
|
|
186
|
+
weights[:content] * content_score +
|
|
187
|
+
weights[:length] * length_score +
|
|
188
|
+
weights[:position] * position_score
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Calculate string similarity using Levenshtein distance.
|
|
192
|
+
#
|
|
193
|
+
# @param str1 [String] First string
|
|
194
|
+
# @param str2 [String] Second string
|
|
195
|
+
# @return [Float] Similarity score (0.0-1.0)
|
|
196
|
+
def string_similarity(str1, str2)
|
|
197
|
+
return 1.0 if str1 == str2
|
|
198
|
+
return 0.0 if str1.empty? || str2.empty?
|
|
199
|
+
|
|
200
|
+
distance = levenshtein_distance(str1, str2)
|
|
201
|
+
max_len = [str1.length, str2.length].max
|
|
202
|
+
1.0 - (distance.to_f / max_len)
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Calculate length similarity between two strings.
|
|
206
|
+
#
|
|
207
|
+
# @param str1 [String] First string
|
|
208
|
+
# @param str2 [String] Second string
|
|
209
|
+
# @return [Float] Similarity score (0.0-1.0)
|
|
210
|
+
def length_similarity(str1, str2)
|
|
211
|
+
return 1.0 if str1.length == str2.length
|
|
212
|
+
return 0.0 if str1.empty? && str2.empty?
|
|
213
|
+
|
|
214
|
+
min_len = [str1.length, str2.length].min.to_f
|
|
215
|
+
max_len = [str1.length, str2.length].max.to_f
|
|
216
|
+
min_len / max_len
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Calculate position similarity in document.
|
|
220
|
+
#
|
|
221
|
+
# Nodes at similar relative positions score higher.
|
|
222
|
+
#
|
|
223
|
+
# @param idx1 [Integer] First node index
|
|
224
|
+
# @param idx2 [Integer] Second node index
|
|
225
|
+
# @param total1 [Integer] Total nodes in first collection
|
|
226
|
+
# @param total2 [Integer] Total nodes in second collection
|
|
227
|
+
# @return [Float] Similarity score (0.0-1.0)
|
|
228
|
+
def position_similarity(idx1, idx2, total1, total2)
|
|
229
|
+
# Normalize positions to 0.0-1.0 range
|
|
230
|
+
pos1 = (total1 > 1) ? idx1.to_f / (total1 - 1) : 0.5
|
|
231
|
+
pos2 = (total2 > 1) ? idx2.to_f / (total2 - 1) : 0.5
|
|
232
|
+
|
|
233
|
+
1.0 - (pos1 - pos2).abs
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Calculate Levenshtein distance between two strings.
|
|
237
|
+
#
|
|
238
|
+
# Uses Wagner-Fischer algorithm with space optimization.
|
|
239
|
+
#
|
|
240
|
+
# @param str1 [String] First string
|
|
241
|
+
# @param str2 [String] Second string
|
|
242
|
+
# @return [Integer] Edit distance
|
|
243
|
+
def levenshtein_distance(str1, str2)
|
|
244
|
+
return str2.length if str1.empty?
|
|
245
|
+
return str1.length if str2.empty?
|
|
246
|
+
|
|
247
|
+
# Use shorter string as columns for space efficiency
|
|
248
|
+
if str1.length > str2.length
|
|
249
|
+
str1, str2 = str2, str1
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
m = str1.length
|
|
253
|
+
n = str2.length
|
|
254
|
+
|
|
255
|
+
# Only need two rows at a time
|
|
256
|
+
prev_row = (0..m).to_a
|
|
257
|
+
curr_row = Array.new(m + 1)
|
|
258
|
+
|
|
259
|
+
(1..n).each do |j|
|
|
260
|
+
curr_row[0] = j
|
|
261
|
+
|
|
262
|
+
(1..m).each do |i|
|
|
263
|
+
cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
|
|
264
|
+
curr_row[i] = [
|
|
265
|
+
curr_row[i - 1] + 1, # insertion
|
|
266
|
+
prev_row[i] + 1, # deletion
|
|
267
|
+
prev_row[i - 1] + cost, # substitution
|
|
268
|
+
].min
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
prev_row, curr_row = curr_row, prev_row
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
prev_row[m]
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
@@ -71,8 +71,9 @@ module Ast
|
|
|
71
71
|
# @note Shared examples require +silent_stream+ and +rspec-stubbed_env+ gems.
|
|
72
72
|
module DebugLogger
|
|
73
73
|
# Benchmark is optional - gracefully degrade if not available
|
|
74
|
+
# Use autoload to defer loading until actually needed
|
|
74
75
|
BENCHMARK_AVAILABLE = begin
|
|
75
|
-
|
|
76
|
+
autoload(:Benchmark, "benchmark")
|
|
76
77
|
true
|
|
77
78
|
rescue LoadError
|
|
78
79
|
# :nocov:
|