plain-merge 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data/lib/plain/merge/version.rb +11 -0
- data/lib/plain/merge.rb +262 -0
- data/lib/plain-merge.rb +3 -0
- data.tar.gz.sig +0 -0
- metadata +110 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: fd3d000aba244c2628a3216cb39596c0019700c11ae89568efef00a6aa3aa796
|
|
4
|
+
data.tar.gz: 3e2c8c645cc153e6e48194d8ce9860b0adf8c4ce1d5582ab6b9f14d8a85fe2f7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: f4ac184c99e041ed6b0f0551022fc9e0163b681db893344a07b1f30503220250ffe9fd19092b5b60fe05a5ed78c627fc736ec0c3d7a9884259977a60f03e7278
|
|
7
|
+
data.tar.gz: eaf6d3f57d5cf225ef29e5aa9e0f5021df15d6b74d6aaa2b2d62bf9fa495dfe91fbd784b8775fbe443c38756749ea2c1282b6f784eaf69f2158cb3908f7a54ca
|
checksums.yaml.gz.sig
ADDED
data/lib/plain/merge.rb
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "tree_haver"
|
|
4
|
+
|
|
5
|
+
module Plain
|
|
6
|
+
module Merge
|
|
7
|
+
PACKAGE_NAME = "plain-merge"
|
|
8
|
+
DEFAULT_TEXT_REFINEMENT_THRESHOLD = 0.7
|
|
9
|
+
DEFAULT_TEXT_REFINEMENT_WEIGHTS = {
|
|
10
|
+
content: 0.7,
|
|
11
|
+
length: 0.15,
|
|
12
|
+
position: 0.15
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
def text_feature_profile
|
|
18
|
+
{
|
|
19
|
+
family: "text",
|
|
20
|
+
supported_dialects: [],
|
|
21
|
+
supported_policies: []
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def text_parse_request(source)
|
|
26
|
+
TreeHaver::ParserRequest.new(source: source, language: "text")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def normalize_text(source)
|
|
30
|
+
source
|
|
31
|
+
.gsub(/\r\n?/, "\n")
|
|
32
|
+
.strip
|
|
33
|
+
.split(/\n\s*\n+/)
|
|
34
|
+
.map { |block| block.strip.gsub(/\s+/, " ") }
|
|
35
|
+
.reject(&:empty?)
|
|
36
|
+
.join("\n\n")
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def analyze_text(source)
|
|
40
|
+
normalized_source = normalize_text(source)
|
|
41
|
+
parts = normalized_source.empty? ? [] : normalized_source.split("\n\n")
|
|
42
|
+
cursor = 0
|
|
43
|
+
|
|
44
|
+
blocks = parts.each_with_index.map do |normalized, index|
|
|
45
|
+
start_offset = cursor
|
|
46
|
+
end_offset = start_offset + normalized.length
|
|
47
|
+
cursor = end_offset + 2
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
index: index,
|
|
51
|
+
normalized: normalized,
|
|
52
|
+
span: {
|
|
53
|
+
start: start_offset,
|
|
54
|
+
end: end_offset
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
{
|
|
60
|
+
kind: "text",
|
|
61
|
+
normalized_source: normalized_source,
|
|
62
|
+
blocks: blocks
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def similarity_score(left_source, right_source)
|
|
67
|
+
left = analyze_text(left_source)
|
|
68
|
+
right = analyze_text(right_source)
|
|
69
|
+
total = [left[:blocks].length, right[:blocks].length].max
|
|
70
|
+
return 1.0 if total.zero?
|
|
71
|
+
|
|
72
|
+
sum = 0.0
|
|
73
|
+
total.times do |index|
|
|
74
|
+
left_block = left[:blocks][index]
|
|
75
|
+
right_block = right[:blocks][index]
|
|
76
|
+
next unless left_block && right_block
|
|
77
|
+
|
|
78
|
+
sum += jaccard(left_block[:normalized], right_block[:normalized])
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
sum / total
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def is_similar(left_source, right_source, threshold)
|
|
85
|
+
score = similarity_score(left_source, right_source)
|
|
86
|
+
{
|
|
87
|
+
score: score,
|
|
88
|
+
threshold: threshold,
|
|
89
|
+
matched: score >= threshold
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def match_text_blocks(template_source, destination_source)
|
|
94
|
+
template = analyze_text(template_source)
|
|
95
|
+
destination = analyze_text(destination_source)
|
|
96
|
+
matched_template = {}
|
|
97
|
+
matched_destination = {}
|
|
98
|
+
matched = []
|
|
99
|
+
|
|
100
|
+
destination[:blocks].each_with_index do |destination_block, destination_index|
|
|
101
|
+
template_index = template[:blocks].find_index.with_index do |template_block, candidate_index|
|
|
102
|
+
!matched_template[candidate_index] && template_block[:normalized] == destination_block[:normalized]
|
|
103
|
+
end
|
|
104
|
+
next unless template_index
|
|
105
|
+
|
|
106
|
+
matched_template[template_index] = true
|
|
107
|
+
matched_destination[destination_index] = true
|
|
108
|
+
matched << {
|
|
109
|
+
template_index: template_index,
|
|
110
|
+
destination_index: destination_index,
|
|
111
|
+
phase: "exact",
|
|
112
|
+
score: 1.0
|
|
113
|
+
}
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
destination[:blocks].each_with_index do |destination_block, destination_index|
|
|
117
|
+
next if matched_destination[destination_index]
|
|
118
|
+
|
|
119
|
+
best_template_index = nil
|
|
120
|
+
best_score = 0.0
|
|
121
|
+
template[:blocks].each_with_index do |template_block, template_index|
|
|
122
|
+
next if matched_template[template_index]
|
|
123
|
+
|
|
124
|
+
score = refined_text_similarity(
|
|
125
|
+
template_block,
|
|
126
|
+
destination_block,
|
|
127
|
+
template[:blocks].length,
|
|
128
|
+
destination[:blocks].length
|
|
129
|
+
)
|
|
130
|
+
next unless score >= DEFAULT_TEXT_REFINEMENT_THRESHOLD && score > best_score
|
|
131
|
+
|
|
132
|
+
best_score = score
|
|
133
|
+
best_template_index = template_index
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
next unless best_template_index
|
|
137
|
+
|
|
138
|
+
matched_template[best_template_index] = true
|
|
139
|
+
matched_destination[destination_index] = true
|
|
140
|
+
matched << {
|
|
141
|
+
template_index: best_template_index,
|
|
142
|
+
destination_index: destination_index,
|
|
143
|
+
phase: "refined",
|
|
144
|
+
score: best_score
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
{
|
|
149
|
+
matched: matched,
|
|
150
|
+
unmatched_template: template[:blocks].each_index.reject { |index| matched_template[index] },
|
|
151
|
+
unmatched_destination: destination[:blocks].each_index.reject { |index| matched_destination[index] }
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def merge_text(template_source, destination_source)
|
|
156
|
+
template = analyze_text(template_source)
|
|
157
|
+
destination = analyze_text(destination_source)
|
|
158
|
+
matches = match_text_blocks(template_source, destination_source)
|
|
159
|
+
matched_template = matches[:matched].each_with_object({}) { |match, memo| memo[match[:template_index]] = true }
|
|
160
|
+
merged_blocks = destination[:blocks].map { |block| block[:normalized] }
|
|
161
|
+
|
|
162
|
+
template[:blocks].each_with_index do |block, index|
|
|
163
|
+
next if matched_template[index]
|
|
164
|
+
|
|
165
|
+
merged_blocks << block[:normalized]
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
{
|
|
169
|
+
ok: true,
|
|
170
|
+
diagnostics: [],
|
|
171
|
+
output: merged_blocks.join("\n\n")
|
|
172
|
+
}
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS)
|
|
176
|
+
content = string_similarity(template_block[:normalized], destination_block[:normalized])
|
|
177
|
+
length = length_similarity(template_block[:normalized], destination_block[:normalized])
|
|
178
|
+
position = position_similarity(
|
|
179
|
+
template_block[:index],
|
|
180
|
+
destination_block[:index],
|
|
181
|
+
template_total,
|
|
182
|
+
destination_total
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
(weights[:content] * content) + (weights[:length] * length) + (weights[:position] * position)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def token_set(normalized)
|
|
189
|
+
normalized.split(/\s+/).reject(&:empty?).to_h { |token| [token, true] }
|
|
190
|
+
end
|
|
191
|
+
private_class_method :token_set
|
|
192
|
+
|
|
193
|
+
def jaccard(left, right)
|
|
194
|
+
left_tokens = token_set(left)
|
|
195
|
+
right_tokens = token_set(right)
|
|
196
|
+
return 1.0 if left_tokens.empty? && right_tokens.empty?
|
|
197
|
+
|
|
198
|
+
intersection = left_tokens.keys.count { |token| right_tokens[token] }
|
|
199
|
+
union = (left_tokens.keys + right_tokens.keys).uniq.length
|
|
200
|
+
union.zero? ? 1.0 : intersection.to_f / union
|
|
201
|
+
end
|
|
202
|
+
private_class_method :jaccard
|
|
203
|
+
|
|
204
|
+
def levenshtein_distance(left, right)
|
|
205
|
+
return 0 if left == right
|
|
206
|
+
return right.length if left.empty?
|
|
207
|
+
return left.length if right.empty?
|
|
208
|
+
|
|
209
|
+
previous = (0..left.length).to_a
|
|
210
|
+
current = Array.new(left.length + 1, 0)
|
|
211
|
+
|
|
212
|
+
(1..right.length).each do |right_index|
|
|
213
|
+
current[0] = right_index
|
|
214
|
+
|
|
215
|
+
(1..left.length).each do |left_index|
|
|
216
|
+
cost = left[left_index - 1] == right[right_index - 1] ? 0 : 1
|
|
217
|
+
current[left_index] = [
|
|
218
|
+
current[left_index - 1] + 1,
|
|
219
|
+
previous[left_index] + 1,
|
|
220
|
+
previous[left_index - 1] + cost
|
|
221
|
+
].min
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
previous = current.dup
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
previous[left.length]
|
|
228
|
+
end
|
|
229
|
+
private_class_method :levenshtein_distance
|
|
230
|
+
|
|
231
|
+
def string_similarity(left, right)
|
|
232
|
+
return 1.0 if left == right
|
|
233
|
+
return 0.0 if left.empty? || right.empty?
|
|
234
|
+
|
|
235
|
+
distance = levenshtein_distance(left, right)
|
|
236
|
+
1.0 - (distance.to_f / [left.length, right.length].max)
|
|
237
|
+
end
|
|
238
|
+
private_class_method :string_similarity
|
|
239
|
+
|
|
240
|
+
def length_similarity(left, right)
|
|
241
|
+
return 1.0 if left.length == right.length
|
|
242
|
+
max_length = [left.length, right.length].max
|
|
243
|
+
return 1.0 if max_length.zero?
|
|
244
|
+
|
|
245
|
+
[left.length, right.length].min.to_f / max_length
|
|
246
|
+
end
|
|
247
|
+
private_class_method :length_similarity
|
|
248
|
+
|
|
249
|
+
def relative_position(index, total)
|
|
250
|
+
total > 1 ? index.to_f / (total - 1) : 0.5
|
|
251
|
+
end
|
|
252
|
+
private_class_method :relative_position
|
|
253
|
+
|
|
254
|
+
def position_similarity(template_index, destination_index, template_total, destination_total)
|
|
255
|
+
1.0 - (
|
|
256
|
+
relative_position(template_index, template_total) -
|
|
257
|
+
relative_position(destination_index, destination_total)
|
|
258
|
+
).abs
|
|
259
|
+
end
|
|
260
|
+
private_class_method :position_similarity
|
|
261
|
+
end
|
|
262
|
+
end
|
data/lib/plain-merge.rb
ADDED
data.tar.gz.sig
ADDED
|
Binary file
|
metadata
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: plain-merge
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 7.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Peter H. Boling
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain:
|
|
10
|
+
- |
|
|
11
|
+
-----BEGIN CERTIFICATE-----
|
|
12
|
+
MIIEgDCCAuigAwIBAgIBATANBgkqhkiG9w0BAQsFADBDMRUwEwYDVQQDDAxwZXRl
|
|
13
|
+
ci5ib2xpbmcxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixkARkW
|
|
14
|
+
A2NvbTAeFw0yNTA1MDQxNTMzMDlaFw00NTA0MjkxNTMzMDlaMEMxFTATBgNVBAMM
|
|
15
|
+
DHBldGVyLmJvbGluZzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPy
|
|
16
|
+
LGQBGRYDY29tMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAruUoo0WA
|
|
17
|
+
uoNuq6puKWYeRYiZekz/nsDeK5x/0IEirzcCEvaHr3Bmz7rjo1I6On3gGKmiZs61
|
|
18
|
+
LRmQ3oxy77ydmkGTXBjruJB+pQEn7UfLSgQ0xa1/X3kdBZt6RmabFlBxnHkoaGY5
|
|
19
|
+
mZuZ5+Z7walmv6sFD9ajhzj+oIgwWfnEHkXYTR8I6VLN7MRRKGMPoZ/yvOmxb2DN
|
|
20
|
+
coEEHWKO9CvgYpW7asIihl/9GMpKiRkcYPm9dGQzZc6uTwom1COfW0+ZOFrDVBuV
|
|
21
|
+
FMQRPswZcY4Wlq0uEBLPU7hxnCL9nKK6Y9IhdDcz1mY6HZ91WImNslOSI0S8hRpj
|
|
22
|
+
yGOWxQIhBT3fqCBlRIqFQBudrnD9jSNpSGsFvbEijd5ns7Z9ZMehXkXDycpGAUj1
|
|
23
|
+
to/5cuTWWw1JqUWrKJYoifnVhtE1o1DZ+LkPtWxHtz5kjDG/zR3MG0Ula0UOavlD
|
|
24
|
+
qbnbcXPBnwXtTFeZ3C+yrWpE4pGnl3yGkZj9SMTlo9qnTMiPmuWKQDatAgMBAAGj
|
|
25
|
+
fzB9MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBQE8uWvNbPVNRXZ
|
|
26
|
+
HlgPbc2PCzC4bjAhBgNVHREEGjAYgRZwZXRlci5ib2xpbmdAZ21haWwuY29tMCEG
|
|
27
|
+
A1UdEgQaMBiBFnBldGVyLmJvbGluZ0BnbWFpbC5jb20wDQYJKoZIhvcNAQELBQAD
|
|
28
|
+
ggGBAJbnUwfJQFPkBgH9cL7hoBfRtmWiCvdqdjeTmi04u8zVNCUox0A4gT982DE9
|
|
29
|
+
wmuN12LpdajxZONqbXuzZvc+nb0StFwmFYZG6iDwaf4BPywm2e/Vmq0YG45vZXGR
|
|
30
|
+
L8yMDSK1cQXjmA+ZBKOHKWavxP6Vp7lWvjAhz8RFwqF9GuNIdhv9NpnCAWcMZtpm
|
|
31
|
+
GUPyIWw/Cw/2wZp74QzZj6Npx+LdXoLTF1HMSJXZ7/pkxLCsB8m4EFVdb/IrW/0k
|
|
32
|
+
kNSfjtAfBHO8nLGuqQZVH9IBD1i9K6aSs7pT6TW8itXUIlkIUI2tg5YzW6OFfPzq
|
|
33
|
+
QekSkX3lZfY+HTSp/o+YvKkqWLUV7PQ7xh1ZYDtocpaHwgxe/j3bBqHE+CUPH2vA
|
|
34
|
+
0V/FwdTRWcwsjVoOJTrYcff8pBZ8r2MvtAc54xfnnhGFzeRHfcltobgFxkAXdE6p
|
|
35
|
+
DVjBtqT23eugOqQ73umLcYDZkc36vnqGxUBSsXrzY9pzV5gGr2I8YUxMqf6ATrZt
|
|
36
|
+
L9nRqA==
|
|
37
|
+
-----END CERTIFICATE-----
|
|
38
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
39
|
+
dependencies:
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: ast-merge
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - '='
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 7.0.0
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - '='
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: 7.0.0
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: tree_haver
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - '='
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: 7.0.0
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - '='
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: 7.0.0
|
|
68
|
+
description: Portable text normalization, similarity, matching, and merge behavior
|
|
69
|
+
for Structured Merge.
|
|
70
|
+
email:
|
|
71
|
+
- info@structuredmerge.org
|
|
72
|
+
executables: []
|
|
73
|
+
extensions: []
|
|
74
|
+
extra_rdoc_files: []
|
|
75
|
+
files:
|
|
76
|
+
- lib/plain-merge.rb
|
|
77
|
+
- lib/plain/merge.rb
|
|
78
|
+
- lib/plain/merge/version.rb
|
|
79
|
+
homepage: https://github.com/structuredmerge/structuredmerge-ruby
|
|
80
|
+
licenses:
|
|
81
|
+
- AGPL-3.0-only
|
|
82
|
+
- PolyForm-Small-Business-1.0.0
|
|
83
|
+
metadata:
|
|
84
|
+
homepage_uri: https://structuredmerge.org
|
|
85
|
+
source_code_uri: https://github.com/structuredmerge/structuredmerge-ruby/tree/v7.0.0
|
|
86
|
+
changelog_uri: https://github.com/structuredmerge/structuredmerge-ruby/blob/v7.0.0/CHANGELOG.md
|
|
87
|
+
bug_tracker_uri: https://github.com/structuredmerge/structuredmerge-ruby/issues
|
|
88
|
+
documentation_uri: https://www.rubydoc.info/gems/plain-merge/7.0.0
|
|
89
|
+
funding_uri: https://github.com/sponsors/pboling
|
|
90
|
+
wiki_uri: https://github.com/structuredmerge/structuredmerge-ruby/wiki
|
|
91
|
+
discord_uri: https://discord.gg/3qme4XHNKN
|
|
92
|
+
rubygems_mfa_required: 'true'
|
|
93
|
+
rdoc_options: []
|
|
94
|
+
require_paths:
|
|
95
|
+
- lib
|
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
97
|
+
requirements:
|
|
98
|
+
- - ">="
|
|
99
|
+
- !ruby/object:Gem::Version
|
|
100
|
+
version: 4.0.0
|
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
102
|
+
requirements:
|
|
103
|
+
- - ">="
|
|
104
|
+
- !ruby/object:Gem::Version
|
|
105
|
+
version: '0'
|
|
106
|
+
requirements: []
|
|
107
|
+
rubygems_version: 4.0.10
|
|
108
|
+
specification_version: 4
|
|
109
|
+
summary: Structured Merge plain text analysis and merge for Ruby
|
|
110
|
+
test_files: []
|
metadata.gz.sig
ADDED
|
Binary file
|