text_alignment 0.2.9 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +225 -39
- data/lib/text_alignment/anchor_finder.rb +146 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6f98465bb47a2b241dda72c8532530f5c7fdf4de49a403366bd08c256b7ff0e
|
4
|
+
data.tar.gz: 44a6c920f8f05ab3ee29a0b9fe4de38e2f6fac2386838625b77d99486189ebf0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17f038d6d7366b8223cdd66b5ef9f3d79c8ecc39f432ac15dbfd0f3311e1197bc9c40c5cbd38a69d5778278405dcd100bc18187870ee563a7e5999246845b049
|
7
|
+
data.tar.gz: f0ded392d47821bc99c640700955686f14cc9550a13b3b8141af2af7f88f79400a3de6632f2bc3223c9e0dc82311d461de84a5ffa16aff443394b3c76540a74c
|
data/bin/align_annotations
CHANGED
@@ -1,51 +1,237 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'text_alignment'
|
3
|
-
|
4
3
|
require 'json'
|
5
4
|
require 'pp'
|
6
5
|
|
6
|
+
def read_annotations(filename)
|
7
|
+
case File.extname(filename)
|
8
|
+
when '.json'
|
9
|
+
JSON.parse File.read(filename), :symbolize_names => true
|
10
|
+
when '.txt'
|
11
|
+
{text: File.read(filename)}
|
12
|
+
else
|
13
|
+
raise "unknown file type: #{filename}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def read_text(filename)
|
18
|
+
case File.extname(filename)
|
19
|
+
when '.json'
|
20
|
+
json = JSON.parse File.read(filename), :symbolize_names => true
|
21
|
+
json[:text]
|
22
|
+
when '.txt'
|
23
|
+
File.read(filename)
|
24
|
+
else
|
25
|
+
raise "unknown file type: #{filename}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def align_mdoc(source_annotations, target_annotations)
|
30
|
+
idnum_denotations = 0
|
31
|
+
idnum_relations = 0
|
32
|
+
idnum_attributes = 0
|
33
|
+
idnum_modifications = 0
|
34
|
+
|
35
|
+
source_annotations.each do |annotations|
|
36
|
+
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
|
+
|
38
|
+
# alignment.block_alignments.each do |a|
|
39
|
+
# p {source:a[:source], target:a[:target]}
|
40
|
+
# puts "--"
|
41
|
+
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
42
|
+
# puts "--"
|
43
|
+
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
44
|
+
# puts "--"
|
45
|
+
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
46
|
+
# puts "======"
|
47
|
+
# end
|
48
|
+
|
49
|
+
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
50
|
+
ididx = {}
|
51
|
+
denotations = alignment.transform_hdenotations(annotations[:denotations])
|
52
|
+
denotations.each do |d|
|
53
|
+
reid = 'T' + (idnum_denotations += 1).to_s
|
54
|
+
ididx[d[:id]] = reid
|
55
|
+
d[:id] = reid
|
56
|
+
end
|
57
|
+
target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
|
58
|
+
target_annotations[:denotations] += denotations
|
59
|
+
|
60
|
+
if annotations.has_key?(:relations) && !annotations[:relations].empty?
|
61
|
+
target_annotations[:relations] = [] unless target_annotations.has_key? :relations
|
62
|
+
annotations[:relations].each do |r|
|
63
|
+
reid = 'R' + (idnum_relations += 1).to_s
|
64
|
+
ididx[r[:id]] = reid
|
65
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
|
70
|
+
target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
|
71
|
+
annotations[:attributes].each do |a|
|
72
|
+
reid = 'A' + (idnum_attributes += 1).to_s
|
73
|
+
ididx[a[:id]] = reid
|
74
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
|
79
|
+
target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
|
80
|
+
annotations[:modifications].each do |m|
|
81
|
+
reid = 'M' + (idnum_modifications += 1).to_s
|
82
|
+
ididx[m[:id]] = reid
|
83
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
target_annotations
|
89
|
+
end
|
90
|
+
|
91
|
+
|
7
92
|
unless ARGV.length == 2
|
8
|
-
warn "align_annotations target_annotations(.json) reference_annotations(.json)"
|
93
|
+
warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
|
9
94
|
exit
|
10
95
|
end
|
11
96
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
puts
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
puts
|
33
|
-
puts "
|
34
|
-
|
35
|
-
puts
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
puts
|
46
|
-
|
47
|
-
|
48
|
-
|
97
|
+
source_annotations = read_annotations(ARGV[0])
|
98
|
+
target_text = read_text(ARGV[1])
|
99
|
+
|
100
|
+
lost_annotations = []
|
101
|
+
target_annotations = if source_annotations.class == Array
|
102
|
+
align_mdoc(source_annotations, {text: target_text})
|
103
|
+
else
|
104
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
|
+
|
106
|
+
# verification
|
107
|
+
source_text = source_annotations[:text]
|
108
|
+
puts "=====BEGIN"
|
109
|
+
(0 ... source_text.length).each do |p|
|
110
|
+
t = alignment.transform_begin_position(p)
|
111
|
+
if t.nil?
|
112
|
+
print source_text[p]
|
113
|
+
else
|
114
|
+
print '.'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
puts
|
118
|
+
puts "=====END"
|
119
|
+
|
120
|
+
puts "=====BEGIN"
|
121
|
+
(0 .. source_text.length).each do |p|
|
122
|
+
t = alignment.transform_end_position(p)
|
123
|
+
if t.nil?
|
124
|
+
print source_text[p]
|
125
|
+
else
|
126
|
+
print '.'
|
127
|
+
end
|
128
|
+
end
|
129
|
+
puts
|
130
|
+
puts "=====END"
|
131
|
+
|
132
|
+
# pp alignment
|
133
|
+
|
134
|
+
# alignment.block_alignments.each do |a|
|
135
|
+
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
+
# # p [a[:source], a[:target]]
|
137
|
+
# # p a[:alignment]
|
138
|
+
# else
|
139
|
+
# p [a[:source], a[:target]]
|
140
|
+
# p a[:alignment].similarity
|
141
|
+
# puts "--"
|
142
|
+
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
+
# puts "--"
|
144
|
+
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
+
# puts "======"
|
146
|
+
# end
|
147
|
+
# end
|
148
|
+
# exit
|
149
|
+
|
150
|
+
# verification of source denotations
|
151
|
+
puts "[Invalid source denotations]"
|
152
|
+
source_annotations[:denotations] do |d|
|
153
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
154
|
+
end
|
155
|
+
puts "====="
|
156
|
+
|
157
|
+
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
158
|
+
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
159
|
+
|
160
|
+
source_annotations.merge({text:target_text, denotations:denotations})
|
161
|
+
end
|
162
|
+
|
163
|
+
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
|
164
|
+
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
165
|
+
source_annotations.each do |annotations|
|
166
|
+
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
167
|
+
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
168
|
+
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
169
|
+
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
170
|
+
end
|
171
|
+
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
172
|
+
else
|
173
|
+
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
174
|
+
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
175
|
+
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
176
|
+
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
177
|
+
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
178
|
+
end
|
179
|
+
|
180
|
+
warn "[source]"
|
181
|
+
warn "denotations:\t#{num_denotations_source}"
|
182
|
+
# warn "relations:\t#{num_relations_source}"
|
183
|
+
# warn "attributes:\t#{num_attributes_source}"
|
184
|
+
# warn "modifications:\t#{num_modifications_source}"
|
185
|
+
|
186
|
+
warn "\n[target]"
|
187
|
+
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
188
|
+
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
189
|
+
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
190
|
+
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
191
|
+
|
192
|
+
if lost_annotations
|
193
|
+
warn "\n[lost annotations]"
|
194
|
+
warn "#{lost_annotations.length}"
|
195
|
+
end
|
196
|
+
|
197
|
+
#puts target_annotations.to_json
|
198
|
+
|
199
|
+
# denotations = anns1[:denotations]
|
200
|
+
|
201
|
+
# puts "[Alignment1]====="
|
202
|
+
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
203
|
+
|
204
|
+
# align.alignment.each do |a|
|
205
|
+
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
206
|
+
# end
|
207
|
+
|
208
|
+
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
209
|
+
# puts
|
210
|
+
# puts "[Similarity]\n#{align.similarity}"
|
211
|
+
# puts
|
212
|
+
# puts '[Denotations original]'
|
213
|
+
# pp denotations
|
214
|
+
# puts
|
215
|
+
# puts '[Denotations transformed]'
|
216
|
+
# new_denotations = align.transform_hdenotations(denotations)
|
217
|
+
# pp new_denotations
|
218
|
+
# puts
|
219
|
+
# puts "[Alignment2 (downcased)]====="
|
220
|
+
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
221
|
+
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
222
|
+
# puts
|
223
|
+
# puts "[Similarity]\n#{align.similarity}"
|
224
|
+
# puts
|
225
|
+
# puts '[Denotations original]'
|
226
|
+
# pp denotations
|
227
|
+
# puts
|
228
|
+
# puts '[Denotations transformed]'
|
229
|
+
# new_denotations = align.transform_hdenotations(denotations)
|
230
|
+
# pp new_denotations
|
231
|
+
# puts
|
232
|
+
# puts '[Annotations transformed]'
|
233
|
+
# anns2[:denotations] = new_denotations
|
234
|
+
# puts anns2.to_json
|
49
235
|
|
50
236
|
# p align.common_elements
|
51
237
|
# puts "---------------"
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'string-similarity'
|
3
|
+
|
4
|
+
module TextAlignment; end unless defined? TextAlignment
|
5
|
+
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
+
|
10
|
+
class TextAlignment::AnchorFinder
|
11
|
+
|
12
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
13
|
+
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
|
+
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
+
|
16
|
+
@reverse = (target_str.length < source_str.length)
|
17
|
+
|
18
|
+
@s1, @s2 = if @reverse
|
19
|
+
[target_str.downcase, source_str.downcase]
|
20
|
+
else
|
21
|
+
[source_str.downcase, target_str.downcase]
|
22
|
+
end
|
23
|
+
|
24
|
+
# current position in s1
|
25
|
+
@beg_s1 = 0
|
26
|
+
@end_s2_prev = 0
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_next_anchor
|
30
|
+
# find the position of an anchor ngram in s1 and s2
|
31
|
+
while @beg_s1 < (@s1.length - @size_ngram)
|
32
|
+
anchor = @s1[@beg_s1, @size_ngram]
|
33
|
+
|
34
|
+
search_position = 0
|
35
|
+
# search_position = @end_s2_prev
|
36
|
+
while @beg_s2 = @s2.index(anchor, search_position)
|
37
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
38
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
39
|
+
|
40
|
+
left_window_s1, left_window_s2 = get_left_windows
|
41
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
42
|
+
|
43
|
+
right_window_s1, right_window_s2 = get_right_windows
|
44
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
45
|
+
|
46
|
+
search_position = @beg_s2 + 1
|
47
|
+
end
|
48
|
+
|
49
|
+
break unless @beg_s2.nil?
|
50
|
+
|
51
|
+
@beg_s1 += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
return nil if @beg_s1 >= (@s1.length - @size_ngram)
|
55
|
+
|
56
|
+
# extend the block
|
57
|
+
b1 = @beg_s1
|
58
|
+
b2 = @beg_s2
|
59
|
+
while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
|
60
|
+
b1 -= 1; b2 -= 1
|
61
|
+
end
|
62
|
+
b1 += 1; b2 += 1
|
63
|
+
|
64
|
+
e1 = @beg_s1 + @size_ngram
|
65
|
+
e2 = @beg_s2 + @size_ngram
|
66
|
+
while @s1[e1] && @s1[e1] == @s2[e2]
|
67
|
+
e1 += 1; e2 += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
@end_s1_prev = e1
|
71
|
+
@end_s2_prev = e2
|
72
|
+
@beg_s1 = e1
|
73
|
+
|
74
|
+
if @reverse
|
75
|
+
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
76
|
+
else
|
77
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def get_left_windows
|
84
|
+
return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
85
|
+
|
86
|
+
window_s1 = ''
|
87
|
+
loc = @beg_s1 - 1
|
88
|
+
count = 0
|
89
|
+
while count < @size_window && loc >= 0
|
90
|
+
if @s1[loc] =~ /[0-9a-zA-Z]/
|
91
|
+
window_s1 += @s1[loc]
|
92
|
+
count += 1
|
93
|
+
end
|
94
|
+
loc -= 1
|
95
|
+
end
|
96
|
+
|
97
|
+
window_s2 = ''
|
98
|
+
loc = @beg_s2 - 1
|
99
|
+
count = 0
|
100
|
+
while count < @size_window && loc >= 0
|
101
|
+
if @s2[loc] =~ /[0-9a-zA-Z]/
|
102
|
+
window_s2 += @s2[loc]
|
103
|
+
count += 1
|
104
|
+
end
|
105
|
+
loc -= 1
|
106
|
+
end
|
107
|
+
|
108
|
+
[window_s1, window_s2]
|
109
|
+
end
|
110
|
+
|
111
|
+
def get_right_windows
|
112
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
113
|
+
|
114
|
+
window_s1 = ''
|
115
|
+
loc = @beg_s1 + @size_ngram
|
116
|
+
len_s1 = @s1.length
|
117
|
+
count = 0
|
118
|
+
while count < @size_window && loc < len_s1
|
119
|
+
if @s1[loc] =~ /[0-9a-zA-Z]/
|
120
|
+
window_s1 += @s1[loc]
|
121
|
+
count += 1
|
122
|
+
end
|
123
|
+
loc += 1
|
124
|
+
end
|
125
|
+
|
126
|
+
window_s2 = ''
|
127
|
+
loc = @beg_s2 + @size_ngram
|
128
|
+
len_s2 = @s2.length
|
129
|
+
count = 0
|
130
|
+
while count < @size_window && loc < len_s2
|
131
|
+
if @s2[loc] =~ /[0-9a-zA-Z]/
|
132
|
+
window_s2 += @s2[loc]
|
133
|
+
count += 1
|
134
|
+
end
|
135
|
+
loc += 1
|
136
|
+
end
|
137
|
+
|
138
|
+
[window_s1, window_s2]
|
139
|
+
end
|
140
|
+
|
141
|
+
def text_similarity(str1, str2, ngram_order = 2)
|
142
|
+
return 0 if str1.nil? || str2.nil?
|
143
|
+
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
@@ -4,75 +4,73 @@ require 'string-similarity'
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
6
|
# approximate the location of str1 in str2
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
TEXT_SIMILARITY_TRESHOLD = 0.7
|
12
|
-
end
|
7
|
+
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
+
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
13
11
|
|
14
12
|
class << TextAlignment
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
# If finds an approximate region of str2 that contains str1
|
15
|
+
def approximate_fit(str1, str2)
|
16
|
+
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
|
+
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
|
20
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
|
21
|
+
ngram_shared = ngram1 & ngram2
|
24
22
|
|
25
|
-
|
26
|
-
|
23
|
+
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
24
|
+
return nil, nil if ngram_shared.empty?
|
27
25
|
|
28
|
-
|
29
|
-
|
26
|
+
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
27
|
+
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
29
|
+
cache = {}
|
30
|
+
fit_begin, fit_end = nil, nil
|
31
|
+
signature_ngrams.each do |signature_ngram|
|
32
|
+
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
33
|
+
loc_signature_ngram_in_str2 = str2.index(signature_ngram)
|
36
34
|
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
# approximate the beginning of the fit
|
36
|
+
fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
|
37
|
+
fit_begin = 0 if fit_begin < 0
|
40
38
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
# approximate the end of the fit
|
40
|
+
offset_end = str1.length - loc_signature_ngram_in_str1
|
41
|
+
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
42
|
+
fit_end = str2.length if fit_end > str2.length
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
44
|
+
next if cache.has_key?("#{fit_begin}-#{fit_end}")
|
45
|
+
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
|
+
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
49
|
+
fit_begin, fit_end = nil, nil
|
50
|
+
end
|
51
|
+
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
52
|
+
return nil, nil
|
53
|
+
end
|
56
54
|
|
57
|
-
|
55
|
+
private
|
58
56
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
57
|
+
def text_similarity(str1, str2, ngram_order = 3)
|
58
|
+
_str1 = str1.delete(" \t\r\n")
|
59
|
+
_str2 = str2.delete(" \t\r\n")
|
60
|
+
String::Similarity.cosine(_str1, _str2, ngram:2)
|
61
|
+
end
|
64
62
|
|
65
63
|
end
|
66
64
|
|
67
65
|
if __FILE__ == $0
|
68
|
-
|
66
|
+
require 'json'
|
69
67
|
|
70
|
-
|
71
|
-
|
72
|
-
|
68
|
+
if ARGV.length == 2
|
69
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
70
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
73
71
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
72
|
+
loc = TextAlignment::approximate_fit(str1, str2)
|
73
|
+
p loc
|
74
|
+
puts str2[loc[0]...loc[1]]
|
75
|
+
end
|
78
76
|
end
|