text_alignment 0.2.9 → 0.3.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +225 -39
- data/lib/text_alignment/anchor_finder.rb +146 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6f98465bb47a2b241dda72c8532530f5c7fdf4de49a403366bd08c256b7ff0e
|
4
|
+
data.tar.gz: 44a6c920f8f05ab3ee29a0b9fe4de38e2f6fac2386838625b77d99486189ebf0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17f038d6d7366b8223cdd66b5ef9f3d79c8ecc39f432ac15dbfd0f3311e1197bc9c40c5cbd38a69d5778278405dcd100bc18187870ee563a7e5999246845b049
|
7
|
+
data.tar.gz: f0ded392d47821bc99c640700955686f14cc9550a13b3b8141af2af7f88f79400a3de6632f2bc3223c9e0dc82311d461de84a5ffa16aff443394b3c76540a74c
|
data/bin/align_annotations
CHANGED
@@ -1,51 +1,237 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'text_alignment'
|
3
|
-
|
4
3
|
require 'json'
|
5
4
|
require 'pp'
|
6
5
|
|
6
|
+
def read_annotations(filename)
|
7
|
+
case File.extname(filename)
|
8
|
+
when '.json'
|
9
|
+
JSON.parse File.read(filename), :symbolize_names => true
|
10
|
+
when '.txt'
|
11
|
+
{text: File.read(filename)}
|
12
|
+
else
|
13
|
+
raise "unknown file type: #{filename}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def read_text(filename)
|
18
|
+
case File.extname(filename)
|
19
|
+
when '.json'
|
20
|
+
json = JSON.parse File.read(filename), :symbolize_names => true
|
21
|
+
json[:text]
|
22
|
+
when '.txt'
|
23
|
+
File.read(filename)
|
24
|
+
else
|
25
|
+
raise "unknown file type: #{filename}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def align_mdoc(source_annotations, target_annotations)
|
30
|
+
idnum_denotations = 0
|
31
|
+
idnum_relations = 0
|
32
|
+
idnum_attributes = 0
|
33
|
+
idnum_modifications = 0
|
34
|
+
|
35
|
+
source_annotations.each do |annotations|
|
36
|
+
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
|
+
|
38
|
+
# alignment.block_alignments.each do |a|
|
39
|
+
# p {source:a[:source], target:a[:target]}
|
40
|
+
# puts "--"
|
41
|
+
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
42
|
+
# puts "--"
|
43
|
+
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
44
|
+
# puts "--"
|
45
|
+
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
46
|
+
# puts "======"
|
47
|
+
# end
|
48
|
+
|
49
|
+
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
50
|
+
ididx = {}
|
51
|
+
denotations = alignment.transform_hdenotations(annotations[:denotations])
|
52
|
+
denotations.each do |d|
|
53
|
+
reid = 'T' + (idnum_denotations += 1).to_s
|
54
|
+
ididx[d[:id]] = reid
|
55
|
+
d[:id] = reid
|
56
|
+
end
|
57
|
+
target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
|
58
|
+
target_annotations[:denotations] += denotations
|
59
|
+
|
60
|
+
if annotations.has_key?(:relations) && !annotations[:relations].empty?
|
61
|
+
target_annotations[:relations] = [] unless target_annotations.has_key? :relations
|
62
|
+
annotations[:relations].each do |r|
|
63
|
+
reid = 'R' + (idnum_relations += 1).to_s
|
64
|
+
ididx[r[:id]] = reid
|
65
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
|
70
|
+
target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
|
71
|
+
annotations[:attributes].each do |a|
|
72
|
+
reid = 'A' + (idnum_attributes += 1).to_s
|
73
|
+
ididx[a[:id]] = reid
|
74
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
|
79
|
+
target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
|
80
|
+
annotations[:modifications].each do |m|
|
81
|
+
reid = 'M' + (idnum_modifications += 1).to_s
|
82
|
+
ididx[m[:id]] = reid
|
83
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
target_annotations
|
89
|
+
end
|
90
|
+
|
91
|
+
|
7
92
|
unless ARGV.length == 2
|
8
|
-
warn "align_annotations target_annotations(.json) reference_annotations(.json)"
|
93
|
+
warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
|
9
94
|
exit
|
10
95
|
end
|
11
96
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
puts
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
puts
|
33
|
-
puts "
|
34
|
-
|
35
|
-
puts
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
puts
|
46
|
-
|
47
|
-
|
48
|
-
|
97
|
+
source_annotations = read_annotations(ARGV[0])
|
98
|
+
target_text = read_text(ARGV[1])
|
99
|
+
|
100
|
+
lost_annotations = []
|
101
|
+
target_annotations = if source_annotations.class == Array
|
102
|
+
align_mdoc(source_annotations, {text: target_text})
|
103
|
+
else
|
104
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
|
+
|
106
|
+
# verification
|
107
|
+
source_text = source_annotations[:text]
|
108
|
+
puts "=====BEGIN"
|
109
|
+
(0 ... source_text.length).each do |p|
|
110
|
+
t = alignment.transform_begin_position(p)
|
111
|
+
if t.nil?
|
112
|
+
print source_text[p]
|
113
|
+
else
|
114
|
+
print '.'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
puts
|
118
|
+
puts "=====END"
|
119
|
+
|
120
|
+
puts "=====BEGIN"
|
121
|
+
(0 .. source_text.length).each do |p|
|
122
|
+
t = alignment.transform_end_position(p)
|
123
|
+
if t.nil?
|
124
|
+
print source_text[p]
|
125
|
+
else
|
126
|
+
print '.'
|
127
|
+
end
|
128
|
+
end
|
129
|
+
puts
|
130
|
+
puts "=====END"
|
131
|
+
|
132
|
+
# pp alignment
|
133
|
+
|
134
|
+
# alignment.block_alignments.each do |a|
|
135
|
+
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
+
# # p [a[:source], a[:target]]
|
137
|
+
# # p a[:alignment]
|
138
|
+
# else
|
139
|
+
# p [a[:source], a[:target]]
|
140
|
+
# p a[:alignment].similarity
|
141
|
+
# puts "--"
|
142
|
+
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
+
# puts "--"
|
144
|
+
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
+
# puts "======"
|
146
|
+
# end
|
147
|
+
# end
|
148
|
+
# exit
|
149
|
+
|
150
|
+
# verification of source denotations
|
151
|
+
puts "[Invalid source denotations]"
|
152
|
+
source_annotations[:denotations] do |d|
|
153
|
+
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
154
|
+
end
|
155
|
+
puts "====="
|
156
|
+
|
157
|
+
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
158
|
+
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
159
|
+
|
160
|
+
source_annotations.merge({text:target_text, denotations:denotations})
|
161
|
+
end
|
162
|
+
|
163
|
+
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
|
164
|
+
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
165
|
+
source_annotations.each do |annotations|
|
166
|
+
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
167
|
+
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
168
|
+
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
169
|
+
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
170
|
+
end
|
171
|
+
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
172
|
+
else
|
173
|
+
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
174
|
+
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
175
|
+
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
176
|
+
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
177
|
+
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
178
|
+
end
|
179
|
+
|
180
|
+
warn "[source]"
|
181
|
+
warn "denotations:\t#{num_denotations_source}"
|
182
|
+
# warn "relations:\t#{num_relations_source}"
|
183
|
+
# warn "attributes:\t#{num_attributes_source}"
|
184
|
+
# warn "modifications:\t#{num_modifications_source}"
|
185
|
+
|
186
|
+
warn "\n[target]"
|
187
|
+
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
188
|
+
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
189
|
+
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
190
|
+
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
191
|
+
|
192
|
+
if lost_annotations
|
193
|
+
warn "\n[lost annotations]"
|
194
|
+
warn "#{lost_annotations.length}"
|
195
|
+
end
|
196
|
+
|
197
|
+
#puts target_annotations.to_json
|
198
|
+
|
199
|
+
# denotations = anns1[:denotations]
|
200
|
+
|
201
|
+
# puts "[Alignment1]====="
|
202
|
+
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
203
|
+
|
204
|
+
# align.alignment.each do |a|
|
205
|
+
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
206
|
+
# end
|
207
|
+
|
208
|
+
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
209
|
+
# puts
|
210
|
+
# puts "[Similarity]\n#{align.similarity}"
|
211
|
+
# puts
|
212
|
+
# puts '[Denotations original]'
|
213
|
+
# pp denotations
|
214
|
+
# puts
|
215
|
+
# puts '[Denotations transformed]'
|
216
|
+
# new_denotations = align.transform_hdenotations(denotations)
|
217
|
+
# pp new_denotations
|
218
|
+
# puts
|
219
|
+
# puts "[Alignment2 (downcased)]====="
|
220
|
+
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
221
|
+
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
222
|
+
# puts
|
223
|
+
# puts "[Similarity]\n#{align.similarity}"
|
224
|
+
# puts
|
225
|
+
# puts '[Denotations original]'
|
226
|
+
# pp denotations
|
227
|
+
# puts
|
228
|
+
# puts '[Denotations transformed]'
|
229
|
+
# new_denotations = align.transform_hdenotations(denotations)
|
230
|
+
# pp new_denotations
|
231
|
+
# puts
|
232
|
+
# puts '[Annotations transformed]'
|
233
|
+
# anns2[:denotations] = new_denotations
|
234
|
+
# puts anns2.to_json
|
49
235
|
|
50
236
|
# p align.common_elements
|
51
237
|
# puts "---------------"
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'string-similarity'
|
3
|
+
|
4
|
+
module TextAlignment; end unless defined? TextAlignment
|
5
|
+
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
+
|
10
|
+
class TextAlignment::AnchorFinder
|
11
|
+
|
12
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
13
|
+
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
|
+
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
+
|
16
|
+
@reverse = (target_str.length < source_str.length)
|
17
|
+
|
18
|
+
@s1, @s2 = if @reverse
|
19
|
+
[target_str.downcase, source_str.downcase]
|
20
|
+
else
|
21
|
+
[source_str.downcase, target_str.downcase]
|
22
|
+
end
|
23
|
+
|
24
|
+
# current position in s1
|
25
|
+
@beg_s1 = 0
|
26
|
+
@end_s2_prev = 0
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_next_anchor
|
30
|
+
# find the position of an anchor ngram in s1 and s2
|
31
|
+
while @beg_s1 < (@s1.length - @size_ngram)
|
32
|
+
anchor = @s1[@beg_s1, @size_ngram]
|
33
|
+
|
34
|
+
search_position = 0
|
35
|
+
# search_position = @end_s2_prev
|
36
|
+
while @beg_s2 = @s2.index(anchor, search_position)
|
37
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
38
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
39
|
+
|
40
|
+
left_window_s1, left_window_s2 = get_left_windows
|
41
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
42
|
+
|
43
|
+
right_window_s1, right_window_s2 = get_right_windows
|
44
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
45
|
+
|
46
|
+
search_position = @beg_s2 + 1
|
47
|
+
end
|
48
|
+
|
49
|
+
break unless @beg_s2.nil?
|
50
|
+
|
51
|
+
@beg_s1 += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
return nil if @beg_s1 >= (@s1.length - @size_ngram)
|
55
|
+
|
56
|
+
# extend the block
|
57
|
+
b1 = @beg_s1
|
58
|
+
b2 = @beg_s2
|
59
|
+
while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
|
60
|
+
b1 -= 1; b2 -= 1
|
61
|
+
end
|
62
|
+
b1 += 1; b2 += 1
|
63
|
+
|
64
|
+
e1 = @beg_s1 + @size_ngram
|
65
|
+
e2 = @beg_s2 + @size_ngram
|
66
|
+
while @s1[e1] && @s1[e1] == @s2[e2]
|
67
|
+
e1 += 1; e2 += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
@end_s1_prev = e1
|
71
|
+
@end_s2_prev = e2
|
72
|
+
@beg_s1 = e1
|
73
|
+
|
74
|
+
if @reverse
|
75
|
+
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
76
|
+
else
|
77
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def get_left_windows
|
84
|
+
return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
85
|
+
|
86
|
+
window_s1 = ''
|
87
|
+
loc = @beg_s1 - 1
|
88
|
+
count = 0
|
89
|
+
while count < @size_window && loc >= 0
|
90
|
+
if @s1[loc] =~ /[0-9a-zA-Z]/
|
91
|
+
window_s1 += @s1[loc]
|
92
|
+
count += 1
|
93
|
+
end
|
94
|
+
loc -= 1
|
95
|
+
end
|
96
|
+
|
97
|
+
window_s2 = ''
|
98
|
+
loc = @beg_s2 - 1
|
99
|
+
count = 0
|
100
|
+
while count < @size_window && loc >= 0
|
101
|
+
if @s2[loc] =~ /[0-9a-zA-Z]/
|
102
|
+
window_s2 += @s2[loc]
|
103
|
+
count += 1
|
104
|
+
end
|
105
|
+
loc -= 1
|
106
|
+
end
|
107
|
+
|
108
|
+
[window_s1, window_s2]
|
109
|
+
end
|
110
|
+
|
111
|
+
def get_right_windows
|
112
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
113
|
+
|
114
|
+
window_s1 = ''
|
115
|
+
loc = @beg_s1 + @size_ngram
|
116
|
+
len_s1 = @s1.length
|
117
|
+
count = 0
|
118
|
+
while count < @size_window && loc < len_s1
|
119
|
+
if @s1[loc] =~ /[0-9a-zA-Z]/
|
120
|
+
window_s1 += @s1[loc]
|
121
|
+
count += 1
|
122
|
+
end
|
123
|
+
loc += 1
|
124
|
+
end
|
125
|
+
|
126
|
+
window_s2 = ''
|
127
|
+
loc = @beg_s2 + @size_ngram
|
128
|
+
len_s2 = @s2.length
|
129
|
+
count = 0
|
130
|
+
while count < @size_window && loc < len_s2
|
131
|
+
if @s2[loc] =~ /[0-9a-zA-Z]/
|
132
|
+
window_s2 += @s2[loc]
|
133
|
+
count += 1
|
134
|
+
end
|
135
|
+
loc += 1
|
136
|
+
end
|
137
|
+
|
138
|
+
[window_s1, window_s2]
|
139
|
+
end
|
140
|
+
|
141
|
+
def text_similarity(str1, str2, ngram_order = 2)
|
142
|
+
return 0 if str1.nil? || str2.nil?
|
143
|
+
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
@@ -4,75 +4,73 @@ require 'string-similarity'
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
6
|
# approximate the location of str1 in str2
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
TEXT_SIMILARITY_TRESHOLD = 0.7
|
12
|
-
end
|
7
|
+
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
+
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
13
11
|
|
14
12
|
class << TextAlignment
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
# If finds an approximate region of str2 that contains str1
|
15
|
+
def approximate_fit(str1, str2)
|
16
|
+
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
|
+
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
|
20
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
|
21
|
+
ngram_shared = ngram1 & ngram2
|
24
22
|
|
25
|
-
|
26
|
-
|
23
|
+
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
24
|
+
return nil, nil if ngram_shared.empty?
|
27
25
|
|
28
|
-
|
29
|
-
|
26
|
+
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
27
|
+
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
29
|
+
cache = {}
|
30
|
+
fit_begin, fit_end = nil, nil
|
31
|
+
signature_ngrams.each do |signature_ngram|
|
32
|
+
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
33
|
+
loc_signature_ngram_in_str2 = str2.index(signature_ngram)
|
36
34
|
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
# approximate the beginning of the fit
|
36
|
+
fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
|
37
|
+
fit_begin = 0 if fit_begin < 0
|
40
38
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
# approximate the end of the fit
|
40
|
+
offset_end = str1.length - loc_signature_ngram_in_str1
|
41
|
+
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
42
|
+
fit_end = str2.length if fit_end > str2.length
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
44
|
+
next if cache.has_key?("#{fit_begin}-#{fit_end}")
|
45
|
+
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
|
+
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
49
|
+
fit_begin, fit_end = nil, nil
|
50
|
+
end
|
51
|
+
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
52
|
+
return nil, nil
|
53
|
+
end
|
56
54
|
|
57
|
-
|
55
|
+
private
|
58
56
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
57
|
+
def text_similarity(str1, str2, ngram_order = 3)
|
58
|
+
_str1 = str1.delete(" \t\r\n")
|
59
|
+
_str2 = str2.delete(" \t\r\n")
|
60
|
+
String::Similarity.cosine(_str1, _str2, ngram:2)
|
61
|
+
end
|
64
62
|
|
65
63
|
end
|
66
64
|
|
67
65
|
if __FILE__ == $0
|
68
|
-
|
66
|
+
require 'json'
|
69
67
|
|
70
|
-
|
71
|
-
|
72
|
-
|
68
|
+
if ARGV.length == 2
|
69
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
70
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
73
71
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
72
|
+
loc = TextAlignment::approximate_fit(str1, str2)
|
73
|
+
p loc
|
74
|
+
puts str2[loc[0]...loc[1]]
|
75
|
+
end
|
78
76
|
end
|