text_alignment 0.2.9 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
- data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
3
+ metadata.gz: f6f98465bb47a2b241dda72c8532530f5c7fdf4de49a403366bd08c256b7ff0e
4
+ data.tar.gz: 44a6c920f8f05ab3ee29a0b9fe4de38e2f6fac2386838625b77d99486189ebf0
5
5
  SHA512:
6
- metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
- data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
6
+ metadata.gz: 17f038d6d7366b8223cdd66b5ef9f3d79c8ecc39f432ac15dbfd0f3311e1197bc9c40c5cbd38a69d5778278405dcd100bc18187870ee563a7e5999246845b049
7
+ data.tar.gz: f0ded392d47821bc99c640700955686f14cc9550a13b3b8141af2af7f88f79400a3de6632f2bc3223c9e0dc82311d461de84a5ffa16aff443394b3c76540a74c
@@ -1,51 +1,237 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'text_alignment'
3
-
4
3
  require 'json'
5
4
  require 'pp'
6
5
 
6
+ def read_annotations(filename)
7
+ case File.extname(filename)
8
+ when '.json'
9
+ JSON.parse File.read(filename), :symbolize_names => true
10
+ when '.txt'
11
+ {text: File.read(filename)}
12
+ else
13
+ raise "unknown file type: #{filename}"
14
+ end
15
+ end
16
+
17
+ def read_text(filename)
18
+ case File.extname(filename)
19
+ when '.json'
20
+ json = JSON.parse File.read(filename), :symbolize_names => true
21
+ json[:text]
22
+ when '.txt'
23
+ File.read(filename)
24
+ else
25
+ raise "unknown file type: #{filename}"
26
+ end
27
+ end
28
+
29
+ def align_mdoc(source_annotations, target_annotations)
30
+ idnum_denotations = 0
31
+ idnum_relations = 0
32
+ idnum_attributes = 0
33
+ idnum_modifications = 0
34
+
35
+ source_annotations.each do |annotations|
36
+ alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
+
38
+ # alignment.block_alignments.each do |a|
39
+ # p {source:a[:source], target:a[:target]}
40
+ # puts "--"
41
+ # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
42
+ # puts "--"
43
+ # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
44
+ # puts "--"
45
+ # puts target_text[a[:target][:begin] ... a[:target][:end]]
46
+ # puts "======"
47
+ # end
48
+
49
+ if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
50
+ ididx = {}
51
+ denotations = alignment.transform_hdenotations(annotations[:denotations])
52
+ denotations.each do |d|
53
+ reid = 'T' + (idnum_denotations += 1).to_s
54
+ ididx[d[:id]] = reid
55
+ d[:id] = reid
56
+ end
57
+ target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
58
+ target_annotations[:denotations] += denotations
59
+
60
+ if annotations.has_key?(:relations) && !annotations[:relations].empty?
61
+ target_annotations[:relations] = [] unless target_annotations.has_key? :relations
62
+ annotations[:relations].each do |r|
63
+ reid = 'R' + (idnum_relations += 1).to_s
64
+ ididx[r[:id]] = reid
65
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
66
+ end
67
+ end
68
+
69
+ if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
70
+ target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
71
+ annotations[:attributes].each do |a|
72
+ reid = 'A' + (idnum_attributes += 1).to_s
73
+ ididx[a[:id]] = reid
74
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
75
+ end
76
+ end
77
+
78
+ if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
79
+ target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
80
+ annotations[:modifications].each do |m|
81
+ reid = 'M' + (idnum_modifications += 1).to_s
82
+ ididx[m[:id]] = reid
83
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
84
+ end
85
+ end
86
+ end
87
+ end
88
+ target_annotations
89
+ end
90
+
91
+
7
92
  unless ARGV.length == 2
8
- warn "align_annotations target_annotations(.json) reference_annotations(.json)"
93
+ warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
9
94
  exit
10
95
  end
11
96
 
12
- anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
13
- anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
14
-
15
- str1 = anns1[:text]
16
- str2 = anns2[:text]
17
-
18
- denotations = anns1[:denotations]
19
-
20
- puts "[Alignment1]====="
21
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
22
- puts TextAlignment::sdiff2cdiff(align.sdiff)
23
- puts
24
- puts "[Similarity]\n#{align.similarity}"
25
- puts
26
- puts '[Denotations original]'
27
- pp denotations
28
- puts
29
- puts '[Denotations transformed]'
30
- new_denotations = align.transform_hdenotations(denotations)
31
- pp new_denotations
32
- puts
33
- puts "[Alignment2 (downcased)]====="
34
- align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
35
- puts TextAlignment::sdiff2cdiff(align.sdiff)
36
- puts
37
- puts "[Similarity]\n#{align.similarity}"
38
- puts
39
- puts '[Denotations original]'
40
- pp denotations
41
- puts
42
- puts '[Denotations transformed]'
43
- new_denotations = align.transform_hdenotations(denotations)
44
- pp new_denotations
45
- puts
46
- puts '[Annotations transformed]'
47
- anns2[:denotations] = new_denotations
48
- puts anns2.to_json
97
+ source_annotations = read_annotations(ARGV[0])
98
+ target_text = read_text(ARGV[1])
99
+
100
+ lost_annotations = []
101
+ target_annotations = if source_annotations.class == Array
102
+ align_mdoc(source_annotations, {text: target_text})
103
+ else
104
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
+
106
+ # verification
107
+ source_text = source_annotations[:text]
108
+ puts "=====BEGIN"
109
+ (0 ... source_text.length).each do |p|
110
+ t = alignment.transform_begin_position(p)
111
+ if t.nil?
112
+ print source_text[p]
113
+ else
114
+ print '.'
115
+ end
116
+ end
117
+ puts
118
+ puts "=====END"
119
+
120
+ puts "=====BEGIN"
121
+ (0 .. source_text.length).each do |p|
122
+ t = alignment.transform_end_position(p)
123
+ if t.nil?
124
+ print source_text[p]
125
+ else
126
+ print '.'
127
+ end
128
+ end
129
+ puts
130
+ puts "=====END"
131
+
132
+ # pp alignment
133
+
134
+ # alignment.block_alignments.each do |a|
135
+ # if a[:alignment].nil? || a[:alignment] == :empty
136
+ # # p [a[:source], a[:target]]
137
+ # # p a[:alignment]
138
+ # else
139
+ # p [a[:source], a[:target]]
140
+ # p a[:alignment].similarity
141
+ # puts "--"
142
+ # puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
143
+ # puts "--"
144
+ # puts target_text[a[:target][:begin] ... a[:target][:end]]
145
+ # puts "======"
146
+ # end
147
+ # end
148
+ # exit
149
+
150
+ # verification of source denotations
151
+ puts "[Invalid source denotations]"
152
+ source_annotations[:denotations] do |d|
153
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
154
+ end
155
+ puts "====="
156
+
157
+ denotations = alignment.transform_hdenotations(source_annotations[:denotations])
158
+ lost_annotations += alignment.lost_annotations if alignment.lost_annotations
159
+
160
+ source_annotations.merge({text:target_text, denotations:denotations})
161
+ end
162
+
163
+ num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
164
+ num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
165
+ source_annotations.each do |annotations|
166
+ num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
167
+ num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
168
+ num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
169
+ num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
170
+ end
171
+ [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
172
+ else
173
+ num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
174
+ num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
175
+ num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
176
+ num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
177
+ [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
178
+ end
179
+
180
+ warn "[source]"
181
+ warn "denotations:\t#{num_denotations_source}"
182
+ # warn "relations:\t#{num_relations_source}"
183
+ # warn "attributes:\t#{num_attributes_source}"
184
+ # warn "modifications:\t#{num_modifications_source}"
185
+
186
+ warn "\n[target]"
187
+ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
188
+ # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
189
+ # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
190
+ # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
191
+
192
+ if lost_annotations
193
+ warn "\n[lost annotations]"
194
+ warn "#{lost_annotations.length}"
195
+ end
196
+
197
+ #puts target_annotations.to_json
198
+
199
+ # denotations = anns1[:denotations]
200
+
201
+ # puts "[Alignment1]====="
202
+ # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
203
+
204
+ # align.alignment.each do |a|
205
+ # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
206
+ # end
207
+
208
+ # puts TextAlignment::sdiff2cdiff(align.sdiff)
209
+ # puts
210
+ # puts "[Similarity]\n#{align.similarity}"
211
+ # puts
212
+ # puts '[Denotations original]'
213
+ # pp denotations
214
+ # puts
215
+ # puts '[Denotations transformed]'
216
+ # new_denotations = align.transform_hdenotations(denotations)
217
+ # pp new_denotations
218
+ # puts
219
+ # puts "[Alignment2 (downcased)]====="
220
+ # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
221
+ # puts TextAlignment::sdiff2cdiff(align.sdiff)
222
+ # puts
223
+ # puts "[Similarity]\n#{align.similarity}"
224
+ # puts
225
+ # puts '[Denotations original]'
226
+ # pp denotations
227
+ # puts
228
+ # puts '[Denotations transformed]'
229
+ # new_denotations = align.transform_hdenotations(denotations)
230
+ # pp new_denotations
231
+ # puts
232
+ # puts '[Annotations transformed]'
233
+ # anns2[:denotations] = new_denotations
234
+ # puts anns2.to_json
49
235
 
50
236
  # p align.common_elements
51
237
  # puts "---------------"
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
+
10
+ class TextAlignment::AnchorFinder
11
+
12
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
13
+ @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
+ @size_window = _size_window || TextAlignment::SIZE_WINDOW
15
+
16
+ @reverse = (target_str.length < source_str.length)
17
+
18
+ @s1, @s2 = if @reverse
19
+ [target_str.downcase, source_str.downcase]
20
+ else
21
+ [source_str.downcase, target_str.downcase]
22
+ end
23
+
24
+ # current position in s1
25
+ @beg_s1 = 0
26
+ @end_s2_prev = 0
27
+ end
28
+
29
+ def get_next_anchor
30
+ # find the position of an anchor ngram in s1 and s2
31
+ while @beg_s1 < (@s1.length - @size_ngram)
32
+ anchor = @s1[@beg_s1, @size_ngram]
33
+
34
+ search_position = 0
35
+ # search_position = @end_s2_prev
36
+ while @beg_s2 = @s2.index(anchor, search_position)
37
+ # if both the begining points are sufficiantly close to the end points of the last match
38
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
39
+
40
+ left_window_s1, left_window_s2 = get_left_windows
41
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
42
+
43
+ right_window_s1, right_window_s2 = get_right_windows
44
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
45
+
46
+ search_position = @beg_s2 + 1
47
+ end
48
+
49
+ break unless @beg_s2.nil?
50
+
51
+ @beg_s1 += 1
52
+ end
53
+
54
+ return nil if @beg_s1 >= (@s1.length - @size_ngram)
55
+
56
+ # extend the block
57
+ b1 = @beg_s1
58
+ b2 = @beg_s2
59
+ while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ b1 -= 1; b2 -= 1
61
+ end
62
+ b1 += 1; b2 += 1
63
+
64
+ e1 = @beg_s1 + @size_ngram
65
+ e2 = @beg_s2 + @size_ngram
66
+ while @s1[e1] && @s1[e1] == @s2[e2]
67
+ e1 += 1; e2 += 1
68
+ end
69
+
70
+ @end_s1_prev = e1
71
+ @end_s2_prev = e2
72
+ @beg_s1 = e1
73
+
74
+ if @reverse
75
+ {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
76
+ else
77
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
78
+ end
79
+ end
80
+
81
+ private
82
+
83
+ def get_left_windows
84
+ return if @beg_s1 < @size_window || @beg_s2 < @size_window
85
+
86
+ window_s1 = ''
87
+ loc = @beg_s1 - 1
88
+ count = 0
89
+ while count < @size_window && loc >= 0
90
+ if @s1[loc] =~ /[0-9a-zA-Z]/
91
+ window_s1 += @s1[loc]
92
+ count += 1
93
+ end
94
+ loc -= 1
95
+ end
96
+
97
+ window_s2 = ''
98
+ loc = @beg_s2 - 1
99
+ count = 0
100
+ while count < @size_window && loc >= 0
101
+ if @s2[loc] =~ /[0-9a-zA-Z]/
102
+ window_s2 += @s2[loc]
103
+ count += 1
104
+ end
105
+ loc -= 1
106
+ end
107
+
108
+ [window_s1, window_s2]
109
+ end
110
+
111
+ def get_right_windows
112
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
113
+
114
+ window_s1 = ''
115
+ loc = @beg_s1 + @size_ngram
116
+ len_s1 = @s1.length
117
+ count = 0
118
+ while count < @size_window && loc < len_s1
119
+ if @s1[loc] =~ /[0-9a-zA-Z]/
120
+ window_s1 += @s1[loc]
121
+ count += 1
122
+ end
123
+ loc += 1
124
+ end
125
+
126
+ window_s2 = ''
127
+ loc = @beg_s2 + @size_ngram
128
+ len_s2 = @s2.length
129
+ count = 0
130
+ while count < @size_window && loc < len_s2
131
+ if @s2[loc] =~ /[0-9a-zA-Z]/
132
+ window_s2 += @s2[loc]
133
+ count += 1
134
+ end
135
+ loc += 1
136
+ end
137
+
138
+ [window_s1, window_s2]
139
+ end
140
+
141
+ def text_similarity(str1, str2, ngram_order = 2)
142
+ return 0 if str1.nil? || str2.nil?
143
+ String::Similarity.cosine(str1, str2, ngram:ngram_order)
144
+ end
145
+
146
+ end
@@ -4,75 +4,73 @@ require 'string-similarity'
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
6
  # approximate the location of str1 in str2
7
- module TextAlignment
8
- SIGNATURE_NGRAM = 5
9
- MIN_LENGTH_FOR_APPROXIMATION = 50
10
- BUFFER_RATE = 0.1
11
- TEXT_SIMILARITY_TRESHOLD = 0.7
12
- end
7
+ TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
+ TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
13
11
 
14
12
  class << TextAlignment
15
13
 
16
- # If finds an approximate region of str2 that contains str1
17
- def approximate_fit(str1, str2)
18
- raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
19
- return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
14
+ # If finds an approximate region of str2 that contains str1
15
+ def approximate_fit(str1, str2)
16
+ raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
+ return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
20
18
 
21
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
22
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
23
- ngram_shared = ngram1 & ngram2
19
+ ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
+ ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
21
+ ngram_shared = ngram1 & ngram2
24
22
 
25
- # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
26
- return nil, nil if ngram_shared.empty?
23
+ # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
24
+ return nil, nil if ngram_shared.empty?
27
25
 
28
- signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
- return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
26
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
27
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
28
 
31
- cache = {}
32
- fit_begin, fit_end = nil, nil
33
- signature_ngrams.each do |signature_ngram|
34
- loc_signature_ngram_in_str1 = str1.index(signature_ngram)
35
- loc_signature_ngram_in_str2 = str2.index(signature_ngram)
29
+ cache = {}
30
+ fit_begin, fit_end = nil, nil
31
+ signature_ngrams.each do |signature_ngram|
32
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
33
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
36
34
 
37
- # approximate the beginning of the fit
38
- fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
39
- fit_begin = 0 if fit_begin < 0
35
+ # approximate the beginning of the fit
36
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
37
+ fit_begin = 0 if fit_begin < 0
40
38
 
41
- # approximate the end of the fit
42
- offset_end = str1.length - loc_signature_ngram_in_str1
43
- fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
44
- fit_end = str2.length if fit_end > str2.length
39
+ # approximate the end of the fit
40
+ offset_end = str1.length - loc_signature_ngram_in_str1
41
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
42
+ fit_end = str2.length if fit_end > str2.length
45
43
 
46
- next if cache.has_key?("#{fit_begin}-#{fit_end}")
47
- text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
- cache["#{fit_begin}-#{fit_end}"] = text_similarity
44
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
45
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
49
47
 
50
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
51
- fit_begin, fit_end = nil, nil
52
- end
53
- return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
- return nil, nil
55
- end
48
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
49
+ fit_begin, fit_end = nil, nil
50
+ end
51
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
52
+ return nil, nil
53
+ end
56
54
 
57
- private
55
+ private
58
56
 
59
- def text_similarity(str1, str2, ngram_order = 3)
60
- _str1 = str1.delete(" \t\r\n")
61
- _str2 = str2.delete(" \t\r\n")
62
- String::Similarity.cosine(_str1, _str2, ngram:2)
63
- end
57
+ def text_similarity(str1, str2, ngram_order = 3)
58
+ _str1 = str1.delete(" \t\r\n")
59
+ _str2 = str2.delete(" \t\r\n")
60
+ String::Similarity.cosine(_str1, _str2, ngram:2)
61
+ end
64
62
 
65
63
  end
66
64
 
67
65
  if __FILE__ == $0
68
- require 'json'
66
+ require 'json'
69
67
 
70
- if ARGV.length == 2
71
- str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
72
- str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
68
+ if ARGV.length == 2
69
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
70
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
73
71
 
74
- loc = TextAlignment::approximate_fit(str1, str2)
75
- p loc
76
- puts str2[loc[0]...loc[1]]
77
- end
72
+ loc = TextAlignment::approximate_fit(str1, str2)
73
+ p loc
74
+ puts str2[loc[0]...loc[1]]
75
+ end
78
76
  end