text_alignment 0.2.9 → 0.3.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
4
- data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
3
+ metadata.gz: f6f98465bb47a2b241dda72c8532530f5c7fdf4de49a403366bd08c256b7ff0e
4
+ data.tar.gz: 44a6c920f8f05ab3ee29a0b9fe4de38e2f6fac2386838625b77d99486189ebf0
5
5
  SHA512:
6
- metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
7
- data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
6
+ metadata.gz: 17f038d6d7366b8223cdd66b5ef9f3d79c8ecc39f432ac15dbfd0f3311e1197bc9c40c5cbd38a69d5778278405dcd100bc18187870ee563a7e5999246845b049
7
+ data.tar.gz: f0ded392d47821bc99c640700955686f14cc9550a13b3b8141af2af7f88f79400a3de6632f2bc3223c9e0dc82311d461de84a5ffa16aff443394b3c76540a74c
@@ -1,51 +1,237 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'text_alignment'
3
-
4
3
  require 'json'
5
4
  require 'pp'
6
5
 
6
+ def read_annotations(filename)
7
+ case File.extname(filename)
8
+ when '.json'
9
+ JSON.parse File.read(filename), :symbolize_names => true
10
+ when '.txt'
11
+ {text: File.read(filename)}
12
+ else
13
+ raise "unknown file type: #{filename}"
14
+ end
15
+ end
16
+
17
+ def read_text(filename)
18
+ case File.extname(filename)
19
+ when '.json'
20
+ json = JSON.parse File.read(filename), :symbolize_names => true
21
+ json[:text]
22
+ when '.txt'
23
+ File.read(filename)
24
+ else
25
+ raise "unknown file type: #{filename}"
26
+ end
27
+ end
28
+
29
+ def align_mdoc(source_annotations, target_annotations)
30
+ idnum_denotations = 0
31
+ idnum_relations = 0
32
+ idnum_attributes = 0
33
+ idnum_modifications = 0
34
+
35
+ source_annotations.each do |annotations|
36
+ alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
+
38
+ # alignment.block_alignments.each do |a|
39
+ # p {source:a[:source], target:a[:target]}
40
+ # puts "--"
41
+ # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
42
+ # puts "--"
43
+ # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
44
+ # puts "--"
45
+ # puts target_text[a[:target][:begin] ... a[:target][:end]]
46
+ # puts "======"
47
+ # end
48
+
49
+ if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
50
+ ididx = {}
51
+ denotations = alignment.transform_hdenotations(annotations[:denotations])
52
+ denotations.each do |d|
53
+ reid = 'T' + (idnum_denotations += 1).to_s
54
+ ididx[d[:id]] = reid
55
+ d[:id] = reid
56
+ end
57
+ target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
58
+ target_annotations[:denotations] += denotations
59
+
60
+ if annotations.has_key?(:relations) && !annotations[:relations].empty?
61
+ target_annotations[:relations] = [] unless target_annotations.has_key? :relations
62
+ annotations[:relations].each do |r|
63
+ reid = 'R' + (idnum_relations += 1).to_s
64
+ ididx[r[:id]] = reid
65
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
66
+ end
67
+ end
68
+
69
+ if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
70
+ target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
71
+ annotations[:attributes].each do |a|
72
+ reid = 'A' + (idnum_attributes += 1).to_s
73
+ ididx[a[:id]] = reid
74
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
75
+ end
76
+ end
77
+
78
+ if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
79
+ target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
80
+ annotations[:modifications].each do |m|
81
+ reid = 'M' + (idnum_modifications += 1).to_s
82
+ ididx[m[:id]] = reid
83
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
84
+ end
85
+ end
86
+ end
87
+ end
88
+ target_annotations
89
+ end
90
+
91
+
7
92
  unless ARGV.length == 2
8
- warn "align_annotations target_annotations(.json) reference_annotations(.json)"
93
+ warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
9
94
  exit
10
95
  end
11
96
 
12
- anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
13
- anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
14
-
15
- str1 = anns1[:text]
16
- str2 = anns2[:text]
17
-
18
- denotations = anns1[:denotations]
19
-
20
- puts "[Alignment1]====="
21
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
22
- puts TextAlignment::sdiff2cdiff(align.sdiff)
23
- puts
24
- puts "[Similarity]\n#{align.similarity}"
25
- puts
26
- puts '[Denotations original]'
27
- pp denotations
28
- puts
29
- puts '[Denotations transformed]'
30
- new_denotations = align.transform_hdenotations(denotations)
31
- pp new_denotations
32
- puts
33
- puts "[Alignment2 (downcased)]====="
34
- align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
35
- puts TextAlignment::sdiff2cdiff(align.sdiff)
36
- puts
37
- puts "[Similarity]\n#{align.similarity}"
38
- puts
39
- puts '[Denotations original]'
40
- pp denotations
41
- puts
42
- puts '[Denotations transformed]'
43
- new_denotations = align.transform_hdenotations(denotations)
44
- pp new_denotations
45
- puts
46
- puts '[Annotations transformed]'
47
- anns2[:denotations] = new_denotations
48
- puts anns2.to_json
97
+ source_annotations = read_annotations(ARGV[0])
98
+ target_text = read_text(ARGV[1])
99
+
100
+ lost_annotations = []
101
+ target_annotations = if source_annotations.class == Array
102
+ align_mdoc(source_annotations, {text: target_text})
103
+ else
104
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
+
106
+ # verification
107
+ source_text = source_annotations[:text]
108
+ puts "=====BEGIN"
109
+ (0 ... source_text.length).each do |p|
110
+ t = alignment.transform_begin_position(p)
111
+ if t.nil?
112
+ print source_text[p]
113
+ else
114
+ print '.'
115
+ end
116
+ end
117
+ puts
118
+ puts "=====END"
119
+
120
+ puts "=====BEGIN"
121
+ (0 .. source_text.length).each do |p|
122
+ t = alignment.transform_end_position(p)
123
+ if t.nil?
124
+ print source_text[p]
125
+ else
126
+ print '.'
127
+ end
128
+ end
129
+ puts
130
+ puts "=====END"
131
+
132
+ # pp alignment
133
+
134
+ # alignment.block_alignments.each do |a|
135
+ # if a[:alignment].nil? || a[:alignment] == :empty
136
+ # # p [a[:source], a[:target]]
137
+ # # p a[:alignment]
138
+ # else
139
+ # p [a[:source], a[:target]]
140
+ # p a[:alignment].similarity
141
+ # puts "--"
142
+ # puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
143
+ # puts "--"
144
+ # puts target_text[a[:target][:begin] ... a[:target][:end]]
145
+ # puts "======"
146
+ # end
147
+ # end
148
+ # exit
149
+
150
+ # verification of source denotations
151
+ puts "[Invalid source denotations]"
152
+ source_annotations[:denotations] do |d|
153
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
154
+ end
155
+ puts "====="
156
+
157
+ denotations = alignment.transform_hdenotations(source_annotations[:denotations])
158
+ lost_annotations += alignment.lost_annotations if alignment.lost_annotations
159
+
160
+ source_annotations.merge({text:target_text, denotations:denotations})
161
+ end
162
+
163
+ num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
164
+ num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
165
+ source_annotations.each do |annotations|
166
+ num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
167
+ num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
168
+ num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
169
+ num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
170
+ end
171
+ [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
172
+ else
173
+ num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
174
+ num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
175
+ num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
176
+ num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
177
+ [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
178
+ end
179
+
180
+ warn "[source]"
181
+ warn "denotations:\t#{num_denotations_source}"
182
+ # warn "relations:\t#{num_relations_source}"
183
+ # warn "attributes:\t#{num_attributes_source}"
184
+ # warn "modifications:\t#{num_modifications_source}"
185
+
186
+ warn "\n[target]"
187
+ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
188
+ # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
189
+ # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
190
+ # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
191
+
192
+ if lost_annotations
193
+ warn "\n[lost annotations]"
194
+ warn "#{lost_annotations.length}"
195
+ end
196
+
197
+ #puts target_annotations.to_json
198
+
199
+ # denotations = anns1[:denotations]
200
+
201
+ # puts "[Alignment1]====="
202
+ # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
203
+
204
+ # align.alignment.each do |a|
205
+ # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
206
+ # end
207
+
208
+ # puts TextAlignment::sdiff2cdiff(align.sdiff)
209
+ # puts
210
+ # puts "[Similarity]\n#{align.similarity}"
211
+ # puts
212
+ # puts '[Denotations original]'
213
+ # pp denotations
214
+ # puts
215
+ # puts '[Denotations transformed]'
216
+ # new_denotations = align.transform_hdenotations(denotations)
217
+ # pp new_denotations
218
+ # puts
219
+ # puts "[Alignment2 (downcased)]====="
220
+ # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
221
+ # puts TextAlignment::sdiff2cdiff(align.sdiff)
222
+ # puts
223
+ # puts "[Similarity]\n#{align.similarity}"
224
+ # puts
225
+ # puts '[Denotations original]'
226
+ # pp denotations
227
+ # puts
228
+ # puts '[Denotations transformed]'
229
+ # new_denotations = align.transform_hdenotations(denotations)
230
+ # pp new_denotations
231
+ # puts
232
+ # puts '[Annotations transformed]'
233
+ # anns2[:denotations] = new_denotations
234
+ # puts anns2.to_json
49
235
 
50
236
  # p align.common_elements
51
237
  # puts "---------------"
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
+
10
+ class TextAlignment::AnchorFinder
11
+
12
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
13
+ @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
+ @size_window = _size_window || TextAlignment::SIZE_WINDOW
15
+
16
+ @reverse = (target_str.length < source_str.length)
17
+
18
+ @s1, @s2 = if @reverse
19
+ [target_str.downcase, source_str.downcase]
20
+ else
21
+ [source_str.downcase, target_str.downcase]
22
+ end
23
+
24
+ # current position in s1
25
+ @beg_s1 = 0
26
+ @end_s2_prev = 0
27
+ end
28
+
29
+ def get_next_anchor
30
+ # find the position of an anchor ngram in s1 and s2
31
+ while @beg_s1 < (@s1.length - @size_ngram)
32
+ anchor = @s1[@beg_s1, @size_ngram]
33
+
34
+ search_position = 0
35
+ # search_position = @end_s2_prev
36
+ while @beg_s2 = @s2.index(anchor, search_position)
37
+ # if both the begining points are sufficiantly close to the end points of the last match
38
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
39
+
40
+ left_window_s1, left_window_s2 = get_left_windows
41
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
42
+
43
+ right_window_s1, right_window_s2 = get_right_windows
44
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
45
+
46
+ search_position = @beg_s2 + 1
47
+ end
48
+
49
+ break unless @beg_s2.nil?
50
+
51
+ @beg_s1 += 1
52
+ end
53
+
54
+ return nil if @beg_s1 >= (@s1.length - @size_ngram)
55
+
56
+ # extend the block
57
+ b1 = @beg_s1
58
+ b2 = @beg_s2
59
+ while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ b1 -= 1; b2 -= 1
61
+ end
62
+ b1 += 1; b2 += 1
63
+
64
+ e1 = @beg_s1 + @size_ngram
65
+ e2 = @beg_s2 + @size_ngram
66
+ while @s1[e1] && @s1[e1] == @s2[e2]
67
+ e1 += 1; e2 += 1
68
+ end
69
+
70
+ @end_s1_prev = e1
71
+ @end_s2_prev = e2
72
+ @beg_s1 = e1
73
+
74
+ if @reverse
75
+ {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
76
+ else
77
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
78
+ end
79
+ end
80
+
81
+ private
82
+
83
+ def get_left_windows
84
+ return if @beg_s1 < @size_window || @beg_s2 < @size_window
85
+
86
+ window_s1 = ''
87
+ loc = @beg_s1 - 1
88
+ count = 0
89
+ while count < @size_window && loc >= 0
90
+ if @s1[loc] =~ /[0-9a-zA-Z]/
91
+ window_s1 += @s1[loc]
92
+ count += 1
93
+ end
94
+ loc -= 1
95
+ end
96
+
97
+ window_s2 = ''
98
+ loc = @beg_s2 - 1
99
+ count = 0
100
+ while count < @size_window && loc >= 0
101
+ if @s2[loc] =~ /[0-9a-zA-Z]/
102
+ window_s2 += @s2[loc]
103
+ count += 1
104
+ end
105
+ loc -= 1
106
+ end
107
+
108
+ [window_s1, window_s2]
109
+ end
110
+
111
+ def get_right_windows
112
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
113
+
114
+ window_s1 = ''
115
+ loc = @beg_s1 + @size_ngram
116
+ len_s1 = @s1.length
117
+ count = 0
118
+ while count < @size_window && loc < len_s1
119
+ if @s1[loc] =~ /[0-9a-zA-Z]/
120
+ window_s1 += @s1[loc]
121
+ count += 1
122
+ end
123
+ loc += 1
124
+ end
125
+
126
+ window_s2 = ''
127
+ loc = @beg_s2 + @size_ngram
128
+ len_s2 = @s2.length
129
+ count = 0
130
+ while count < @size_window && loc < len_s2
131
+ if @s2[loc] =~ /[0-9a-zA-Z]/
132
+ window_s2 += @s2[loc]
133
+ count += 1
134
+ end
135
+ loc += 1
136
+ end
137
+
138
+ [window_s1, window_s2]
139
+ end
140
+
141
+ def text_similarity(str1, str2, ngram_order = 2)
142
+ return 0 if str1.nil? || str2.nil?
143
+ String::Similarity.cosine(str1, str2, ngram:ngram_order)
144
+ end
145
+
146
+ end
@@ -4,75 +4,73 @@ require 'string-similarity'
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
6
  # approximate the location of str1 in str2
7
- module TextAlignment
8
- SIGNATURE_NGRAM = 5
9
- MIN_LENGTH_FOR_APPROXIMATION = 50
10
- BUFFER_RATE = 0.1
11
- TEXT_SIMILARITY_TRESHOLD = 0.7
12
- end
7
+ TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
+ TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
13
11
 
14
12
  class << TextAlignment
15
13
 
16
- # If finds an approximate region of str2 that contains str1
17
- def approximate_fit(str1, str2)
18
- raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
19
- return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
14
+ # If finds an approximate region of str2 that contains str1
15
+ def approximate_fit(str1, str2)
16
+ raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
+ return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
20
18
 
21
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
22
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
23
- ngram_shared = ngram1 & ngram2
19
+ ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
+ ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
21
+ ngram_shared = ngram1 & ngram2
24
22
 
25
- # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
26
- return nil, nil if ngram_shared.empty?
23
+ # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
24
+ return nil, nil if ngram_shared.empty?
27
25
 
28
- signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
- return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
26
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
27
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
28
 
31
- cache = {}
32
- fit_begin, fit_end = nil, nil
33
- signature_ngrams.each do |signature_ngram|
34
- loc_signature_ngram_in_str1 = str1.index(signature_ngram)
35
- loc_signature_ngram_in_str2 = str2.index(signature_ngram)
29
+ cache = {}
30
+ fit_begin, fit_end = nil, nil
31
+ signature_ngrams.each do |signature_ngram|
32
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
33
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
36
34
 
37
- # approximate the beginning of the fit
38
- fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
39
- fit_begin = 0 if fit_begin < 0
35
+ # approximate the beginning of the fit
36
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
37
+ fit_begin = 0 if fit_begin < 0
40
38
 
41
- # approximate the end of the fit
42
- offset_end = str1.length - loc_signature_ngram_in_str1
43
- fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
44
- fit_end = str2.length if fit_end > str2.length
39
+ # approximate the end of the fit
40
+ offset_end = str1.length - loc_signature_ngram_in_str1
41
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
42
+ fit_end = str2.length if fit_end > str2.length
45
43
 
46
- next if cache.has_key?("#{fit_begin}-#{fit_end}")
47
- text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
48
- cache["#{fit_begin}-#{fit_end}"] = text_similarity
44
+ next if cache.has_key?("#{fit_begin}-#{fit_end}")
45
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
+ cache["#{fit_begin}-#{fit_end}"] = text_similarity
49
47
 
50
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
51
- fit_begin, fit_end = nil, nil
52
- end
53
- return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
54
- return nil, nil
55
- end
48
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
49
+ fit_begin, fit_end = nil, nil
50
+ end
51
+ return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
52
+ return nil, nil
53
+ end
56
54
 
57
- private
55
+ private
58
56
 
59
- def text_similarity(str1, str2, ngram_order = 3)
60
- _str1 = str1.delete(" \t\r\n")
61
- _str2 = str2.delete(" \t\r\n")
62
- String::Similarity.cosine(_str1, _str2, ngram:2)
63
- end
57
+ def text_similarity(str1, str2, ngram_order = 3)
58
+ _str1 = str1.delete(" \t\r\n")
59
+ _str2 = str2.delete(" \t\r\n")
60
+ String::Similarity.cosine(_str1, _str2, ngram:2)
61
+ end
64
62
 
65
63
  end
66
64
 
67
65
  if __FILE__ == $0
68
- require 'json'
66
+ require 'json'
69
67
 
70
- if ARGV.length == 2
71
- str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
72
- str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
68
+ if ARGV.length == 2
69
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
70
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
73
71
 
74
- loc = TextAlignment::approximate_fit(str1, str2)
75
- p loc
76
- puts str2[loc[0]...loc[1]]
77
- end
72
+ loc = TextAlignment::approximate_fit(str1, str2)
73
+ p loc
74
+ puts str2[loc[0]...loc[1]]
75
+ end
78
76
  end