text_alignment 0.3.9 → 0.3.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b4a8ed8b4cff6f5b04b10c91704939936530ad0dc820a126a514a58cce7a4df6
4
- data.tar.gz: '087412fa8b9779073c67fa1d9a0afc05e32d7f1baad4518c375fcf804a45ecd4'
3
+ metadata.gz: 8252929e7b74251db493ad991332d0da8a92f35441e2a442d05b6fb29139d657
4
+ data.tar.gz: c9e41127fb231e4db2843696c1029b35711449c5bf5104158da301d03b817131
5
5
  SHA512:
6
- metadata.gz: e5f56b58d35a614c6b9a72ccb8282b775d19c2fb576d68420153b96703d954e47471cfbcd9b384bd244f19110cc436d28e409d5014f1ade66c23390a928111fc
7
- data.tar.gz: 4dbdd214b0e2aab9b32751305517160016cfd16b60f5380282b2b1ba2e6946e3097789f4fd234b577cf8ab00e06ea8d3497f59843d600eac2b12b8d60c32441c
6
+ metadata.gz: 1fc8da7324d71cf25edbec9765ab512928323079472736ea4e294abb12dfafc87f55d71cb49c371470811775bd489d3c91cce4a787b99faa305f2f326dc80c77
7
+ data.tar.gz: f694c99216b59dd693a6acdfffc727fe74b5c189b4b9583b31fb7e6394319a3176de76237a142dac3770bcd1fbbc467d5d1e97a7225ed993a21246a66de8b2ec
@@ -103,6 +103,35 @@ target_annotations = if source_annotations.class == Array
103
103
  else
104
104
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
105
 
106
+ # verification
107
+ source_text = source_annotations[:text]
108
+ puts "=====BEGIN"
109
+ (0 ... source_text.length).each do |p|
110
+ t = alignment.transform_begin_position(p)
111
+ if t.nil?
112
+ print source_text[p]
113
+ else
114
+ print '.'
115
+ end
116
+ end
117
+ puts
118
+ puts "=====END"
119
+
120
+ puts "=====BEGIN"
121
+ (0 .. source_text.length).each do |p|
122
+ t = alignment.transform_end_position(p)
123
+ if t.nil?
124
+ print source_text[p]
125
+ else
126
+ print '.'
127
+ end
128
+ end
129
+ puts
130
+ puts "=====END"
131
+
132
+ pp alignment
133
+
134
+ exit
106
135
  # alignment.block_alignments.each do |a|
107
136
  # if a[:alignment].nil? || a[:alignment] == :empty
108
137
  # # p [a[:source], a[:target]]
@@ -119,7 +148,22 @@ else
119
148
  # end
120
149
  # exit
121
150
 
151
+ # verification of source denotations
152
+ puts "[Invalid source denotations]"
153
+ source_annotations[:denotations] do |d|
154
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
155
+ end
156
+ puts "====="
157
+ puts
158
+
122
159
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
160
+ puts "[Invalid transformation]"
161
+ denotations.each do |d|
162
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
163
+ end
164
+ puts "====="
165
+ puts
166
+
123
167
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
124
168
 
125
169
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -159,7 +203,7 @@ if lost_annotations
159
203
  warn "#{lost_annotations.length}"
160
204
  end
161
205
 
162
- puts target_annotations.to_json
206
+ #puts target_annotations.to_json
163
207
 
164
208
  # denotations = anns1[:denotations]
165
209
 
@@ -3,9 +3,9 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 30 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
@@ -23,39 +23,35 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s2_prev = 0
26
27
  end
27
28
 
28
29
  def get_next_anchor
29
30
  # find the position of an anchor ngram in s1 and s2
30
- @beg_s2 = nil
31
31
  while @beg_s1 < (@s1.length - @size_ngram)
32
- while @beg_s1 < (@s1.length - @size_ngram)
33
- anchor = @s1[@beg_s1, @size_ngram]
34
- @beg_s2 = if defined? @end_s2_prev
35
- @s2.index(anchor, @end_s2_prev)
36
- else
37
- @s2.index(anchor)
38
- end
39
- break unless @beg_s2.nil?
40
- @beg_s1 += 1
41
- end
32
+ anchor = @s1[@beg_s1, @size_ngram]
33
+
34
+ search_position = 0
35
+ # search_position = @end_s2_prev
36
+ while @beg_s2 = @s2.index(anchor, search_position)
37
+ # if both the begining points are sufficiantly close to the end points of the last match
38
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
42
39
 
43
- # The loop above is terminated with beg_s2 == nil, which means no more anchor
44
- break if @beg_s2.nil?
40
+ left_window_s1, left_window_s2 = get_left_windows
41
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
45
42
 
46
- # if both the begining points are sufficiantly close to the end points of the last match
47
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
43
+ right_window_s1, right_window_s2 = get_right_windows
44
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
48
45
 
49
- left_window_s1, left_window_s2 = get_left_windows
50
- break if left_window_s1 && text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ search_position = @beg_s2 + 1
47
+ end
51
48
 
52
- right_window_s1, right_window_s2 = get_right_windows
53
- break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
49
+ break unless @beg_s2.nil?
54
50
 
55
51
  @beg_s1 += 1
56
52
  end
57
53
 
58
- return nil if @beg_s2.nil?
54
+ return nil if @beg_s1 >= (@s1.length - @size_ngram)
59
55
 
60
56
  # extend the block
61
57
  b1 = @beg_s1
@@ -67,7 +63,7 @@ class TextAlignment::AnchorFinder
67
63
 
68
64
  e1 = @beg_s1 + @size_ngram
69
65
  e2 = @beg_s2 + @size_ngram
70
- while @s1[e1] == @s2[e2]
66
+ while @s1[e1] && @s1[e1] == @s2[e2]
71
67
  e1 += 1; e2 += 1
72
68
  end
73
69
 
@@ -113,7 +109,7 @@ class TextAlignment::AnchorFinder
113
109
  end
114
110
 
115
111
  def get_right_windows
116
- return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
112
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
113
 
118
114
  window_s1 = ''
119
115
  loc = @beg_s1 + @size_ngram
@@ -143,6 +139,7 @@ class TextAlignment::AnchorFinder
143
139
  end
144
140
 
145
141
  def text_similarity(str1, str2, ngram_order = 2)
142
+ return 0 if str1.nil? || str2.nil?
146
143
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
147
144
  end
148
145
 
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,7 +170,8 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
172
177
  block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
@@ -183,7 +188,8 @@ class TextAlignment::TextAlignment
183
188
  if end_position == block_alignment[:source][:end]
184
189
  block_alignment[:target][:end]
185
190
  else
186
- raise "lost annotation"
191
+ # raise "lost annotation"
192
+ nil
187
193
  end
188
194
  else
189
195
  block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
@@ -199,13 +205,9 @@ class TextAlignment::TextAlignment
199
205
  end
200
206
 
201
207
  def transform_denotations!(denotations)
202
- puts "hereherehere========"
203
208
  return nil if denotations.nil?
204
209
  @lost_annotations = []
205
210
 
206
- pp denotations
207
- puts "-----"
208
-
209
211
  denotations.each do |d|
210
212
  begin
211
213
  d.begin = transform_begin_position(d.begin);
@@ -217,7 +219,7 @@ class TextAlignment::TextAlignment
217
219
  end
218
220
  end
219
221
 
220
- pp denotations
222
+ @lost_annotations
221
223
  end
222
224
 
223
225
  def transform_hdenotations(hdenotations)
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.9'
2
+ VERSION = '0.3.15'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.9
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary