text_alignment 0.3.11 → 0.3.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01525b6ca5f7e0ae1ebb9dfce2083006e095a66b3ab468ccef0a584bc3005556
4
- data.tar.gz: bc1137052a12b8db97635b183f299518b0158f3ccecf36d42fa45d746b4f3792
3
+ metadata.gz: 5fc0292470086de4cc05275d8d19dcd534b1bfe66c5560d2043451530a657c5e
4
+ data.tar.gz: c6eee42e0b2b2111f063abd4bd37cbd83a6eff7c236eb032832a23fe77bb7ed5
5
5
  SHA512:
6
- metadata.gz: 9596bea2616c3b4d939c8314d026941a6e627f4380183409051df62722a0ee5e3b35302da3066b0d32e8322582c999877b05a09c54749d878a284a062247342e
7
- data.tar.gz: 361e3e7a23697167b41e037e7d272bbd286ac397416defb125821ebbcb17bfa386341c75fecbb94fe7f9976cfbc2a8b5f7f9be7c150d23daf6d8c16410509b5d
6
+ metadata.gz: 22d172f3fdaa4549edd4f9e4541c03b947c6ae8bab61e46f55c0378120b632639c70d7a30361b321065a978be6930844aedbce6c5e7591521c016c429f4eeb14
7
+ data.tar.gz: 3babca5408aa2dd7ce23e40dbb7909cb4b60f492583c48cb9a6fe085f84e7cf1d4f9b07ea3942455ca984c98dc3d91fce425b8dc116c98a5b8d0ff753fc232c6
@@ -103,6 +103,34 @@ target_annotations = if source_annotations.class == Array
103
103
  else
104
104
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
105
 
106
+ pp alignment
107
+
108
+ # verification
109
+ source_text = source_annotations[:text]
110
+ puts "=====BEGIN"
111
+ (0 ... source_text.length).each do |p|
112
+ t = alignment.transform_begin_position(p)
113
+ if t.nil?
114
+ print source_text[p]
115
+ else
116
+ print '.'
117
+ end
118
+ end
119
+ puts
120
+ puts "=====END"
121
+
122
+ puts "=====BEGIN"
123
+ (0 .. source_text.length).each do |p|
124
+ t = alignment.transform_end_position(p)
125
+ if t.nil?
126
+ print source_text[p]
127
+ else
128
+ print '.'
129
+ end
130
+ end
131
+ puts
132
+ puts "=====END"
133
+
106
134
  # alignment.block_alignments.each do |a|
107
135
  # if a[:alignment].nil? || a[:alignment] == :empty
108
136
  # # p [a[:source], a[:target]]
@@ -119,7 +147,22 @@ else
119
147
  # end
120
148
  # exit
121
149
 
150
+ # verification of source denotations
151
+ puts "[Invalid source denotations]"
152
+ source_annotations[:denotations] do |d|
153
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
154
+ end
155
+ puts "====="
156
+ puts
157
+
122
158
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
159
+ puts "[Invalid transformation]"
160
+ denotations.each do |d|
161
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
162
+ end
163
+ puts "====="
164
+ puts
165
+
123
166
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
124
167
 
125
168
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -159,7 +202,7 @@ if lost_annotations
159
202
  warn "#{lost_annotations.length}"
160
203
  end
161
204
 
162
- puts target_annotations.to_json
205
+ #puts target_annotations.to_json
163
206
 
164
207
  # denotations = anns1[:denotations]
165
208
 
@@ -3,9 +3,9 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
@@ -23,6 +23,8 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s1_prev = 0
27
+ @end_s2_prev = 0
26
28
  end
27
29
 
28
30
  def get_next_anchor
@@ -31,15 +33,16 @@ class TextAlignment::AnchorFinder
31
33
  anchor = @s1[@beg_s1, @size_ngram]
32
34
 
33
35
  search_position = 0
36
+ # search_position = @end_s2_prev
34
37
  while @beg_s2 = @s2.index(anchor, search_position)
35
38
  # if both the begining points are sufficiantly close to the end points of the last match
36
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
39
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
37
40
 
38
41
  left_window_s1, left_window_s2 = get_left_windows
39
- break if left_window_s1 && text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
42
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
40
43
 
41
44
  right_window_s1, right_window_s2 = get_right_windows
42
- break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
45
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
43
46
 
44
47
  search_position = @beg_s2 + 1
45
48
  end
@@ -54,7 +57,7 @@ class TextAlignment::AnchorFinder
54
57
  # extend the block
55
58
  b1 = @beg_s1
56
59
  b2 = @beg_s2
57
- while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
58
61
  b1 -= 1; b2 -= 1
59
62
  end
60
63
  b1 += 1; b2 += 1
@@ -107,7 +110,7 @@ class TextAlignment::AnchorFinder
107
110
  end
108
111
 
109
112
  def get_right_windows
110
- return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
113
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
111
114
 
112
115
  window_s1 = ''
113
116
  loc = @beg_s1 + @size_ngram
@@ -137,6 +140,7 @@ class TextAlignment::AnchorFinder
137
140
  end
138
141
 
139
142
  def text_similarity(str1, str2, ngram_order = 2)
143
+ return 0 if str1.nil? || str2.nil?
140
144
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
141
145
  end
142
146
 
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,10 +170,12 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
172
- block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
177
+ r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
178
+ r.nil? ? nil : r + block_alignment[:target][:begin]
173
179
  end
174
180
  end
175
181
 
@@ -183,10 +189,12 @@ class TextAlignment::TextAlignment
183
189
  if end_position == block_alignment[:source][:end]
184
190
  block_alignment[:target][:end]
185
191
  else
186
- raise "lost annotation"
192
+ # raise "lost annotation"
193
+ nil
187
194
  end
188
195
  else
189
- block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
196
+ r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
197
+ r.nil? ? nil : r + block_alignment[:target][:begin]
190
198
  end
191
199
  end
192
200
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.11'
2
+ VERSION = '0.3.17'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-10 00:00:00.000000000 Z
11
+ date: 2020-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary