text_alignment 0.3.11 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01525b6ca5f7e0ae1ebb9dfce2083006e095a66b3ab468ccef0a584bc3005556
4
- data.tar.gz: bc1137052a12b8db97635b183f299518b0158f3ccecf36d42fa45d746b4f3792
3
+ metadata.gz: 5fc0292470086de4cc05275d8d19dcd534b1bfe66c5560d2043451530a657c5e
4
+ data.tar.gz: c6eee42e0b2b2111f063abd4bd37cbd83a6eff7c236eb032832a23fe77bb7ed5
5
5
  SHA512:
6
- metadata.gz: 9596bea2616c3b4d939c8314d026941a6e627f4380183409051df62722a0ee5e3b35302da3066b0d32e8322582c999877b05a09c54749d878a284a062247342e
7
- data.tar.gz: 361e3e7a23697167b41e037e7d272bbd286ac397416defb125821ebbcb17bfa386341c75fecbb94fe7f9976cfbc2a8b5f7f9be7c150d23daf6d8c16410509b5d
6
+ metadata.gz: 22d172f3fdaa4549edd4f9e4541c03b947c6ae8bab61e46f55c0378120b632639c70d7a30361b321065a978be6930844aedbce6c5e7591521c016c429f4eeb14
7
+ data.tar.gz: 3babca5408aa2dd7ce23e40dbb7909cb4b60f492583c48cb9a6fe085f84e7cf1d4f9b07ea3942455ca984c98dc3d91fce425b8dc116c98a5b8d0ff753fc232c6
@@ -103,6 +103,34 @@ target_annotations = if source_annotations.class == Array
103
103
  else
104
104
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
105
 
106
+ pp alignment
107
+
108
+ # verification
109
+ source_text = source_annotations[:text]
110
+ puts "=====BEGIN"
111
+ (0 ... source_text.length).each do |p|
112
+ t = alignment.transform_begin_position(p)
113
+ if t.nil?
114
+ print source_text[p]
115
+ else
116
+ print '.'
117
+ end
118
+ end
119
+ puts
120
+ puts "=====END"
121
+
122
+ puts "=====BEGIN"
123
+ (0 .. source_text.length).each do |p|
124
+ t = alignment.transform_end_position(p)
125
+ if t.nil?
126
+ print source_text[p]
127
+ else
128
+ print '.'
129
+ end
130
+ end
131
+ puts
132
+ puts "=====END"
133
+
106
134
  # alignment.block_alignments.each do |a|
107
135
  # if a[:alignment].nil? || a[:alignment] == :empty
108
136
  # # p [a[:source], a[:target]]
@@ -119,7 +147,22 @@ else
119
147
  # end
120
148
  # exit
121
149
 
150
+ # verification of source denotations
151
+ puts "[Invalid source denotations]"
152
+ source_annotations[:denotations] do |d|
153
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
154
+ end
155
+ puts "====="
156
+ puts
157
+
122
158
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
159
+ puts "[Invalid transformation]"
160
+ denotations.each do |d|
161
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
162
+ end
163
+ puts "====="
164
+ puts
165
+
123
166
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
124
167
 
125
168
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -159,7 +202,7 @@ if lost_annotations
159
202
  warn "#{lost_annotations.length}"
160
203
  end
161
204
 
162
- puts target_annotations.to_json
205
+ #puts target_annotations.to_json
163
206
 
164
207
  # denotations = anns1[:denotations]
165
208
 
@@ -3,9 +3,9 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
@@ -23,6 +23,8 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s1_prev = 0
27
+ @end_s2_prev = 0
26
28
  end
27
29
 
28
30
  def get_next_anchor
@@ -31,15 +33,16 @@ class TextAlignment::AnchorFinder
31
33
  anchor = @s1[@beg_s1, @size_ngram]
32
34
 
33
35
  search_position = 0
36
+ # search_position = @end_s2_prev
34
37
  while @beg_s2 = @s2.index(anchor, search_position)
35
38
  # if both the begining points are sufficiantly close to the end points of the last match
36
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
39
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
37
40
 
38
41
  left_window_s1, left_window_s2 = get_left_windows
39
- break if left_window_s1 && text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
42
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
40
43
 
41
44
  right_window_s1, right_window_s2 = get_right_windows
42
- break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
45
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
43
46
 
44
47
  search_position = @beg_s2 + 1
45
48
  end
@@ -54,7 +57,7 @@ class TextAlignment::AnchorFinder
54
57
  # extend the block
55
58
  b1 = @beg_s1
56
59
  b2 = @beg_s2
57
- while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
58
61
  b1 -= 1; b2 -= 1
59
62
  end
60
63
  b1 += 1; b2 += 1
@@ -107,7 +110,7 @@ class TextAlignment::AnchorFinder
107
110
  end
108
111
 
109
112
  def get_right_windows
110
- return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
113
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
111
114
 
112
115
  window_s1 = ''
113
116
  loc = @beg_s1 + @size_ngram
@@ -137,6 +140,7 @@ class TextAlignment::AnchorFinder
137
140
  end
138
141
 
139
142
  def text_similarity(str1, str2, ngram_order = 2)
143
+ return 0 if str1.nil? || str2.nil?
140
144
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
141
145
  end
142
146
 
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,10 +170,12 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
172
- block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
177
+ r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
178
+ r.nil? ? nil : r + block_alignment[:target][:begin]
173
179
  end
174
180
  end
175
181
 
@@ -183,10 +189,12 @@ class TextAlignment::TextAlignment
183
189
  if end_position == block_alignment[:source][:end]
184
190
  block_alignment[:target][:end]
185
191
  else
186
- raise "lost annotation"
192
+ # raise "lost annotation"
193
+ nil
187
194
  end
188
195
  else
189
- block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
196
+ r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
197
+ r.nil? ? nil : r + block_alignment[:target][:begin]
190
198
  end
191
199
  end
192
200
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.11'
2
+ VERSION = '0.3.17'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-10 00:00:00.000000000 Z
11
+ date: 2020-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary