text_alignment 0.3.10 → 0.3.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 76e19ea61fda3543526ef26d582ad4ec9cfb3eec33ad03b77a998529af762eb6
4
- data.tar.gz: e386cd97317e6c32c0d4c9ec347b664c2640fcad17bb7dc906b6b5b9c5784b54
3
+ metadata.gz: 61f98e83ee9c1d220dd228be6bb708b79f36d1c691f04dcb14d4af55f398b6da
4
+ data.tar.gz: f692e98a27a555baab2797ebe37407ad7133916db172a977e95415b9004e471c
5
5
  SHA512:
6
- metadata.gz: dbb72e3c50713edc4fbf3328bdcc5f78723d1fe03cad4f451f2142bd05075da0ca9c3ea9b7bcdaeda009bf94cb8e26f4ede32ab96f819479fd3e2f9a988f41e9
7
- data.tar.gz: 31443cca978d00e03bd5cf4339575ce4b070b06ad690938277b19f12045de2c33e04395c16799cfe4e336ac1b14264dccaa76b1c4fc48f71dbe9558f4efc6221
6
+ metadata.gz: 2dd8f865c245601c362e335df4e26413501fa682a97010b8aebd3ebc01864ae4772f6e716725331f9c6bc8f688818d665ef7a21384906211efc0e630b46f2313
7
+ data.tar.gz: 4c43199f474b94c825d8ec8ca2085b06107ec34df3d5ee988294f7423caef317c893bbba9879637d197fb2e7ae426c9e43c67cf042b7339959b638d5e5f60d01
@@ -103,6 +103,35 @@ target_annotations = if source_annotations.class == Array
103
103
  else
104
104
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
105
 
106
+ # verification
107
+ source_text = source_annotations[:text]
108
+ puts "=====BEGIN"
109
+ (0 ... source_text.length).each do |p|
110
+ t = alignment.transform_begin_position(p)
111
+ if t.nil?
112
+ print source_text[p]
113
+ else
114
+ print '.'
115
+ end
116
+ end
117
+ puts
118
+ puts "=====END"
119
+
120
+ puts "=====BEGIN"
121
+ (0 .. source_text.length).each do |p|
122
+ t = alignment.transform_end_position(p)
123
+ if t.nil?
124
+ print source_text[p]
125
+ else
126
+ print '.'
127
+ end
128
+ end
129
+ puts
130
+ puts "=====END"
131
+
132
+ pp alignment
133
+
134
+ exit
106
135
  # alignment.block_alignments.each do |a|
107
136
  # if a[:alignment].nil? || a[:alignment] == :empty
108
137
  # # p [a[:source], a[:target]]
@@ -119,7 +148,22 @@ else
119
148
  # end
120
149
  # exit
121
150
 
151
+ # verification of source denotations
152
+ puts "[Invalid source denotations]"
153
+ source_annotations[:denotations] do |d|
154
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
155
+ end
156
+ puts "====="
157
+ puts
158
+
122
159
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
160
+ puts "[Invalid transformation]"
161
+ denotations.each do |d|
162
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
163
+ end
164
+ puts "====="
165
+ puts
166
+
123
167
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
124
168
 
125
169
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -159,7 +203,7 @@ if lost_annotations
159
203
  warn "#{lost_annotations.length}"
160
204
  end
161
205
 
162
- puts target_annotations.to_json
206
+ #puts target_annotations.to_json
163
207
 
164
208
  # denotations = anns1[:denotations]
165
209
 
@@ -3,9 +3,9 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
@@ -23,44 +23,41 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s1_prev = 0
27
+ @end_s2_prev = 0
26
28
  end
27
29
 
28
30
  def get_next_anchor
29
31
  # find the position of an anchor ngram in s1 and s2
30
- @beg_s2 = nil
31
32
  while @beg_s1 < (@s1.length - @size_ngram)
32
- while @beg_s1 < (@s1.length - @size_ngram)
33
- anchor = @s1[@beg_s1, @size_ngram]
34
- @beg_s2 = if defined? @end_s2_prev
35
- @s2.index(anchor, @end_s2_prev)
36
- else
37
- @s2.index(anchor)
38
- end
39
- break unless @beg_s2.nil?
40
- @beg_s1 += 1
41
- end
33
+ anchor = @s1[@beg_s1, @size_ngram]
34
+
35
+ search_position = 0
36
+ # search_position = @end_s2_prev
37
+ while @beg_s2 = @s2.index(anchor, search_position)
38
+ # if both the begining points are sufficiantly close to the end points of the last match
39
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
42
40
 
43
- # The loop above is terminated with beg_s2 == nil, which means no more anchor
44
- break if @beg_s2.nil?
41
+ left_window_s1, left_window_s2 = get_left_windows
42
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
45
43
 
46
- # if both the begining points are sufficiantly close to the end points of the last match
47
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
44
+ right_window_s1, right_window_s2 = get_right_windows
45
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
48
46
 
49
- left_window_s1, left_window_s2 = get_left_windows
50
- break if left_window_s1 && text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
+ search_position = @beg_s2 + 1
48
+ end
51
49
 
52
- right_window_s1, right_window_s2 = get_right_windows
53
- break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
50
+ break unless @beg_s2.nil?
54
51
 
55
52
  @beg_s1 += 1
56
53
  end
57
54
 
58
- return nil if @beg_s2.nil?
55
+ return nil if @beg_s1 >= (@s1.length - @size_ngram)
59
56
 
60
57
  # extend the block
61
58
  b1 = @beg_s1
62
59
  b2 = @beg_s2
63
- while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
64
61
  b1 -= 1; b2 -= 1
65
62
  end
66
63
  b1 += 1; b2 += 1
@@ -113,7 +110,7 @@ class TextAlignment::AnchorFinder
113
110
  end
114
111
 
115
112
  def get_right_windows
116
- return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
113
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
114
 
118
115
  window_s1 = ''
119
116
  loc = @beg_s1 + @size_ngram
@@ -143,6 +140,7 @@ class TextAlignment::AnchorFinder
143
140
  end
144
141
 
145
142
  def text_similarity(str1, str2, ngram_order = 2)
143
+ return 0 if str1.nil? || str2.nil?
146
144
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
147
145
  end
148
146
 
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -38,17 +40,17 @@ class TextAlignment::TextAlignment
38
40
  end
39
41
  end
40
42
 
41
- # mblocks.each do |b|
42
- # p [b[:source], b[:target]]
43
- # puts "---"
44
- # puts str1[b[:source][:begin] ... b[:source][:end]]
45
- # puts "---"
46
- # puts str2[b[:target][:begin] ... b[:target][:end]]
47
- # puts "====="
48
- # puts
49
- # end
50
- # puts "-=-=-=-=-"
51
- # puts
43
+ mblocks.each do |b|
44
+ p [b[:source], b[:target]]
45
+ puts "---"
46
+ puts str1[b[:source][:begin] ... b[:source][:end]]
47
+ puts "---"
48
+ puts str2[b[:target][:begin] ... b[:target][:end]]
49
+ puts "====="
50
+ puts
51
+ end
52
+ puts "-=-=-=-=-"
53
+ puts
52
54
 
53
55
  ## To find block alignments
54
56
  @block_alignments = []
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,9 +170,12 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
177
+ p begin_position
178
+ puts "-----"
172
179
  block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
173
180
  end
174
181
  end
@@ -183,7 +190,8 @@ class TextAlignment::TextAlignment
183
190
  if end_position == block_alignment[:source][:end]
184
191
  block_alignment[:target][:end]
185
192
  else
186
- raise "lost annotation"
193
+ # raise "lost annotation"
194
+ nil
187
195
  end
188
196
  else
189
197
  block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.10'
2
+ VERSION = '0.3.16'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.10
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-29 00:00:00.000000000 Z
11
+ date: 2020-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary