text_alignment 0.3.10 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 76e19ea61fda3543526ef26d582ad4ec9cfb3eec33ad03b77a998529af762eb6
4
- data.tar.gz: e386cd97317e6c32c0d4c9ec347b664c2640fcad17bb7dc906b6b5b9c5784b54
3
+ metadata.gz: 61f98e83ee9c1d220dd228be6bb708b79f36d1c691f04dcb14d4af55f398b6da
4
+ data.tar.gz: f692e98a27a555baab2797ebe37407ad7133916db172a977e95415b9004e471c
5
5
  SHA512:
6
- metadata.gz: dbb72e3c50713edc4fbf3328bdcc5f78723d1fe03cad4f451f2142bd05075da0ca9c3ea9b7bcdaeda009bf94cb8e26f4ede32ab96f819479fd3e2f9a988f41e9
7
- data.tar.gz: 31443cca978d00e03bd5cf4339575ce4b070b06ad690938277b19f12045de2c33e04395c16799cfe4e336ac1b14264dccaa76b1c4fc48f71dbe9558f4efc6221
6
+ metadata.gz: 2dd8f865c245601c362e335df4e26413501fa682a97010b8aebd3ebc01864ae4772f6e716725331f9c6bc8f688818d665ef7a21384906211efc0e630b46f2313
7
+ data.tar.gz: 4c43199f474b94c825d8ec8ca2085b06107ec34df3d5ee988294f7423caef317c893bbba9879637d197fb2e7ae426c9e43c67cf042b7339959b638d5e5f60d01
@@ -103,6 +103,35 @@ target_annotations = if source_annotations.class == Array
103
103
  else
104
104
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
105
 
106
+ # verification
107
+ source_text = source_annotations[:text]
108
+ puts "=====BEGIN"
109
+ (0 ... source_text.length).each do |p|
110
+ t = alignment.transform_begin_position(p)
111
+ if t.nil?
112
+ print source_text[p]
113
+ else
114
+ print '.'
115
+ end
116
+ end
117
+ puts
118
+ puts "=====END"
119
+
120
+ puts "=====BEGIN"
121
+ (0 .. source_text.length).each do |p|
122
+ t = alignment.transform_end_position(p)
123
+ if t.nil?
124
+ print source_text[p]
125
+ else
126
+ print '.'
127
+ end
128
+ end
129
+ puts
130
+ puts "=====END"
131
+
132
+ pp alignment
133
+
134
+ exit
106
135
  # alignment.block_alignments.each do |a|
107
136
  # if a[:alignment].nil? || a[:alignment] == :empty
108
137
  # # p [a[:source], a[:target]]
@@ -119,7 +148,22 @@ else
119
148
  # end
120
149
  # exit
121
150
 
151
+ # verification of source denotations
152
+ puts "[Invalid source denotations]"
153
+ source_annotations[:denotations] do |d|
154
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
155
+ end
156
+ puts "====="
157
+ puts
158
+
122
159
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
160
+ puts "[Invalid transformation]"
161
+ denotations.each do |d|
162
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
163
+ end
164
+ puts "====="
165
+ puts
166
+
123
167
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
124
168
 
125
169
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -159,7 +203,7 @@ if lost_annotations
159
203
  warn "#{lost_annotations.length}"
160
204
  end
161
205
 
162
- puts target_annotations.to_json
206
+ #puts target_annotations.to_json
163
207
 
164
208
  # denotations = anns1[:denotations]
165
209
 
@@ -3,9 +3,9 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
6
+ TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
+ TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
+ TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
@@ -23,44 +23,41 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s1_prev = 0
27
+ @end_s2_prev = 0
26
28
  end
27
29
 
28
30
  def get_next_anchor
29
31
  # find the position of an anchor ngram in s1 and s2
30
- @beg_s2 = nil
31
32
  while @beg_s1 < (@s1.length - @size_ngram)
32
- while @beg_s1 < (@s1.length - @size_ngram)
33
- anchor = @s1[@beg_s1, @size_ngram]
34
- @beg_s2 = if defined? @end_s2_prev
35
- @s2.index(anchor, @end_s2_prev)
36
- else
37
- @s2.index(anchor)
38
- end
39
- break unless @beg_s2.nil?
40
- @beg_s1 += 1
41
- end
33
+ anchor = @s1[@beg_s1, @size_ngram]
34
+
35
+ search_position = 0
36
+ # search_position = @end_s2_prev
37
+ while @beg_s2 = @s2.index(anchor, search_position)
38
+ # if both the begining points are sufficiantly close to the end points of the last match
39
+ break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
42
40
 
43
- # The loop above is terminated with beg_s2 == nil, which means no more anchor
44
- break if @beg_s2.nil?
41
+ left_window_s1, left_window_s2 = get_left_windows
42
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
45
43
 
46
- # if both the begining points are sufficiantly close to the end points of the last match
47
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
44
+ right_window_s1, right_window_s2 = get_right_windows
45
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
48
46
 
49
- left_window_s1, left_window_s2 = get_left_windows
50
- break if left_window_s1 && text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
+ search_position = @beg_s2 + 1
48
+ end
51
49
 
52
- right_window_s1, right_window_s2 = get_right_windows
53
- break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
50
+ break unless @beg_s2.nil?
54
51
 
55
52
  @beg_s1 += 1
56
53
  end
57
54
 
58
- return nil if @beg_s2.nil?
55
+ return nil if @beg_s1 >= (@s1.length - @size_ngram)
59
56
 
60
57
  # extend the block
61
58
  b1 = @beg_s1
62
59
  b2 = @beg_s2
63
- while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
64
61
  b1 -= 1; b2 -= 1
65
62
  end
66
63
  b1 += 1; b2 += 1
@@ -113,7 +110,7 @@ class TextAlignment::AnchorFinder
113
110
  end
114
111
 
115
112
  def get_right_windows
116
- return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
113
+ return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
114
 
118
115
  window_s1 = ''
119
116
  loc = @beg_s1 + @size_ngram
@@ -143,6 +140,7 @@ class TextAlignment::AnchorFinder
143
140
  end
144
141
 
145
142
  def text_similarity(str1, str2, ngram_order = 2)
143
+ return 0 if str1.nil? || str2.nil?
146
144
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
147
145
  end
148
146
 
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -38,17 +40,17 @@ class TextAlignment::TextAlignment
38
40
  end
39
41
  end
40
42
 
41
- # mblocks.each do |b|
42
- # p [b[:source], b[:target]]
43
- # puts "---"
44
- # puts str1[b[:source][:begin] ... b[:source][:end]]
45
- # puts "---"
46
- # puts str2[b[:target][:begin] ... b[:target][:end]]
47
- # puts "====="
48
- # puts
49
- # end
50
- # puts "-=-=-=-=-"
51
- # puts
43
+ mblocks.each do |b|
44
+ p [b[:source], b[:target]]
45
+ puts "---"
46
+ puts str1[b[:source][:begin] ... b[:source][:end]]
47
+ puts "---"
48
+ puts str2[b[:target][:begin] ... b[:target][:end]]
49
+ puts "====="
50
+ puts
51
+ end
52
+ puts "-=-=-=-=-"
53
+ puts
52
54
 
53
55
  ## To find block alignments
54
56
  @block_alignments = []
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,9 +170,12 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
177
+ p begin_position
178
+ puts "-----"
172
179
  block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
173
180
  end
174
181
  end
@@ -183,7 +190,8 @@ class TextAlignment::TextAlignment
183
190
  if end_position == block_alignment[:source][:end]
184
191
  block_alignment[:target][:end]
185
192
  else
186
- raise "lost annotation"
193
+ # raise "lost annotation"
194
+ nil
187
195
  end
188
196
  else
189
197
  block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.10'
2
+ VERSION = '0.3.16'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.10
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-29 00:00:00.000000000 Z
11
+ date: 2020-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary