text_alignment 0.3.13 → 0.3.18

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19a2dfcf8dfffa752dfc0c3363d2d3e1cb3ef7498f79023cdd16e38aa8c46afd
4
- data.tar.gz: 94d925dfc71d24b05fd6861a4f7f7344428b68785db84eeae8f430563b4e3318
3
+ metadata.gz: 5b6821076dd780721d97403f3ca49568a66100f34f7fffe32acbf28572c36429
4
+ data.tar.gz: 0d29d5b64897b4030de7479eaa8fc3990e13f2135298315a18b42b06ee3aa300
5
5
  SHA512:
6
- metadata.gz: 72e61cf30c98df2c3d5ac19717c813c936b55daad22cb8c6e8b44bdb45321dab98c69d5f90820e9993d86263b04bdad2e96e8010afc8a57eee916126b673c8cc
7
- data.tar.gz: d92c04294d58845f4a88cb8d9e3db42e9a18e0dd02d0398e3a95bf94662f64a33752754dd637163ef9bc4af77dc602c12fe683d49e9ac0ebb61a2469e5e08216
6
+ metadata.gz: efb442cee5dfa76e7428d516b5e8ba768cbf80229580f396d02fcd5aa99f5573f88b2af662d82159669fc2c58f84ad77f6bde72696162f3c0a7fb74a75d4f7e0
7
+ data.tar.gz: d6fe000e32862fea2c511fb7937c85ca65e75dd53fe33214f0a637cd7b081a947f268919dd050e425d4fa2fca4268125dcffcb1a54fb933c6eb0bd3c4971b058
@@ -105,6 +105,32 @@ else
105
105
 
106
106
  pp alignment
107
107
 
108
+ # verification
109
+ source_text = source_annotations[:text]
110
+ puts "=====BEGIN"
111
+ (0 ... source_text.length).each do |p|
112
+ t = alignment.transform_begin_position(p)
113
+ if t.nil?
114
+ print source_text[p]
115
+ else
116
+ print '.'
117
+ end
118
+ end
119
+ puts
120
+ puts "=====END"
121
+
122
+ puts "=====BEGIN"
123
+ (0 .. source_text.length).each do |p|
124
+ t = alignment.transform_end_position(p)
125
+ if t.nil?
126
+ print source_text[p]
127
+ else
128
+ print '.'
129
+ end
130
+ end
131
+ puts
132
+ puts "=====END"
133
+
108
134
  # alignment.block_alignments.each do |a|
109
135
  # if a[:alignment].nil? || a[:alignment] == :empty
110
136
  # # p [a[:source], a[:target]]
@@ -121,7 +147,22 @@ else
121
147
  # end
122
148
  # exit
123
149
 
150
+ # verification of source denotations
151
+ puts "[Invalid source denotations]"
152
+ source_annotations[:denotations] do |d|
153
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
154
+ end
155
+ puts "====="
156
+ puts
157
+
124
158
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
159
+ puts "[Invalid transformation]"
160
+ denotations.each do |d|
161
+ p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
162
+ end
163
+ puts "====="
164
+ puts
165
+
125
166
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
126
167
 
127
168
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -4,7 +4,7 @@ require 'string-similarity'
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
6
  TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
7
+ TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
8
  TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
23
23
 
24
24
  # current position in s1
25
25
  @beg_s1 = 0
26
+ @end_s1_prev = 0
26
27
  @end_s2_prev = 0
27
28
  end
28
29
 
@@ -31,8 +32,8 @@ class TextAlignment::AnchorFinder
31
32
  while @beg_s1 < (@s1.length - @size_ngram)
32
33
  anchor = @s1[@beg_s1, @size_ngram]
33
34
 
34
- search_position = 0
35
- # search_position = @end_s2_prev
35
+ # search_position = 0
36
+ search_position = @end_s2_prev
36
37
  while @beg_s2 = @s2.index(anchor, search_position)
37
38
  # if both the begining points are sufficiantly close to the end points of the last match
38
39
  break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
@@ -56,7 +57,7 @@ class TextAlignment::AnchorFinder
56
57
  # extend the block
57
58
  b1 = @beg_s1
58
59
  b2 = @beg_s2
59
- while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
60
+ while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
60
61
  b1 -= 1; b2 -= 1
61
62
  end
62
63
  b1 += 1; b2 += 1
@@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
+ TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
+
9
11
 
10
12
  class TextAlignment::TextAlignment
11
13
  attr_reader :block_alignments
@@ -70,7 +72,7 @@ class TextAlignment::TextAlignment
70
72
  @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
71
73
  else
72
74
  len_min = [_str1.length, _str2.length].min
73
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
75
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
74
76
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
75
77
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
76
78
 
@@ -135,9 +137,11 @@ class TextAlignment::TextAlignment
135
137
  @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
136
138
  else
137
139
  len_min = [_str1.length, _str2.length].min
138
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
140
+ len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
139
141
  e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
140
- e2 = _str2.length < len_buffer ? str1.length : b2 + len_buffer
142
+ e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
143
+ _str1 = str1[b1 ... e1]
144
+ _str2 = str2[b2 ... e2]
141
145
 
142
146
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
143
147
  if alignment.similarity < 0.6
@@ -166,10 +170,12 @@ class TextAlignment::TextAlignment
166
170
  if begin_position == block_alignment[:source][:begin]
167
171
  block_alignment[:target][:begin]
168
172
  else
169
- raise "lost annotation"
173
+ # raise "lost annotation"
174
+ nil
170
175
  end
171
176
  else
172
- block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
177
+ r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
178
+ r.nil? ? nil : r + block_alignment[:target][:begin]
173
179
  end
174
180
  end
175
181
 
@@ -183,10 +189,12 @@ class TextAlignment::TextAlignment
183
189
  if end_position == block_alignment[:source][:end]
184
190
  block_alignment[:target][:end]
185
191
  else
186
- raise "lost annotation"
192
+ # raise "lost annotation"
193
+ nil
187
194
  end
188
195
  else
189
- block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) + block_alignment[:target][:begin]
196
+ r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
197
+ r.nil? ? nil : r + block_alignment[:target][:begin]
190
198
  end
191
199
  end
192
200
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.13'
2
+ VERSION = '0.3.18'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.13
4
+ version: 0.3.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-12 00:00:00.000000000 Z
11
+ date: 2020-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary