text_alignment 0.3.20 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 179976ef6ac286a34343f99eef5cf8ee3d26997c1a8e1c8e9348793773ac044a
4
- data.tar.gz: e15a57b2460a21d2607e3e4775ad32c5a760d9f00067c898b966e21777c241d4
3
+ metadata.gz: 35f83f76cd3e9ca59604327710d2eac4684b8abca9dbd167b2b391d98ea561f9
4
+ data.tar.gz: 77007a1bfcf72d0681ac9d21b35221e7d519a291383f3d0d8fa0574d631b65ea
5
5
  SHA512:
6
- metadata.gz: b3742565325c8ce8b4ce35093b4524b1d9182b51e332f50d9376ce2c22af8918a168b1fcc5764e4313e819c546684a3fc9411d8ee39607d95557924975bd4143
7
- data.tar.gz: 6f68f68799075990fce62f4454c2ef1f1ef4ff260eac44a5332744a9c21605ac14eb35c9d6d24bedb31681f0995222699d8018727f7e70a45c11076df1c82212
6
+ metadata.gz: 2f936569e072e6693f279b5d3a349965b286192d8f694fffbbf41110f66afbd5d80f865a9232c2bef60ec94920e0a814142c8756f0ba3ff29a109cbbe1f7abec
7
+ data.tar.gz: 955b85aa639f733361f354cc68ac3d8b14a17a14784f1e6ec24f240934ef0d160c6b8cf01232e1e4243c2eecffcaf1665999cc3e455bfb0e61c0dd905d2cb8b4
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -3,15 +3,16 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
6
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
7
  TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
8
  TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
12
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
13
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
14
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
15
+ @sim_threshold = _size_window || TextAlignment::TEXT_SIMILARITY_TRESHOLD
15
16
 
16
17
  @reverse = (target_str.length < source_str.length)
17
18
 
@@ -30,19 +31,23 @@ class TextAlignment::AnchorFinder
30
31
  def get_next_anchor
31
32
  # find the position of an anchor ngram in s1 and s2
32
33
  while @beg_s1 < (@s1.length - @size_ngram)
34
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
35
+ @beg_s1 += 1
36
+ next
37
+ end
33
38
  anchor = @s1[@beg_s1, @size_ngram]
34
39
 
35
40
  # search_position = 0
36
41
  search_position = @end_s2_prev
37
42
  while @beg_s2 = @s2.index(anchor, search_position)
38
43
  # if both the begining points are sufficiantly close to the end points of the last match
39
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
44
+ break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
45
 
41
46
  left_window_s1, left_window_s2 = get_left_windows
42
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
43
48
 
44
49
  right_window_s1, right_window_s2 = get_right_windows
45
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
50
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
46
51
 
47
52
  search_position = @beg_s2 + 1
48
53
  end
@@ -57,9 +62,10 @@ class TextAlignment::AnchorFinder
57
62
  # extend the block
58
63
  b1 = @beg_s1
59
64
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
65
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
66
  b1 -= 1; b2 -= 1
62
67
  end
68
+
63
69
  b1 += 1; b2 += 1
64
70
 
65
71
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +88,8 @@ class TextAlignment::AnchorFinder
82
88
  private
83
89
 
84
90
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
91
+ # commend below with the assumption that the beginning of a document gives a significant locational information
92
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
93
 
87
94
  window_s1 = ''
88
95
  loc = @beg_s1 - 1
@@ -110,7 +117,8 @@ class TextAlignment::AnchorFinder
110
117
  end
111
118
 
112
119
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
120
+ # commend below with the assumption that the end of a document gives a significant locational
121
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
122
 
115
123
  window_s1 = ''
116
124
  loc = @beg_s1 + @size_ngram
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
9
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
10
10
 
11
11
 
12
12
  class TextAlignment::TextAlignment
@@ -121,8 +121,8 @@ class TextAlignment::TextAlignment
121
121
  if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
122
122
  b1 = mblocks[-1][:source][:end]
123
123
  b2 = mblocks[-1][:target][:end]
124
- _str1 = str1[b1 ... -1]
125
- _str2 = str2[b2 ... -1]
124
+ _str1 = str1[b1 ... str1.length]
125
+ _str2 = str2[b2 ... str2.length]
126
126
 
127
127
  unless _str1.strip.empty?
128
128
  if _str2.strip.empty?
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.20'
2
+ VERSION = '0.4'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.20
4
+ version: '0.4'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-22 00:00:00.000000000 Z
11
+ date: 2020-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary