text_alignment 0.3.20 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 179976ef6ac286a34343f99eef5cf8ee3d26997c1a8e1c8e9348793773ac044a
4
- data.tar.gz: e15a57b2460a21d2607e3e4775ad32c5a760d9f00067c898b966e21777c241d4
3
+ metadata.gz: 35f83f76cd3e9ca59604327710d2eac4684b8abca9dbd167b2b391d98ea561f9
4
+ data.tar.gz: 77007a1bfcf72d0681ac9d21b35221e7d519a291383f3d0d8fa0574d631b65ea
5
5
  SHA512:
6
- metadata.gz: b3742565325c8ce8b4ce35093b4524b1d9182b51e332f50d9376ce2c22af8918a168b1fcc5764e4313e819c546684a3fc9411d8ee39607d95557924975bd4143
7
- data.tar.gz: 6f68f68799075990fce62f4454c2ef1f1ef4ff260eac44a5332744a9c21605ac14eb35c9d6d24bedb31681f0995222699d8018727f7e70a45c11076df1c82212
6
+ metadata.gz: 2f936569e072e6693f279b5d3a349965b286192d8f694fffbbf41110f66afbd5d80f865a9232c2bef60ec94920e0a814142c8756f0ba3ff29a109cbbe1f7abec
7
+ data.tar.gz: 955b85aa639f733361f354cc68ac3d8b14a17a14784f1e6ec24f240934ef0d160c6b8cf01232e1e4243c2eecffcaf1665999cc3e455bfb0e61c0dd905d2cb8b4
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -3,15 +3,16 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
6
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
7
  TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
8
  TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
10
10
  class TextAlignment::AnchorFinder
11
11
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
12
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
13
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
14
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
15
+ @sim_threshold = _size_window || TextAlignment::TEXT_SIMILARITY_TRESHOLD
15
16
 
16
17
  @reverse = (target_str.length < source_str.length)
17
18
 
@@ -30,19 +31,23 @@ class TextAlignment::AnchorFinder
30
31
  def get_next_anchor
31
32
  # find the position of an anchor ngram in s1 and s2
32
33
  while @beg_s1 < (@s1.length - @size_ngram)
34
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
35
+ @beg_s1 += 1
36
+ next
37
+ end
33
38
  anchor = @s1[@beg_s1, @size_ngram]
34
39
 
35
40
  # search_position = 0
36
41
  search_position = @end_s2_prev
37
42
  while @beg_s2 = @s2.index(anchor, search_position)
38
43
  # if both the begining points are sufficiantly close to the end points of the last match
39
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
44
+ break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
45
 
41
46
  left_window_s1, left_window_s2 = get_left_windows
42
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
43
48
 
44
49
  right_window_s1, right_window_s2 = get_right_windows
45
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
50
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
46
51
 
47
52
  search_position = @beg_s2 + 1
48
53
  end
@@ -57,9 +62,10 @@ class TextAlignment::AnchorFinder
57
62
  # extend the block
58
63
  b1 = @beg_s1
59
64
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
65
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
66
  b1 -= 1; b2 -= 1
62
67
  end
68
+
63
69
  b1 += 1; b2 += 1
64
70
 
65
71
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +88,8 @@ class TextAlignment::AnchorFinder
82
88
  private
83
89
 
84
90
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
91
+ # commend below with the assumption that the beginning of a document gives a significant locational information
92
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
93
 
87
94
  window_s1 = ''
88
95
  loc = @beg_s1 - 1
@@ -110,7 +117,8 @@ class TextAlignment::AnchorFinder
110
117
  end
111
118
 
112
119
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
120
+ # commend below with the assumption that the end of a document gives a significant locational
121
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
122
 
115
123
  window_s1 = ''
116
124
  loc = @beg_s1 + @size_ngram
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
9
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
10
10
 
11
11
 
12
12
  class TextAlignment::TextAlignment
@@ -121,8 +121,8 @@ class TextAlignment::TextAlignment
121
121
  if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
122
122
  b1 = mblocks[-1][:source][:end]
123
123
  b2 = mblocks[-1][:target][:end]
124
- _str1 = str1[b1 ... -1]
125
- _str2 = str2[b2 ... -1]
124
+ _str1 = str1[b1 ... str1.length]
125
+ _str2 = str2[b2 ... str2.length]
126
126
 
127
127
  unless _str1.strip.empty?
128
128
  if _str2.strip.empty?
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.20'
2
+ VERSION = '0.4'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.20
4
+ version: '0.4'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-22 00:00:00.000000000 Z
11
+ date: 2020-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary