text_alignment 0.3.19 → 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea57d01970fdb56a95a7929803949a553965692fb3f4748eec72fe026f9a79cf
4
- data.tar.gz: 96397baa91646b3eb05a346ff699930b6dacf7d38075273b64ce7916f32d6275
3
+ metadata.gz: fd59fb1ad977d3286f358d8c08315824e86d99cc6f9d72814adb760f2e680107
4
+ data.tar.gz: baafd1b76f6c6447a5763ff731b77ad07e449fc87ef94abed74a978376f6334e
5
5
  SHA512:
6
- metadata.gz: 1d1e7650c35d9bae35a7f1dc2948dbf97fa8f71a86f1f83c5dda9cb64b7179e96ae219db88951e65c2988f078900d960784c4c74489a074654ed893c408be97f
7
- data.tar.gz: 49afaec2b6332dc4038c1f0d0e930bf20ce61a6d4933ff3758bbbfea3e05cf347277aefc341d9c3b3d1f8a4692cec24b901528c2875875bacecc1caa7cf9159c
6
+ metadata.gz: d417878396803e1169a24fae67a9a4b0d4e84948d0c6bc678626641d6f1b6ac1fe16c94e9f07ee747b98f83e4305fa553220a607475363378f32fac4d43a65c7
7
+ data.tar.gz: efaf3640a67be46dddcf7dda9d819cbcb47828a3966d305887b6024eee6ba517a597e9303790f584264b8995f69671d90bdc0c7883fc9244f3b1746695960bf8
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder
30
30
  def get_next_anchor
31
31
  # find the position of an anchor ngram in s1 and s2
32
32
  while @beg_s1 < (@s1.length - @size_ngram)
33
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
34
+ @beg_s1 += 1
35
+ next
36
+ end
33
37
  anchor = @s1[@beg_s1, @size_ngram]
34
38
 
35
39
  # search_position = 0
36
40
  search_position = @end_s2_prev
37
41
  while @beg_s2 = @s2.index(anchor, search_position)
38
42
  # if both the begining points are sufficiantly close to the end points of the last match
39
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
43
+ break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
44
 
41
45
  left_window_s1, left_window_s2 = get_left_windows
42
46
  break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
@@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder
57
61
  # extend the block
58
62
  b1 = @beg_s1
59
63
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
64
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
65
  b1 -= 1; b2 -= 1
62
66
  end
67
+
63
68
  b1 += 1; b2 += 1
64
69
 
65
70
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder
82
87
  private
83
88
 
84
89
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
90
+ # commend below with the assumption that the beginning of a document gives a significant locational information
91
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
92
 
87
93
  window_s1 = ''
88
94
  loc = @beg_s1 - 1
@@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder
110
116
  end
111
117
 
112
118
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
119
+ # commend below with the assumption that the end of a document gives a significant locational
120
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
121
 
115
122
  window_s1 = ''
116
123
  loc = @beg_s1 + @size_ngram
@@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
9
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
10
10
 
11
11
 
12
12
  class TextAlignment::TextAlignment
@@ -40,10 +40,9 @@ class TextAlignment::TextAlignment
40
40
  end
41
41
  end
42
42
 
43
- pp mblocks
44
- puts "-----"
45
- puts
46
-
43
+ # pp mblocks
44
+ # puts "-----"
45
+ # puts
47
46
  # mblocks.each do |b|
48
47
  # p [b[:source], b[:target]]
49
48
  # puts "---"
@@ -82,6 +81,8 @@ class TextAlignment::TextAlignment
82
81
 
83
82
  @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
84
83
 
84
+ _str1 = str1[b1 ... e1]
85
+ _str2 = str2[b2 ... e2]
85
86
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
86
87
  if alignment.similarity < 0.6
87
88
  @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
@@ -117,24 +118,11 @@ class TextAlignment::TextAlignment
117
118
  end
118
119
 
119
120
  # Final step
120
- if mblocks[-1][:source][:end] < str1.length
121
- b1 = mblocks[-1][:source][:end]
122
- b2 = mblocks[-1][:target][:end]
123
-
124
- if mblocks[-1][:target][:end] < str2.length
125
-
126
- else
127
- e1 = str1.length
128
- e2 = str2.length
129
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
130
- end
131
- end
132
-
133
121
  if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
134
122
  b1 = mblocks[-1][:source][:end]
135
123
  b2 = mblocks[-1][:target][:end]
136
- _str1 = str1[b1 ... -1]
137
- _str2 = str2[b2 ... -1]
124
+ _str1 = str1[b1 ... str1.length]
125
+ _str2 = str2[b2 ... str2.length]
138
126
 
139
127
  unless _str1.strip.empty?
140
128
  if _str2.strip.empty?
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.19'
2
+ VERSION = '0.3.24'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.19
4
+ version: 0.3.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-21 00:00:00.000000000 Z
11
+ date: 2020-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary