text_alignment 0.3.18 → 0.3.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b6821076dd780721d97403f3ca49568a66100f34f7fffe32acbf28572c36429
4
- data.tar.gz: 0d29d5b64897b4030de7479eaa8fc3990e13f2135298315a18b42b06ee3aa300
3
+ metadata.gz: 7678cbfd749833e03a17dcf0b06699def0215e7c39b981d2ff8ddfb3a79b2ea4
4
+ data.tar.gz: 0251b99cbd984dcd8e23ea38e6190a449a1f83f98119e68251f1f17e5854ac73
5
5
  SHA512:
6
- metadata.gz: efb442cee5dfa76e7428d516b5e8ba768cbf80229580f396d02fcd5aa99f5573f88b2af662d82159669fc2c58f84ad77f6bde72696162f3c0a7fb74a75d4f7e0
7
- data.tar.gz: d6fe000e32862fea2c511fb7937c85ca65e75dd53fe33214f0a637cd7b081a947f268919dd050e425d4fa2fca4268125dcffcb1a54fb933c6eb0bd3c4971b058
6
+ metadata.gz: '08055cb7659458e4a5834a193442c7592f9eecab09e206a8e5f20aab658c26cb24cda2949804db3569c160f03c6e71e9a28bdcdb352f8d207f1eb31aa427d963'
7
+ data.tar.gz: 43f0c0111662212c049a8b4baf2eeb7b2cc15a01d9f02d7cfbc501a9b016fa1344516685b8052c862227c72f8f9866b3232425e06d862899003860ad4700fc70
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -3,7 +3,7 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
6
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
7
  TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
8
  TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
@@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder
30
30
  def get_next_anchor
31
31
  # find the position of an anchor ngram in s1 and s2
32
32
  while @beg_s1 < (@s1.length - @size_ngram)
33
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
34
+ @beg_s1 += 1
35
+ next
36
+ end
33
37
  anchor = @s1[@beg_s1, @size_ngram]
34
38
 
35
39
  # search_position = 0
36
40
  search_position = @end_s2_prev
37
41
  while @beg_s2 = @s2.index(anchor, search_position)
38
42
  # if both the begining points are sufficiantly close to the end points of the last match
39
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
43
+ break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
44
 
41
45
  left_window_s1, left_window_s2 = get_left_windows
42
46
  break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
@@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder
57
61
  # extend the block
58
62
  b1 = @beg_s1
59
63
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
64
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
65
  b1 -= 1; b2 -= 1
62
66
  end
67
+
63
68
  b1 += 1; b2 += 1
64
69
 
65
70
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder
82
87
  private
83
88
 
84
89
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
90
+ # commend below with the assumption that the beginning of a document gives a significant locational information
91
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
92
 
87
93
  window_s1 = ''
88
94
  loc = @beg_s1 - 1
@@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder
110
116
  end
111
117
 
112
118
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
119
+ # commend below with the assumption that the end of a document gives a significant locational
120
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
121
 
115
122
  window_s1 = ''
116
123
  loc = @beg_s1 + @size_ngram
@@ -40,6 +40,9 @@ class TextAlignment::TextAlignment
40
40
  end
41
41
  end
42
42
 
43
+ # pp mblocks
44
+ # puts "-----"
45
+ # puts
43
46
  # mblocks.each do |b|
44
47
  # p [b[:source], b[:target]]
45
48
  # puts "---"
@@ -78,6 +81,8 @@ class TextAlignment::TextAlignment
78
81
 
79
82
  @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
80
83
 
84
+ _str1 = str1[b1 ... e1]
85
+ _str2 = str2[b2 ... e2]
81
86
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
82
87
  if alignment.similarity < 0.6
83
88
  @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
@@ -113,24 +118,11 @@ class TextAlignment::TextAlignment
113
118
  end
114
119
 
115
120
  # Final step
116
- if mblocks[-1][:source][:end] < str1.length
117
- b1 = mblocks[-1][:source][:end]
118
- b2 = mblocks[-1][:target][:end]
119
-
120
- if mblocks[-1][:target][:end] < str2.length
121
-
122
- else
123
- e1 = str1.length
124
- e2 = str2.length
125
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
126
- end
127
- end
128
-
129
121
  if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
130
122
  b1 = mblocks[-1][:source][:end]
131
123
  b2 = mblocks[-1][:target][:end]
132
- _str1 = str1[b1 ... -1]
133
- _str2 = str2[b2 ... -1]
124
+ _str1 = str1[b1 ... str1.length]
125
+ _str2 = str2[b2 ... str2.length]
134
126
 
135
127
  unless _str1.strip.empty?
136
128
  if _str2.strip.empty?
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.18'
2
+ VERSION = '0.3.23'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.18
4
+ version: 0.3.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-18 00:00:00.000000000 Z
11
+ date: 2020-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary