text_alignment 0.3.18 → 0.3.23

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b6821076dd780721d97403f3ca49568a66100f34f7fffe32acbf28572c36429
4
- data.tar.gz: 0d29d5b64897b4030de7479eaa8fc3990e13f2135298315a18b42b06ee3aa300
3
+ metadata.gz: 7678cbfd749833e03a17dcf0b06699def0215e7c39b981d2ff8ddfb3a79b2ea4
4
+ data.tar.gz: 0251b99cbd984dcd8e23ea38e6190a449a1f83f98119e68251f1f17e5854ac73
5
5
  SHA512:
6
- metadata.gz: efb442cee5dfa76e7428d516b5e8ba768cbf80229580f396d02fcd5aa99f5573f88b2af662d82159669fc2c58f84ad77f6bde72696162f3c0a7fb74a75d4f7e0
7
- data.tar.gz: d6fe000e32862fea2c511fb7937c85ca65e75dd53fe33214f0a637cd7b081a947f268919dd050e425d4fa2fca4268125dcffcb1a54fb933c6eb0bd3c4971b058
6
+ metadata.gz: '08055cb7659458e4a5834a193442c7592f9eecab09e206a8e5f20aab658c26cb24cda2949804db3569c160f03c6e71e9a28bdcdb352f8d207f1eb31aa427d963'
7
+ data.tar.gz: 43f0c0111662212c049a8b4baf2eeb7b2cc15a01d9f02d7cfbc501a9b016fa1344516685b8052c862227c72f8f9866b3232425e06d862899003860ad4700fc70
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -3,7 +3,7 @@ require 'string-similarity'
3
3
 
4
4
  module TextAlignment; end unless defined? TextAlignment
5
5
 
6
- TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
6
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
7
  TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
8
  TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
9
 
@@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder
30
30
  def get_next_anchor
31
31
  # find the position of an anchor ngram in s1 and s2
32
32
  while @beg_s1 < (@s1.length - @size_ngram)
33
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
34
+ @beg_s1 += 1
35
+ next
36
+ end
33
37
  anchor = @s1[@beg_s1, @size_ngram]
34
38
 
35
39
  # search_position = 0
36
40
  search_position = @end_s2_prev
37
41
  while @beg_s2 = @s2.index(anchor, search_position)
38
42
  # if both the begining points are sufficiantly close to the end points of the last match
39
- break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
43
+ break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
44
 
41
45
  left_window_s1, left_window_s2 = get_left_windows
42
46
  break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
@@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder
57
61
  # extend the block
58
62
  b1 = @beg_s1
59
63
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
64
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
65
  b1 -= 1; b2 -= 1
62
66
  end
67
+
63
68
  b1 += 1; b2 += 1
64
69
 
65
70
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder
82
87
  private
83
88
 
84
89
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
90
+ # commend below with the assumption that the beginning of a document gives a significant locational information
91
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
92
 
87
93
  window_s1 = ''
88
94
  loc = @beg_s1 - 1
@@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder
110
116
  end
111
117
 
112
118
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
119
+ # commend below with the assumption that the end of a document gives a significant locational
120
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
121
 
115
122
  window_s1 = ''
116
123
  loc = @beg_s1 + @size_ngram
@@ -40,6 +40,9 @@ class TextAlignment::TextAlignment
40
40
  end
41
41
  end
42
42
 
43
+ # pp mblocks
44
+ # puts "-----"
45
+ # puts
43
46
  # mblocks.each do |b|
44
47
  # p [b[:source], b[:target]]
45
48
  # puts "---"
@@ -78,6 +81,8 @@ class TextAlignment::TextAlignment
78
81
 
79
82
  @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
80
83
 
84
+ _str1 = str1[b1 ... e1]
85
+ _str2 = str2[b2 ... e2]
81
86
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
82
87
  if alignment.similarity < 0.6
83
88
  @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
@@ -113,24 +118,11 @@ class TextAlignment::TextAlignment
113
118
  end
114
119
 
115
120
  # Final step
116
- if mblocks[-1][:source][:end] < str1.length
117
- b1 = mblocks[-1][:source][:end]
118
- b2 = mblocks[-1][:target][:end]
119
-
120
- if mblocks[-1][:target][:end] < str2.length
121
-
122
- else
123
- e1 = str1.length
124
- e2 = str2.length
125
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
126
- end
127
- end
128
-
129
121
  if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
130
122
  b1 = mblocks[-1][:source][:end]
131
123
  b2 = mblocks[-1][:target][:end]
132
- _str1 = str1[b1 ... -1]
133
- _str2 = str2[b2 ... -1]
124
+ _str1 = str1[b1 ... str1.length]
125
+ _str2 = str2[b2 ... str2.length]
134
126
 
135
127
  unless _str1.strip.empty?
136
128
  if _str2.strip.empty?
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.18'
2
+ VERSION = '0.3.23'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.18
4
+ version: 0.3.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-18 00:00:00.000000000 Z
11
+ date: 2020-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary