text_alignment 0.3.23 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +40 -49
- data/lib/text_alignment/anchor_finder.rb +5 -7
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/mixed_alignment.rb +1 -54
- data/lib/text_alignment/text_alignment.rb +194 -12
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9f5f7f27c8628123530d51d0a68060aa6fb850bcef8c7089c8bf990f7257a80b
|
|
4
|
+
data.tar.gz: 45f768df4e7d89c931295985adb31df9e725156e5a85d7f78a5b7cd26d00be4d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4004293fa10eb247110764d16a24900a10e714072227dd6e5626d2123deca4bcec00b7255e1affaba4d7dda75d7e42049aabf6b1e6c51f4c005124443b5f9ffc
|
|
7
|
+
data.tar.gz: eb5c8a2c89c8973242bb77457e0f8d9922486d9743e7c39ee12ea4025ece6d888be5308eaddec076e5a21c84149dcb31bc0ed6fcd68881fb6c8c4063e49fb64d
|
data/bin/align_annotations
CHANGED
|
@@ -103,65 +103,56 @@ target_annotations = if source_annotations.class == Array
|
|
|
103
103
|
else
|
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
|
105
105
|
|
|
106
|
-
pp alignment
|
|
106
|
+
# pp alignment
|
|
107
107
|
|
|
108
108
|
# verification
|
|
109
|
-
source_text = source_annotations[:text]
|
|
110
|
-
puts "=====BEGIN"
|
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
end
|
|
119
|
-
puts
|
|
120
|
-
puts "=====END"
|
|
121
|
-
|
|
122
|
-
puts "=====BEGIN"
|
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
else
|
|
128
|
-
print '.'
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
puts
|
|
132
|
-
puts "=====END"
|
|
133
|
-
|
|
134
|
-
# alignment.block_alignments.each do |a|
|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
|
136
|
-
# # p [a[:source], a[:target]]
|
|
137
|
-
# # p a[:alignment]
|
|
109
|
+
# source_text = source_annotations[:text]
|
|
110
|
+
# puts "=====BEGIN"
|
|
111
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
|
112
|
+
# t = alignment.transform_begin_position(p)
|
|
113
|
+
# if t.nil?
|
|
114
|
+
# print source_text[p]
|
|
115
|
+
# else
|
|
116
|
+
# print '.'
|
|
117
|
+
# end
|
|
118
|
+
# end
|
|
119
|
+
# puts
|
|
120
|
+
# puts "=====END"
|
|
121
|
+
|
|
122
|
+
# puts "=====BEGIN"
|
|
123
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
|
124
|
+
# t = alignment.transform_end_position(p)
|
|
125
|
+
# if t.nil?
|
|
126
|
+
# print source_text[p]
|
|
138
127
|
# else
|
|
139
|
-
#
|
|
140
|
-
# p a[:alignment].similarity
|
|
141
|
-
# puts "--"
|
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
|
143
|
-
# puts "--"
|
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
|
145
|
-
# puts "======"
|
|
128
|
+
# print '.'
|
|
146
129
|
# end
|
|
147
130
|
# end
|
|
131
|
+
# puts
|
|
132
|
+
# puts "=====END"
|
|
133
|
+
|
|
134
|
+
source_text = source_annotations[:text]
|
|
135
|
+
|
|
136
|
+
# puts "[block alignment]"
|
|
137
|
+
puts alignment.alignment_table
|
|
138
|
+
# puts "====="
|
|
148
139
|
# exit
|
|
149
140
|
|
|
150
141
|
# verification of source denotations
|
|
151
|
-
puts "[Invalid source denotations]"
|
|
152
|
-
source_annotations[:denotations] do |d|
|
|
153
|
-
|
|
154
|
-
end
|
|
155
|
-
puts "====="
|
|
156
|
-
puts
|
|
142
|
+
# puts "[Invalid source denotations]"
|
|
143
|
+
# source_annotations[:denotations] do |d|
|
|
144
|
+
# p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
|
145
|
+
# end
|
|
146
|
+
# puts "====="
|
|
147
|
+
# puts
|
|
157
148
|
|
|
158
149
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
|
159
|
-
puts "[Invalid transformation]"
|
|
160
|
-
denotations.each do |d|
|
|
161
|
-
|
|
162
|
-
end
|
|
163
|
-
puts "====="
|
|
164
|
-
puts
|
|
150
|
+
# puts "[Invalid transformation]"
|
|
151
|
+
# denotations.each do |d|
|
|
152
|
+
# p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
|
153
|
+
# end
|
|
154
|
+
# puts "====="
|
|
155
|
+
# puts
|
|
165
156
|
|
|
166
157
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
|
167
158
|
|
|
@@ -1,17 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
require 'text_alignment/constants'
|
|
2
3
|
require 'string-similarity'
|
|
3
4
|
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
|
5
6
|
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
|
9
|
-
|
|
10
7
|
class TextAlignment::AnchorFinder
|
|
11
8
|
|
|
12
|
-
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
|
|
9
|
+
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
|
15
13
|
|
|
16
14
|
@reverse = (target_str.length < source_str.length)
|
|
17
15
|
|
|
@@ -43,10 +41,10 @@ class TextAlignment::AnchorFinder
|
|
|
43
41
|
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
|
44
42
|
|
|
45
43
|
left_window_s1, left_window_s2 = get_left_windows
|
|
46
|
-
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >
|
|
44
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
|
47
45
|
|
|
48
46
|
right_window_s1, right_window_s2 = get_right_windows
|
|
49
|
-
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >
|
|
47
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
|
50
48
|
|
|
51
49
|
search_position = @beg_s2 + 1
|
|
52
50
|
end
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
require 'text_alignment/constants'
|
|
2
3
|
require 'string-similarity'
|
|
3
4
|
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
|
5
6
|
|
|
6
7
|
# approximate the location of str1 in str2
|
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
|
11
9
|
|
|
12
10
|
class << TextAlignment
|
|
13
11
|
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
|
18
16
|
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
|
21
19
|
ngram_shared = ngram1 & ngram2
|
|
22
20
|
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
|
47
45
|
|
|
48
|
-
break if text_similarity > TextAlignment::
|
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
|
49
47
|
fit_begin, fit_end = nil, nil
|
|
50
48
|
end
|
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
|
2
|
+
|
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
|
9
9
|
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
|
11
11
|
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
|
13
|
-
|
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
|
10
10
|
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
|
12
12
|
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
|
14
|
-
|
|
15
13
|
class TextAlignment::MixedAlignment
|
|
16
14
|
attr_reader :sdiff
|
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
|
21
19
|
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
## preprocessing
|
|
27
|
-
str1 = str1.dup
|
|
28
|
-
str2 = str2.dup
|
|
29
|
-
mappings = mappings.dup
|
|
30
|
-
|
|
31
|
-
## find the first nomatch character
|
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
|
33
|
-
if str2.index(c).nil?
|
|
34
|
-
@nomatch_char1 = c
|
|
35
|
-
break
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
|
39
|
-
|
|
40
|
-
## find the first nomatch character
|
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
|
43
|
-
@nomatch_char2 = c
|
|
44
|
-
break
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
|
48
|
-
|
|
49
|
-
# single character mappings
|
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
|
54
|
-
|
|
55
|
-
str1.tr!(characters_from, characters_to)
|
|
56
|
-
str2.tr!(characters_from, characters_to)
|
|
57
|
-
|
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
|
59
|
-
|
|
60
|
-
# ASCII foldings
|
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
|
62
|
-
ascii_foldings.each do |f|
|
|
63
|
-
from = f[1]
|
|
64
|
-
|
|
65
|
-
if str2.index(f[0])
|
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
|
67
|
-
str1.gsub!(from, to)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
if str1.index(f[0])
|
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
|
72
|
-
str2.gsub!(from, to)
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
|
22
|
+
mappings ||= []
|
|
76
23
|
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
|
78
25
|
end
|
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
require 'text_alignment/constants'
|
|
2
3
|
require 'text_alignment/anchor_finder'
|
|
3
4
|
require 'text_alignment/mixed_alignment'
|
|
4
5
|
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
|
6
7
|
|
|
7
|
-
TextAlignment::
|
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
|
9
|
-
TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
|
|
10
|
-
|
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
|
11
9
|
|
|
12
10
|
class TextAlignment::TextAlignment
|
|
13
11
|
attr_reader :block_alignments
|
|
14
12
|
attr_reader :similarity
|
|
15
13
|
attr_reader :lost_annotations
|
|
16
14
|
|
|
17
|
-
def initialize(
|
|
18
|
-
raise ArgumentError, "nil string" if
|
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
|
17
|
+
|
|
18
|
+
@ostr1 = _str1
|
|
19
|
+
@ostr2 = _str2
|
|
19
20
|
|
|
20
|
-
mappings
|
|
21
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
|
21
22
|
|
|
22
23
|
# try exact match
|
|
23
24
|
block_begin = str2.index(str1)
|
|
@@ -26,7 +27,14 @@ class TextAlignment::TextAlignment
|
|
|
26
27
|
return @block_alignments
|
|
27
28
|
end
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
# try exact match
|
|
31
|
+
block_begin = str2.downcase.index(str1.downcase)
|
|
32
|
+
unless block_begin.nil?
|
|
33
|
+
@block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
|
|
34
|
+
return @block_alignments
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
|
30
38
|
|
|
31
39
|
# To collect matched blocks
|
|
32
40
|
mblocks = []
|
|
@@ -93,7 +101,7 @@ class TextAlignment::TextAlignment
|
|
|
93
101
|
end
|
|
94
102
|
end
|
|
95
103
|
end
|
|
96
|
-
@block_alignments << mblocks[0]
|
|
104
|
+
@block_alignments << mblocks[0].merge(alignment: :block)
|
|
97
105
|
|
|
98
106
|
(1 ... mblocks.length).each do |i|
|
|
99
107
|
b1 = mblocks[i - 1][:source][:end]
|
|
@@ -114,7 +122,7 @@ class TextAlignment::TextAlignment
|
|
|
114
122
|
end
|
|
115
123
|
end
|
|
116
124
|
end
|
|
117
|
-
@block_alignments << mblocks[i]
|
|
125
|
+
@block_alignments << mblocks[i].merge(alignment: :block)
|
|
118
126
|
end
|
|
119
127
|
|
|
120
128
|
# Final step
|
|
@@ -156,7 +164,7 @@ class TextAlignment::TextAlignment
|
|
|
156
164
|
i = @block_alignments.index{|b| b[:source][:end] > begin_position}
|
|
157
165
|
block_alignment = @block_alignments[i]
|
|
158
166
|
|
|
159
|
-
b = if block_alignment[:alignment]
|
|
167
|
+
b = if block_alignment[:alignment] == :block
|
|
160
168
|
begin_position + block_alignment[:delta]
|
|
161
169
|
elsif block_alignment[:alignment] == :empty
|
|
162
170
|
if begin_position == block_alignment[:source][:begin]
|
|
@@ -175,7 +183,7 @@ class TextAlignment::TextAlignment
|
|
|
175
183
|
i = @block_alignments.index{|b| b[:source][:end] >= end_position}
|
|
176
184
|
block_alignment = @block_alignments[i]
|
|
177
185
|
|
|
178
|
-
e = if block_alignment[:alignment]
|
|
186
|
+
e = if block_alignment[:alignment] == :block
|
|
179
187
|
end_position + block_alignment[:delta]
|
|
180
188
|
elsif block_alignment[:alignment] == :empty
|
|
181
189
|
if end_position == block_alignment[:source][:end]
|
|
@@ -232,4 +240,178 @@ class TextAlignment::TextAlignment
|
|
|
232
240
|
r
|
|
233
241
|
end
|
|
234
242
|
|
|
243
|
+
def alignment_table
|
|
244
|
+
table = <<-TABLE
|
|
245
|
+
<table class='text_alignment_table'>
|
|
246
|
+
<thead>
|
|
247
|
+
<tr>
|
|
248
|
+
<th class='text_alignment_left' style='width:50%'>Text 1</th>
|
|
249
|
+
<th class='text_alignment_rigt'>Text 2</th>
|
|
250
|
+
</tr>
|
|
251
|
+
</thead>
|
|
252
|
+
<tbody>
|
|
253
|
+
TABLE
|
|
254
|
+
|
|
255
|
+
@block_alignments.each do |a|
|
|
256
|
+
table += alignment_table_th(a)
|
|
257
|
+
table += "<tr>\n" + case a[:alignment]
|
|
258
|
+
when :block
|
|
259
|
+
"<td colspan='2' class='text_alignment_common'>" +
|
|
260
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] +
|
|
261
|
+
"</td>\n"
|
|
262
|
+
when :empty
|
|
263
|
+
"<td class='text_alignment_left'>" + @ostr1[a[:source][:begin] ... a[:source][:end]] + "</td>\n" +
|
|
264
|
+
"<td class='text_alignment_right'>" + @ostr2[a[:target][:begin] ... a[:target][:end]] + "</td>\n"
|
|
265
|
+
else
|
|
266
|
+
base = a[:source][:begin]
|
|
267
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
|
268
|
+
case c.action
|
|
269
|
+
when '='
|
|
270
|
+
@ostr1[c.old_position + base]
|
|
271
|
+
when '+'
|
|
272
|
+
'_'
|
|
273
|
+
when '-'
|
|
274
|
+
@ostr1[c.old_position + base]
|
|
275
|
+
when '!'
|
|
276
|
+
@ostr1[c.old_position + base] + '_'
|
|
277
|
+
end
|
|
278
|
+
end.join('')
|
|
279
|
+
|
|
280
|
+
base = a[:target][:begin]
|
|
281
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
|
282
|
+
case c.action
|
|
283
|
+
when '='
|
|
284
|
+
@ostr2[c.new_position + base]
|
|
285
|
+
when '+'
|
|
286
|
+
@ostr2[c.new_position + base]
|
|
287
|
+
when '-'
|
|
288
|
+
'_'
|
|
289
|
+
when '!'
|
|
290
|
+
'_' + @ostr2[c.new_position + base]
|
|
291
|
+
end
|
|
292
|
+
end.join('')
|
|
293
|
+
|
|
294
|
+
"<td class='text_alignment_left'>" + astr1 + "</td>\n" +
|
|
295
|
+
"<td class='text_alignment_right'>" + astr2 + "</td>\n"
|
|
296
|
+
end + "</tr>\n"
|
|
297
|
+
end
|
|
298
|
+
table += '</tbody></table>'
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def alignment_table_th(a)
|
|
302
|
+
"<tr>" +
|
|
303
|
+
"<th class='text_alignment_left'>#{a[:source][:begin]} - #{a[:source][:end]}</th>" +
|
|
304
|
+
"<th class='text_alignment_right'>#{a[:target][:begin]} - #{a[:target][:end]}</th>" +
|
|
305
|
+
"</tr>"
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def alignment_show
|
|
309
|
+
show = ''
|
|
310
|
+
@block_alignments.each do |a|
|
|
311
|
+
show += case a[:alignment]
|
|
312
|
+
when :block
|
|
313
|
+
"===== common =====\n" +
|
|
314
|
+
@ostr1[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
|
315
|
+
when :empty
|
|
316
|
+
puts "<<<<< string 1"
|
|
317
|
+
p @ostr1[a[:source][:begin] ... a[:source][:end]]
|
|
318
|
+
puts
|
|
319
|
+
puts ">>>>> string 2"
|
|
320
|
+
p @ostr2[a[:target][:begin] ... a[:target][:end]]
|
|
321
|
+
puts
|
|
322
|
+
else
|
|
323
|
+
puts "***** local mismatch"
|
|
324
|
+
astr1 = ''
|
|
325
|
+
astr2 = ''
|
|
326
|
+
|
|
327
|
+
base = a[:source][:begin]
|
|
328
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
|
329
|
+
case c.action
|
|
330
|
+
when '='
|
|
331
|
+
@ostr1[c.old_position + base]
|
|
332
|
+
when '+'
|
|
333
|
+
'_'
|
|
334
|
+
when '-'
|
|
335
|
+
@ostr1[c.old_position + base]
|
|
336
|
+
when '!'
|
|
337
|
+
@ostr1[c.old_position + base] + '_'
|
|
338
|
+
end
|
|
339
|
+
end.join('')
|
|
340
|
+
|
|
341
|
+
base = a[:target][:begin]
|
|
342
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
|
343
|
+
case c.action
|
|
344
|
+
when '='
|
|
345
|
+
@ostr2[c.new_position + base]
|
|
346
|
+
when '+'
|
|
347
|
+
@ostr2[c.new_position + base]
|
|
348
|
+
when '-'
|
|
349
|
+
'_'
|
|
350
|
+
when '!'
|
|
351
|
+
'_' + @ostr2[c.new_position + base]
|
|
352
|
+
end
|
|
353
|
+
end.join('')
|
|
354
|
+
|
|
355
|
+
puts '[' + astr1 + ']'
|
|
356
|
+
puts '[' + astr2 + ']'
|
|
357
|
+
puts
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
private
|
|
363
|
+
|
|
364
|
+
def string_preprocessing(_str1, _str2)
|
|
365
|
+
str1 = _str1.dup
|
|
366
|
+
str2 = _str2.dup
|
|
367
|
+
mappings = TextAlignment::MAPPINGS.dup
|
|
368
|
+
|
|
369
|
+
## single character mappings
|
|
370
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
|
371
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
|
372
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
|
373
|
+
characters_to.gsub!(/-/, '\-')
|
|
374
|
+
|
|
375
|
+
str1.tr!(characters_from, characters_to)
|
|
376
|
+
str2.tr!(characters_from, characters_to)
|
|
377
|
+
|
|
378
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
|
379
|
+
|
|
380
|
+
## long to one character mappings
|
|
381
|
+
pletters = TextAlignment::PADDING_LETTERS
|
|
382
|
+
|
|
383
|
+
# find the padding letter for str1
|
|
384
|
+
padding_letter1 = begin
|
|
385
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
|
386
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
|
387
|
+
TextAlignment::PADDING_LETTERS[i]
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# find the padding letter for str2
|
|
391
|
+
padding_letter2 = begin
|
|
392
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
|
393
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
|
394
|
+
TextAlignment::PADDING_LETTERS[i]
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# ASCII foldings
|
|
398
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
|
399
|
+
ascii_foldings.each do |f|
|
|
400
|
+
from = f[1]
|
|
401
|
+
|
|
402
|
+
if str2.index(f[0])
|
|
403
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
|
404
|
+
str1.gsub!(from, to)
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
if str1.index(f[0])
|
|
408
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
|
409
|
+
str2.gsub!(from, to)
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
|
413
|
+
|
|
414
|
+
[str1, str2, mappings]
|
|
415
|
+
end
|
|
416
|
+
|
|
235
417
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: text_alignment
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: '0.5'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jin-Dong Kim
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-
|
|
11
|
+
date: 2020-10-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ruby-dictionary
|
|
@@ -77,6 +77,7 @@ files:
|
|
|
77
77
|
- lib/text_alignment.rb
|
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
|
80
|
+
- lib/text_alignment/constants.rb
|
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|