text_alignment 0.7.2 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +37 -138
- data/lib/text_alignment/anchor_finder.rb +130 -62
- data/lib/text_alignment/char_mapping.rb +187 -0
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/cultivation_map.rb +19 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +7 -63
- data/lib/text_alignment/text_alignment.rb +251 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
- data/lib/text_alignment/mappings.rb +0 -74
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
|
4
|
+
data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
|
7
|
+
data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
|
data/bin/align_annotations
CHANGED
@@ -26,33 +26,46 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
|
31
|
+
cm = alignment.cultivation_map
|
32
|
+
new_denotations = alignment.transform_hdenotations(denotations)
|
33
|
+
|
34
|
+
if debug
|
35
|
+
warn "[block alignment]"
|
36
|
+
warn alignment.alignment_show
|
37
|
+
warn "-----"
|
38
|
+
end
|
39
|
+
|
40
|
+
lost_annotations = alignment.lost_annotations
|
41
|
+
unless lost_annotations.empty?
|
42
|
+
warn "\n[lost annotations] #{lost_annotations.length}"
|
43
|
+
lost_annotations.each do |a|
|
44
|
+
warn "#{a}"
|
45
|
+
end
|
46
|
+
warn "====="
|
47
|
+
end
|
48
|
+
warn
|
49
|
+
|
50
|
+
# return target annotations
|
51
|
+
[new_denotations, cm]
|
52
|
+
end
|
53
|
+
|
54
|
+
def align_mannotations(source_annotations, target_text, debug = false)
|
55
|
+
target_annotations = {text:target_text}
|
56
|
+
|
30
57
|
idnum_denotations = 0
|
31
58
|
idnum_relations = 0
|
32
59
|
idnum_attributes = 0
|
33
60
|
idnum_modifications = 0
|
34
61
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
puts alignment.alignment_show
|
39
|
-
puts "-----"
|
40
|
-
puts
|
41
|
-
|
42
|
-
# alignment.block_alignments.each do |a|
|
43
|
-
# p {source:a[:source], target:a[:target]}
|
44
|
-
# puts "--"
|
45
|
-
# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
|
46
|
-
# puts "--"
|
47
|
-
# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
48
|
-
# puts "--"
|
49
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
50
|
-
# puts "======"
|
51
|
-
# end
|
52
|
-
|
62
|
+
cm = nil
|
63
|
+
source_annotations.each_with_index do |annotations, i|
|
53
64
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
54
65
|
ididx = {}
|
55
|
-
|
66
|
+
warn "[#{i}]-=-=-=-=-"
|
67
|
+
denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
|
68
|
+
|
56
69
|
denotations.each do |d|
|
57
70
|
reid = 'T' + (idnum_denotations += 1).to_s
|
58
71
|
ididx[d[:id]] = reid
|
@@ -101,126 +114,12 @@ end
|
|
101
114
|
source_annotations = read_annotations(ARGV[0])
|
102
115
|
target_text = read_text(ARGV[1])
|
103
116
|
|
104
|
-
lost_annotations = []
|
105
117
|
target_annotations = if source_annotations.class == Array
|
106
|
-
|
118
|
+
align_mannotations(source_annotations, target_text, false)
|
107
119
|
else
|
108
|
-
|
109
|
-
|
110
|
-
# verification
|
111
|
-
# source_text = source_annotations[:text]
|
112
|
-
# puts "=====BEGIN"
|
113
|
-
# (0 ... source_text.rstrip.length).each do |p|
|
114
|
-
# t = alignment.transform_begin_position(p)
|
115
|
-
# if t.nil?
|
116
|
-
# print source_text[p]
|
117
|
-
# else
|
118
|
-
# print '.'
|
119
|
-
# end
|
120
|
-
# end
|
121
|
-
# puts
|
122
|
-
# puts "=====END"
|
123
|
-
|
124
|
-
# puts "=====BEGIN"
|
125
|
-
# (0 .. source_text.rstrip.length).each do |p|
|
126
|
-
# t = alignment.transform_end_position(p)
|
127
|
-
# if t.nil?
|
128
|
-
# print source_text[p]
|
129
|
-
# else
|
130
|
-
# print '.'
|
131
|
-
# end
|
132
|
-
# end
|
133
|
-
# puts
|
134
|
-
# puts "=====END"
|
135
|
-
|
136
|
-
source_text = source_annotations[:text]
|
137
|
-
|
138
|
-
puts "[block alignment]"
|
139
|
-
puts alignment.alignment_show
|
140
|
-
puts "====="
|
141
|
-
# exit
|
142
|
-
|
143
|
-
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
144
|
-
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
145
|
-
|
120
|
+
denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
121
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
|
146
122
|
source_annotations.merge({text:target_text, denotations:denotations})
|
147
123
|
end
|
148
124
|
|
149
|
-
|
150
|
-
num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
|
151
|
-
source_annotations.each do |annotations|
|
152
|
-
num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
|
153
|
-
num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
|
154
|
-
num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
|
155
|
-
num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
|
156
|
-
end
|
157
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
158
|
-
else
|
159
|
-
num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
|
160
|
-
num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
|
161
|
-
num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
|
162
|
-
num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
|
163
|
-
[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
|
164
|
-
end
|
165
|
-
|
166
|
-
warn "[source]"
|
167
|
-
warn "denotations:\t#{num_denotations_source}"
|
168
|
-
# warn "relations:\t#{num_relations_source}"
|
169
|
-
# warn "attributes:\t#{num_attributes_source}"
|
170
|
-
# warn "modifications:\t#{num_modifications_source}"
|
171
|
-
|
172
|
-
warn "\n[target]"
|
173
|
-
warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
|
174
|
-
# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
|
175
|
-
# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
|
176
|
-
# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
|
177
|
-
|
178
|
-
if lost_annotations
|
179
|
-
warn "\n[lost annotations]"
|
180
|
-
lost_annotations.each do |a|
|
181
|
-
p a
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
#puts target_annotations.to_json
|
186
|
-
|
187
|
-
# denotations = anns1[:denotations]
|
188
|
-
|
189
|
-
# puts "[Alignment1]====="
|
190
|
-
# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
191
|
-
|
192
|
-
# align.alignment.each do |a|
|
193
|
-
# p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
|
194
|
-
# end
|
195
|
-
|
196
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
197
|
-
# puts
|
198
|
-
# puts "[Similarity]\n#{align.similarity}"
|
199
|
-
# puts
|
200
|
-
# puts '[Denotations original]'
|
201
|
-
# pp denotations
|
202
|
-
# puts
|
203
|
-
# puts '[Denotations transformed]'
|
204
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
205
|
-
# pp new_denotations
|
206
|
-
# puts
|
207
|
-
# puts "[Alignment2 (downcased)]====="
|
208
|
-
# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
209
|
-
# puts TextAlignment::sdiff2cdiff(align.sdiff)
|
210
|
-
# puts
|
211
|
-
# puts "[Similarity]\n#{align.similarity}"
|
212
|
-
# puts
|
213
|
-
# puts '[Denotations original]'
|
214
|
-
# pp denotations
|
215
|
-
# puts
|
216
|
-
# puts '[Denotations transformed]'
|
217
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
218
|
-
# pp new_denotations
|
219
|
-
# puts
|
220
|
-
# puts '[Annotations transformed]'
|
221
|
-
# anns2[:denotations] = new_denotations
|
222
|
-
# puts anns2.to_json
|
223
|
-
|
224
|
-
# p align.common_elements
|
225
|
-
# puts "---------------"
|
226
|
-
# p align.mapped_elements
|
125
|
+
puts target_annotations.to_json
|
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
13
|
-
|
14
|
-
@reverse = (target_str.length < source_str.length)
|
15
|
-
|
16
|
-
@s1, @s2 = if @reverse
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1, @s2 = if reverse?(source_str, target_str)
|
17
11
|
[target_str.downcase, source_str.downcase]
|
18
12
|
else
|
19
13
|
[source_str.downcase, target_str.downcase]
|
20
14
|
end
|
21
15
|
|
22
|
-
|
23
|
-
@beg_s1 = 0
|
24
|
-
@end_s1_prev = 0
|
25
|
-
@end_s2_prev = 0
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_next_anchor
|
29
|
-
# find the position of an anchor ngram in s1 and s2
|
30
|
-
while @beg_s1 < (@s1.length - @size_ngram)
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
16
|
+
@cultivation_map = cultivation_map
|
36
17
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
18
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
19
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
20
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
42
22
|
|
43
|
-
|
44
|
-
|
23
|
+
# positions of last match
|
24
|
+
@pos_s1_last_match = 0
|
25
|
+
@pos_s2_last_match = 0
|
26
|
+
end
|
45
27
|
|
46
|
-
|
47
|
-
|
28
|
+
def reverse?(source_str = nil, target_str = nil)
|
29
|
+
unless source_str.nil?
|
30
|
+
@reverse_p = target_str.length < source_str.length
|
31
|
+
end
|
32
|
+
@reverse_p
|
33
|
+
end
|
48
34
|
|
49
|
-
|
50
|
-
|
35
|
+
def get_next_anchor
|
36
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
51
38
|
|
52
|
-
|
39
|
+
# To skip whitespace letters
|
40
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
53
41
|
|
54
|
-
|
42
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
43
|
+
break _beg_s2 unless _beg_s2.nil?
|
55
44
|
end
|
56
45
|
|
57
|
-
return nil
|
46
|
+
# To return nil when it fails to find an anchor
|
47
|
+
return nil if beg_s2.class == Range
|
58
48
|
|
59
|
-
# extend the block
|
60
|
-
b1 =
|
61
|
-
b2 =
|
62
|
-
while b1 >= @
|
49
|
+
# To extend the block to the left
|
50
|
+
b1 = beg_s1
|
51
|
+
b2 = beg_s2
|
52
|
+
while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
|
63
53
|
b1 -= 1; b2 -= 1
|
64
54
|
end
|
65
|
-
|
66
55
|
b1 += 1; b2 += 1
|
67
56
|
|
68
|
-
|
69
|
-
|
57
|
+
# To extend the block to the right
|
58
|
+
e1 = beg_s1 + @size_ngram
|
59
|
+
e2 = beg_s2 + @size_ngram
|
70
60
|
while @s1[e1] && @s1[e1] == @s2[e2]
|
71
61
|
e1 += 1; e2 += 1
|
72
62
|
end
|
73
63
|
|
74
|
-
@
|
75
|
-
@
|
76
|
-
@beg_s1 = e1
|
64
|
+
@pos_s1_last_match = e1
|
65
|
+
@pos_s2_last_match = e2
|
77
66
|
|
78
|
-
if
|
67
|
+
if reverse?
|
79
68
|
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
80
69
|
else
|
81
70
|
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
|
|
84
73
|
|
85
74
|
private
|
86
75
|
|
87
|
-
def
|
88
|
-
#
|
89
|
-
|
76
|
+
def get_beg_s2(beg_s1)
|
77
|
+
# to get the anchor to search for in s2
|
78
|
+
anchor = @s1[beg_s1, @size_ngram]
|
79
|
+
|
80
|
+
# comment out below with the assumption that texts are in the same order
|
81
|
+
# search_position = 0
|
82
|
+
search_position = @pos_s2_last_match
|
83
|
+
|
84
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
85
|
+
return nil if beg_s2_candidates.empty?
|
86
|
+
|
87
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
88
|
+
end
|
89
|
+
|
90
|
+
# To find beg_s2 which match to the anchor
|
91
|
+
# return nil if the anchor is too much frequent
|
92
|
+
def find_beg_s2_candidates(anchor, search_position)
|
93
|
+
candidates = []
|
94
|
+
while _beg_s2 = @s2.index(anchor, search_position)
|
95
|
+
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
+
unless search_again_position.nil?
|
97
|
+
search_position = search_again_position
|
98
|
+
next
|
99
|
+
end
|
100
|
+
|
101
|
+
candidates << _beg_s2
|
102
|
+
|
103
|
+
# for speed, skip anchor of high frequency
|
104
|
+
if candidates.length > 5
|
105
|
+
candidates.clear
|
106
|
+
break
|
107
|
+
end
|
108
|
+
|
109
|
+
search_position = _beg_s2 + 1
|
110
|
+
end
|
111
|
+
candidates
|
112
|
+
end
|
113
|
+
|
114
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
115
|
+
valid_beg_s2 = nil
|
116
|
+
|
117
|
+
(10 .. 30).step(10).each do |size_window|
|
118
|
+
valid_beg_s2 = nil
|
119
|
+
|
120
|
+
r = beg_s2_candidates.each do |beg_s2|
|
121
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
122
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
123
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
124
|
+
break unless valid_beg_s2.nil?
|
125
|
+
valid_beg_s2 = beg_s2
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
130
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
131
|
+
break unless valid_beg_s2.nil?
|
132
|
+
valid_beg_s2 = beg_s2
|
133
|
+
next
|
134
|
+
end
|
135
|
+
|
136
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
137
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
138
|
+
break unless valid_beg_s2.nil?
|
139
|
+
valid_beg_s2 = beg_s2
|
140
|
+
next
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
145
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
146
|
+
break unless r.nil?
|
147
|
+
end
|
148
|
+
|
149
|
+
valid_beg_s2
|
150
|
+
end
|
151
|
+
|
152
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
153
|
+
size_window ||= @size_window
|
154
|
+
|
155
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
156
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
157
|
|
91
158
|
window_s1 = ''
|
92
|
-
loc =
|
159
|
+
loc = beg_s1 - 1
|
93
160
|
count = 0
|
94
|
-
while count <
|
161
|
+
while count < size_window && loc >= 0
|
95
162
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
163
|
window_s1 += @s1[loc]
|
97
164
|
count += 1
|
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
|
|
100
167
|
end
|
101
168
|
|
102
169
|
window_s2 = ''
|
103
|
-
loc =
|
170
|
+
loc = beg_s2 - 1
|
104
171
|
count = 0
|
105
|
-
while count <
|
172
|
+
while count < size_window && loc >= 0
|
106
173
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
174
|
window_s2 += @s2[loc]
|
108
175
|
count += 1
|
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
|
|
113
180
|
[window_s1, window_s2]
|
114
181
|
end
|
115
182
|
|
116
|
-
def get_right_windows
|
183
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
184
|
+
size_window ||= @size_window
|
185
|
+
|
117
186
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
187
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
188
|
|
120
189
|
window_s1 = ''
|
121
|
-
loc =
|
190
|
+
loc = beg_s1 + @size_ngram
|
122
191
|
len_s1 = @s1.length
|
123
192
|
count = 0
|
124
|
-
while count <
|
193
|
+
while count < size_window && loc < len_s1
|
125
194
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
195
|
window_s1 += @s1[loc]
|
127
196
|
count += 1
|
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
|
|
130
199
|
end
|
131
200
|
|
132
201
|
window_s2 = ''
|
133
|
-
loc =
|
202
|
+
loc = beg_s2 + @size_ngram
|
134
203
|
len_s2 = @s2.length
|
135
204
|
count = 0
|
136
|
-
while count <
|
205
|
+
while count < size_window && loc < len_s2
|
137
206
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
207
|
window_s2 += @s2[loc]
|
139
208
|
count += 1
|
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
|
|
148
217
|
return 0 if str1.nil? || str2.nil?
|
149
218
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
219
|
end
|
151
|
-
|
152
|
-
end
|
220
|
+
end
|