text_alignment 0.10.1 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +13 -13
- data/lib/text_alignment/char_mapping.rb +19 -17
- data/lib/text_alignment/text_alignment.rb +45 -26
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
|
4
|
+
data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
|
7
|
+
data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
|
data/bin/align_annotations
CHANGED
@@ -26,9 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
31
|
-
cm = alignment.cultivation_map
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
32
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
33
32
|
|
34
33
|
if debug
|
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
|
|
48
47
|
warn
|
49
48
|
|
50
49
|
# return target annotations
|
51
|
-
|
50
|
+
new_denotations
|
52
51
|
end
|
53
52
|
|
54
|
-
def align_mannotations(source_annotations,
|
55
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
56
55
|
|
57
56
|
idnum_denotations = 0
|
58
57
|
idnum_relations = 0
|
59
58
|
idnum_attributes = 0
|
60
59
|
idnum_modifications = 0
|
61
60
|
|
62
|
-
cm = nil
|
63
61
|
source_annotations.each_with_index do |annotations, i|
|
64
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
65
63
|
ididx = {}
|
66
64
|
warn "[#{i}]-=-=-=-=-"
|
67
|
-
denotations
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
68
66
|
|
69
67
|
denotations.each do |d|
|
70
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
@@ -112,14 +110,16 @@ unless ARGV.length == 2
|
|
112
110
|
end
|
113
111
|
|
114
112
|
source_annotations = read_annotations(ARGV[0])
|
115
|
-
|
113
|
+
reference_text = read_text(ARGV[1])
|
114
|
+
|
115
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
116
116
|
|
117
117
|
target_annotations = if source_annotations.class == Array
|
118
|
-
align_mannotations(source_annotations,
|
118
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
119
119
|
else
|
120
|
-
denotations
|
121
|
-
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
122
|
-
source_annotations.merge({text:
|
120
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
121
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
|
122
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
123
123
|
end
|
124
124
|
|
125
125
|
puts target_annotations.to_json
|
@@ -77,11 +77,11 @@ TextAlignment::CHAR_MAPPING = [
|
|
77
77
|
|
78
78
|
|
79
79
|
class TextAlignment::CharMapping
|
80
|
-
attr_reader :
|
80
|
+
attr_reader :mapped_text
|
81
81
|
|
82
|
-
def initialize(
|
82
|
+
def initialize(_text, char_mapping = nil)
|
83
83
|
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
-
@
|
84
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
85
85
|
@index_enmap = offset_mapping.to_h
|
86
86
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
87
|
end
|
@@ -95,6 +95,8 @@ class TextAlignment::CharMapping
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def enmap_denotations(_denotations)
|
98
|
+
return nil if _denotations.nil?
|
99
|
+
|
98
100
|
denotations = _denotations.map do |d|
|
99
101
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
100
102
|
end
|
@@ -102,12 +104,12 @@ class TextAlignment::CharMapping
|
|
102
104
|
|
103
105
|
private
|
104
106
|
|
105
|
-
def
|
106
|
-
|
107
|
+
def enmap_text(_text, char_mapping)
|
108
|
+
text = _text.dup
|
107
109
|
|
108
110
|
# To execute the single letter mapping
|
109
111
|
char_mapping.each do |one, long|
|
110
|
-
|
112
|
+
text.gsub!(one, long) if long.length == 1
|
111
113
|
end
|
112
114
|
|
113
115
|
# To get the (location, length) index for replacements
|
@@ -116,18 +118,18 @@ class TextAlignment::CharMapping
|
|
116
118
|
next if long.length == 1
|
117
119
|
|
118
120
|
init_next = 0
|
119
|
-
while loc =
|
121
|
+
while loc = text.index(long, init_next)
|
120
122
|
loc_len << [loc, long.length]
|
121
123
|
init_next = loc + long.length
|
122
124
|
end
|
123
125
|
|
124
126
|
# a workaround to avoid messing-up due to embedding
|
125
|
-
|
127
|
+
text.gsub!(long, one * long.length)
|
126
128
|
end
|
127
129
|
|
128
130
|
# To get the (location, length) index for consecutive whitespace sequences
|
129
131
|
init_next = 0
|
130
|
-
while loc =
|
132
|
+
while loc = text.index(/\s{2,}/, init_next)
|
131
133
|
len = $~[0].length
|
132
134
|
loc_len << [loc, len]
|
133
135
|
init_next = loc + len
|
@@ -148,20 +150,20 @@ class TextAlignment::CharMapping
|
|
148
150
|
init_next = loc + len
|
149
151
|
end
|
150
152
|
|
151
|
-
offset_mapping += (init_next ..
|
153
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
152
154
|
j += 1
|
153
155
|
[i, j - 1]
|
154
156
|
end
|
155
157
|
|
156
158
|
# To execute the long letter mapping
|
157
159
|
char_mapping.each do |one, long|
|
158
|
-
|
160
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
159
161
|
end
|
160
162
|
|
161
163
|
# To replace multi whitespace sequences to a space
|
162
|
-
|
164
|
+
text.gsub!(/\s{2,}/, ' ')
|
163
165
|
|
164
|
-
[
|
166
|
+
[text, offset_mapping]
|
165
167
|
end
|
166
168
|
end
|
167
169
|
|
@@ -178,10 +180,10 @@ if __FILE__ == $0
|
|
178
180
|
denotations = annotations[:tracks].first[:denotations]
|
179
181
|
end
|
180
182
|
|
181
|
-
|
182
|
-
|
183
|
-
denotations_mapped =
|
184
|
-
new_annotations = {text:
|
183
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
184
|
+
text_mapped = text_mapping.mapped_text
|
185
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
186
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
185
187
|
|
186
188
|
puts new_annotations.to_json
|
187
189
|
end
|
@@ -10,40 +10,59 @@ class TextAlignment::TextAlignment
|
|
10
10
|
attr_reader :block_alignment
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
|
-
attr_reader :cultivation_map
|
14
13
|
|
15
|
-
|
16
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
18
|
+
@original_rtext = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@to_prevent_overlap = to_prevent_overlap
|
21
21
|
|
22
|
-
@
|
23
|
-
@
|
22
|
+
@original_text = nil
|
23
|
+
@block_alignment = nil
|
24
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def align(text, denotations = nil)
|
28
|
+
# To maintain the cultivation map
|
29
|
+
update_cultivation_map if @to_prevent_overlap
|
30
|
+
|
31
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
32
|
+
unless @original_text && @original_text == text
|
33
|
+
@original_text = text
|
34
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
35
|
+
end
|
24
36
|
|
25
|
-
|
26
|
-
|
37
|
+
text_mapped = @text_mapping.mapped_text
|
38
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
27
39
|
|
28
|
-
|
40
|
+
rtext_mapped = @rtext_mapping.mapped_text
|
29
41
|
|
30
|
-
|
42
|
+
## To generate the block_alignment of the input text against the reference text
|
31
43
|
|
32
|
-
|
33
|
-
|
44
|
+
# Initialization
|
45
|
+
@block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
|
46
|
+
|
47
|
+
# Generation
|
48
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
|
34
49
|
r
|
35
50
|
else
|
36
|
-
find_block_alignment(
|
51
|
+
find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
|
37
52
|
end
|
53
|
+
end
|
38
54
|
|
55
|
+
def update_cultivation_map
|
56
|
+
return if @block_alignment.nil? || @block_alignment[:blocks].nil?
|
57
|
+
|
58
|
+
## To update the cultivation map
|
39
59
|
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
40
60
|
if b[:alignment] == :block || b[:alignment] == :term
|
41
61
|
[b[:target][:begin], b[:target][:end]]
|
42
62
|
else
|
43
63
|
nil
|
44
64
|
end
|
45
|
-
end.compact
|
46
|
-
newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
|
65
|
+
end.compact.inject([]) do |condensed, region|
|
47
66
|
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
48
67
|
condensed.push region
|
49
68
|
else
|
@@ -52,11 +71,11 @@ class TextAlignment::TextAlignment
|
|
52
71
|
condensed
|
53
72
|
end
|
54
73
|
|
55
|
-
@cultivation_map.cultivate(
|
74
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
56
75
|
end
|
57
76
|
|
58
77
|
def transform_begin_position(_begin_position)
|
59
|
-
begin_position = @
|
78
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
60
79
|
|
61
80
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
62
81
|
block = @block_alignment[:blocks][i]
|
@@ -74,11 +93,11 @@ class TextAlignment::TextAlignment
|
|
74
93
|
r.nil? ? nil : r + block[:target][:begin]
|
75
94
|
end
|
76
95
|
|
77
|
-
@
|
96
|
+
@rtext_mapping.demap_position(b)
|
78
97
|
end
|
79
98
|
|
80
99
|
def transform_end_position(_end_position)
|
81
|
-
end_position = @
|
100
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
82
101
|
|
83
102
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
84
103
|
block = @block_alignment[:blocks][i]
|
@@ -96,7 +115,7 @@ class TextAlignment::TextAlignment
|
|
96
115
|
r.nil? ? nil : r + block[:target][:begin]
|
97
116
|
end
|
98
117
|
|
99
|
-
@
|
118
|
+
@rtext_mapping.demap_position(e)
|
100
119
|
end
|
101
120
|
|
102
121
|
def transform_a_span(span)
|
@@ -115,7 +134,7 @@ class TextAlignment::TextAlignment
|
|
115
134
|
source = {begin:d.begin, end:d.end}
|
116
135
|
d.begin = transform_begin_position(d.begin);
|
117
136
|
d.end = transform_end_position(d.end);
|
118
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
137
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
|
119
138
|
rescue
|
120
139
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
121
140
|
d.begin = nil
|
@@ -131,7 +150,7 @@ class TextAlignment::TextAlignment
|
|
131
150
|
|
132
151
|
r = hdenotations.collect do |d|
|
133
152
|
t = transform_a_span(d[:span])
|
134
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
153
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
|
135
154
|
new_d = d.dup.merge({span:t})
|
136
155
|
rescue
|
137
156
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -142,8 +161,8 @@ class TextAlignment::TextAlignment
|
|
142
161
|
end
|
143
162
|
|
144
163
|
def alignment_show
|
145
|
-
stext = @block_alignment[:
|
146
|
-
ttext = @block_alignment[:
|
164
|
+
stext = @block_alignment[:text]
|
165
|
+
ttext = @block_alignment[:reference_text]
|
147
166
|
|
148
167
|
show = ''
|
149
168
|
@block_alignment[:blocks].each do |a|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|