text_alignment 0.10.1 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +13 -13
- data/lib/text_alignment/char_mapping.rb +19 -17
- data/lib/text_alignment/text_alignment.rb +45 -26
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
|
4
|
+
data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
|
7
|
+
data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
|
data/bin/align_annotations
CHANGED
@@ -26,9 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
31
|
-
cm = alignment.cultivation_map
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
32
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
33
32
|
|
34
33
|
if debug
|
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
|
|
48
47
|
warn
|
49
48
|
|
50
49
|
# return target annotations
|
51
|
-
|
50
|
+
new_denotations
|
52
51
|
end
|
53
52
|
|
54
|
-
def align_mannotations(source_annotations,
|
55
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
56
55
|
|
57
56
|
idnum_denotations = 0
|
58
57
|
idnum_relations = 0
|
59
58
|
idnum_attributes = 0
|
60
59
|
idnum_modifications = 0
|
61
60
|
|
62
|
-
cm = nil
|
63
61
|
source_annotations.each_with_index do |annotations, i|
|
64
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
65
63
|
ididx = {}
|
66
64
|
warn "[#{i}]-=-=-=-=-"
|
67
|
-
denotations
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
68
66
|
|
69
67
|
denotations.each do |d|
|
70
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
@@ -112,14 +110,16 @@ unless ARGV.length == 2
|
|
112
110
|
end
|
113
111
|
|
114
112
|
source_annotations = read_annotations(ARGV[0])
|
115
|
-
|
113
|
+
reference_text = read_text(ARGV[1])
|
114
|
+
|
115
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
116
116
|
|
117
117
|
target_annotations = if source_annotations.class == Array
|
118
|
-
align_mannotations(source_annotations,
|
118
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
119
119
|
else
|
120
|
-
denotations
|
121
|
-
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
122
|
-
source_annotations.merge({text:
|
120
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
121
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
|
122
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
123
123
|
end
|
124
124
|
|
125
125
|
puts target_annotations.to_json
|
@@ -77,11 +77,11 @@ TextAlignment::CHAR_MAPPING = [
|
|
77
77
|
|
78
78
|
|
79
79
|
class TextAlignment::CharMapping
|
80
|
-
attr_reader :
|
80
|
+
attr_reader :mapped_text
|
81
81
|
|
82
|
-
def initialize(
|
82
|
+
def initialize(_text, char_mapping = nil)
|
83
83
|
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
-
@
|
84
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
85
85
|
@index_enmap = offset_mapping.to_h
|
86
86
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
87
|
end
|
@@ -95,6 +95,8 @@ class TextAlignment::CharMapping
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def enmap_denotations(_denotations)
|
98
|
+
return nil if _denotations.nil?
|
99
|
+
|
98
100
|
denotations = _denotations.map do |d|
|
99
101
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
100
102
|
end
|
@@ -102,12 +104,12 @@ class TextAlignment::CharMapping
|
|
102
104
|
|
103
105
|
private
|
104
106
|
|
105
|
-
def
|
106
|
-
|
107
|
+
def enmap_text(_text, char_mapping)
|
108
|
+
text = _text.dup
|
107
109
|
|
108
110
|
# To execute the single letter mapping
|
109
111
|
char_mapping.each do |one, long|
|
110
|
-
|
112
|
+
text.gsub!(one, long) if long.length == 1
|
111
113
|
end
|
112
114
|
|
113
115
|
# To get the (location, length) index for replacements
|
@@ -116,18 +118,18 @@ class TextAlignment::CharMapping
|
|
116
118
|
next if long.length == 1
|
117
119
|
|
118
120
|
init_next = 0
|
119
|
-
while loc =
|
121
|
+
while loc = text.index(long, init_next)
|
120
122
|
loc_len << [loc, long.length]
|
121
123
|
init_next = loc + long.length
|
122
124
|
end
|
123
125
|
|
124
126
|
# a workaround to avoid messing-up due to embedding
|
125
|
-
|
127
|
+
text.gsub!(long, one * long.length)
|
126
128
|
end
|
127
129
|
|
128
130
|
# To get the (location, length) index for consecutive whitespace sequences
|
129
131
|
init_next = 0
|
130
|
-
while loc =
|
132
|
+
while loc = text.index(/\s{2,}/, init_next)
|
131
133
|
len = $~[0].length
|
132
134
|
loc_len << [loc, len]
|
133
135
|
init_next = loc + len
|
@@ -148,20 +150,20 @@ class TextAlignment::CharMapping
|
|
148
150
|
init_next = loc + len
|
149
151
|
end
|
150
152
|
|
151
|
-
offset_mapping += (init_next ..
|
153
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
152
154
|
j += 1
|
153
155
|
[i, j - 1]
|
154
156
|
end
|
155
157
|
|
156
158
|
# To execute the long letter mapping
|
157
159
|
char_mapping.each do |one, long|
|
158
|
-
|
160
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
159
161
|
end
|
160
162
|
|
161
163
|
# To replace multi whitespace sequences to a space
|
162
|
-
|
164
|
+
text.gsub!(/\s{2,}/, ' ')
|
163
165
|
|
164
|
-
[
|
166
|
+
[text, offset_mapping]
|
165
167
|
end
|
166
168
|
end
|
167
169
|
|
@@ -178,10 +180,10 @@ if __FILE__ == $0
|
|
178
180
|
denotations = annotations[:tracks].first[:denotations]
|
179
181
|
end
|
180
182
|
|
181
|
-
|
182
|
-
|
183
|
-
denotations_mapped =
|
184
|
-
new_annotations = {text:
|
183
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
184
|
+
text_mapped = text_mapping.mapped_text
|
185
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
186
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
185
187
|
|
186
188
|
puts new_annotations.to_json
|
187
189
|
end
|
@@ -10,40 +10,59 @@ class TextAlignment::TextAlignment
|
|
10
10
|
attr_reader :block_alignment
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
|
-
attr_reader :cultivation_map
|
14
13
|
|
15
|
-
|
16
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
18
|
+
@original_rtext = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@to_prevent_overlap = to_prevent_overlap
|
21
21
|
|
22
|
-
@
|
23
|
-
@
|
22
|
+
@original_text = nil
|
23
|
+
@block_alignment = nil
|
24
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def align(text, denotations = nil)
|
28
|
+
# To maintain the cultivation map
|
29
|
+
update_cultivation_map if @to_prevent_overlap
|
30
|
+
|
31
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
32
|
+
unless @original_text && @original_text == text
|
33
|
+
@original_text = text
|
34
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
35
|
+
end
|
24
36
|
|
25
|
-
|
26
|
-
|
37
|
+
text_mapped = @text_mapping.mapped_text
|
38
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
27
39
|
|
28
|
-
|
40
|
+
rtext_mapped = @rtext_mapping.mapped_text
|
29
41
|
|
30
|
-
|
42
|
+
## To generate the block_alignment of the input text against the reference text
|
31
43
|
|
32
|
-
|
33
|
-
|
44
|
+
# Initialization
|
45
|
+
@block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
|
46
|
+
|
47
|
+
# Generation
|
48
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
|
34
49
|
r
|
35
50
|
else
|
36
|
-
find_block_alignment(
|
51
|
+
find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
|
37
52
|
end
|
53
|
+
end
|
38
54
|
|
55
|
+
def update_cultivation_map
|
56
|
+
return if @block_alignment.nil? || @block_alignment[:blocks].nil?
|
57
|
+
|
58
|
+
## To update the cultivation map
|
39
59
|
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
40
60
|
if b[:alignment] == :block || b[:alignment] == :term
|
41
61
|
[b[:target][:begin], b[:target][:end]]
|
42
62
|
else
|
43
63
|
nil
|
44
64
|
end
|
45
|
-
end.compact
|
46
|
-
newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
|
65
|
+
end.compact.inject([]) do |condensed, region|
|
47
66
|
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
48
67
|
condensed.push region
|
49
68
|
else
|
@@ -52,11 +71,11 @@ class TextAlignment::TextAlignment
|
|
52
71
|
condensed
|
53
72
|
end
|
54
73
|
|
55
|
-
@cultivation_map.cultivate(
|
74
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
56
75
|
end
|
57
76
|
|
58
77
|
def transform_begin_position(_begin_position)
|
59
|
-
begin_position = @
|
78
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
60
79
|
|
61
80
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
62
81
|
block = @block_alignment[:blocks][i]
|
@@ -74,11 +93,11 @@ class TextAlignment::TextAlignment
|
|
74
93
|
r.nil? ? nil : r + block[:target][:begin]
|
75
94
|
end
|
76
95
|
|
77
|
-
@
|
96
|
+
@rtext_mapping.demap_position(b)
|
78
97
|
end
|
79
98
|
|
80
99
|
def transform_end_position(_end_position)
|
81
|
-
end_position = @
|
100
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
82
101
|
|
83
102
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
84
103
|
block = @block_alignment[:blocks][i]
|
@@ -96,7 +115,7 @@ class TextAlignment::TextAlignment
|
|
96
115
|
r.nil? ? nil : r + block[:target][:begin]
|
97
116
|
end
|
98
117
|
|
99
|
-
@
|
118
|
+
@rtext_mapping.demap_position(e)
|
100
119
|
end
|
101
120
|
|
102
121
|
def transform_a_span(span)
|
@@ -115,7 +134,7 @@ class TextAlignment::TextAlignment
|
|
115
134
|
source = {begin:d.begin, end:d.end}
|
116
135
|
d.begin = transform_begin_position(d.begin);
|
117
136
|
d.end = transform_end_position(d.end);
|
118
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
137
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
|
119
138
|
rescue
|
120
139
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
121
140
|
d.begin = nil
|
@@ -131,7 +150,7 @@ class TextAlignment::TextAlignment
|
|
131
150
|
|
132
151
|
r = hdenotations.collect do |d|
|
133
152
|
t = transform_a_span(d[:span])
|
134
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
153
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
|
135
154
|
new_d = d.dup.merge({span:t})
|
136
155
|
rescue
|
137
156
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -142,8 +161,8 @@ class TextAlignment::TextAlignment
|
|
142
161
|
end
|
143
162
|
|
144
163
|
def alignment_show
|
145
|
-
stext = @block_alignment[:
|
146
|
-
ttext = @block_alignment[:
|
164
|
+
stext = @block_alignment[:text]
|
165
|
+
ttext = @block_alignment[:reference_text]
|
147
166
|
|
148
167
|
show = ''
|
149
168
|
@block_alignment[:blocks].each do |a|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|