text_alignment 0.10.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
4
- data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
3
+ metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
+ data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
5
5
  SHA512:
6
- metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
7
- data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
6
+ metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
+ data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
@@ -26,9 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
31
- cm = alignment.cultivation_map
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
32
31
  new_denotations = alignment.transform_hdenotations(denotations)
33
32
 
34
33
  if debug
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
48
47
  warn
49
48
 
50
49
  # return target annotations
51
- [new_denotations, cm]
50
+ new_denotations
52
51
  end
53
52
 
54
- def align_mannotations(source_annotations, target_text, debug = false)
55
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
56
55
 
57
56
  idnum_denotations = 0
58
57
  idnum_relations = 0
59
58
  idnum_attributes = 0
60
59
  idnum_modifications = 0
61
60
 
62
- cm = nil
63
61
  source_annotations.each_with_index do |annotations, i|
64
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
65
63
  ididx = {}
66
64
  warn "[#{i}]-=-=-=-=-"
67
- denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
68
66
 
69
67
  denotations.each do |d|
70
68
  reid = 'T' + (idnum_denotations += 1).to_s
@@ -112,14 +110,16 @@ unless ARGV.length == 2
112
110
  end
113
111
 
114
112
  source_annotations = read_annotations(ARGV[0])
115
- target_text = read_text(ARGV[1])
113
+ reference_text = read_text(ARGV[1])
114
+
115
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
116
 
117
117
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, target_text, false)
118
+ align_mannotations(source_annotations, reference_text, alignment, false)
119
119
  else
120
- denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
122
- source_annotations.merge({text:target_text, denotations:denotations})
120
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
122
+ source_annotations.merge({text:reference_text, denotations:denotations})
123
123
  end
124
124
 
125
125
  puts target_annotations.to_json
@@ -77,11 +77,11 @@ TextAlignment::CHAR_MAPPING = [
77
77
 
78
78
 
79
79
  class TextAlignment::CharMapping
80
- attr_reader :str
80
+ attr_reader :mapped_text
81
81
 
82
- def initialize(_str, char_mapping = nil)
82
+ def initialize(_text, char_mapping = nil)
83
83
  char_mapping ||= TextAlignment::CHAR_MAPPING
84
- @str, offset_mapping = enmap_str(_str, char_mapping)
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
85
  @index_enmap = offset_mapping.to_h
86
86
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
87
  end
@@ -95,6 +95,8 @@ class TextAlignment::CharMapping
95
95
  end
96
96
 
97
97
  def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
99
+
98
100
  denotations = _denotations.map do |d|
99
101
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
102
  end
@@ -102,12 +104,12 @@ class TextAlignment::CharMapping
102
104
 
103
105
  private
104
106
 
105
- def enmap_str(_str, char_mapping)
106
- str = _str.dup
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
107
109
 
108
110
  # To execute the single letter mapping
109
111
  char_mapping.each do |one, long|
110
- str.gsub!(one, long) if long.length == 1
112
+ text.gsub!(one, long) if long.length == 1
111
113
  end
112
114
 
113
115
  # To get the (location, length) index for replacements
@@ -116,18 +118,18 @@ class TextAlignment::CharMapping
116
118
  next if long.length == 1
117
119
 
118
120
  init_next = 0
119
- while loc = str.index(long, init_next)
121
+ while loc = text.index(long, init_next)
120
122
  loc_len << [loc, long.length]
121
123
  init_next = loc + long.length
122
124
  end
123
125
 
124
126
  # a workaround to avoid messing-up due to embedding
125
- str.gsub!(long, one * long.length)
127
+ text.gsub!(long, one * long.length)
126
128
  end
127
129
 
128
130
  # To get the (location, length) index for consecutive whitespace sequences
129
131
  init_next = 0
130
- while loc = str.index(/\s{2,}/, init_next)
132
+ while loc = text.index(/\s{2,}/, init_next)
131
133
  len = $~[0].length
132
134
  loc_len << [loc, len]
133
135
  init_next = loc + len
@@ -148,20 +150,20 @@ class TextAlignment::CharMapping
148
150
  init_next = loc + len
149
151
  end
150
152
 
151
- offset_mapping += (init_next .. str.length).map do |i|
153
+ offset_mapping += (init_next .. text.length).map do |i|
152
154
  j += 1
153
155
  [i, j - 1]
154
156
  end
155
157
 
156
158
  # To execute the long letter mapping
157
159
  char_mapping.each do |one, long|
158
- str.gsub!(one * long.length, one) if long.length > 1
160
+ text.gsub!(one * long.length, one) if long.length > 1
159
161
  end
160
162
 
161
163
  # To replace multi whitespace sequences to a space
162
- str.gsub!(/\s{2,}/, ' ')
164
+ text.gsub!(/\s{2,}/, ' ')
163
165
 
164
- [str, offset_mapping]
166
+ [text, offset_mapping]
165
167
  end
166
168
  end
167
169
 
@@ -178,10 +180,10 @@ if __FILE__ == $0
178
180
  denotations = annotations[:tracks].first[:denotations]
179
181
  end
180
182
 
181
- str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
- str_mapped = str_mapping.str
183
- denotations_mapped = str_mapping.enmap_denotations(denotations)
184
- new_annotations = {text:str_mapped, denotations:denotations_mapped}
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
185
187
 
186
188
  puts new_annotations.to_json
187
189
  end
@@ -10,40 +10,59 @@ class TextAlignment::TextAlignment
10
10
  attr_reader :block_alignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
- attr_reader :cultivation_map
14
13
 
15
- def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
- @original_str1 = _str1
20
- @original_str2 = _str2
18
+ @original_rtext = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @to_prevent_overlap = to_prevent_overlap
21
21
 
22
- @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
- @str2_mapping = TextAlignment::CharMapping.new(_str2)
22
+ @original_text = nil
23
+ @block_alignment = nil
24
+ @cultivation_map = TextAlignment::CultivationMap.new
25
+ end
26
+
27
+ def align(text, denotations = nil)
28
+ # To maintain the cultivation map
29
+ update_cultivation_map if @to_prevent_overlap
30
+
31
+ # In case the input text is the same as the previous one, reuse the previous text mapping
32
+ unless @original_text && @original_text == text
33
+ @original_text = text
34
+ @text_mapping = TextAlignment::CharMapping.new(text)
35
+ end
24
36
 
25
- str1 = @str1_mapping.str
26
- denotations = @str1_mapping.enmap_denotations(_denotations)
37
+ text_mapped = @text_mapping.mapped_text
38
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
27
39
 
28
- str2 = @str2_mapping.str
40
+ rtext_mapped = @rtext_mapping.mapped_text
29
41
 
30
- @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
42
+ ## To generate the block_alignment of the input text against the reference text
31
43
 
32
- @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
- # whole block alignment
44
+ # Initialization
45
+ @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
+
47
+ # Generation
48
+ @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
34
49
  r
35
50
  else
36
- find_block_alignment(str1, str2, denotations, @cultivation_map)
51
+ find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
37
52
  end
53
+ end
38
54
 
55
+ def update_cultivation_map
56
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
+
58
+ ## To update the cultivation map
39
59
  newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
60
  if b[:alignment] == :block || b[:alignment] == :term
41
61
  [b[:target][:begin], b[:target][:end]]
42
62
  else
43
63
  nil
44
64
  end
45
- end.compact
46
- newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
65
+ end.compact.inject([]) do |condensed, region|
47
66
  if condensed.empty? || (condensed.last.last + 1 < region.first)
48
67
  condensed.push region
49
68
  else
@@ -52,11 +71,11 @@ class TextAlignment::TextAlignment
52
71
  condensed
53
72
  end
54
73
 
55
- @cultivation_map.cultivate(newly_cultivated_regions_condensed)
74
+ @cultivation_map.cultivate(newly_cultivated_regions)
56
75
  end
57
76
 
58
77
  def transform_begin_position(_begin_position)
59
- begin_position = @str1_mapping.enmap_position(_begin_position)
78
+ begin_position = @text_mapping.enmap_position(_begin_position)
60
79
 
61
80
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
81
  block = @block_alignment[:blocks][i]
@@ -74,11 +93,11 @@ class TextAlignment::TextAlignment
74
93
  r.nil? ? nil : r + block[:target][:begin]
75
94
  end
76
95
 
77
- @str2_mapping.demap_position(b)
96
+ @rtext_mapping.demap_position(b)
78
97
  end
79
98
 
80
99
  def transform_end_position(_end_position)
81
- end_position = @str1_mapping.enmap_position(_end_position)
100
+ end_position = @text_mapping.enmap_position(_end_position)
82
101
 
83
102
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
103
  block = @block_alignment[:blocks][i]
@@ -96,7 +115,7 @@ class TextAlignment::TextAlignment
96
115
  r.nil? ? nil : r + block[:target][:begin]
97
116
  end
98
117
 
99
- @str2_mapping.demap_position(e)
118
+ @rtext_mapping.demap_position(e)
100
119
  end
101
120
 
102
121
  def transform_a_span(span)
@@ -115,7 +134,7 @@ class TextAlignment::TextAlignment
115
134
  source = {begin:d.begin, end:d.end}
116
135
  d.begin = transform_begin_position(d.begin);
117
136
  d.end = transform_end_position(d.end);
118
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
137
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
119
138
  rescue
120
139
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
140
  d.begin = nil
@@ -131,7 +150,7 @@ class TextAlignment::TextAlignment
131
150
 
132
151
  r = hdenotations.collect do |d|
133
152
  t = transform_a_span(d[:span])
134
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
153
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
135
154
  new_d = d.dup.merge({span:t})
136
155
  rescue
137
156
  @lost_annotations << {source: d[:span], target:t}
@@ -142,8 +161,8 @@ class TextAlignment::TextAlignment
142
161
  end
143
162
 
144
163
  def alignment_show
145
- stext = @block_alignment[:source_text]
146
- ttext = @block_alignment[:target_text]
164
+ stext = @block_alignment[:text]
165
+ ttext = @block_alignment[:reference_text]
147
166
 
148
167
  show = ''
149
168
  @block_alignment[:blocks].each do |a|
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.10.1'
2
+ VERSION = '0.11.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-03 00:00:00.000000000 Z
11
+ date: 2021-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary