text_alignment 0.10.1 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
4
- data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
3
+ metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
+ data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
5
5
  SHA512:
6
- metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
7
- data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
6
+ metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
+ data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
@@ -26,9 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
31
- cm = alignment.cultivation_map
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
32
31
  new_denotations = alignment.transform_hdenotations(denotations)
33
32
 
34
33
  if debug
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
48
47
  warn
49
48
 
50
49
  # return target annotations
51
- [new_denotations, cm]
50
+ new_denotations
52
51
  end
53
52
 
54
- def align_mannotations(source_annotations, target_text, debug = false)
55
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
56
55
 
57
56
  idnum_denotations = 0
58
57
  idnum_relations = 0
59
58
  idnum_attributes = 0
60
59
  idnum_modifications = 0
61
60
 
62
- cm = nil
63
61
  source_annotations.each_with_index do |annotations, i|
64
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
65
63
  ididx = {}
66
64
  warn "[#{i}]-=-=-=-=-"
67
- denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
68
66
 
69
67
  denotations.each do |d|
70
68
  reid = 'T' + (idnum_denotations += 1).to_s
@@ -112,14 +110,16 @@ unless ARGV.length == 2
112
110
  end
113
111
 
114
112
  source_annotations = read_annotations(ARGV[0])
115
- target_text = read_text(ARGV[1])
113
+ reference_text = read_text(ARGV[1])
114
+
115
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
116
 
117
117
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, target_text, false)
118
+ align_mannotations(source_annotations, reference_text, alignment, false)
119
119
  else
120
- denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
122
- source_annotations.merge({text:target_text, denotations:denotations})
120
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
122
+ source_annotations.merge({text:reference_text, denotations:denotations})
123
123
  end
124
124
 
125
125
  puts target_annotations.to_json
@@ -77,11 +77,11 @@ TextAlignment::CHAR_MAPPING = [
77
77
 
78
78
 
79
79
  class TextAlignment::CharMapping
80
- attr_reader :str
80
+ attr_reader :mapped_text
81
81
 
82
- def initialize(_str, char_mapping = nil)
82
+ def initialize(_text, char_mapping = nil)
83
83
  char_mapping ||= TextAlignment::CHAR_MAPPING
84
- @str, offset_mapping = enmap_str(_str, char_mapping)
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
85
  @index_enmap = offset_mapping.to_h
86
86
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
87
  end
@@ -95,6 +95,8 @@ class TextAlignment::CharMapping
95
95
  end
96
96
 
97
97
  def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
99
+
98
100
  denotations = _denotations.map do |d|
99
101
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
102
  end
@@ -102,12 +104,12 @@ class TextAlignment::CharMapping
102
104
 
103
105
  private
104
106
 
105
- def enmap_str(_str, char_mapping)
106
- str = _str.dup
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
107
109
 
108
110
  # To execute the single letter mapping
109
111
  char_mapping.each do |one, long|
110
- str.gsub!(one, long) if long.length == 1
112
+ text.gsub!(one, long) if long.length == 1
111
113
  end
112
114
 
113
115
  # To get the (location, length) index for replacements
@@ -116,18 +118,18 @@ class TextAlignment::CharMapping
116
118
  next if long.length == 1
117
119
 
118
120
  init_next = 0
119
- while loc = str.index(long, init_next)
121
+ while loc = text.index(long, init_next)
120
122
  loc_len << [loc, long.length]
121
123
  init_next = loc + long.length
122
124
  end
123
125
 
124
126
  # a workaround to avoid messing-up due to embedding
125
- str.gsub!(long, one * long.length)
127
+ text.gsub!(long, one * long.length)
126
128
  end
127
129
 
128
130
  # To get the (location, length) index for consecutive whitespace sequences
129
131
  init_next = 0
130
- while loc = str.index(/\s{2,}/, init_next)
132
+ while loc = text.index(/\s{2,}/, init_next)
131
133
  len = $~[0].length
132
134
  loc_len << [loc, len]
133
135
  init_next = loc + len
@@ -148,20 +150,20 @@ class TextAlignment::CharMapping
148
150
  init_next = loc + len
149
151
  end
150
152
 
151
- offset_mapping += (init_next .. str.length).map do |i|
153
+ offset_mapping += (init_next .. text.length).map do |i|
152
154
  j += 1
153
155
  [i, j - 1]
154
156
  end
155
157
 
156
158
  # To execute the long letter mapping
157
159
  char_mapping.each do |one, long|
158
- str.gsub!(one * long.length, one) if long.length > 1
160
+ text.gsub!(one * long.length, one) if long.length > 1
159
161
  end
160
162
 
161
163
  # To replace multi whitespace sequences to a space
162
- str.gsub!(/\s{2,}/, ' ')
164
+ text.gsub!(/\s{2,}/, ' ')
163
165
 
164
- [str, offset_mapping]
166
+ [text, offset_mapping]
165
167
  end
166
168
  end
167
169
 
@@ -178,10 +180,10 @@ if __FILE__ == $0
178
180
  denotations = annotations[:tracks].first[:denotations]
179
181
  end
180
182
 
181
- str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
- str_mapped = str_mapping.str
183
- denotations_mapped = str_mapping.enmap_denotations(denotations)
184
- new_annotations = {text:str_mapped, denotations:denotations_mapped}
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
185
187
 
186
188
  puts new_annotations.to_json
187
189
  end
@@ -10,40 +10,59 @@ class TextAlignment::TextAlignment
10
10
  attr_reader :block_alignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
- attr_reader :cultivation_map
14
13
 
15
- def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
- @original_str1 = _str1
20
- @original_str2 = _str2
18
+ @original_rtext = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @to_prevent_overlap = to_prevent_overlap
21
21
 
22
- @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
- @str2_mapping = TextAlignment::CharMapping.new(_str2)
22
+ @original_text = nil
23
+ @block_alignment = nil
24
+ @cultivation_map = TextAlignment::CultivationMap.new
25
+ end
26
+
27
+ def align(text, denotations = nil)
28
+ # To maintain the cultivation map
29
+ update_cultivation_map if @to_prevent_overlap
30
+
31
+ # In case the input text is the same as the previous one, reuse the previous text mapping
32
+ unless @original_text && @original_text == text
33
+ @original_text = text
34
+ @text_mapping = TextAlignment::CharMapping.new(text)
35
+ end
24
36
 
25
- str1 = @str1_mapping.str
26
- denotations = @str1_mapping.enmap_denotations(_denotations)
37
+ text_mapped = @text_mapping.mapped_text
38
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
27
39
 
28
- str2 = @str2_mapping.str
40
+ rtext_mapped = @rtext_mapping.mapped_text
29
41
 
30
- @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
42
+ ## To generate the block_alignment of the input text against the reference text
31
43
 
32
- @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
- # whole block alignment
44
+ # Initialization
45
+ @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
+
47
+ # Generation
48
+ @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
34
49
  r
35
50
  else
36
- find_block_alignment(str1, str2, denotations, @cultivation_map)
51
+ find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
37
52
  end
53
+ end
38
54
 
55
+ def update_cultivation_map
56
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
+
58
+ ## To update the cultivation map
39
59
  newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
60
  if b[:alignment] == :block || b[:alignment] == :term
41
61
  [b[:target][:begin], b[:target][:end]]
42
62
  else
43
63
  nil
44
64
  end
45
- end.compact
46
- newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
65
+ end.compact.inject([]) do |condensed, region|
47
66
  if condensed.empty? || (condensed.last.last + 1 < region.first)
48
67
  condensed.push region
49
68
  else
@@ -52,11 +71,11 @@ class TextAlignment::TextAlignment
52
71
  condensed
53
72
  end
54
73
 
55
- @cultivation_map.cultivate(newly_cultivated_regions_condensed)
74
+ @cultivation_map.cultivate(newly_cultivated_regions)
56
75
  end
57
76
 
58
77
  def transform_begin_position(_begin_position)
59
- begin_position = @str1_mapping.enmap_position(_begin_position)
78
+ begin_position = @text_mapping.enmap_position(_begin_position)
60
79
 
61
80
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
81
  block = @block_alignment[:blocks][i]
@@ -74,11 +93,11 @@ class TextAlignment::TextAlignment
74
93
  r.nil? ? nil : r + block[:target][:begin]
75
94
  end
76
95
 
77
- @str2_mapping.demap_position(b)
96
+ @rtext_mapping.demap_position(b)
78
97
  end
79
98
 
80
99
  def transform_end_position(_end_position)
81
- end_position = @str1_mapping.enmap_position(_end_position)
100
+ end_position = @text_mapping.enmap_position(_end_position)
82
101
 
83
102
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
103
  block = @block_alignment[:blocks][i]
@@ -96,7 +115,7 @@ class TextAlignment::TextAlignment
96
115
  r.nil? ? nil : r + block[:target][:begin]
97
116
  end
98
117
 
99
- @str2_mapping.demap_position(e)
118
+ @rtext_mapping.demap_position(e)
100
119
  end
101
120
 
102
121
  def transform_a_span(span)
@@ -115,7 +134,7 @@ class TextAlignment::TextAlignment
115
134
  source = {begin:d.begin, end:d.end}
116
135
  d.begin = transform_begin_position(d.begin);
117
136
  d.end = transform_end_position(d.end);
118
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
137
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
119
138
  rescue
120
139
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
140
  d.begin = nil
@@ -131,7 +150,7 @@ class TextAlignment::TextAlignment
131
150
 
132
151
  r = hdenotations.collect do |d|
133
152
  t = transform_a_span(d[:span])
134
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
153
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
135
154
  new_d = d.dup.merge({span:t})
136
155
  rescue
137
156
  @lost_annotations << {source: d[:span], target:t}
@@ -142,8 +161,8 @@ class TextAlignment::TextAlignment
142
161
  end
143
162
 
144
163
  def alignment_show
145
- stext = @block_alignment[:source_text]
146
- ttext = @block_alignment[:target_text]
164
+ stext = @block_alignment[:text]
165
+ ttext = @block_alignment[:reference_text]
147
166
 
148
167
  show = ''
149
168
  @block_alignment[:blocks].each do |a|
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.10.1'
2
+ VERSION = '0.11.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-03 00:00:00.000000000 Z
11
+ date: 2021-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary