text_alignment 0.8.1 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
4
- data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
3
+ metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
+ data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
5
5
  SHA512:
6
- metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
7
- data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
6
+ metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
+ data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
@@ -26,8 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
31
  new_denotations = alignment.transform_hdenotations(denotations)
32
32
 
33
33
  if debug
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
50
50
  new_denotations
51
51
  end
52
52
 
53
- def align_mannotations(source_annotations, target_text, debug = false)
54
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
55
 
56
56
  idnum_denotations = 0
57
57
  idnum_relations = 0
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
62
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
63
63
  ididx = {}
64
64
  warn "[#{i}]-=-=-=-=-"
65
- denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
66
67
  denotations.each do |d|
67
68
  reid = 'T' + (idnum_denotations += 1).to_s
68
69
  ididx[d[:id]] = reid
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
76
77
  annotations[:relations].each do |r|
77
78
  reid = 'R' + (idnum_relations += 1).to_s
78
79
  ididx[r[:id]] = reid
79
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
80
83
  end
81
84
  end
82
85
 
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
85
88
  annotations[:attributes].each do |a|
86
89
  reid = 'A' + (idnum_attributes += 1).to_s
87
90
  ididx[a[:id]] = reid
88
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
89
93
  end
90
94
  end
91
95
 
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
94
98
  annotations[:modifications].each do |m|
95
99
  reid = 'M' + (idnum_modifications += 1).to_s
96
100
  ididx[m[:id]] = reid
97
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
98
103
  end
99
104
  end
100
105
  end
@@ -109,13 +114,18 @@ unless ARGV.length == 2
109
114
  end
110
115
 
111
116
  source_annotations = read_annotations(ARGV[0])
112
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
113
120
 
114
121
  target_annotations = if source_annotations.class == Array
115
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
116
124
  else
117
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
118
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
119
128
  end
120
129
 
121
- # puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ puts target_annotations.to_json
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
13
12
 
14
- @reverse = (target_str.length < source_str.length)
13
+ @cultivation_map = cultivation_map
15
14
 
16
- @s1, @s2 = if @reverse
17
- [target_str.downcase, source_str.downcase]
18
- else
19
- [source_str.downcase, target_str.downcase]
20
- end
15
+ @size_ngram = TextAlignment::SIZE_NGRAM
16
+ @size_window = TextAlignment::SIZE_WINDOW
17
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
18
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
21
20
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
21
+ # positions of last match
22
+ @pos_s1_last_match = 0
23
+ @pos_s2_last_match = 0
26
24
  end
27
25
 
28
26
  def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
36
-
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
27
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
28
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
42
29
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
30
+ # To skip whitespace letters
31
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
32
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
33
+ _beg_s2 = get_beg_s2(beg_s1)
34
+ break _beg_s2 unless _beg_s2.nil?
35
+ end
48
36
 
49
- search_position = @beg_s2 + 1
50
- end
37
+ # To return nil when it fails to find an anchor
38
+ return nil if beg_s2.class == Range
51
39
 
52
- break unless @beg_s2.nil?
40
+ # To extend the block to the left
41
+ b1 = beg_s1
42
+ b2 = beg_s2
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
45
+ b1 -= 1; b2 -= 1
46
+ end
53
47
 
54
- @beg_s1 += 1
48
+ # To extend the block to the right
49
+ e1 = beg_s1 + @size_ngram
50
+ e2 = beg_s2 + @size_ngram
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
53
+ e1 += 1; e2 += 1
55
54
  end
56
55
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
56
+ @pos_s1_last_match = e1
57
+ @pos_s2_last_match = e2
58
58
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
63
- b1 -= 1; b2 -= 1
64
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
60
+ end
65
61
 
66
- b1 += 1; b2 += 1
62
+ private
67
63
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
70
- while @s1[e1] && @s1[e1] == @s2[e2]
71
- e1 += 1; e2 += 1
64
+ def get_beg_s2(beg_s1)
65
+ # to get the anchor to search for in s2
66
+ anchor = @s1[beg_s1, @size_ngram]
67
+
68
+ # comment out below with the assumption that texts are in the same order
69
+ # search_position = 0
70
+ search_position = @pos_s2_last_match
71
+
72
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
73
+ return nil if beg_s2_candidates.empty?
74
+
75
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
76
+ end
77
+
78
+ # To find beg_s2 which match to the anchor
79
+ # return nil if the anchor is too much frequent
80
+ def find_beg_s2_candidates(anchor, search_position)
81
+ candidates = []
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
83
+ candidates << _beg_s2
84
+
85
+ # for speed, skip anchor of high frequency
86
+ if candidates.length > 5
87
+ candidates.clear
88
+ break
89
+ end
90
+
91
+ search_position = _beg_s2 + 1
72
92
  end
93
+ candidates
94
+ end
95
+
96
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
97
+ valid_beg_s2 = nil
73
98
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
99
+ (10 .. 30).step(10).each do |size_window|
100
+ valid_beg_s2 = nil
77
101
 
78
- if @reverse
79
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
- else
81
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
102
+ r = beg_s2_candidates.each do |beg_s2|
103
+ # if both the begining points are sufficiantly close to the end points of the last match
104
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
105
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
106
+ break unless valid_beg_s2.nil?
107
+ valid_beg_s2 = beg_s2
108
+ next
109
+ end
110
+
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
112
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
+ break unless valid_beg_s2.nil?
114
+ valid_beg_s2 = beg_s2
115
+ next
116
+ end
117
+
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
119
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
+ break unless valid_beg_s2.nil?
121
+ valid_beg_s2 = beg_s2
122
+ next
123
+ end
124
+ end
125
+
126
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
+ break unless r.nil?
82
129
  end
130
+
131
+ valid_beg_s2
83
132
  end
84
133
 
85
- private
134
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
135
+ size_window ||= @size_window
86
136
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
137
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
138
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
139
 
91
140
  window_s1 = ''
92
- loc = @beg_s1 - 1
141
+ loc = beg_s1 - 1
93
142
  count = 0
94
- while count < @size_window && loc >= 0
143
+ while count < size_window && loc >= 0
95
144
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
145
  window_s1 += @s1[loc]
97
146
  count += 1
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
100
149
  end
101
150
 
102
151
  window_s2 = ''
103
- loc = @beg_s2 - 1
152
+ loc = beg_s2 - 1
104
153
  count = 0
105
- while count < @size_window && loc >= 0
154
+ while count < size_window && loc >= 0
106
155
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
156
  window_s2 += @s2[loc]
108
157
  count += 1
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
113
162
  [window_s1, window_s2]
114
163
  end
115
164
 
116
- def get_right_windows
165
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
166
+ size_window ||= @size_window
167
+
117
168
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
169
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
170
 
120
171
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
172
+ loc = beg_s1 + @size_ngram
122
173
  len_s1 = @s1.length
123
174
  count = 0
124
- while count < @size_window && loc < len_s1
175
+ while count < size_window && loc < len_s1
125
176
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
177
  window_s1 += @s1[loc]
127
178
  count += 1
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
130
181
  end
131
182
 
132
183
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
184
+ loc = beg_s2 + @size_ngram
134
185
  len_s2 = @s2.length
135
186
  count = 0
136
- while count < @size_window && loc < len_s2
187
+ while count < size_window && loc < len_s2
137
188
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
189
  window_s2 += @s2[loc]
139
190
  count += 1
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
148
199
  return 0 if str1.nil? || str2.nil?
149
200
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
201
  end
151
-
152
- end
202
+ end
@@ -1,6 +1,6 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
3
+ TextAlignment::CHAR_MAPPING = [
4
4
  ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
6
  ["α", "alpha"], #U+03B1 (greek small letter alpha)
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
65
  [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2211 (Non-Breaking Hyphen)
66
+ ["", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
67
68
  ["−", "-"], #U+2212 (minus sign)
68
69
  ["–", "-"], #U+2013 (en dash)
69
70
  ["′", "'"], #U+2032 (prime)
@@ -75,98 +76,114 @@ TextAlignment::MAPPINGS = [
75
76
  ]
76
77
 
77
78
 
78
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+ class TextAlignment::CharMapping
80
+ attr_reader :mapped_text
79
81
 
82
+ def initialize(_text, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
80
88
 
81
- class << TextAlignment
82
- def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
- _mappings ||= TextAlignment::MAPPINGS
84
-
85
- character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
- if character_mappings.empty?
87
- [_str1, _str2, _mappings]
88
- else
89
- characters_from = character_mappings.collect{|m| m[0]}.join
90
- characters_to = character_mappings.collect{|m| m[1]}.join
91
- characters_to.gsub!(/-/, '\-')
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
92
 
93
- str1 = _str1.tr(characters_from, characters_to)
94
- str2 = _str2.tr(characters_from, characters_to)
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
95
96
 
96
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+ def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
97
99
 
98
- [str1, str2, mappings]
100
+ denotations = _denotations.map do |d|
101
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
99
102
  end
100
103
  end
101
104
 
102
- def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
- _mappings ||= TextAlignment::MAPPINGS
104
-
105
- long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
- if long_to_one_mappings.empty?
107
- [_str1, _str2, _mappings]
108
- else
109
- ## long to one character mappings
110
- pletters = TextAlignment::PADDING_LETTERS
111
-
112
- # find the padding letter for str1
113
- @padding_letter1 = begin
114
- i = pletters.index{|l| _str2.index(l).nil?}
115
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
- TextAlignment::PADDING_LETTERS[i]
117
- end
105
+ private
118
106
 
119
- # find the padding letter for str2
120
- @padding_letter2 = begin
121
- i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
- TextAlignment::PADDING_LETTERS[i]
124
- end
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
125
109
 
126
- str1 = str2 = nil
127
- long_to_one_mappings.each do |f|
128
- from = f[1]
129
-
130
- str1 = if _str2.index(f[0])
131
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
- _str1.gsub(from, to)
133
- else
134
- _str1
135
- end
136
-
137
- str2 = if _str1.index(f[0])
138
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
- _str2.gsub(from, to)
140
- else
141
- _str2
142
- end
110
+ # To execute the single letter mapping
111
+ char_mapping.each do |one, long|
112
+ text.gsub!(one, long) if long.length == 1
113
+ end
114
+
115
+ # To get the (location, length) index for replacements
116
+ loc_len = []
117
+ char_mapping.each do |one, long|
118
+ next if long.length == 1
119
+
120
+ init_next = 0
121
+ while loc = text.index(long, init_next)
122
+ loc_len << [loc, long.length]
123
+ init_next = loc + long.length
143
124
  end
144
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
125
 
146
- [str1, str2, mappings]
126
+ # a workaround to avoid messing-up due to embedding
127
+ text.gsub!(long, one * long.length)
147
128
  end
148
- end
149
129
 
150
- def compute_similarity(_s1, _s2, sdiff)
151
- return 0 if sdiff.nil?
130
+ # To get the (location, length) index for consecutive whitespace sequences
131
+ init_next = 0
132
+ while loc = text.index(/\s{2,}/, init_next)
133
+ len = $~[0].length
134
+ loc_len << [loc, len]
135
+ init_next = loc + len
136
+ end
137
+
138
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
139
+
140
+ # To get the offset_mapping before and after replacement
141
+ offset_mapping = []
142
+ init_next = 0
143
+ j = 0
152
144
 
153
- # compute the lcs only with non-whitespace letters
154
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
- return 0 if lcs == 0
145
+ loc_len.each do |loc, len|
146
+ offset_mapping += (init_next .. loc).map do |i|
147
+ j += 1
148
+ [i, j - 1]
149
+ end
150
+ init_next = loc + len
151
+ end
156
152
 
157
- s1 = if @padding_letter1
158
- _s1.tr(@padding_letter1, ' ')
159
- else
160
- _s1
153
+ offset_mapping += (init_next .. text.length).map do |i|
154
+ j += 1
155
+ [i, j - 1]
161
156
  end
162
157
 
163
- s2 = if @padding_letter2
164
- _s2.tr(@padding_letter2, ' ')
165
- else
166
- _s2
158
+ # To execute the long letter mapping
159
+ char_mapping.each do |one, long|
160
+ text.gsub!(one * long.length, one) if long.length > 1
167
161
  end
168
162
 
169
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
163
+ # To replace multi whitespace sequences to a space
164
+ text.gsub!(/\s{2,}/, ' ')
165
+
166
+ [text, offset_mapping]
170
167
  end
168
+ end
169
+
170
+ if __FILE__ == $0
171
+ require 'json'
172
+
173
+ unless ARGV.length == 1
174
+ warn "#{$0} an_annotation_json_file.json"
175
+ exit
176
+ end
177
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
+ denotations = annotations[:denotations]
179
+ if denotations.nil? && annotations[:tracks]
180
+ denotations = annotations[:tracks].first[:denotations]
181
+ end
182
+
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
171
187
 
188
+ puts new_annotations.to_json
172
189
  end
@@ -0,0 +1,94 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = []
8
+ end
9
+
10
+ def cultivate(regions)
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
24
+ end
25
+ @map = new_map
26
+ end
27
+
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position < r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
93
+ end
94
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
20
20
  def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
63
65
  end
64
66
 
65
67
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
68
+ @similarity = compute_similarity(str1, str2, @sdiff)
67
69
  @str1_match_initial = cmp.str1_match_initial
68
70
  @str1_match_final = cmp.str1_match_final
69
71
  @str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
144
+ def compute_similarity(s1, s2, sdiff)
145
+ return 0 if sdiff.nil?
146
+
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
157
+
158
+ similarity = coverage * rate_frag
159
+ end
142
160
  end
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
@@ -10,253 +11,71 @@ class TextAlignment::TextAlignment
10
11
  attr_reader :similarity
11
12
  attr_reader :lost_annotations
12
13
 
13
- def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
15
17
 
16
- @block_alignment = {source_text:_str1, target_text:_str2}
17
- @original_str1 = _str1
18
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
19
22
 
20
- str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
21
-
22
- if r = whole_block_alignment(str1, str2)
23
- @block_alignment[:blocks] = r
24
- return
25
- end
23
+ @original_text = nil
24
+ @block_alignment = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
26
+ end
26
27
 
27
- ## to find block alignments
28
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
29
31
 
30
- blocks = []
31
- while block = anchor_finder.get_next_anchor
32
- last = blocks.last
33
- if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
34
- last[:source][:end] = block[:source][:end]
35
- last[:target][:end] = block[:target][:end]
36
- else
37
- blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
38
- end
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
39
36
  end
40
37
 
41
- # pp blocks
42
- # puts "-----"
43
- # puts
44
- # exit
45
- # blocks.each do |b|
46
- # p [b[:source], b[:target]]
47
- # puts "---"
48
- # puts str1[b[:source][:begin] ... b[:source][:end]]
49
- # puts "---"
50
- # puts str2[b[:target][:begin] ... b[:target][:end]]
51
- # puts "====="
52
- # puts
53
- # end
54
- # puts "-=-=-=-=-"
55
- # puts
56
-
57
- ## to fill the gaps
58
- last_block = nil
59
- blocks2 = blocks.inject([]) do |sum, block|
60
- b1 = last_block ? last_block[:source][:end] : 0
61
- e1 = block[:source][:begin]
62
-
63
- sum += if b1 == e1
64
- [block]
65
- else
66
- b2 = last_block ? last_block[:target][:end] : 0
67
- e2 = block[:target][:begin]
68
-
69
- if b2 == e2
70
- [
71
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
72
- block
73
- ]
74
- else
75
- if b1 == 0 && b2 == 0
76
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
- b2 = e2 - len_buffer if e2 > len_buffer
78
- end
79
-
80
- _str1 = str1[b1 ... e1]
81
- _str2 = str2[b2 ... e2]
82
-
83
- if _str1.strip.empty? || _str2.strip.empty?
84
- [
85
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
- block
87
- ]
88
- else
89
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
90
- end
91
- end
92
- end
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
93
40
 
94
- last_block = block
95
- sum
96
- end
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ # Initialization
43
+ # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
+ @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
97
45
 
98
- # the last step
99
- blocks2 += if last_block.nil?
100
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
46
+ # Generation
47
+ @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
48
+ r
101
49
  else
102
- b1 = last_block[:source][:end]
103
- if b1 < str1.length
104
- e1 = str1.length
105
-
106
- b2 = last_block[:target][:end]
107
- if b2 < str2.length
108
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
111
- else
112
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
113
- end
114
- else
115
- []
116
- end
50
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
117
51
  end
118
-
119
- @block_alignment[:blocks] = blocks2
120
52
  end
121
53
 
122
- def whole_block_alignment(str1, str2)
123
- ## Block exact match
124
- block_begin = str2.index(str1)
125
- unless block_begin.nil?
126
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
- end
54
+ def update_cultivation_map
55
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
128
56
 
129
- block_begin = str2.downcase.index(str1.downcase)
130
- unless block_begin.nil?
131
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
- end
133
-
134
- nil
135
- end
136
-
137
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
138
- block2 = str2[b2 ... e2]
139
-
140
- ## term-based alignment
141
- tblocks = if denotations
142
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
143
- sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
144
- map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
-
146
- position = 0
147
- tblocks = ds_in_scope.map do |term|
148
- lex = term[:lex]
149
- r = block2.index(lex, position)
150
- if r.nil?
151
- position = nil
152
- break
153
- end
154
- position = r + lex.length
155
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
156
- end
157
-
158
- # missing term found
159
- tblocks = [] if position.nil?
160
-
161
- # redundant matching found
162
- unless position.nil?
163
- ds_in_scope.each do |term|
164
- lex = term[:lex]
165
- look_forward = block2.index(lex, position)
166
- unless look_forward.nil?
167
- tblocks = []
168
- break
169
- end
170
- end
171
- end
172
-
173
- tblocks
174
- else
175
- []
176
- end
177
-
178
- if tblocks.empty?
179
- if b1 == 0 && e1 == str1.length
180
- if (e1 > 2000) || (e2 > 2000)
181
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
182
- else
183
- block1 = str1[b1 ... e1]
184
- block2 = str2[b2 ... e2]
185
-
186
- ## character-based alignment
187
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
- if alignment.sdiff.nil?
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
- else
191
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
- end
193
- end
57
+ ## To update the cultivation map
58
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
59
+ if b[:alignment] == :block || b[:alignment] == :term
60
+ [b[:target][:begin], b[:target][:end]]
194
61
  else
195
- block1 = str1[b1 ... e1]
196
- block2 = str2[b2 ... e2]
197
-
198
- ## character-based alignment
199
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
- if alignment.sdiff.nil?
201
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
- else
203
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
- end
205
- end
206
- else
207
- last_tblock = nil
208
- lblocks = tblocks.inject([]) do |sum, tblock|
209
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
210
- te1 = tblock[:source][:begin]
211
-
212
- sum += if te1 == tb1
213
- [tblock]
214
- else
215
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
216
- te2 = tblock[:target][:begin]
217
-
218
- if b2 == e2
219
- [
220
- {source:{begin:tb1, end:te1}, alignment: :empty},
221
- tblock
222
- ]
223
- else
224
- [
225
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
226
- tblock
227
- ]
228
- end
229
- end
230
-
231
- last_tblock = tblock
232
- sum
62
+ nil
233
63
  end
234
-
235
- if last_tblock[:source][:end] < e1
236
- if last_tblock[:target][:end] < e2
237
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
238
- else
239
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
240
- end
64
+ end.compact.inject([]) do |condensed, region|
65
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
66
+ condensed.push region
67
+ else
68
+ condensed.last[1] = region.last
241
69
  end
242
-
243
- lblocks
70
+ condensed
244
71
  end
245
- end
246
-
247
72
 
248
- def indices(str, target)
249
- position = 0
250
- len = target.len
251
- Enumerator.new do |yielder|
252
- while idx = str.index(target, position)
253
- yielder << idx
254
- position = idx + len
255
- end
256
- end
73
+ @cultivation_map.cultivate(newly_cultivated_regions)
257
74
  end
258
75
 
259
- def transform_begin_position(begin_position)
76
+ def transform_begin_position(_begin_position)
77
+ begin_position = @text_mapping.enmap_position(_begin_position)
78
+
260
79
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
261
80
  block = @block_alignment[:blocks][i]
262
81
 
@@ -272,9 +91,13 @@ class TextAlignment::TextAlignment
272
91
  r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
273
92
  r.nil? ? nil : r + block[:target][:begin]
274
93
  end
94
+
95
+ @rtext_mapping.demap_position(b)
275
96
  end
276
97
 
277
- def transform_end_position(end_position)
98
+ def transform_end_position(_end_position)
99
+ end_position = @text_mapping.enmap_position(_end_position)
100
+
278
101
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
279
102
  block = @block_alignment[:blocks][i]
280
103
 
@@ -290,6 +113,8 @@ class TextAlignment::TextAlignment
290
113
  r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
291
114
  r.nil? ? nil : r + block[:target][:begin]
292
115
  end
116
+
117
+ @rtext_mapping.demap_position(e)
293
118
  end
294
119
 
295
120
  def transform_a_span(span)
@@ -308,7 +133,7 @@ class TextAlignment::TextAlignment
308
133
  source = {begin:d.begin, end:d.end}
309
134
  d.begin = transform_begin_position(d.begin);
310
135
  d.end = transform_end_position(d.end);
311
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
136
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
312
137
  rescue
313
138
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
139
  d.begin = nil
@@ -324,7 +149,7 @@ class TextAlignment::TextAlignment
324
149
 
325
150
  r = hdenotations.collect do |d|
326
151
  t = transform_a_span(d[:span])
327
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
152
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
328
153
  new_d = d.dup.merge({span:t})
329
154
  rescue
330
155
  @lost_annotations << {source: d[:span], target:t}
@@ -335,8 +160,8 @@ class TextAlignment::TextAlignment
335
160
  end
336
161
 
337
162
  def alignment_show
338
- stext = @block_alignment[:source_text]
339
- ttext = @block_alignment[:target_text]
163
+ stext = @mapped_text
164
+ ttext = @mapped_reference_text
340
165
 
341
166
  show = ''
342
167
  @block_alignment[:blocks].each do |a|
@@ -392,9 +217,192 @@ class TextAlignment::TextAlignment
392
217
 
393
218
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
394
219
  "[#{astr1}]\n" +
395
- "[#{astr2}]\n\n"
220
+ "[#{astr2.gsub("\n", " ")}]\n\n"
396
221
  end
397
222
  end
398
223
  show
399
224
  end
225
+
226
+ private
227
+
228
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
229
+ ## to find block alignments
230
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
231
+
232
+ blocks = []
233
+ while block = anchor_finder.get_next_anchor
234
+ last = blocks.last
235
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
236
+ last[:source][:end] = block[:source][:end]
237
+ last[:target][:end] = block[:target][:end]
238
+ else
239
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
240
+ end
241
+ end
242
+
243
+ # pp blocks
244
+ # puts "-----"
245
+ # puts
246
+ # exit
247
+ # blocks.each do |b|
248
+ # p [b[:source], b[:target]]
249
+ # puts "---"
250
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
251
+ # puts "---"
252
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
253
+ # puts "====="
254
+ # puts
255
+ # end
256
+ # puts "-=-=-=-=-"
257
+ # puts
258
+
259
+ ## To fill the gaps
260
+ ## lblock: last block, cblock: current block
261
+ lblock = nil
262
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
263
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
264
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
+
266
+ if b1 < e1
267
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
268
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
269
+ _str1 = str1[b1 ... e1]
270
+ _str2 = str2[b2 ... e2]
271
+
272
+ sum += if _str1.strip.empty? || _str2.strip.empty?
273
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
274
+ else
275
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
276
+ region_state, state_region = cultivation_map.region_state([b2, e2])
277
+ case region_state
278
+ when :closed
279
+ []
280
+ when :front_open
281
+ oe2 = state_region[1]
282
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
284
+ when :rear_open
285
+ ob2 = state_region[0]
286
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
288
+ when :middle_closed
289
+ oe2 = state_region[0]
290
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
+ if attempt1.empty?
293
+ ob2 = state_region[1]
294
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
296
+ else
297
+ attempt1
298
+ end
299
+ else # :open
300
+ if (e2 - b2) > len_buffer
301
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
+ if attempt1.empty?
303
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
+ else
305
+ attempt1
306
+ end
307
+ else
308
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
309
+ end
310
+ end
311
+ end
312
+ end
313
+
314
+ lblock = cblock
315
+ cblock.nil? ? sum : sum << cblock
316
+ end
317
+
318
+ end
319
+
320
+ def whole_block_alignment(str1, str2, cultivation_map)
321
+ block_begin = cultivation_map.index(str1, str2, 0)
322
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
323
+
324
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
325
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
326
+
327
+ nil
328
+ end
329
+
330
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
+ if tblocks.empty?
333
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
+ else
335
+ tblocks
336
+ end
337
+ end
338
+
339
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
340
+ str2_block = str2[0 ... e2]
341
+
342
+ ## term-based alignment
343
+ tblocks = if denotations
344
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
345
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
346
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
347
+
348
+ search_position = b2
349
+ _tblocks = denotations_in_scope.map do |denotation|
350
+ lex = denotation[:lex]
351
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
352
+ break [] if term_begin.nil? # break the loop if a missing term is found
353
+ search_position = term_begin + lex.length
354
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
355
+ end
356
+
357
+ # redundant matching found
358
+ unless _tblocks.empty?
359
+ search_position = _tblocks.last[:target][:end]
360
+ denotations_in_scope.each do |term|
361
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
362
+ unless look_forward.nil?
363
+ _tblocks = []
364
+ break
365
+ end
366
+ end
367
+ end
368
+
369
+ _tblocks
370
+ else
371
+ []
372
+ end
373
+
374
+ ltblock = nil
375
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
376
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
377
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
378
+
379
+ if te1 > tb1
380
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
381
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
382
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
383
+ end
384
+
385
+ ltblock = ctblock
386
+ ctblock.nil? ? sum : sum << ctblock
387
+ end
388
+
389
+ tblocks2
390
+ end
391
+
392
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
393
+ source = {begin:b1, end:e1}
394
+ target = {begin:b2, end:e2}
395
+
396
+ if (e1 - b1) > 2000
397
+ [{source:source, target:target, alignment: :empty}]
398
+ else
399
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
400
+ if alignment.similarity < 0.5
401
+ [{source:source, target:target, alignment: :empty}]
402
+ else
403
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
404
+ end
405
+ end
406
+ end
407
+
400
408
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.8.1'
2
+ VERSION = '0.11.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-26 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,7 +77,9 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/char_mapping.rb
80
81
  - lib/text_alignment/constants.rb
82
+ - lib/text_alignment/cultivation_map.rb
81
83
  - lib/text_alignment/find_divisions.rb
82
84
  - lib/text_alignment/glcs_alignment.rb
83
85
  - lib/text_alignment/glcs_alignment_fast.rb
@@ -86,7 +88,6 @@ files:
86
88
  - lib/text_alignment/lcs_cdiff.rb
87
89
  - lib/text_alignment/lcs_comparison.rb
88
90
  - lib/text_alignment/lcs_min.rb
89
- - lib/text_alignment/mappings.rb
90
91
  - lib/text_alignment/mixed_alignment.rb
91
92
  - lib/text_alignment/text_alignment.rb
92
93
  - lib/text_alignment/version.rb