text_alignment 0.7.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0788bdb6d161499f5a5258757b9f61faee96b60246b422b57b17ba953b4a2c87'
4
- data.tar.gz: 4564fd15e1e1d673932438206989dc706aa67f2467698f16946e2635c562ec90
3
+ metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
+ data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
5
5
  SHA512:
6
- metadata.gz: c93882cc28e8bfdbdeed325b282b63fd3f3644d1739ed979ef92a7d8a133e26f4a8ffd0bda22da0fc2e0a31c77c6d49ce89f8f0fae802c9ae2041a7db60a2a4e
7
- data.tar.gz: 238ed44ac0c0a178a64743846550639c626d3a300a79f05fedb016e71a890f11025b52d260ce2f46f00d68b1c4b9c13c48a8be8712f737bb2b57b0274b174b8b
6
+ metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
+ data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
@@ -26,33 +26,44 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
56
67
  denotations.each do |d|
57
68
  reid = 'T' + (idnum_denotations += 1).to_s
58
69
  ididx[d[:id]] = reid
@@ -99,128 +110,16 @@ unless ARGV.length == 2
99
110
  end
100
111
 
101
112
  source_annotations = read_annotations(ARGV[0])
102
- target_text = read_text(ARGV[1])
113
+ reference_text = read_text(ARGV[1])
103
114
 
104
- lost_annotations = []
105
- target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
107
- else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
146
- source_annotations.merge({text:target_text, denotations:denotations})
147
- end
115
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
148
116
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
117
+ target_annotations = if source_annotations.class == Array
118
+ align_mannotations(source_annotations, reference_text, alignment, false)
158
119
  else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
120
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
122
+ source_annotations.merge({text:reference_text, denotations:denotations})
183
123
  end
184
124
 
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
125
+ puts target_annotations.to_json
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
13
-
14
- @reverse = (target_str.length < source_str.length)
15
-
16
- @s1, @s2 = if @reverse
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1, @s2 = if reverse?(source_str, target_str)
17
11
  [target_str.downcase, source_str.downcase]
18
12
  else
19
13
  [source_str.downcase, target_str.downcase]
20
14
  end
21
15
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
26
- end
27
-
28
- def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
16
+ @cultivation_map = cultivation_map
36
17
 
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
18
+ @size_ngram = TextAlignment::SIZE_NGRAM
19
+ @size_window = TextAlignment::SIZE_WINDOW
20
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
42
22
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
23
+ # positions of last match
24
+ @pos_s1_last_match = 0
25
+ @pos_s2_last_match = 0
26
+ end
45
27
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
28
+ def reverse?(source_str = nil, target_str = nil)
29
+ unless source_str.nil?
30
+ @reverse_p = target_str.length < source_str.length
31
+ end
32
+ @reverse_p
33
+ end
48
34
 
49
- search_position = @beg_s2 + 1
50
- end
35
+ def get_next_anchor
36
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
51
38
 
52
- break unless @beg_s2.nil?
39
+ # To skip whitespace letters
40
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
53
41
 
54
- @beg_s1 += 1
42
+ _beg_s2 = get_beg_s2(beg_s1)
43
+ break _beg_s2 unless _beg_s2.nil?
55
44
  end
56
45
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
46
+ # To return nil when it fails to find an anchor
47
+ return nil if beg_s2.class == Range
58
48
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
49
+ # To extend the block to the left
50
+ b1 = beg_s1
51
+ b2 = beg_s2
52
+ while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
63
53
  b1 -= 1; b2 -= 1
64
54
  end
65
-
66
55
  b1 += 1; b2 += 1
67
56
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
57
+ # To extend the block to the right
58
+ e1 = beg_s1 + @size_ngram
59
+ e2 = beg_s2 + @size_ngram
70
60
  while @s1[e1] && @s1[e1] == @s2[e2]
71
61
  e1 += 1; e2 += 1
72
62
  end
73
63
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
64
+ @pos_s1_last_match = e1
65
+ @pos_s2_last_match = e2
77
66
 
78
- if @reverse
67
+ if reverse?
79
68
  {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
69
  else
81
70
  {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
84
73
 
85
74
  private
86
75
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
76
+ def get_beg_s2(beg_s1)
77
+ # to get the anchor to search for in s2
78
+ anchor = @s1[beg_s1, @size_ngram]
79
+
80
+ # comment out below with the assumption that texts are in the same order
81
+ # search_position = 0
82
+ search_position = @pos_s2_last_match
83
+
84
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
85
+ return nil if beg_s2_candidates.empty?
86
+
87
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
88
+ end
89
+
90
+ # To find beg_s2 which match to the anchor
91
+ # return nil if the anchor is too much frequent
92
+ def find_beg_s2_candidates(anchor, search_position)
93
+ candidates = []
94
+ while _beg_s2 = @s2.index(anchor, search_position)
95
+ search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
+ unless search_again_position.nil?
97
+ search_position = search_again_position
98
+ next
99
+ end
100
+
101
+ candidates << _beg_s2
102
+
103
+ # for speed, skip anchor of high frequency
104
+ if candidates.length > 5
105
+ candidates.clear
106
+ break
107
+ end
108
+
109
+ search_position = _beg_s2 + 1
110
+ end
111
+ candidates
112
+ end
113
+
114
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
115
+ valid_beg_s2 = nil
116
+
117
+ (10 .. 30).step(10).each do |size_window|
118
+ valid_beg_s2 = nil
119
+
120
+ r = beg_s2_candidates.each do |beg_s2|
121
+ # if both the begining points are sufficiantly close to the end points of the last match
122
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
123
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
124
+ break unless valid_beg_s2.nil?
125
+ valid_beg_s2 = beg_s2
126
+ next
127
+ end
128
+
129
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
130
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
+ break unless valid_beg_s2.nil?
132
+ valid_beg_s2 = beg_s2
133
+ next
134
+ end
135
+
136
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
137
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
+ break unless valid_beg_s2.nil?
139
+ valid_beg_s2 = beg_s2
140
+ next
141
+ end
142
+ end
143
+
144
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
+ break unless r.nil?
147
+ end
148
+
149
+ valid_beg_s2
150
+ end
151
+
152
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
153
+ size_window ||= @size_window
154
+
155
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
156
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
157
 
91
158
  window_s1 = ''
92
- loc = @beg_s1 - 1
159
+ loc = beg_s1 - 1
93
160
  count = 0
94
- while count < @size_window && loc >= 0
161
+ while count < size_window && loc >= 0
95
162
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
163
  window_s1 += @s1[loc]
97
164
  count += 1
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
100
167
  end
101
168
 
102
169
  window_s2 = ''
103
- loc = @beg_s2 - 1
170
+ loc = beg_s2 - 1
104
171
  count = 0
105
- while count < @size_window && loc >= 0
172
+ while count < size_window && loc >= 0
106
173
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
174
  window_s2 += @s2[loc]
108
175
  count += 1
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
113
180
  [window_s1, window_s2]
114
181
  end
115
182
 
116
- def get_right_windows
183
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
184
+ size_window ||= @size_window
185
+
117
186
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
187
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
188
 
120
189
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
190
+ loc = beg_s1 + @size_ngram
122
191
  len_s1 = @s1.length
123
192
  count = 0
124
- while count < @size_window && loc < len_s1
193
+ while count < size_window && loc < len_s1
125
194
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
195
  window_s1 += @s1[loc]
127
196
  count += 1
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
130
199
  end
131
200
 
132
201
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
202
+ loc = beg_s2 + @size_ngram
134
203
  len_s2 = @s2.length
135
204
  count = 0
136
- while count < @size_window && loc < len_s2
205
+ while count < size_window && loc < len_s2
137
206
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
207
  window_s2 += @s2[loc]
139
208
  count += 1
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
148
217
  return 0 if str1.nil? || str2.nil?
149
218
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
219
  end
151
-
152
- end
220
+ end