text_alignment 0.7.3 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0788bdb6d161499f5a5258757b9f61faee96b60246b422b57b17ba953b4a2c87'
4
- data.tar.gz: 4564fd15e1e1d673932438206989dc706aa67f2467698f16946e2635c562ec90
3
+ metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
+ data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
5
5
  SHA512:
6
- metadata.gz: c93882cc28e8bfdbdeed325b282b63fd3f3644d1739ed979ef92a7d8a133e26f4a8ffd0bda22da0fc2e0a31c77c6d49ce89f8f0fae802c9ae2041a7db60a2a4e
7
- data.tar.gz: 238ed44ac0c0a178a64743846550639c626d3a300a79f05fedb016e71a890f11025b52d260ce2f46f00d68b1c4b9c13c48a8be8712f737bb2b57b0274b174b8b
6
+ metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
+ data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
@@ -26,33 +26,44 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_mdoc(source_annotations, target_annotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
+ new_denotations = alignment.transform_hdenotations(denotations)
32
+
33
+ if debug
34
+ warn "[block alignment]"
35
+ warn alignment.alignment_show
36
+ warn "-----"
37
+ end
38
+
39
+ lost_annotations = alignment.lost_annotations
40
+ unless lost_annotations.empty?
41
+ warn "\n[lost annotations] #{lost_annotations.length}"
42
+ lost_annotations.each do |a|
43
+ warn "#{a}"
44
+ end
45
+ warn "====="
46
+ end
47
+ warn
48
+
49
+ # return target annotations
50
+ new_denotations
51
+ end
52
+
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
+
30
56
  idnum_denotations = 0
31
57
  idnum_relations = 0
32
58
  idnum_attributes = 0
33
59
  idnum_modifications = 0
34
60
 
35
- source_annotations.each do |annotations|
36
- alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
-
38
- puts alignment.alignment_show
39
- puts "-----"
40
- puts
41
-
42
- # alignment.block_alignments.each do |a|
43
- # p {source:a[:source], target:a[:target]}
44
- # puts "--"
45
- # p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
46
- # puts "--"
47
- # puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
48
- # puts "--"
49
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
50
- # puts "======"
51
- # end
52
-
61
+ source_annotations.each_with_index do |annotations, i|
53
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
54
63
  ididx = {}
55
- denotations = alignment.transform_hdenotations(annotations[:denotations])
64
+ warn "[#{i}]-=-=-=-=-"
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
56
67
  denotations.each do |d|
57
68
  reid = 'T' + (idnum_denotations += 1).to_s
58
69
  ididx[d[:id]] = reid
@@ -99,128 +110,16 @@ unless ARGV.length == 2
99
110
  end
100
111
 
101
112
  source_annotations = read_annotations(ARGV[0])
102
- target_text = read_text(ARGV[1])
113
+ reference_text = read_text(ARGV[1])
103
114
 
104
- lost_annotations = []
105
- target_annotations = if source_annotations.class == Array
106
- align_mdoc(source_annotations, {text: target_text})
107
- else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
109
-
110
- # verification
111
- # source_text = source_annotations[:text]
112
- # puts "=====BEGIN"
113
- # (0 ... source_text.rstrip.length).each do |p|
114
- # t = alignment.transform_begin_position(p)
115
- # if t.nil?
116
- # print source_text[p]
117
- # else
118
- # print '.'
119
- # end
120
- # end
121
- # puts
122
- # puts "=====END"
123
-
124
- # puts "=====BEGIN"
125
- # (0 .. source_text.rstrip.length).each do |p|
126
- # t = alignment.transform_end_position(p)
127
- # if t.nil?
128
- # print source_text[p]
129
- # else
130
- # print '.'
131
- # end
132
- # end
133
- # puts
134
- # puts "=====END"
135
-
136
- source_text = source_annotations[:text]
137
-
138
- puts "[block alignment]"
139
- puts alignment.alignment_show
140
- puts "====="
141
- # exit
142
-
143
- denotations = alignment.transform_hdenotations(source_annotations[:denotations])
144
- lost_annotations += alignment.lost_annotations if alignment.lost_annotations
145
-
146
- source_annotations.merge({text:target_text, denotations:denotations})
147
- end
115
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
148
116
 
149
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
150
- num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
151
- source_annotations.each do |annotations|
152
- num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
153
- num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
154
- num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
155
- num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
156
- end
157
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
117
+ target_annotations = if source_annotations.class == Array
118
+ align_mannotations(source_annotations, reference_text, alignment, false)
158
119
  else
159
- num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
160
- num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
161
- num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
162
- num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
163
- [num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
164
- end
165
-
166
- warn "[source]"
167
- warn "denotations:\t#{num_denotations_source}"
168
- # warn "relations:\t#{num_relations_source}"
169
- # warn "attributes:\t#{num_attributes_source}"
170
- # warn "modifications:\t#{num_modifications_source}"
171
-
172
- warn "\n[target]"
173
- warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
174
- # warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
175
- # warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
176
- # warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
177
-
178
- if lost_annotations
179
- warn "\n[lost annotations]"
180
- lost_annotations.each do |a|
181
- p a
182
- end
120
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
122
+ source_annotations.merge({text:reference_text, denotations:denotations})
183
123
  end
184
124
 
185
- #puts target_annotations.to_json
186
-
187
- # denotations = anns1[:denotations]
188
-
189
- # puts "[Alignment1]====="
190
- # align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
191
-
192
- # align.alignment.each do |a|
193
- # p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
194
- # end
195
-
196
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
197
- # puts
198
- # puts "[Similarity]\n#{align.similarity}"
199
- # puts
200
- # puts '[Denotations original]'
201
- # pp denotations
202
- # puts
203
- # puts '[Denotations transformed]'
204
- # new_denotations = align.transform_hdenotations(denotations)
205
- # pp new_denotations
206
- # puts
207
- # puts "[Alignment2 (downcased)]====="
208
- # align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
209
- # puts TextAlignment::sdiff2cdiff(align.sdiff)
210
- # puts
211
- # puts "[Similarity]\n#{align.similarity}"
212
- # puts
213
- # puts '[Denotations original]'
214
- # pp denotations
215
- # puts
216
- # puts '[Denotations transformed]'
217
- # new_denotations = align.transform_hdenotations(denotations)
218
- # pp new_denotations
219
- # puts
220
- # puts '[Annotations transformed]'
221
- # anns2[:denotations] = new_denotations
222
- # puts anns2.to_json
223
-
224
- # p align.common_elements
225
- # puts "---------------"
226
- # p align.mapped_elements
125
+ puts target_annotations.to_json
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
13
-
14
- @reverse = (target_str.length < source_str.length)
15
-
16
- @s1, @s2 = if @reverse
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1, @s2 = if reverse?(source_str, target_str)
17
11
  [target_str.downcase, source_str.downcase]
18
12
  else
19
13
  [source_str.downcase, target_str.downcase]
20
14
  end
21
15
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
26
- end
27
-
28
- def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
16
+ @cultivation_map = cultivation_map
36
17
 
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
18
+ @size_ngram = TextAlignment::SIZE_NGRAM
19
+ @size_window = TextAlignment::SIZE_WINDOW
20
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
42
22
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
23
+ # positions of last match
24
+ @pos_s1_last_match = 0
25
+ @pos_s2_last_match = 0
26
+ end
45
27
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
28
+ def reverse?(source_str = nil, target_str = nil)
29
+ unless source_str.nil?
30
+ @reverse_p = target_str.length < source_str.length
31
+ end
32
+ @reverse_p
33
+ end
48
34
 
49
- search_position = @beg_s2 + 1
50
- end
35
+ def get_next_anchor
36
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
51
38
 
52
- break unless @beg_s2.nil?
39
+ # To skip whitespace letters
40
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
53
41
 
54
- @beg_s1 += 1
42
+ _beg_s2 = get_beg_s2(beg_s1)
43
+ break _beg_s2 unless _beg_s2.nil?
55
44
  end
56
45
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
46
+ # To return nil when it fails to find an anchor
47
+ return nil if beg_s2.class == Range
58
48
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
49
+ # To extend the block to the left
50
+ b1 = beg_s1
51
+ b2 = beg_s2
52
+ while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
63
53
  b1 -= 1; b2 -= 1
64
54
  end
65
-
66
55
  b1 += 1; b2 += 1
67
56
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
57
+ # To extend the block to the right
58
+ e1 = beg_s1 + @size_ngram
59
+ e2 = beg_s2 + @size_ngram
70
60
  while @s1[e1] && @s1[e1] == @s2[e2]
71
61
  e1 += 1; e2 += 1
72
62
  end
73
63
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
64
+ @pos_s1_last_match = e1
65
+ @pos_s2_last_match = e2
77
66
 
78
- if @reverse
67
+ if reverse?
79
68
  {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
69
  else
81
70
  {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
84
73
 
85
74
  private
86
75
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
76
+ def get_beg_s2(beg_s1)
77
+ # to get the anchor to search for in s2
78
+ anchor = @s1[beg_s1, @size_ngram]
79
+
80
+ # comment out below with the assumption that texts are in the same order
81
+ # search_position = 0
82
+ search_position = @pos_s2_last_match
83
+
84
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
85
+ return nil if beg_s2_candidates.empty?
86
+
87
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
88
+ end
89
+
90
+ # To find beg_s2 which match to the anchor
91
+ # return nil if the anchor is too much frequent
92
+ def find_beg_s2_candidates(anchor, search_position)
93
+ candidates = []
94
+ while _beg_s2 = @s2.index(anchor, search_position)
95
+ search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
+ unless search_again_position.nil?
97
+ search_position = search_again_position
98
+ next
99
+ end
100
+
101
+ candidates << _beg_s2
102
+
103
+ # for speed, skip anchor of high frequency
104
+ if candidates.length > 5
105
+ candidates.clear
106
+ break
107
+ end
108
+
109
+ search_position = _beg_s2 + 1
110
+ end
111
+ candidates
112
+ end
113
+
114
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
115
+ valid_beg_s2 = nil
116
+
117
+ (10 .. 30).step(10).each do |size_window|
118
+ valid_beg_s2 = nil
119
+
120
+ r = beg_s2_candidates.each do |beg_s2|
121
+ # if both the begining points are sufficiantly close to the end points of the last match
122
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
123
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
124
+ break unless valid_beg_s2.nil?
125
+ valid_beg_s2 = beg_s2
126
+ next
127
+ end
128
+
129
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
130
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
+ break unless valid_beg_s2.nil?
132
+ valid_beg_s2 = beg_s2
133
+ next
134
+ end
135
+
136
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
137
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
+ break unless valid_beg_s2.nil?
139
+ valid_beg_s2 = beg_s2
140
+ next
141
+ end
142
+ end
143
+
144
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
+ break unless r.nil?
147
+ end
148
+
149
+ valid_beg_s2
150
+ end
151
+
152
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
153
+ size_window ||= @size_window
154
+
155
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
156
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
157
 
91
158
  window_s1 = ''
92
- loc = @beg_s1 - 1
159
+ loc = beg_s1 - 1
93
160
  count = 0
94
- while count < @size_window && loc >= 0
161
+ while count < size_window && loc >= 0
95
162
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
163
  window_s1 += @s1[loc]
97
164
  count += 1
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
100
167
  end
101
168
 
102
169
  window_s2 = ''
103
- loc = @beg_s2 - 1
170
+ loc = beg_s2 - 1
104
171
  count = 0
105
- while count < @size_window && loc >= 0
172
+ while count < size_window && loc >= 0
106
173
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
174
  window_s2 += @s2[loc]
108
175
  count += 1
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
113
180
  [window_s1, window_s2]
114
181
  end
115
182
 
116
- def get_right_windows
183
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
184
+ size_window ||= @size_window
185
+
117
186
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
187
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
188
 
120
189
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
190
+ loc = beg_s1 + @size_ngram
122
191
  len_s1 = @s1.length
123
192
  count = 0
124
- while count < @size_window && loc < len_s1
193
+ while count < size_window && loc < len_s1
125
194
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
195
  window_s1 += @s1[loc]
127
196
  count += 1
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
130
199
  end
131
200
 
132
201
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
202
+ loc = beg_s2 + @size_ngram
134
203
  len_s2 = @s2.length
135
204
  count = 0
136
- while count < @size_window && loc < len_s2
205
+ while count < size_window && loc < len_s2
137
206
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
207
  window_s2 += @s2[loc]
139
208
  count += 1
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
148
217
  return 0 if str1.nil? || str2.nil?
149
218
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
219
  end
151
-
152
- end
220
+ end