chat_correct 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +208 -0
  8. data/Rakefile +4 -0
  9. data/chat_correct.gemspec +28 -0
  10. data/lib/chat_correct/capitalization.rb +13 -0
  11. data/lib/chat_correct/combine_multi_word_verbs.rb +51 -0
  12. data/lib/chat_correct/common_verb_mistake.rb +62 -0
  13. data/lib/chat_correct/contraction.rb +103 -0
  14. data/lib/chat_correct/correct.rb +352 -0
  15. data/lib/chat_correct/corrections_hash.rb +204 -0
  16. data/lib/chat_correct/mistake_analyzer.rb +40 -0
  17. data/lib/chat_correct/pluralization.rb +22 -0
  18. data/lib/chat_correct/possessive.rb +25 -0
  19. data/lib/chat_correct/punctuation.rb +17 -0
  20. data/lib/chat_correct/punctuation_masquerading_as_spelling_error.rb +14 -0
  21. data/lib/chat_correct/spelling.rb +20 -0
  22. data/lib/chat_correct/time.rb +14 -0
  23. data/lib/chat_correct/tokenize.rb +164 -0
  24. data/lib/chat_correct/verb.rb +65 -0
  25. data/lib/chat_correct/version.rb +3 -0
  26. data/lib/chat_correct.rb +16 -0
  27. data/spec/chat_correct/capitalization_spec.rb +17 -0
  28. data/spec/chat_correct/combine_multi_word_verbs_spec.rb +39 -0
  29. data/spec/chat_correct/common_verb_mistake_spec.rb +24 -0
  30. data/spec/chat_correct/contraction_spec.rb +259 -0
  31. data/spec/chat_correct/correct_spec.rb +1650 -0
  32. data/spec/chat_correct/mistake_analyzer_spec.rb +99 -0
  33. data/spec/chat_correct/pluralization_spec.rb +31 -0
  34. data/spec/chat_correct/possessive_spec.rb +31 -0
  35. data/spec/chat_correct/punctuation_masquerading_as_spelling_error_spec.rb +24 -0
  36. data/spec/chat_correct/punctuation_spec.rb +21 -0
  37. data/spec/chat_correct/spelling_spec.rb +59 -0
  38. data/spec/chat_correct/time_spec.rb +21 -0
  39. data/spec/chat_correct/tokenize_spec.rb +142 -0
  40. data/spec/chat_correct/verb_spec.rb +60 -0
  41. data/spec/spec_helper.rb +1 -0
  42. metadata +201 -0
@@ -0,0 +1,352 @@
1
+ require 'engtagger'
2
+
3
+ module ChatCorrect
4
+ class Correct
5
+ TYPES_OF_MISTAKES = ['missing_word', 'unnecessary_word', 'spelling', 'verb', 'punctuation', 'word_order', 'capitalization', 'duplicate_word', 'word_choice', 'pluralization', 'possessive', 'stylistic_choice']
6
+ attr_reader :original_sentence, :corrected_sentence
7
+ def initialize(original_sentence:, corrected_sentence:)
8
+ @original_sentence = original_sentence
9
+ @corrected_sentence = corrected_sentence
10
+ end
11
+
12
+ def correct
13
+ # puts "OS: #{original_sentence}"
14
+ # puts "CS: #{corrected_sentence}"
15
+ # puts "OST: #{original_sentence_tokenized}"
16
+ # puts "CST: #{corrected_sentence_tokenized}"
17
+ # puts "OSTag: #{original_sentence_tagged}"
18
+ # puts "CSTag: #{corrected_sentence_tagged}"
19
+ # puts "OSTD: #{original_sentence_tokenized_downcased}"
20
+ # puts "CSTD: #{corrected_sentence_tokenized_downcased}"
21
+ stage_1
22
+ debug
23
+ stage_2
24
+ debug
25
+ iterate_sentences('stage_3')
26
+ debug
27
+ iterate_sentences('stage_4')
28
+ debug
29
+ iterate_sentences('stage_5')
30
+ debug
31
+ iterate_sentences('stage_6')
32
+ debug
33
+ iterate_sentences('stage_7')
34
+ debug
35
+ stage_8
36
+ debug
37
+ prev_next_match_check
38
+ debug
39
+ stage_9
40
+ debug
41
+ correction_hash = ChatCorrect::CorrectionsHash.new(original_sentence_info_hash: original_sentence_info_hash, corrected_sentence_info_hash: corrected_sentence_info_hash).create
42
+ build_corrections_hash(correction_hash)
43
+ end
44
+
45
+ def mistakes
46
+ mistakes_hash = {}
47
+ correct.each do |key, value|
48
+ next if !value['type'].split('_')[-1].eql?('mistake') || value['type'].split('_')[0].eql?('no')
49
+ interim_hash = {}
50
+ interim_hash['position'] = key
51
+ if value['type'].split('_').length > 2
52
+ interim_hash['error_type'] = value['type'].split('_')[0] + '_' + value['type'].split('_')[1]
53
+ else
54
+ interim_hash['error_type'] = value['type'].split('_')[0]
55
+ end
56
+ interim_hash['mistake'] = value['token']
57
+ if correct[key + 1]['type'].split('_')[0].eql?(correct[key]['type'].split('_')[0])
58
+ interim_hash['correction'] = correct[key + 1]['token']
59
+ else
60
+ interim_hash['correction'] = ''
61
+ end
62
+ mistakes_hash[mistakes_hash.length] = interim_hash
63
+ end
64
+ mistakes_hash
65
+ end
66
+
67
+ def mistake_report
68
+ mistake_report_hash = {}
69
+ TYPES_OF_MISTAKES.each do |mistake|
70
+ counter = 0
71
+ mistakes.each do |key, value|
72
+ counter += 1 if value['error_type'].eql?(mistake)
73
+ end
74
+ mistake_report_hash[mistake] = counter
75
+ end
76
+ mistake_report_hash
77
+ end
78
+
79
+ def number_of_mistakes
80
+ mistakes.length
81
+ end
82
+
83
+ private
84
+
85
+ def build_corrections_hash(correction_hash)
86
+ final_hash = {}
87
+ correction_hash.each do |k, v|
88
+ interim_hash = {}
89
+ interim_hash['token'] = reverse_symbols(v.keys[0])
90
+ interim_hash['type'] = v.values[0]
91
+ final_hash[k] = interim_hash
92
+ end
93
+ final_hash
94
+ end
95
+
96
+ def reverse_symbols(txt)
97
+ txt.gsub('∬', '"')
98
+ .gsub('∯', '"')
99
+ .gsub('ƪ', "'")
100
+ .gsub('∫', "'")
101
+ .gsub('∮', "'")
102
+ .gsub('☍', ". ")
103
+ .gsub('☊', ".")
104
+ .gsub('☌', ",")
105
+ end
106
+
107
+ def original_sentence_tokenized
108
+ @original_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: original_sentence).combine
109
+ end
110
+
111
+ def corrected_sentence_tokenized
112
+ @corrected_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: corrected_sentence).combine
113
+ end
114
+
115
+ def original_sentence_tagged
116
+ tgr = EngTagger.new
117
+ @original_sentence_tagged ||= tgr.add_tags(original_sentence).split
118
+ end
119
+
120
+ def corrected_sentence_tagged
121
+ tgr = EngTagger.new
122
+ @corrected_sentence_tagged ||= tgr.add_tags(corrected_sentence).split
123
+ end
124
+
125
+ def original_sentence_tokenized_downcased
126
+ @original_sentence_tokenized_downcased ||= original_sentence_tokenized.map { |token| token.downcase }
127
+ end
128
+
129
+ def corrected_sentence_tokenized_downcased
130
+ @corrected_sentence_tokenized_downcased ||= corrected_sentence_tokenized.map { |token| token.downcase }
131
+ end
132
+
133
+ def original_sentence_info_hash
134
+ @original_sentence_info_hash ||= create_sentence_info_hash(original_sentence_tokenized, original_sentence_tokenized_downcased, original_sentence_tagged)
135
+ end
136
+
137
+ def corrected_sentence_info_hash
138
+ @corrected_sentence_info_hash ||= create_sentence_info_hash(corrected_sentence_tokenized, corrected_sentence_tokenized_downcased, corrected_sentence_tagged)
139
+ end
140
+
141
+ def create_sentence_info_hash(sentence_tokenized, sentence_tokenized_downcased, sentence_tagged)
142
+ sentence_hash = {}
143
+ sentence_tokenized.each_with_index do |token, index|
144
+ sentence_info = {}
145
+ sentence_info['token'] = token
146
+ assign_previous_token(sentence_info, index, 1, sentence_tokenized)
147
+ assign_previous_token(sentence_info, index, 2, sentence_tokenized)
148
+ assign_next_token(sentence_info, index, 1, sentence_tokenized)
149
+ assign_next_token(sentence_info, index, 2, sentence_tokenized)
150
+ sentence_info['num_char'] = token.length
151
+ sentence_info['position'] = index
152
+ sentence_info['multiple_words'] = token.include?(' ') ? true : false
153
+ sentence_info['lowercase'] = token.downcase
154
+ sentence_info['match_id'] = 'c' + index.to_s if sentence_tokenized.eql?(corrected_sentence_tokenized)
155
+ sentence_info['pos_tag'] = sentence_tagged[index].to_s.partition('>').first[1..-1]
156
+ sentence_info['punctuation'] = ChatCorrect::Punctuation.new(text: token).is_punctuation?
157
+ sentence_info['duplicates'] = (sentence_tokenized_downcased.count(token.downcase) > 1 ? true : false)
158
+ sentence_info['uid'] = sentence_tokenized.eql?(corrected_sentence_tokenized) ? 'corrected' + index.to_s : 'original' + index.to_s
159
+ sentence_info['matched'] = false
160
+ sentence_info['is_time'] = ChatCorrect::Time.new(text: token).is_time?
161
+ sentence_hash[index] = sentence_info
162
+ end
163
+ sentence_hash
164
+ end
165
+
166
+ def write_match_to_info_hash(ks, kc, vc)
167
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
168
+ corrected_sentence_info_hash[kc]['matched'] = true
169
+ end
170
+
171
+ def assign_previous_token(hash, index, lookup, tokenized_array)
172
+ if index - lookup < 0
173
+ hash["prev_word#{lookup}"] = 'ȸ'
174
+ else
175
+ hash["prev_word#{lookup}"] = tokenized_array[index - lookup]
176
+ end
177
+ end
178
+
179
+ def assign_next_token(hash, index, lookup, tokenized_array)
180
+ if (index + lookup) > (tokenized_array.length - 1)
181
+ hash["next_word#{lookup}"] = 'ȹ'
182
+ else
183
+ hash["next_word#{lookup}"] = tokenized_array[index + lookup]
184
+ end
185
+ end
186
+
187
+ def prev_next_match_check
188
+ corrected_sentence_info_hash.each do |kc, vc|
189
+ if !vc['matched']
190
+ prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
191
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
192
+ original_sentence_info_hash.each do |ks, vs|
193
+ prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
194
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
195
+ next if vs['match_id']
196
+ next unless prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
197
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
198
+ corrected_sentence_info_hash[kc]['matched'] = true
199
+ end
200
+ end
201
+ end
202
+ end
203
+
204
+ def set_previous_match(key, hash)
205
+ if key.eql?(0)
206
+ 'ȸ'
207
+ else
208
+ hash[key - 1]['match_id']
209
+ end
210
+ end
211
+
212
+ def set_next_match(key, hash)
213
+ if key.eql?(hash.length - 1)
214
+ 'ȹ'
215
+ else
216
+ hash[key + 1]['match_id']
217
+ end
218
+ end
219
+
220
+ def debug
221
+ # puts "++++++++++++++++++++"
222
+ original_sentence_info_hash.each do |k, v|
223
+ # puts 'Key: ' + k.to_s + '; Word: ' + v['token'].to_s + '; Match ID: ' + v['match_id'].to_s
224
+ end
225
+ end
226
+
227
+ def stage_1
228
+ matched_id_array = []
229
+ corrected_sentence_info_hash.each do |kc, vc|
230
+ original_sentence_info_hash.each do |ko, vo|
231
+ if (vc['lowercase'].eql?(vo['lowercase']) ||
232
+ (vc['prev_word1'].eql?(vo['prev_word1']) &&
233
+ vc['next_word1'].eql?(vo['next_word1']) &&
234
+ !vc['is_time'] &&
235
+ !vo['is_time'] &&
236
+ (!ChatCorrect::Punctuation.new(text: vc['prev_word1']).is_punctuation? &&
237
+ !ChatCorrect::Punctuation.new(text: vc['next_word1']).is_punctuation?) &&
238
+ vc['punctuation'].eql?(vo['punctuation']))) &&
239
+ !matched_id_array.include?(vc['match_id'].to_s) &&
240
+ !vo['duplicates'] &&
241
+ !vc['duplicates']
242
+
243
+ original_sentence_info_hash[ko]['match_id'] = vc['match_id']
244
+ corrected_sentence_info_hash[kc]['matched'] = true
245
+ matched_id_array << vc['match_id'].to_s
246
+ end
247
+ end
248
+ end
249
+ end
250
+
251
+ def stage_2
252
+ corrected_sentence_info_hash.each do |kc, vc|
253
+ if !vc['matched']
254
+ prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
255
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
256
+ if kc.eql?(corrected_sentence_info_hash.length - 1)
257
+ next_word_vc = 'ȹ'
258
+ else
259
+ next_word_vc = corrected_sentence_info_hash[kc + 1]['token']
260
+ end
261
+ original_sentence_info_hash.each do |ks, vs|
262
+ prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
263
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
264
+ if ks.eql?(original_sentence_info_hash.length - 1)
265
+ next_word_vs = 'ȹ'
266
+ else
267
+ next_word_vs = original_sentence_info_hash[ks + 1]['token']
268
+ end
269
+ next if vs['match_id']
270
+ if prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
271
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
272
+ corrected_sentence_info_hash[kc]['matched'] = true
273
+ end
274
+ next unless vs['token'].eql?(next_word_vs) && vs['token'] != next_word_vc
275
+ original_sentence_info_hash[ks]['match_id'] = 'd' + ks.to_s
276
+ end
277
+ end
278
+ end
279
+ end
280
+
281
+ def iterate_sentences(inner_method)
282
+ corrected_sentence_info_hash.each do |kc, vc|
283
+ next if vc['matched']
284
+ original_sentence_info_hash.each do |ks, vs|
285
+ next if !vs['match_id'].to_s.strip.empty?
286
+ send("#{inner_method}", kc, vc, ks, vs)
287
+ end
288
+ end
289
+ end
290
+
291
+ def stage_3(kc, vc, ks, vs)
292
+ return unless vc['token'].eql?(vs['token']) &&
293
+ (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
294
+ !vc['matched'] && vs['prev_word1'] != 'ȸ'
295
+ write_match_to_info_hash(ks, kc, vc)
296
+ end
297
+
298
+ def stage_4(kc, vc, ks, vs)
299
+ return unless vc['token'].length > 3 && vs['token'].length > 3 &&
300
+ Levenshtein.distance(vc['token'], vs['token']) < 3 && !vc['matched']
301
+ write_match_to_info_hash(ks, kc, vc)
302
+ end
303
+
304
+ def stage_5(kc, vc, ks, vs)
305
+ return unless ChatCorrect::Pluralization.new(token_a: vc['token'], token_b: vs['token']).pluralization_error? &&
306
+ !vc['matched']
307
+ write_match_to_info_hash(ks, kc, vc)
308
+ end
309
+
310
+ def stage_6(kc, vc, ks, vs)
311
+ return unless ChatCorrect::Verb.new(word: vs['token'], pos: vc['pos_tag'], text: vc['token']).verb_error? &&
312
+ (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
313
+ !vc['matched'] && !vs['next_word1'].include?(' ')
314
+ write_match_to_info_hash(ks, kc, vc)
315
+ end
316
+
317
+ def stage_7(kc, vc, ks, vs)
318
+ # Distance between position of words is currently hardcoded to 5,
319
+ # but this is a SWAG and can be adjusted based on testing.
320
+ # The idea is to stop the algoroithm from matching words like 'to'
321
+ # and 'the' that appear very far apart in the sentence and should not be matched.
322
+ return unless vc['token'].length > 1 &&
323
+ vs['token'].length > 1 &&
324
+ Levenshtein.distance(vc['token'], vs['token']) < 3 &&
325
+ vs['token'].to_s[0].eql?(vc['token'].to_s[0]) &&
326
+ (vs['position'].to_i - vc['position'].to_i).abs < 5 &&
327
+ !vc['matched']
328
+ write_match_to_info_hash(ks, kc, vc)
329
+ end
330
+
331
+ def stage_8
332
+ corrected_sentence_info_hash.each do |kc, vc|
333
+ if !vc['matched']
334
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
335
+ original_sentence_info_hash.each do |ks, vs|
336
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
337
+ next if vs['match_id']
338
+ write_match_to_info_hash(ks, kc, vc) if vs['multiple_words'] && vc['multiple_words'] && !vc['matched']
339
+ write_match_to_info_hash(ks, kc, vc) if next_match_vc.eql?('ȹ') && next_match_vs.eql?('ȹ') && vs['token'].gsub(/[[:punct:]]/, '').eql?('') && vc['token'].gsub(/[[:punct:]]/, '').eql?('') && !vc['matched']
340
+ end
341
+ end
342
+ end
343
+ end
344
+
345
+ def stage_9
346
+ original_sentence_info_hash.each do |k, v|
347
+ next if v['match_id']
348
+ original_sentence_info_hash[k]['match_id'] = 's' + k.to_s
349
+ end
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,204 @@
1
+ module ChatCorrect
2
+ class CorrectionsHash
3
+ PUNCTUATION_SYMBOLS = ['∯', '∬', '∫', '∮']
4
+ attr_reader :original_sentence_info_hash, :corrected_sentence_info_hash
5
+ def initialize(original_sentence_info_hash:, corrected_sentence_info_hash:)
6
+ @original_sentence_info_hash = original_sentence_info_hash
7
+ @corrected_sentence_info_hash = corrected_sentence_info_hash
8
+ @combined_hash = {}
9
+ @final_matched_array = []
10
+ end
11
+
12
+ def create
13
+ @j = 0
14
+ @i = 0
15
+ while @i < corrected_sentence_info_hash.length do
16
+ @correct_info = {}
17
+ @mistake_info = {}
18
+ if @j >= original_sentence_info_hash.length
19
+ if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
20
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_punctuation_mistake'
21
+ @combined_hash[@combined_hash.length] = @correct_info
22
+ else
23
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_word_mistake'
24
+ @combined_hash[@combined_hash.length] = @correct_info
25
+ end
26
+ @i +=1
27
+ else
28
+ case
29
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length].eql?(@i.to_s)
30
+ matching_ids_error_analysis(original_sentence_info_hash[@j], corrected_sentence_info_hash[@i])
31
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length] != @i.to_s
32
+ unmatched_ids_error_analysis
33
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('s')
34
+ special_error_analysis
35
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('d')
36
+ duplicate_error_analysis
37
+ end
38
+ end
39
+ end
40
+ original_sentence_info_hash.each do |k, v|
41
+ if v['match_id'].to_s[0].eql?('s') && !@final_matched_array.include?(v['match_id'].to_s)
42
+ if v['token'].gsub(/[[:punct:]]/, '').eql?('') || PUNCTUATION_SYMBOLS.include?(v['token'])
43
+ @mistake_info = {}
44
+ @mistake_info[v['token']] = 'punctuation_mistake'
45
+ @combined_hash[@combined_hash.length] = @mistake_info
46
+ @final_matched_array << v['match_id'].to_s
47
+ else
48
+ @mistake_info = {}
49
+ @mistake_info[v['token']] = 'unnecessary_word_mistake'
50
+ @combined_hash[@combined_hash.length] = @mistake_info
51
+ @final_matched_array << v['match_id'].to_s
52
+ end
53
+ end
54
+ end
55
+ @combined_hash.each do |k, v|
56
+ v.each do |k1, v1|
57
+ next unless k1.include?('ƪ')
58
+ case
59
+ when v1.include?('missing_word_mistake') && @combined_hash[k - 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k - 2].to_s.include?('unnecessary_word_mistake')
60
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k - 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
61
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
62
+ @combined_hash[k - 1][@combined_hash[k - 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
63
+ @combined_hash[k - 2][@combined_hash[k - 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
64
+ when v1.include?('missing_word_mistake') && @combined_hash[k + 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k + 2].to_s.include?('unnecessary_word_mistake')
65
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k + 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
66
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
67
+ @combined_hash[k + 1][@combined_hash[k + 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
68
+ @combined_hash[k + 2][@combined_hash[k + 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
69
+ when v1.include?('unnecessary_word_mistake') && @combined_hash[k + 1].to_s.include?('missing_word_mistake') && @combined_hash[k + 2].to_s.include?('missing_word_mistake')
70
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('missing_word_mistake').to_s, token_b: @combined_hash[k + 2].key('missing_word_mistake').to_s, contraction: k1.to_s).contraction?
71
+ @combined_hash[k][k1] = 'stylistic_choice'
72
+ @combined_hash[k + 1][@combined_hash[k + 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
73
+ @combined_hash[k + 2][@combined_hash[k + 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
74
+ when v1.include?('unnecessary_word_mistake') && @combined_hash[k - 1].to_s.include?('missing_word_mistake') && @combined_hash[k - 2].to_s.include?('missing_word_mistake')
75
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('missing_word_mistake').to_s, token_b: @combined_hash[k - 1].key('missing_word_mistake').to_s, contraction: k1.to_s).contraction?
76
+ @combined_hash[k][k1] = 'stylistic_choice'
77
+ @combined_hash[k - 1][@combined_hash[k - 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
78
+ @combined_hash[k - 2][@combined_hash[k - 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
79
+ when v1.include?('verb_mistake') && @combined_hash[k + 1].to_s.include?('verb_mistake_correction')
80
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[0].to_s, token_b: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[1].to_s, contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
81
+ @combined_hash[k][k1] = 'stylistic_choice'
82
+ @combined_hash[k + 1][@combined_hash[k + 1].key('verb_mistake_correction')] = 'stylistic_choice_correction'
83
+ when v1.include?('verb_mistake_correction') && @combined_hash[k - 1].to_s.include?('verb_mistake')
84
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 1].key('verb_mistake').to_s.split[0].to_s, token_b: @combined_hash[k - 1].key('verb_mistake').to_s.split[1].to_s, contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
85
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
86
+ @combined_hash[k - 1][@combined_hash[k - 1].key('verb_mistake')] = 'stylistic_choice'
87
+ end
88
+ end
89
+ end
90
+ @combined_hash
91
+ end
92
+
93
+ private
94
+
95
+ def update_combined_hash(mistake, original, corrected, opposite_mistake)
96
+ opposite_mistake.nil? ? om = "#{mistake.gsub(/_mistake/, '')}_correction" : om = opposite_mistake
97
+ @mistake_info[original] = "#{mistake}"
98
+ @correct_info[corrected] = om
99
+ @combined_hash[@combined_hash.length] = @mistake_info
100
+ @combined_hash[@combined_hash.length] = @correct_info
101
+ end
102
+
103
+ def update_combined_hash_single_mistake_original(mistake)
104
+ @mistake_info[original_sentence_info_hash[@j]['token']] = mistake
105
+ @combined_hash[@combined_hash.length] = @mistake_info
106
+ end
107
+
108
+ def update_combined_hash_single_mistake_corrected(mistake)
109
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = mistake
110
+ @combined_hash[@combined_hash.length] = @correct_info
111
+ end
112
+
113
+ def matching_ids_error_analysis(original, corrected)
114
+ case
115
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).no_mistake?
116
+ @correct_info[corrected['token']] = 'no_mistake'
117
+ @combined_hash[@combined_hash.length] = @correct_info
118
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).verb_mistake?
119
+ update_combined_hash('verb_mistake', original['token'], corrected['token'], nil)
120
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).capitalization_mistake?
121
+ update_combined_hash('capitalization_mistake', original['token'], corrected['token'], nil)
122
+ when ChatCorrect::Pluralization.new(token_a: corrected['token'], token_b: original['token']).pluralization_error?
123
+ update_combined_hash('pluralization_mistake', original['token'], corrected['token'], nil)
124
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).spelling_mistake?
125
+ update_combined_hash('spelling_mistake', original['token'], corrected['token'], nil)
126
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).punctuation_mistake?
127
+ update_combined_hash('punctuation_mistake', original['token'], corrected['token'], nil)
128
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).unnecessary_word_missing_punctuation_mistake?
129
+ update_combined_hash('unnecessary_word_mistake', original['token'], corrected['token'], 'missing_punctuation_mistake')
130
+ else
131
+ update_combined_hash('word_choice_mistake', original['token'], corrected['token'], nil)
132
+ end
133
+ @j +=1
134
+ @i +=1
135
+ end
136
+
137
+ def unmatched_ids_error_analysis
138
+ word_order_counter = 0
139
+ word_order_key = 0
140
+ original_sentence_info_hash.each do |ks1, kv1|
141
+ if kv1['match_id'] == corrected_sentence_info_hash[@i]['match_id']
142
+ word_order_counter = 1
143
+ word_order_key = ks1
144
+ end
145
+ end
146
+ if word_order_counter == 1
147
+ if corrected_sentence_info_hash[@i]['token'].downcase == original_sentence_info_hash[word_order_key]['token'].downcase
148
+ update_combined_hash_single_mistake_corrected('word_order_mistake')
149
+ else
150
+ if ChatCorrect::Verb.new(word: corrected_sentence_info_hash[@i]['token'], pos: 'vb', text: original_sentence_info_hash[word_order_key]['token']).verb_error?
151
+ update_combined_hash_single_mistake_corrected('verb_mistake_correction')
152
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'verb_mistake'
153
+ @combined_hash[@combined_hash.length] = @mistake_info
154
+ elsif ChatCorrect::Pluralization.new(token_a: corrected_sentence_info_hash[@i]['token'], token_b: original_sentence_info_hash[word_order_key]['token']).pluralization_error?
155
+ update_combined_hash_single_mistake_corrected('pluralization_mistake_correction')
156
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'pluralization_mistake'
157
+ @combined_hash[@combined_hash.length] = @mistake_info
158
+ else
159
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
160
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'unnecessary_word_mistake'
161
+ @combined_hash[@combined_hash.length] = @mistake_info
162
+ end
163
+ end
164
+ @j +=1
165
+ else
166
+ if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
167
+ update_combined_hash_single_mistake_corrected('punctuation_mistake')
168
+ else
169
+ if @j != 0
170
+ concatenated_corrected_string = corrected_sentence_info_hash[@i - 1]['token'].to_s + corrected_sentence_info_hash[@i]['token'].to_s
171
+ if ChatCorrect::Possessive.new(token_a: original_sentence_info_hash[@j - 1]['token'], token_b: concatenated_corrected_string).possessive?
172
+ @mistake_info[original_sentence_info_hash[@j - 1]['token']] = 'possessive_mistake'
173
+ @correct_info[concatenated_corrected_string] = 'possessive_mistake_correction'
174
+ @combined_hash[@combined_hash.length - 1] = @mistake_info
175
+ @combined_hash[@combined_hash.length] = @correct_info
176
+ else
177
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
178
+ end
179
+ else
180
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
181
+ end
182
+ end
183
+ end
184
+ @i +=1
185
+ end
186
+
187
+ def special_error_analysis
188
+ if original_sentence_info_hash[@j]['token'].gsub(/[[:punct:]]/, '').eql?('') ||
189
+ PUNCTUATION_SYMBOLS.include?(original_sentence_info_hash[@j]['token'])
190
+ update_combined_hash_single_mistake_original('punctuation_mistake')
191
+ @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
192
+ else
193
+ update_combined_hash_single_mistake_original('unnecessary_word_mistake')
194
+ @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
195
+ end
196
+ @j +=1
197
+ end
198
+
199
+ def duplicate_error_analysis
200
+ update_combined_hash_single_mistake_original('duplicate_word_mistake')
201
+ @j +=1
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,40 @@
1
+ module ChatCorrect
2
+ class MistakeAnalyzer
3
+ attr_reader :original, :corrected
4
+ def initialize(original:, corrected:)
5
+ @original = original
6
+ @corrected = corrected
7
+ end
8
+
9
+ def no_mistake?
10
+ original['token'].eql?(corrected['token'])
11
+ end
12
+
13
+ def verb_mistake?
14
+ ChatCorrect::CommonVerbMistake.new(token_a: corrected['token'], token_b: original['token']).exists? ||
15
+ original['multiple_words'] ||
16
+ corrected['multiple_words'] ||
17
+ ChatCorrect::Verb.new(word: original['token'], pos: corrected['pos_tag'], text: corrected['token']).verb_error?
18
+ end
19
+
20
+ def capitalization_mistake?
21
+ ChatCorrect::Capitalization.new(token_a: corrected['token'], token_b: original['token']).capitalization_error?
22
+ end
23
+
24
+ def punctuation_mistake?
25
+ (corrected['punctuation'] && original['punctuation']) ||
26
+ (ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
27
+ ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists? &&
28
+ !ChatCorrect::Possessive.new(token_a: original['token'], token_b: corrected['token']).possessive?)
29
+ end
30
+
31
+ def unnecessary_word_missing_punctuation_mistake?
32
+ corrected['punctuation'] && !original['punctuation']
33
+ end
34
+
35
+ def spelling_mistake?
36
+ ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
37
+ !ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists?
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,22 @@
1
+ require 'linguistics'
2
+
3
+ module ChatCorrect
4
+ class Pluralization
5
+ attr_reader :token_a, :token_b
6
+ def initialize(token_a:, token_b:)
7
+ @token_a = token_a
8
+ @token_b = token_b
9
+ end
10
+
11
+ def pluralization_error?
12
+ begin
13
+ Linguistics.use(:en)
14
+ token_a_plural = token_a.en.plural
15
+ token_b_plural = token_b.en.plural
16
+ rescue
17
+ return false
18
+ end
19
+ token_a_plural.eql?(token_b) || token_b_plural.eql?(token_a)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module ChatCorrect
2
+ class Possessive
3
+ attr_reader :token_a, :token_b
4
+ def initialize(token_a:, token_b:)
5
+ @token_a = token_a
6
+ @token_b = token_b
7
+ end
8
+
9
+ def possessive?
10
+ check_for_possessive(token_a, token_b, "ƪ") ||
11
+ check_for_possessive(token_b, token_a, "ƪ") ||
12
+ check_for_possessive(token_a, token_b, "∮") ||
13
+ check_for_possessive(token_b, token_a, "∮")
14
+ end
15
+
16
+ private
17
+
18
+ def check_for_possessive(word_1, word_2, mark)
19
+ word_1.include?(mark) &&
20
+ word_1.partition(mark)[0].downcase.eql?(word_2.downcase) ||
21
+ (word_1.partition(mark)[0].downcase.eql?(word_1.downcase[0...-1]) &&
22
+ (word_1.partition(mark)[2].eql?('s') || word_1.partition(mark)[2].length < 3))
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,17 @@
1
+ module ChatCorrect
2
+ class Punctuation
3
+ attr_reader :text
4
+ def initialize(text:)
5
+ @text = text
6
+ end
7
+
8
+ def is_punctuation?
9
+ text.gsub(/[[:punct:]]/, '').eql?('') ||
10
+ text.eql?('∫') ||
11
+ text.eql?('∬') ||
12
+ text.eql?('∯') ||
13
+ text.eql?('∮') ||
14
+ text.eql?('ƪ')
15
+ end
16
+ end
17
+ end