chat_correct 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +208 -0
  8. data/Rakefile +4 -0
  9. data/chat_correct.gemspec +28 -0
  10. data/lib/chat_correct/capitalization.rb +13 -0
  11. data/lib/chat_correct/combine_multi_word_verbs.rb +51 -0
  12. data/lib/chat_correct/common_verb_mistake.rb +62 -0
  13. data/lib/chat_correct/contraction.rb +103 -0
  14. data/lib/chat_correct/correct.rb +352 -0
  15. data/lib/chat_correct/corrections_hash.rb +204 -0
  16. data/lib/chat_correct/mistake_analyzer.rb +40 -0
  17. data/lib/chat_correct/pluralization.rb +22 -0
  18. data/lib/chat_correct/possessive.rb +25 -0
  19. data/lib/chat_correct/punctuation.rb +17 -0
  20. data/lib/chat_correct/punctuation_masquerading_as_spelling_error.rb +14 -0
  21. data/lib/chat_correct/spelling.rb +20 -0
  22. data/lib/chat_correct/time.rb +14 -0
  23. data/lib/chat_correct/tokenize.rb +164 -0
  24. data/lib/chat_correct/verb.rb +65 -0
  25. data/lib/chat_correct/version.rb +3 -0
  26. data/lib/chat_correct.rb +16 -0
  27. data/spec/chat_correct/capitalization_spec.rb +17 -0
  28. data/spec/chat_correct/combine_multi_word_verbs_spec.rb +39 -0
  29. data/spec/chat_correct/common_verb_mistake_spec.rb +24 -0
  30. data/spec/chat_correct/contraction_spec.rb +259 -0
  31. data/spec/chat_correct/correct_spec.rb +1650 -0
  32. data/spec/chat_correct/mistake_analyzer_spec.rb +99 -0
  33. data/spec/chat_correct/pluralization_spec.rb +31 -0
  34. data/spec/chat_correct/possessive_spec.rb +31 -0
  35. data/spec/chat_correct/punctuation_masquerading_as_spelling_error_spec.rb +24 -0
  36. data/spec/chat_correct/punctuation_spec.rb +21 -0
  37. data/spec/chat_correct/spelling_spec.rb +59 -0
  38. data/spec/chat_correct/time_spec.rb +21 -0
  39. data/spec/chat_correct/tokenize_spec.rb +142 -0
  40. data/spec/chat_correct/verb_spec.rb +60 -0
  41. data/spec/spec_helper.rb +1 -0
  42. metadata +201 -0
@@ -0,0 +1,352 @@
1
+ require 'engtagger'
2
+
3
+ module ChatCorrect
4
+ class Correct
5
+ TYPES_OF_MISTAKES = ['missing_word', 'unnecessary_word', 'spelling', 'verb', 'punctuation', 'word_order', 'capitalization', 'duplicate_word', 'word_choice', 'pluralization', 'possessive', 'stylistic_choice']
6
+ attr_reader :original_sentence, :corrected_sentence
7
+ def initialize(original_sentence:, corrected_sentence:)
8
+ @original_sentence = original_sentence
9
+ @corrected_sentence = corrected_sentence
10
+ end
11
+
12
+ def correct
13
+ # puts "OS: #{original_sentence}"
14
+ # puts "CS: #{corrected_sentence}"
15
+ # puts "OST: #{original_sentence_tokenized}"
16
+ # puts "CST: #{corrected_sentence_tokenized}"
17
+ # puts "OSTag: #{original_sentence_tagged}"
18
+ # puts "CSTag: #{corrected_sentence_tagged}"
19
+ # puts "OSTD: #{original_sentence_tokenized_downcased}"
20
+ # puts "CSTD: #{corrected_sentence_tokenized_downcased}"
21
+ stage_1
22
+ debug
23
+ stage_2
24
+ debug
25
+ iterate_sentences('stage_3')
26
+ debug
27
+ iterate_sentences('stage_4')
28
+ debug
29
+ iterate_sentences('stage_5')
30
+ debug
31
+ iterate_sentences('stage_6')
32
+ debug
33
+ iterate_sentences('stage_7')
34
+ debug
35
+ stage_8
36
+ debug
37
+ prev_next_match_check
38
+ debug
39
+ stage_9
40
+ debug
41
+ correction_hash = ChatCorrect::CorrectionsHash.new(original_sentence_info_hash: original_sentence_info_hash, corrected_sentence_info_hash: corrected_sentence_info_hash).create
42
+ build_corrections_hash(correction_hash)
43
+ end
44
+
45
+ def mistakes
46
+ mistakes_hash = {}
47
+ correct.each do |key, value|
48
+ next if !value['type'].split('_')[-1].eql?('mistake') || value['type'].split('_')[0].eql?('no')
49
+ interim_hash = {}
50
+ interim_hash['position'] = key
51
+ if value['type'].split('_').length > 2
52
+ interim_hash['error_type'] = value['type'].split('_')[0] + '_' + value['type'].split('_')[1]
53
+ else
54
+ interim_hash['error_type'] = value['type'].split('_')[0]
55
+ end
56
+ interim_hash['mistake'] = value['token']
57
+ if correct[key + 1]['type'].split('_')[0].eql?(correct[key]['type'].split('_')[0])
58
+ interim_hash['correction'] = correct[key + 1]['token']
59
+ else
60
+ interim_hash['correction'] = ''
61
+ end
62
+ mistakes_hash[mistakes_hash.length] = interim_hash
63
+ end
64
+ mistakes_hash
65
+ end
66
+
67
+ def mistake_report
68
+ mistake_report_hash = {}
69
+ TYPES_OF_MISTAKES.each do |mistake|
70
+ counter = 0
71
+ mistakes.each do |key, value|
72
+ counter += 1 if value['error_type'].eql?(mistake)
73
+ end
74
+ mistake_report_hash[mistake] = counter
75
+ end
76
+ mistake_report_hash
77
+ end
78
+
79
+ def number_of_mistakes
80
+ mistakes.length
81
+ end
82
+
83
+ private
84
+
85
+ def build_corrections_hash(correction_hash)
86
+ final_hash = {}
87
+ correction_hash.each do |k, v|
88
+ interim_hash = {}
89
+ interim_hash['token'] = reverse_symbols(v.keys[0])
90
+ interim_hash['type'] = v.values[0]
91
+ final_hash[k] = interim_hash
92
+ end
93
+ final_hash
94
+ end
95
+
96
+ def reverse_symbols(txt)
97
+ txt.gsub('∬', '"')
98
+ .gsub('∯', '"')
99
+ .gsub('ƪ', "'")
100
+ .gsub('∫', "'")
101
+ .gsub('∮', "'")
102
+ .gsub('☍', ". ")
103
+ .gsub('☊', ".")
104
+ .gsub('☌', ",")
105
+ end
106
+
107
+ def original_sentence_tokenized
108
+ @original_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: original_sentence).combine
109
+ end
110
+
111
+ def corrected_sentence_tokenized
112
+ @corrected_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: corrected_sentence).combine
113
+ end
114
+
115
+ def original_sentence_tagged
116
+ tgr = EngTagger.new
117
+ @original_sentence_tagged ||= tgr.add_tags(original_sentence).split
118
+ end
119
+
120
+ def corrected_sentence_tagged
121
+ tgr = EngTagger.new
122
+ @corrected_sentence_tagged ||= tgr.add_tags(corrected_sentence).split
123
+ end
124
+
125
+ def original_sentence_tokenized_downcased
126
+ @original_sentence_tokenized_downcased ||= original_sentence_tokenized.map { |token| token.downcase }
127
+ end
128
+
129
+ def corrected_sentence_tokenized_downcased
130
+ @corrected_sentence_tokenized_downcased ||= corrected_sentence_tokenized.map { |token| token.downcase }
131
+ end
132
+
133
+ def original_sentence_info_hash
134
+ @original_sentence_info_hash ||= create_sentence_info_hash(original_sentence_tokenized, original_sentence_tokenized_downcased, original_sentence_tagged)
135
+ end
136
+
137
+ def corrected_sentence_info_hash
138
+ @corrected_sentence_info_hash ||= create_sentence_info_hash(corrected_sentence_tokenized, corrected_sentence_tokenized_downcased, corrected_sentence_tagged)
139
+ end
140
+
141
+ def create_sentence_info_hash(sentence_tokenized, sentence_tokenized_downcased, sentence_tagged)
142
+ sentence_hash = {}
143
+ sentence_tokenized.each_with_index do |token, index|
144
+ sentence_info = {}
145
+ sentence_info['token'] = token
146
+ assign_previous_token(sentence_info, index, 1, sentence_tokenized)
147
+ assign_previous_token(sentence_info, index, 2, sentence_tokenized)
148
+ assign_next_token(sentence_info, index, 1, sentence_tokenized)
149
+ assign_next_token(sentence_info, index, 2, sentence_tokenized)
150
+ sentence_info['num_char'] = token.length
151
+ sentence_info['position'] = index
152
+ sentence_info['multiple_words'] = token.include?(' ') ? true : false
153
+ sentence_info['lowercase'] = token.downcase
154
+ sentence_info['match_id'] = 'c' + index.to_s if sentence_tokenized.eql?(corrected_sentence_tokenized)
155
+ sentence_info['pos_tag'] = sentence_tagged[index].to_s.partition('>').first[1..-1]
156
+ sentence_info['punctuation'] = ChatCorrect::Punctuation.new(text: token).is_punctuation?
157
+ sentence_info['duplicates'] = (sentence_tokenized_downcased.count(token.downcase) > 1 ? true : false)
158
+ sentence_info['uid'] = sentence_tokenized.eql?(corrected_sentence_tokenized) ? 'corrected' + index.to_s : 'original' + index.to_s
159
+ sentence_info['matched'] = false
160
+ sentence_info['is_time'] = ChatCorrect::Time.new(text: token).is_time?
161
+ sentence_hash[index] = sentence_info
162
+ end
163
+ sentence_hash
164
+ end
165
+
166
+ def write_match_to_info_hash(ks, kc, vc)
167
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
168
+ corrected_sentence_info_hash[kc]['matched'] = true
169
+ end
170
+
171
+ def assign_previous_token(hash, index, lookup, tokenized_array)
172
+ if index - lookup < 0
173
+ hash["prev_word#{lookup}"] = 'ȸ'
174
+ else
175
+ hash["prev_word#{lookup}"] = tokenized_array[index - lookup]
176
+ end
177
+ end
178
+
179
+ def assign_next_token(hash, index, lookup, tokenized_array)
180
+ if (index + lookup) > (tokenized_array.length - 1)
181
+ hash["next_word#{lookup}"] = 'ȹ'
182
+ else
183
+ hash["next_word#{lookup}"] = tokenized_array[index + lookup]
184
+ end
185
+ end
186
+
187
+ def prev_next_match_check
188
+ corrected_sentence_info_hash.each do |kc, vc|
189
+ if !vc['matched']
190
+ prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
191
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
192
+ original_sentence_info_hash.each do |ks, vs|
193
+ prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
194
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
195
+ next if vs['match_id']
196
+ next unless prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
197
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
198
+ corrected_sentence_info_hash[kc]['matched'] = true
199
+ end
200
+ end
201
+ end
202
+ end
203
+
204
+ def set_previous_match(key, hash)
205
+ if key.eql?(0)
206
+ 'ȸ'
207
+ else
208
+ hash[key - 1]['match_id']
209
+ end
210
+ end
211
+
212
+ def set_next_match(key, hash)
213
+ if key.eql?(hash.length - 1)
214
+ 'ȹ'
215
+ else
216
+ hash[key + 1]['match_id']
217
+ end
218
+ end
219
+
220
+ def debug
221
+ # puts "++++++++++++++++++++"
222
+ original_sentence_info_hash.each do |k, v|
223
+ # puts 'Key: ' + k.to_s + '; Word: ' + v['token'].to_s + '; Match ID: ' + v['match_id'].to_s
224
+ end
225
+ end
226
+
227
+ def stage_1
228
+ matched_id_array = []
229
+ corrected_sentence_info_hash.each do |kc, vc|
230
+ original_sentence_info_hash.each do |ko, vo|
231
+ if (vc['lowercase'].eql?(vo['lowercase']) ||
232
+ (vc['prev_word1'].eql?(vo['prev_word1']) &&
233
+ vc['next_word1'].eql?(vo['next_word1']) &&
234
+ !vc['is_time'] &&
235
+ !vo['is_time'] &&
236
+ (!ChatCorrect::Punctuation.new(text: vc['prev_word1']).is_punctuation? &&
237
+ !ChatCorrect::Punctuation.new(text: vc['next_word1']).is_punctuation?) &&
238
+ vc['punctuation'].eql?(vo['punctuation']))) &&
239
+ !matched_id_array.include?(vc['match_id'].to_s) &&
240
+ !vo['duplicates'] &&
241
+ !vc['duplicates']
242
+
243
+ original_sentence_info_hash[ko]['match_id'] = vc['match_id']
244
+ corrected_sentence_info_hash[kc]['matched'] = true
245
+ matched_id_array << vc['match_id'].to_s
246
+ end
247
+ end
248
+ end
249
+ end
250
+
251
+ def stage_2
252
+ corrected_sentence_info_hash.each do |kc, vc|
253
+ if !vc['matched']
254
+ prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
255
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
256
+ if kc.eql?(corrected_sentence_info_hash.length - 1)
257
+ next_word_vc = 'ȹ'
258
+ else
259
+ next_word_vc = corrected_sentence_info_hash[kc + 1]['token']
260
+ end
261
+ original_sentence_info_hash.each do |ks, vs|
262
+ prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
263
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
264
+ if ks.eql?(original_sentence_info_hash.length - 1)
265
+ next_word_vs = 'ȹ'
266
+ else
267
+ next_word_vs = original_sentence_info_hash[ks + 1]['token']
268
+ end
269
+ next if vs['match_id']
270
+ if prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
271
+ original_sentence_info_hash[ks]['match_id'] = vc['match_id']
272
+ corrected_sentence_info_hash[kc]['matched'] = true
273
+ end
274
+ next unless vs['token'].eql?(next_word_vs) && vs['token'] != next_word_vc
275
+ original_sentence_info_hash[ks]['match_id'] = 'd' + ks.to_s
276
+ end
277
+ end
278
+ end
279
+ end
280
+
281
+ def iterate_sentences(inner_method)
282
+ corrected_sentence_info_hash.each do |kc, vc|
283
+ next if vc['matched']
284
+ original_sentence_info_hash.each do |ks, vs|
285
+ next if !vs['match_id'].to_s.strip.empty?
286
+ send("#{inner_method}", kc, vc, ks, vs)
287
+ end
288
+ end
289
+ end
290
+
291
+ def stage_3(kc, vc, ks, vs)
292
+ return unless vc['token'].eql?(vs['token']) &&
293
+ (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
294
+ !vc['matched'] && vs['prev_word1'] != 'ȸ'
295
+ write_match_to_info_hash(ks, kc, vc)
296
+ end
297
+
298
+ def stage_4(kc, vc, ks, vs)
299
+ return unless vc['token'].length > 3 && vs['token'].length > 3 &&
300
+ Levenshtein.distance(vc['token'], vs['token']) < 3 && !vc['matched']
301
+ write_match_to_info_hash(ks, kc, vc)
302
+ end
303
+
304
+ def stage_5(kc, vc, ks, vs)
305
+ return unless ChatCorrect::Pluralization.new(token_a: vc['token'], token_b: vs['token']).pluralization_error? &&
306
+ !vc['matched']
307
+ write_match_to_info_hash(ks, kc, vc)
308
+ end
309
+
310
+ def stage_6(kc, vc, ks, vs)
311
+ return unless ChatCorrect::Verb.new(word: vs['token'], pos: vc['pos_tag'], text: vc['token']).verb_error? &&
312
+ (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
313
+ !vc['matched'] && !vs['next_word1'].include?(' ')
314
+ write_match_to_info_hash(ks, kc, vc)
315
+ end
316
+
317
+ def stage_7(kc, vc, ks, vs)
318
+ # Distance between position of words is currently hardcoded to 5,
319
+ # but this is a SWAG and can be adjusted based on testing.
320
+ # The idea is to stop the algoroithm from matching words like 'to'
321
+ # and 'the' that appear very far apart in the sentence and should not be matched.
322
+ return unless vc['token'].length > 1 &&
323
+ vs['token'].length > 1 &&
324
+ Levenshtein.distance(vc['token'], vs['token']) < 3 &&
325
+ vs['token'].to_s[0].eql?(vc['token'].to_s[0]) &&
326
+ (vs['position'].to_i - vc['position'].to_i).abs < 5 &&
327
+ !vc['matched']
328
+ write_match_to_info_hash(ks, kc, vc)
329
+ end
330
+
331
+ def stage_8
332
+ corrected_sentence_info_hash.each do |kc, vc|
333
+ if !vc['matched']
334
+ next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
335
+ original_sentence_info_hash.each do |ks, vs|
336
+ next_match_vs = set_next_match(ks, original_sentence_info_hash)
337
+ next if vs['match_id']
338
+ write_match_to_info_hash(ks, kc, vc) if vs['multiple_words'] && vc['multiple_words'] && !vc['matched']
339
+ write_match_to_info_hash(ks, kc, vc) if next_match_vc.eql?('ȹ') && next_match_vs.eql?('ȹ') && vs['token'].gsub(/[[:punct:]]/, '').eql?('') && vc['token'].gsub(/[[:punct:]]/, '').eql?('') && !vc['matched']
340
+ end
341
+ end
342
+ end
343
+ end
344
+
345
+ def stage_9
346
+ original_sentence_info_hash.each do |k, v|
347
+ next if v['match_id']
348
+ original_sentence_info_hash[k]['match_id'] = 's' + k.to_s
349
+ end
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,204 @@
1
+ module ChatCorrect
2
+ class CorrectionsHash
3
+ PUNCTUATION_SYMBOLS = ['∯', '∬', '∫', '∮']
4
+ attr_reader :original_sentence_info_hash, :corrected_sentence_info_hash
5
+ def initialize(original_sentence_info_hash:, corrected_sentence_info_hash:)
6
+ @original_sentence_info_hash = original_sentence_info_hash
7
+ @corrected_sentence_info_hash = corrected_sentence_info_hash
8
+ @combined_hash = {}
9
+ @final_matched_array = []
10
+ end
11
+
12
+ def create
13
+ @j = 0
14
+ @i = 0
15
+ while @i < corrected_sentence_info_hash.length do
16
+ @correct_info = {}
17
+ @mistake_info = {}
18
+ if @j >= original_sentence_info_hash.length
19
+ if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
20
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_punctuation_mistake'
21
+ @combined_hash[@combined_hash.length] = @correct_info
22
+ else
23
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_word_mistake'
24
+ @combined_hash[@combined_hash.length] = @correct_info
25
+ end
26
+ @i +=1
27
+ else
28
+ case
29
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length].eql?(@i.to_s)
30
+ matching_ids_error_analysis(original_sentence_info_hash[@j], corrected_sentence_info_hash[@i])
31
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length] != @i.to_s
32
+ unmatched_ids_error_analysis
33
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('s')
34
+ special_error_analysis
35
+ when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('d')
36
+ duplicate_error_analysis
37
+ end
38
+ end
39
+ end
40
+ original_sentence_info_hash.each do |k, v|
41
+ if v['match_id'].to_s[0].eql?('s') && !@final_matched_array.include?(v['match_id'].to_s)
42
+ if v['token'].gsub(/[[:punct:]]/, '').eql?('') || PUNCTUATION_SYMBOLS.include?(v['token'])
43
+ @mistake_info = {}
44
+ @mistake_info[v['token']] = 'punctuation_mistake'
45
+ @combined_hash[@combined_hash.length] = @mistake_info
46
+ @final_matched_array << v['match_id'].to_s
47
+ else
48
+ @mistake_info = {}
49
+ @mistake_info[v['token']] = 'unnecessary_word_mistake'
50
+ @combined_hash[@combined_hash.length] = @mistake_info
51
+ @final_matched_array << v['match_id'].to_s
52
+ end
53
+ end
54
+ end
55
+ @combined_hash.each do |k, v|
56
+ v.each do |k1, v1|
57
+ next unless k1.include?('ƪ')
58
+ case
59
+ when v1.include?('missing_word_mistake') && @combined_hash[k - 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k - 2].to_s.include?('unnecessary_word_mistake')
60
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k - 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
61
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
62
+ @combined_hash[k - 1][@combined_hash[k - 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
63
+ @combined_hash[k - 2][@combined_hash[k - 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
64
+ when v1.include?('missing_word_mistake') && @combined_hash[k + 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k + 2].to_s.include?('unnecessary_word_mistake')
65
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k + 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
66
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
67
+ @combined_hash[k + 1][@combined_hash[k + 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
68
+ @combined_hash[k + 2][@combined_hash[k + 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
69
+ when v1.include?('unnecessary_word_mistake') && @combined_hash[k + 1].to_s.include?('missing_word_mistake') && @combined_hash[k + 2].to_s.include?('missing_word_mistake')
70
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('missing_word_mistake').to_s, token_b: @combined_hash[k + 2].key('missing_word_mistake').to_s, contraction: k1.to_s).contraction?
71
+ @combined_hash[k][k1] = 'stylistic_choice'
72
+ @combined_hash[k + 1][@combined_hash[k + 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
73
+ @combined_hash[k + 2][@combined_hash[k + 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
74
+ when v1.include?('unnecessary_word_mistake') && @combined_hash[k - 1].to_s.include?('missing_word_mistake') && @combined_hash[k - 2].to_s.include?('missing_word_mistake')
75
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('missing_word_mistake').to_s, token_b: @combined_hash[k - 1].key('missing_word_mistake').to_s, contraction: k1.to_s).contraction?
76
+ @combined_hash[k][k1] = 'stylistic_choice'
77
+ @combined_hash[k - 1][@combined_hash[k - 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
78
+ @combined_hash[k - 2][@combined_hash[k - 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
79
+ when v1.include?('verb_mistake') && @combined_hash[k + 1].to_s.include?('verb_mistake_correction')
80
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[0].to_s, token_b: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[1].to_s, contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
81
+ @combined_hash[k][k1] = 'stylistic_choice'
82
+ @combined_hash[k + 1][@combined_hash[k + 1].key('verb_mistake_correction')] = 'stylistic_choice_correction'
83
+ when v1.include?('verb_mistake_correction') && @combined_hash[k - 1].to_s.include?('verb_mistake')
84
+ next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 1].key('verb_mistake').to_s.split[0].to_s, token_b: @combined_hash[k - 1].key('verb_mistake').to_s.split[1].to_s, contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
85
+ @combined_hash[k][k1] = 'stylistic_choice_correction'
86
+ @combined_hash[k - 1][@combined_hash[k - 1].key('verb_mistake')] = 'stylistic_choice'
87
+ end
88
+ end
89
+ end
90
+ @combined_hash
91
+ end
92
+
93
+ private
94
+
95
+ def update_combined_hash(mistake, original, corrected, opposite_mistake)
96
+ opposite_mistake.nil? ? om = "#{mistake.gsub(/_mistake/, '')}_correction" : om = opposite_mistake
97
+ @mistake_info[original] = "#{mistake}"
98
+ @correct_info[corrected] = om
99
+ @combined_hash[@combined_hash.length] = @mistake_info
100
+ @combined_hash[@combined_hash.length] = @correct_info
101
+ end
102
+
103
+ def update_combined_hash_single_mistake_original(mistake)
104
+ @mistake_info[original_sentence_info_hash[@j]['token']] = mistake
105
+ @combined_hash[@combined_hash.length] = @mistake_info
106
+ end
107
+
108
+ def update_combined_hash_single_mistake_corrected(mistake)
109
+ @correct_info[corrected_sentence_info_hash[@i]['token']] = mistake
110
+ @combined_hash[@combined_hash.length] = @correct_info
111
+ end
112
+
113
+ def matching_ids_error_analysis(original, corrected)
114
+ case
115
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).no_mistake?
116
+ @correct_info[corrected['token']] = 'no_mistake'
117
+ @combined_hash[@combined_hash.length] = @correct_info
118
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).verb_mistake?
119
+ update_combined_hash('verb_mistake', original['token'], corrected['token'], nil)
120
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).capitalization_mistake?
121
+ update_combined_hash('capitalization_mistake', original['token'], corrected['token'], nil)
122
+ when ChatCorrect::Pluralization.new(token_a: corrected['token'], token_b: original['token']).pluralization_error?
123
+ update_combined_hash('pluralization_mistake', original['token'], corrected['token'], nil)
124
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).spelling_mistake?
125
+ update_combined_hash('spelling_mistake', original['token'], corrected['token'], nil)
126
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).punctuation_mistake?
127
+ update_combined_hash('punctuation_mistake', original['token'], corrected['token'], nil)
128
+ when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).unnecessary_word_missing_punctuation_mistake?
129
+ update_combined_hash('unnecessary_word_mistake', original['token'], corrected['token'], 'missing_punctuation_mistake')
130
+ else
131
+ update_combined_hash('word_choice_mistake', original['token'], corrected['token'], nil)
132
+ end
133
+ @j +=1
134
+ @i +=1
135
+ end
136
+
137
+ def unmatched_ids_error_analysis
138
+ word_order_counter = 0
139
+ word_order_key = 0
140
+ original_sentence_info_hash.each do |ks1, kv1|
141
+ if kv1['match_id'] == corrected_sentence_info_hash[@i]['match_id']
142
+ word_order_counter = 1
143
+ word_order_key = ks1
144
+ end
145
+ end
146
+ if word_order_counter == 1
147
+ if corrected_sentence_info_hash[@i]['token'].downcase == original_sentence_info_hash[word_order_key]['token'].downcase
148
+ update_combined_hash_single_mistake_corrected('word_order_mistake')
149
+ else
150
+ if ChatCorrect::Verb.new(word: corrected_sentence_info_hash[@i]['token'], pos: 'vb', text: original_sentence_info_hash[word_order_key]['token']).verb_error?
151
+ update_combined_hash_single_mistake_corrected('verb_mistake_correction')
152
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'verb_mistake'
153
+ @combined_hash[@combined_hash.length] = @mistake_info
154
+ elsif ChatCorrect::Pluralization.new(token_a: corrected_sentence_info_hash[@i]['token'], token_b: original_sentence_info_hash[word_order_key]['token']).pluralization_error?
155
+ update_combined_hash_single_mistake_corrected('pluralization_mistake_correction')
156
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'pluralization_mistake'
157
+ @combined_hash[@combined_hash.length] = @mistake_info
158
+ else
159
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
160
+ @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'unnecessary_word_mistake'
161
+ @combined_hash[@combined_hash.length] = @mistake_info
162
+ end
163
+ end
164
+ @j +=1
165
+ else
166
+ if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
167
+ update_combined_hash_single_mistake_corrected('punctuation_mistake')
168
+ else
169
+ if @j != 0
170
+ concatenated_corrected_string = corrected_sentence_info_hash[@i - 1]['token'].to_s + corrected_sentence_info_hash[@i]['token'].to_s
171
+ if ChatCorrect::Possessive.new(token_a: original_sentence_info_hash[@j - 1]['token'], token_b: concatenated_corrected_string).possessive?
172
+ @mistake_info[original_sentence_info_hash[@j - 1]['token']] = 'possessive_mistake'
173
+ @correct_info[concatenated_corrected_string] = 'possessive_mistake_correction'
174
+ @combined_hash[@combined_hash.length - 1] = @mistake_info
175
+ @combined_hash[@combined_hash.length] = @correct_info
176
+ else
177
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
178
+ end
179
+ else
180
+ update_combined_hash_single_mistake_corrected('missing_word_mistake')
181
+ end
182
+ end
183
+ end
184
+ @i +=1
185
+ end
186
+
187
+ def special_error_analysis
188
+ if original_sentence_info_hash[@j]['token'].gsub(/[[:punct:]]/, '').eql?('') ||
189
+ PUNCTUATION_SYMBOLS.include?(original_sentence_info_hash[@j]['token'])
190
+ update_combined_hash_single_mistake_original('punctuation_mistake')
191
+ @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
192
+ else
193
+ update_combined_hash_single_mistake_original('unnecessary_word_mistake')
194
+ @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
195
+ end
196
+ @j +=1
197
+ end
198
+
199
+ def duplicate_error_analysis
200
+ update_combined_hash_single_mistake_original('duplicate_word_mistake')
201
+ @j +=1
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,40 @@
1
+ module ChatCorrect
2
+ class MistakeAnalyzer
3
+ attr_reader :original, :corrected
4
+ def initialize(original:, corrected:)
5
+ @original = original
6
+ @corrected = corrected
7
+ end
8
+
9
+ def no_mistake?
10
+ original['token'].eql?(corrected['token'])
11
+ end
12
+
13
+ def verb_mistake?
14
+ ChatCorrect::CommonVerbMistake.new(token_a: corrected['token'], token_b: original['token']).exists? ||
15
+ original['multiple_words'] ||
16
+ corrected['multiple_words'] ||
17
+ ChatCorrect::Verb.new(word: original['token'], pos: corrected['pos_tag'], text: corrected['token']).verb_error?
18
+ end
19
+
20
+ def capitalization_mistake?
21
+ ChatCorrect::Capitalization.new(token_a: corrected['token'], token_b: original['token']).capitalization_error?
22
+ end
23
+
24
+ def punctuation_mistake?
25
+ (corrected['punctuation'] && original['punctuation']) ||
26
+ (ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
27
+ ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists? &&
28
+ !ChatCorrect::Possessive.new(token_a: original['token'], token_b: corrected['token']).possessive?)
29
+ end
30
+
31
+ def unnecessary_word_missing_punctuation_mistake?
32
+ corrected['punctuation'] && !original['punctuation']
33
+ end
34
+
35
+ def spelling_mistake?
36
+ ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
37
+ !ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists?
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,22 @@
1
+ require 'linguistics'
2
+
3
+ module ChatCorrect
4
+ class Pluralization
5
+ attr_reader :token_a, :token_b
6
+ def initialize(token_a:, token_b:)
7
+ @token_a = token_a
8
+ @token_b = token_b
9
+ end
10
+
11
+ def pluralization_error?
12
+ begin
13
+ Linguistics.use(:en)
14
+ token_a_plural = token_a.en.plural
15
+ token_b_plural = token_b.en.plural
16
+ rescue
17
+ return false
18
+ end
19
+ token_a_plural.eql?(token_b) || token_b_plural.eql?(token_a)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module ChatCorrect
2
+ class Possessive
3
+ attr_reader :token_a, :token_b
4
+ def initialize(token_a:, token_b:)
5
+ @token_a = token_a
6
+ @token_b = token_b
7
+ end
8
+
9
+ def possessive?
10
+ check_for_possessive(token_a, token_b, "ƪ") ||
11
+ check_for_possessive(token_b, token_a, "ƪ") ||
12
+ check_for_possessive(token_a, token_b, "∮") ||
13
+ check_for_possessive(token_b, token_a, "∮")
14
+ end
15
+
16
+ private
17
+
18
+ def check_for_possessive(word_1, word_2, mark)
19
+ word_1.include?(mark) &&
20
+ word_1.partition(mark)[0].downcase.eql?(word_2.downcase) ||
21
+ (word_1.partition(mark)[0].downcase.eql?(word_1.downcase[0...-1]) &&
22
+ (word_1.partition(mark)[2].eql?('s') || word_1.partition(mark)[2].length < 3))
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,17 @@
1
+ module ChatCorrect
2
+ class Punctuation
3
+ attr_reader :text
4
+ def initialize(text:)
5
+ @text = text
6
+ end
7
+
8
+ def is_punctuation?
9
+ text.gsub(/[[:punct:]]/, '').eql?('') ||
10
+ text.eql?('∫') ||
11
+ text.eql?('∬') ||
12
+ text.eql?('∯') ||
13
+ text.eql?('∮') ||
14
+ text.eql?('ƪ')
15
+ end
16
+ end
17
+ end