RubyGems - chat_correct - Versions diffs - 0.0.1 - Mend

chat_correct 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +1 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +208 -0
data/Rakefile +4 -0
data/chat_correct.gemspec +28 -0
data/lib/chat_correct/capitalization.rb +13 -0
data/lib/chat_correct/combine_multi_word_verbs.rb +51 -0
data/lib/chat_correct/common_verb_mistake.rb +62 -0
data/lib/chat_correct/contraction.rb +103 -0
data/lib/chat_correct/correct.rb +352 -0
data/lib/chat_correct/corrections_hash.rb +204 -0
data/lib/chat_correct/mistake_analyzer.rb +40 -0
data/lib/chat_correct/pluralization.rb +22 -0
data/lib/chat_correct/possessive.rb +25 -0
data/lib/chat_correct/punctuation.rb +17 -0
data/lib/chat_correct/punctuation_masquerading_as_spelling_error.rb +14 -0
data/lib/chat_correct/spelling.rb +20 -0
data/lib/chat_correct/time.rb +14 -0
data/lib/chat_correct/tokenize.rb +164 -0
data/lib/chat_correct/verb.rb +65 -0
data/lib/chat_correct/version.rb +3 -0
data/lib/chat_correct.rb +16 -0
data/spec/chat_correct/capitalization_spec.rb +17 -0
data/spec/chat_correct/combine_multi_word_verbs_spec.rb +39 -0
data/spec/chat_correct/common_verb_mistake_spec.rb +24 -0
data/spec/chat_correct/contraction_spec.rb +259 -0
data/spec/chat_correct/correct_spec.rb +1650 -0
data/spec/chat_correct/mistake_analyzer_spec.rb +99 -0
data/spec/chat_correct/pluralization_spec.rb +31 -0
data/spec/chat_correct/possessive_spec.rb +31 -0
data/spec/chat_correct/punctuation_masquerading_as_spelling_error_spec.rb +24 -0
data/spec/chat_correct/punctuation_spec.rb +21 -0
data/spec/chat_correct/spelling_spec.rb +59 -0
data/spec/chat_correct/time_spec.rb +21 -0
data/spec/chat_correct/tokenize_spec.rb +142 -0
data/spec/chat_correct/verb_spec.rb +60 -0
data/spec/spec_helper.rb +1 -0
metadata +201 -0

data/lib/chat_correct/correct.rb ADDED Viewed

@@ -0,0 +1,352 @@
+require 'engtagger'
+module ChatCorrect
+  class Correct
+    TYPES_OF_MISTAKES = ['missing_word', 'unnecessary_word', 'spelling', 'verb', 'punctuation', 'word_order', 'capitalization', 'duplicate_word', 'word_choice', 'pluralization', 'possessive', 'stylistic_choice']
+    attr_reader :original_sentence, :corrected_sentence
+    def initialize(original_sentence:, corrected_sentence:)
+      @original_sentence = original_sentence
+      @corrected_sentence = corrected_sentence
+    end
+    def correct
+      # puts "OS: #{original_sentence}"
+      # puts "CS: #{corrected_sentence}"
+      # puts "OST: #{original_sentence_tokenized}"
+      # puts "CST: #{corrected_sentence_tokenized}"
+      # puts "OSTag: #{original_sentence_tagged}"
+      # puts "CSTag: #{corrected_sentence_tagged}"
+      # puts "OSTD: #{original_sentence_tokenized_downcased}"
+      # puts "CSTD: #{corrected_sentence_tokenized_downcased}"
+      stage_1
+      debug
+      stage_2
+      debug
+      iterate_sentences('stage_3')
+      debug
+      iterate_sentences('stage_4')
+      debug
+      iterate_sentences('stage_5')
+      debug
+      iterate_sentences('stage_6')
+      debug
+      iterate_sentences('stage_7')
+      debug
+      stage_8
+      debug
+      prev_next_match_check
+      debug
+      stage_9
+      debug
+      correction_hash = ChatCorrect::CorrectionsHash.new(original_sentence_info_hash: original_sentence_info_hash, corrected_sentence_info_hash: corrected_sentence_info_hash).create
+      build_corrections_hash(correction_hash)
+    end
+    def mistakes
+      mistakes_hash = {}
+      correct.each do |key, value|
+        next if !value['type'].split('_')[-1].eql?('mistake') || value['type'].split('_')[0].eql?('no')
+        interim_hash = {}
+        interim_hash['position'] = key
+        if value['type'].split('_').length > 2
+          interim_hash['error_type'] = value['type'].split('_')[0] + '_' + value['type'].split('_')[1]
+        else
+          interim_hash['error_type'] = value['type'].split('_')[0]
+        end
+        interim_hash['mistake'] = value['token']
+        if correct[key + 1]['type'].split('_')[0].eql?(correct[key]['type'].split('_')[0])
+          interim_hash['correction'] = correct[key + 1]['token']
+        else
+          interim_hash['correction'] = ''
+        end
+        mistakes_hash[mistakes_hash.length] = interim_hash
+      end
+      mistakes_hash
+    end
+    def mistake_report
+      mistake_report_hash = {}
+      TYPES_OF_MISTAKES.each do |mistake|
+        counter = 0
+        mistakes.each do |key, value|
+          counter += 1 if value['error_type'].eql?(mistake)
+        end
+        mistake_report_hash[mistake] = counter
+      end
+      mistake_report_hash
+    end
+    def number_of_mistakes
+      mistakes.length
+    end
+    private
+    def build_corrections_hash(correction_hash)
+      final_hash = {}
+      correction_hash.each do |k, v|
+        interim_hash = {}
+        interim_hash['token'] = reverse_symbols(v.keys[0])
+        interim_hash['type'] = v.values[0]
+        final_hash[k] = interim_hash
+      end
+      final_hash
+    end
+    def reverse_symbols(txt)
+      txt.gsub('∬', '"')
+         .gsub('∯', '"')
+         .gsub('ƪ', "'")
+         .gsub('∫', "'")
+         .gsub('∮', "'")
+         .gsub('☍', ". ")
+         .gsub('☊', ".")
+         .gsub('☌', ",")
+    end
+    def original_sentence_tokenized
+      @original_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: original_sentence).combine
+    end
+    def corrected_sentence_tokenized
+      @corrected_sentence_tokenized ||= ChatCorrect::CombineMultiWordVerbs.new(text: corrected_sentence).combine
+    end
+    def original_sentence_tagged
+      tgr = EngTagger.new
+      @original_sentence_tagged ||= tgr.add_tags(original_sentence).split
+    end
+    def corrected_sentence_tagged
+      tgr = EngTagger.new
+      @corrected_sentence_tagged ||= tgr.add_tags(corrected_sentence).split
+    end
+    def original_sentence_tokenized_downcased
+      @original_sentence_tokenized_downcased ||= original_sentence_tokenized.map { |token| token.downcase }
+    end
+    def corrected_sentence_tokenized_downcased
+      @corrected_sentence_tokenized_downcased ||= corrected_sentence_tokenized.map { |token| token.downcase }
+    end
+    def original_sentence_info_hash
+      @original_sentence_info_hash ||= create_sentence_info_hash(original_sentence_tokenized, original_sentence_tokenized_downcased, original_sentence_tagged)
+    end
+    def corrected_sentence_info_hash
+      @corrected_sentence_info_hash ||= create_sentence_info_hash(corrected_sentence_tokenized, corrected_sentence_tokenized_downcased, corrected_sentence_tagged)
+    end
+    def create_sentence_info_hash(sentence_tokenized, sentence_tokenized_downcased, sentence_tagged)
+      sentence_hash = {}
+      sentence_tokenized.each_with_index do |token, index|
+        sentence_info = {}
+        sentence_info['token'] = token
+        assign_previous_token(sentence_info, index, 1, sentence_tokenized)
+        assign_previous_token(sentence_info, index, 2, sentence_tokenized)
+        assign_next_token(sentence_info, index, 1, sentence_tokenized)
+        assign_next_token(sentence_info, index, 2, sentence_tokenized)
+        sentence_info['num_char'] = token.length
+        sentence_info['position'] = index
+        sentence_info['multiple_words'] = token.include?(' ') ? true : false
+        sentence_info['lowercase'] = token.downcase
+        sentence_info['match_id'] = 'c' + index.to_s if sentence_tokenized.eql?(corrected_sentence_tokenized)
+        sentence_info['pos_tag'] = sentence_tagged[index].to_s.partition('>').first[1..-1]
+        sentence_info['punctuation'] = ChatCorrect::Punctuation.new(text: token).is_punctuation?
+        sentence_info['duplicates'] = (sentence_tokenized_downcased.count(token.downcase) > 1 ? true : false)
+        sentence_info['uid'] = sentence_tokenized.eql?(corrected_sentence_tokenized) ? 'corrected' + index.to_s : 'original' + index.to_s
+        sentence_info['matched'] = false
+        sentence_info['is_time'] = ChatCorrect::Time.new(text: token).is_time?
+        sentence_hash[index] = sentence_info
+      end
+      sentence_hash
+    end
+    def write_match_to_info_hash(ks, kc, vc)
+      original_sentence_info_hash[ks]['match_id'] = vc['match_id']
+      corrected_sentence_info_hash[kc]['matched'] = true
+    end
+    def assign_previous_token(hash, index, lookup, tokenized_array)
+      if index - lookup < 0
+        hash["prev_word#{lookup}"] = 'ȸ'
+      else
+        hash["prev_word#{lookup}"] = tokenized_array[index - lookup]
+      end
+    end
+    def assign_next_token(hash, index, lookup, tokenized_array)
+      if (index + lookup) > (tokenized_array.length - 1)
+        hash["next_word#{lookup}"] = 'ȹ'
+      else
+        hash["next_word#{lookup}"] = tokenized_array[index + lookup]
+      end
+    end
+    def prev_next_match_check
+      corrected_sentence_info_hash.each do |kc, vc|
+        if !vc['matched']
+          prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
+          next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
+          original_sentence_info_hash.each do |ks, vs|
+            prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
+            next_match_vs = set_next_match(ks, original_sentence_info_hash)
+            next if vs['match_id']
+            next unless prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
+            original_sentence_info_hash[ks]['match_id'] = vc['match_id']
+            corrected_sentence_info_hash[kc]['matched'] = true
+          end
+        end
+      end
+    end
+    def set_previous_match(key, hash)
+      if key.eql?(0)
+        'ȸ'
+      else
+        hash[key - 1]['match_id']
+      end
+    end
+    def set_next_match(key, hash)
+      if key.eql?(hash.length - 1)
+        'ȹ'
+      else
+        hash[key + 1]['match_id']
+      end
+    end
+    def debug
+      # puts "++++++++++++++++++++"
+      original_sentence_info_hash.each do |k, v|
+        # puts 'Key: ' + k.to_s + '; Word: ' + v['token'].to_s + '; Match ID: ' + v['match_id'].to_s
+      end
+    end
+    def stage_1
+      matched_id_array = []
+      corrected_sentence_info_hash.each do |kc, vc|
+        original_sentence_info_hash.each do |ko, vo|
+          if (vc['lowercase'].eql?(vo['lowercase']) ||
+             (vc['prev_word1'].eql?(vo['prev_word1']) &&
+              vc['next_word1'].eql?(vo['next_word1']) &&
+              !vc['is_time'] &&
+              !vo['is_time'] &&
+              (!ChatCorrect::Punctuation.new(text: vc['prev_word1']).is_punctuation? &&
+               !ChatCorrect::Punctuation.new(text: vc['next_word1']).is_punctuation?) &&
+              vc['punctuation'].eql?(vo['punctuation']))) &&
+            !matched_id_array.include?(vc['match_id'].to_s) &&
+            !vo['duplicates'] &&
+            !vc['duplicates']
+            original_sentence_info_hash[ko]['match_id'] = vc['match_id']
+            corrected_sentence_info_hash[kc]['matched'] = true
+            matched_id_array << vc['match_id'].to_s
+          end
+        end
+      end
+    end
+    def stage_2
+      corrected_sentence_info_hash.each do |kc, vc|
+        if !vc['matched']
+          prev_match_vc = set_previous_match(kc, corrected_sentence_info_hash)
+            next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
+            if kc.eql?(corrected_sentence_info_hash.length - 1)
+              next_word_vc = 'ȹ'
+            else
+              next_word_vc = corrected_sentence_info_hash[kc + 1]['token']
+            end
+          original_sentence_info_hash.each do |ks, vs|
+            prev_match_vs = set_previous_match(ks, original_sentence_info_hash)
+            next_match_vs = set_next_match(ks, original_sentence_info_hash)
+            if ks.eql?(original_sentence_info_hash.length - 1)
+              next_word_vs = 'ȹ'
+            else
+              next_word_vs = original_sentence_info_hash[ks + 1]['token']
+            end
+            next if vs['match_id']
+            if prev_match_vc.eql?(prev_match_vs) && next_match_vc.eql?(next_match_vs)
+              original_sentence_info_hash[ks]['match_id'] = vc['match_id']
+              corrected_sentence_info_hash[kc]['matched'] = true
+            end
+            next unless vs['token'].eql?(next_word_vs) && vs['token'] != next_word_vc
+            original_sentence_info_hash[ks]['match_id'] = 'd' + ks.to_s
+          end
+        end
+      end
+    end
+    def iterate_sentences(inner_method)
+      corrected_sentence_info_hash.each do |kc, vc|
+        next if vc['matched']
+        original_sentence_info_hash.each do |ks, vs|
+          next if !vs['match_id'].to_s.strip.empty?
+          send("#{inner_method}", kc, vc, ks, vs)
+        end
+      end
+    end
+    def stage_3(kc, vc, ks, vs)
+      return unless vc['token'].eql?(vs['token']) &&
+      (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
+      !vc['matched'] && vs['prev_word1'] != 'ȸ'
+        write_match_to_info_hash(ks, kc, vc)
+    end
+    def stage_4(kc, vc, ks, vs)
+      return unless vc['token'].length > 3 && vs['token'].length > 3 &&
+      Levenshtein.distance(vc['token'], vs['token']) < 3 && !vc['matched']
+        write_match_to_info_hash(ks, kc, vc)
+    end
+    def stage_5(kc, vc, ks, vs)
+      return unless ChatCorrect::Pluralization.new(token_a: vc['token'], token_b: vs['token']).pluralization_error? &&
+      !vc['matched']
+        write_match_to_info_hash(ks, kc, vc)
+    end
+    def stage_6(kc, vc, ks, vs)
+      return unless ChatCorrect::Verb.new(word: vs['token'], pos: vc['pos_tag'], text: vc['token']).verb_error? &&
+      (vc['prev_word1'].eql?(vs['prev_word1']) || vc['next_word1'].eql?(vs['next_word1'])) &&
+      !vc['matched'] && !vs['next_word1'].include?(' ')
+        write_match_to_info_hash(ks, kc, vc)
+    end
+    def stage_7(kc, vc, ks, vs)
+      # Distance between position of words is currently hardcoded to 5,
+      # but this is a SWAG and can be adjusted based on testing.
+      # The idea is to stop the algoroithm from matching words like 'to'
+      # and 'the' that appear very far apart in the sentence and should not be matched.
+      return unless vc['token'].length > 1 &&
+      vs['token'].length > 1 &&
+      Levenshtein.distance(vc['token'], vs['token']) < 3 &&
+      vs['token'].to_s[0].eql?(vc['token'].to_s[0]) &&
+      (vs['position'].to_i - vc['position'].to_i).abs < 5 &&
+      !vc['matched']
+        write_match_to_info_hash(ks, kc, vc)
+    end
+    def stage_8
+      corrected_sentence_info_hash.each do |kc, vc|
+        if !vc['matched']
+          next_match_vc = set_next_match(kc, corrected_sentence_info_hash)
+          original_sentence_info_hash.each do |ks, vs|
+            next_match_vs = set_next_match(ks, original_sentence_info_hash)
+            next if vs['match_id']
+            write_match_to_info_hash(ks, kc, vc) if vs['multiple_words'] && vc['multiple_words'] && !vc['matched']
+            write_match_to_info_hash(ks, kc, vc) if next_match_vc.eql?('ȹ') && next_match_vs.eql?('ȹ') && vs['token'].gsub(/[[:punct:]]/, '').eql?('') && vc['token'].gsub(/[[:punct:]]/, '').eql?('') && !vc['matched']
+          end
+        end
+      end
+    end
+    def stage_9
+      original_sentence_info_hash.each do |k, v|
+        next if v['match_id']
+        original_sentence_info_hash[k]['match_id'] = 's' + k.to_s
+      end
+    end
+  end
+end

data/lib/chat_correct/corrections_hash.rb ADDED Viewed

@@ -0,0 +1,204 @@
+module ChatCorrect
+  class CorrectionsHash
+    PUNCTUATION_SYMBOLS = ['∯', '∬', '∫', '∮']
+    attr_reader :original_sentence_info_hash, :corrected_sentence_info_hash
+    def initialize(original_sentence_info_hash:, corrected_sentence_info_hash:)
+      @original_sentence_info_hash = original_sentence_info_hash
+      @corrected_sentence_info_hash = corrected_sentence_info_hash
+      @combined_hash = {}
+      @final_matched_array = []
+    end
+    def create
+      @j = 0
+      @i = 0
+      while @i < corrected_sentence_info_hash.length do
+        @correct_info = {}
+        @mistake_info = {}
+        if @j >= original_sentence_info_hash.length
+          if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
+            @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_punctuation_mistake'
+            @combined_hash[@combined_hash.length] = @correct_info
+          else
+            @correct_info[corrected_sentence_info_hash[@i]['token']] = 'missing_word_mistake'
+            @combined_hash[@combined_hash.length] = @correct_info
+          end
+          @i +=1
+        else
+          case
+          when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length].eql?(@i.to_s)
+            matching_ids_error_analysis(original_sentence_info_hash[@j], corrected_sentence_info_hash[@i])
+          when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('c') && original_sentence_info_hash[@j]['match_id'].to_s[1..original_sentence_info_hash[@j]['match_id'].to_s.length] != @i.to_s
+            unmatched_ids_error_analysis
+          when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('s')
+            special_error_analysis
+          when original_sentence_info_hash[@j]['match_id'].to_s[0].eql?('d')
+            duplicate_error_analysis
+          end
+        end
+      end
+      original_sentence_info_hash.each do |k, v|
+        if v['match_id'].to_s[0].eql?('s') && !@final_matched_array.include?(v['match_id'].to_s)
+          if v['token'].gsub(/[[:punct:]]/, '').eql?('') || PUNCTUATION_SYMBOLS.include?(v['token'])
+            @mistake_info = {}
+            @mistake_info[v['token']] = 'punctuation_mistake'
+            @combined_hash[@combined_hash.length] = @mistake_info
+            @final_matched_array << v['match_id'].to_s
+          else
+            @mistake_info = {}
+            @mistake_info[v['token']] = 'unnecessary_word_mistake'
+            @combined_hash[@combined_hash.length] = @mistake_info
+            @final_matched_array << v['match_id'].to_s
+          end
+        end
+      end
+      @combined_hash.each do |k, v|
+        v.each do |k1, v1|
+          next unless k1.include?('ƪ')
+          case
+          when v1.include?('missing_word_mistake') && @combined_hash[k - 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k - 2].to_s.include?('unnecessary_word_mistake')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k - 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice_correction'
+            @combined_hash[k - 1][@combined_hash[k - 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
+            @combined_hash[k - 2][@combined_hash[k - 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
+          when v1.include?('missing_word_mistake') && @combined_hash[k + 1].to_s.include?('unnecessary_word_mistake') && @combined_hash[k + 2].to_s.include?('unnecessary_word_mistake')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 2].key('unnecessary_word_mistake').to_s, token_b: @combined_hash[k + 1].key('unnecessary_word_mistake').to_s, contraction: k1.to_s).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice_correction'
+            @combined_hash[k + 1][@combined_hash[k + 1].key('unnecessary_word_mistake')] = 'stylistic_choice'
+            @combined_hash[k + 2][@combined_hash[k + 2].key('unnecessary_word_mistake')] = 'stylistic_choice'
+          when v1.include?('unnecessary_word_mistake') && @combined_hash[k + 1].to_s.include?('missing_word_mistake') && @combined_hash[k + 2].to_s.include?('missing_word_mistake')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('missing_word_mistake').to_s, token_b: @combined_hash[k + 2].key('missing_word_mistake').to_s,  contraction: k1.to_s).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice'
+            @combined_hash[k + 1][@combined_hash[k + 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
+            @combined_hash[k + 2][@combined_hash[k + 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
+          when v1.include?('unnecessary_word_mistake') && @combined_hash[k - 1].to_s.include?('missing_word_mistake') && @combined_hash[k - 2].to_s.include?('missing_word_mistake')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 2].key('missing_word_mistake').to_s, token_b: @combined_hash[k - 1].key('missing_word_mistake').to_s,  contraction: k1.to_s).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice'
+            @combined_hash[k - 1][@combined_hash[k - 1].key('missing_word_mistake')] = 'stylistic_choice_correction'
+            @combined_hash[k - 2][@combined_hash[k - 2].key('missing_word_mistake')] = 'stylistic_choice_correction'
+          when v1.include?('verb_mistake') && @combined_hash[k + 1].to_s.include?('verb_mistake_correction')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[0].to_s, token_b: @combined_hash[k + 1].key('verb_mistake_correction').to_s.split[1].to_s,  contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice'
+            @combined_hash[k + 1][@combined_hash[k + 1].key('verb_mistake_correction')] = 'stylistic_choice_correction'
+          when v1.include?('verb_mistake_correction') && @combined_hash[k - 1].to_s.include?('verb_mistake')
+            next if !ChatCorrect::Contraction.new(token_a: @combined_hash[k - 1].key('verb_mistake').to_s.split[0].to_s, token_b: @combined_hash[k - 1].key('verb_mistake').to_s.split[1].to_s,  contraction: k1.gsub(/ƪ/o, "'").split[0].to_s.gsub(/'/o, 'ƪ')).contraction?
+            @combined_hash[k][k1] = 'stylistic_choice_correction'
+            @combined_hash[k - 1][@combined_hash[k - 1].key('verb_mistake')] = 'stylistic_choice'
+          end
+        end
+      end
+      @combined_hash
+    end
+    private
+    def update_combined_hash(mistake, original, corrected, opposite_mistake)
+      opposite_mistake.nil? ? om = "#{mistake.gsub(/_mistake/, '')}_correction" : om = opposite_mistake
+      @mistake_info[original] = "#{mistake}"
+      @correct_info[corrected] = om
+      @combined_hash[@combined_hash.length] = @mistake_info
+      @combined_hash[@combined_hash.length] = @correct_info
+    end
+    def update_combined_hash_single_mistake_original(mistake)
+      @mistake_info[original_sentence_info_hash[@j]['token']] = mistake
+      @combined_hash[@combined_hash.length] = @mistake_info
+    end
+    def update_combined_hash_single_mistake_corrected(mistake)
+      @correct_info[corrected_sentence_info_hash[@i]['token']] = mistake
+      @combined_hash[@combined_hash.length] = @correct_info
+    end
+    def matching_ids_error_analysis(original, corrected)
+      case
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).no_mistake?
+        @correct_info[corrected['token']] = 'no_mistake'
+        @combined_hash[@combined_hash.length] = @correct_info
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).verb_mistake?
+        update_combined_hash('verb_mistake', original['token'], corrected['token'], nil)
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).capitalization_mistake?
+        update_combined_hash('capitalization_mistake', original['token'], corrected['token'], nil)
+      when ChatCorrect::Pluralization.new(token_a: corrected['token'], token_b: original['token']).pluralization_error?
+        update_combined_hash('pluralization_mistake', original['token'], corrected['token'], nil)
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).spelling_mistake?
+        update_combined_hash('spelling_mistake', original['token'], corrected['token'], nil)
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).punctuation_mistake?
+        update_combined_hash('punctuation_mistake', original['token'], corrected['token'], nil)
+      when ChatCorrect::MistakeAnalyzer.new(original: original, corrected: corrected).unnecessary_word_missing_punctuation_mistake?
+        update_combined_hash('unnecessary_word_mistake', original['token'], corrected['token'], 'missing_punctuation_mistake')
+      else
+        update_combined_hash('word_choice_mistake', original['token'], corrected['token'], nil)
+      end
+      @j +=1
+      @i +=1
+    end
+    def unmatched_ids_error_analysis
+      word_order_counter = 0
+      word_order_key = 0
+      original_sentence_info_hash.each do |ks1, kv1|
+        if kv1['match_id'] == corrected_sentence_info_hash[@i]['match_id']
+          word_order_counter = 1
+          word_order_key = ks1
+        end
+      end
+      if word_order_counter == 1
+        if corrected_sentence_info_hash[@i]['token'].downcase == original_sentence_info_hash[word_order_key]['token'].downcase
+          update_combined_hash_single_mistake_corrected('word_order_mistake')
+        else
+          if ChatCorrect::Verb.new(word: corrected_sentence_info_hash[@i]['token'], pos: 'vb', text: original_sentence_info_hash[word_order_key]['token']).verb_error?
+            update_combined_hash_single_mistake_corrected('verb_mistake_correction')
+            @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'verb_mistake'
+            @combined_hash[@combined_hash.length] = @mistake_info
+          elsif ChatCorrect::Pluralization.new(token_a: corrected_sentence_info_hash[@i]['token'], token_b: original_sentence_info_hash[word_order_key]['token']).pluralization_error?
+            update_combined_hash_single_mistake_corrected('pluralization_mistake_correction')
+            @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'pluralization_mistake'
+            @combined_hash[@combined_hash.length] = @mistake_info
+          else
+            update_combined_hash_single_mistake_corrected('missing_word_mistake')
+            @mistake_info[original_sentence_info_hash[word_order_key]['token']] = 'unnecessary_word_mistake'
+            @combined_hash[@combined_hash.length] = @mistake_info
+          end
+        end
+        @j +=1
+      else
+        if corrected_sentence_info_hash[@i]['token'].gsub(/[[:punct:]]/, '').eql?('')
+          update_combined_hash_single_mistake_corrected('punctuation_mistake')
+        else
+          if @j != 0
+            concatenated_corrected_string = corrected_sentence_info_hash[@i - 1]['token'].to_s + corrected_sentence_info_hash[@i]['token'].to_s
+            if ChatCorrect::Possessive.new(token_a: original_sentence_info_hash[@j - 1]['token'], token_b: concatenated_corrected_string).possessive?
+              @mistake_info[original_sentence_info_hash[@j - 1]['token']] = 'possessive_mistake'
+              @correct_info[concatenated_corrected_string] = 'possessive_mistake_correction'
+              @combined_hash[@combined_hash.length - 1] = @mistake_info
+              @combined_hash[@combined_hash.length] = @correct_info
+            else
+              update_combined_hash_single_mistake_corrected('missing_word_mistake')
+            end
+          else
+            update_combined_hash_single_mistake_corrected('missing_word_mistake')
+          end
+        end
+      end
+      @i +=1
+    end
+    def special_error_analysis
+      if original_sentence_info_hash[@j]['token'].gsub(/[[:punct:]]/, '').eql?('') ||
+        PUNCTUATION_SYMBOLS.include?(original_sentence_info_hash[@j]['token'])
+        update_combined_hash_single_mistake_original('punctuation_mistake')
+        @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
+      else
+        update_combined_hash_single_mistake_original('unnecessary_word_mistake')
+        @final_matched_array << original_sentence_info_hash[@j]['match_id'].to_s
+      end
+      @j +=1
+    end
+    def duplicate_error_analysis
+      update_combined_hash_single_mistake_original('duplicate_word_mistake')
+      @j +=1
+    end
+  end
+end

data/lib/chat_correct/mistake_analyzer.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module ChatCorrect
+  class MistakeAnalyzer
+    attr_reader :original, :corrected
+    def initialize(original:, corrected:)
+      @original = original
+      @corrected = corrected
+    end
+    def no_mistake?
+      original['token'].eql?(corrected['token'])
+    end
+    def verb_mistake?
+      ChatCorrect::CommonVerbMistake.new(token_a: corrected['token'], token_b: original['token']).exists? ||
+      original['multiple_words'] ||
+      corrected['multiple_words'] ||
+      ChatCorrect::Verb.new(word: original['token'], pos: corrected['pos_tag'], text: corrected['token']).verb_error?
+    end
+    def capitalization_mistake?
+      ChatCorrect::Capitalization.new(token_a: corrected['token'], token_b: original['token']).capitalization_error?
+    end
+    def punctuation_mistake?
+      (corrected['punctuation'] && original['punctuation']) ||
+      (ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
+      ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists? &&
+      !ChatCorrect::Possessive.new(token_a: original['token'], token_b: corrected['token']).possessive?)
+    end
+    def unnecessary_word_missing_punctuation_mistake?
+      corrected['punctuation'] && !original['punctuation']
+    end
+    def spelling_mistake?
+      ChatCorrect::Spelling.new(token_a: corrected['token'], token_b: original['token']).spelling_error? &&
+      !ChatCorrect::PunctuationMasqueradingAsSpellingError.new(token_a: corrected['token'], token_b: original['token']).exists?
+    end
+  end
+end

data/lib/chat_correct/pluralization.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'linguistics'
+module ChatCorrect
+  class Pluralization
+    attr_reader :token_a, :token_b
+    def initialize(token_a:, token_b:)
+      @token_a = token_a
+      @token_b = token_b
+    end
+    def pluralization_error?
+      begin
+        Linguistics.use(:en)
+        token_a_plural = token_a.en.plural
+        token_b_plural = token_b.en.plural
+      rescue
+        return false
+      end
+      token_a_plural.eql?(token_b) || token_b_plural.eql?(token_a)
+    end
+  end
+end

data/lib/chat_correct/possessive.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module ChatCorrect
+  class Possessive
+    attr_reader :token_a, :token_b
+    def initialize(token_a:, token_b:)
+      @token_a = token_a
+      @token_b = token_b
+    end
+    def possessive?
+      check_for_possessive(token_a, token_b, "ƪ") ||
+      check_for_possessive(token_b, token_a, "ƪ") ||
+      check_for_possessive(token_a, token_b, "∮") ||
+      check_for_possessive(token_b, token_a, "∮")
+    end
+    private
+    def check_for_possessive(word_1, word_2, mark)
+      word_1.include?(mark) &&
+        word_1.partition(mark)[0].downcase.eql?(word_2.downcase) ||
+        (word_1.partition(mark)[0].downcase.eql?(word_1.downcase[0...-1]) &&
+        (word_1.partition(mark)[2].eql?('s') || word_1.partition(mark)[2].length < 3))
+    end
+  end
+end

data/lib/chat_correct/punctuation.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module ChatCorrect
+  class Punctuation
+    attr_reader :text
+    def initialize(text:)
+      @text = text
+    end
+    def is_punctuation?
+      text.gsub(/[[:punct:]]/, '').eql?('') ||
+      text.eql?('∫') ||
+      text.eql?('∬') ||
+      text.eql?('∯') ||
+      text.eql?('∮') ||
+      text.eql?('ƪ')
+    end
+  end
+end