RubyGems - automated_metareview - Versions diffs - 0.0.1 - Mend

automated_metareview 0.0.1

Files changed (33) hide show

data/.gitignore +17 -0
data/.idea/automated_metareview.iml +91 -0
data/.idea/encodings.xml +5 -0
data/.idea/misc.xml +5 -0
data/.idea/modules.xml +9 -0
data/.idea/scopes/scope_settings.xml +5 -0
data/.idea/vcs.xml +8 -0
data/Gemfile +42 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +1 -0
data/automated_metareview.gemspec +27 -0
data/lib/automated_metareview.rb +18 -0
data/lib/automated_metareview/constants.rb +208 -0
data/lib/automated_metareview/degree_of_relevance.rb +562 -0
data/lib/automated_metareview/edge.rb +13 -0
data/lib/automated_metareview/graph_generator.rb +695 -0
data/lib/automated_metareview/negations.rb +51 -0
data/lib/automated_metareview/negative-words.csv +4783 -0
data/lib/automated_metareview/patterns-assess.csv +17 -0
data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
data/lib/automated_metareview/patterns-suggest.csv +20 -0
data/lib/automated_metareview/plagiarism_check.rb +155 -0
data/lib/automated_metareview/positive-words.csv +2006 -0
data/lib/automated_metareview/predict_class.rb +121 -0
data/lib/automated_metareview/sentence_state.rb +293 -0
data/lib/automated_metareview/text_preprocessing.rb +342 -0
data/lib/automated_metareview/text_quantity.rb +26 -0
data/lib/automated_metareview/tone.rb +212 -0
data/lib/automated_metareview/version.rb +3 -0
data/lib/automated_metareview/vertex.rb +18 -0
data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
metadata +158 -0

data/lib/automated_metareview/predict_class.rb ADDED Viewed

@@ -0,0 +1,121 @@
+require 'automated_metareview/wordnet_based_similarity'
+require 'automated_metareview/constants'
+class PredictClass
+=begin
+ Identifies the probabilities of a review belonging to each of the three classes.
+ Returns an array of probablities (length = numClasses)
+=end
+#predicting the review's class
+def predict_classes(pos_tagger, core_NLP_tagger, review_text, review_graph, pattern_files_array, num_classes)
+  #reading the patterns from the pattern files
+  patterns_files = Array.new
+  pattern_files_array.each do |file|
+    patterns_files << file #collecting the file names for each class of patterns
+  end
+  tc = TextPreprocessing.new
+  single_patterns = Array.new(num_classes){Array.new}
+  #reading the patterns from each of the pattern files
+  for i in (0..num_classes - 1) #for every class
+    #read_patterns in TextPreprocessing helps read patterns in the format 'X = Y'
+    single_patterns[i] = tc.read_patterns(patterns_files[i], pos_tagger)
+  end
+  #Predicting the probability of the review belonging to each of the content classes
+  wordnet = WordnetBasedSimilarity.new
+  max_probability = 0.0
+  class_value = 0
+  edges = review_graph.edges
+  class_prob = Array.new #contains the probabilities for each of the classes - it contains 3 rows for the 3 classes
+  #comparing each test review text with patterns from each of the classes
+  for k in (0..num_classes - 1)
+    #comparing edges with patterns from a particular class
+    class_prob[k] = compare_review_with_patterns(edges, single_patterns[k], wordnet)/6.to_f #normalizing the result
+    #we divide the match by 6 to ensure the value is in the range of [0-1]
+  end #end of for loop for the classes
+  #printing the probability values
+  # puts("########## Probability for test review:: "+review_text[0]+" is::")
+  # for k in (0..num_classes - 1)
+    # puts "class_prob[#{k}] .. #{class_prob[k]}"
+  # end
+  return class_prob
+end #end of the prediction method
+#------------------------------------------#------------------------------------------#------------------------------------------
+def compare_review_with_patterns(single_edges, single_patterns, wordnet)
+  final_class_sum = 0.0
+  final_edge_num = 0
+  single_edge_matches = Array.new(single_edges.length){Array.new}
+  #resetting the average_match values for all the edges, before matching with the single_patterns for a new class
+  for i in 0..single_edges.length - 1
+    if(!single_edges[i].nil?)
+      single_edges[i].average_match = 0
+    end
+  end
+  #comparing each single edge with all the patterns
+  for i in (0..single_edges.length - 1)  #iterating through the single edges
+    max_match = 0
+    if(!single_edges[i].nil?)
+      for j in (0..single_patterns.length - 1)
+        if(!single_patterns[j].nil?)
+          single_edge_matches[i][j] = compare_edges(single_edges[i], single_patterns[j], wordnet)
+          if(single_edge_matches[i][j] > max_match)
+            max_match = single_edge_matches[i][j]
+          end
+        end
+      end #end of for loop for the patterns
+      single_edges[i].average_match = max_match
+      #calculating class average
+      if(single_edges[i].average_match != 0.0)
+        final_class_sum = final_class_sum + single_edges[i].average_match
+        final_edge_num+=1
+      end
+    end #end of the if condition
+  end #end of for loop
+  if(final_edge_num == 0)
+    final_edge_num = 1
+  end
+  # puts("final_class_sum:: #{final_class_sum} final_edge_num:: #{final_edge_num} Class average #{final_class_sum/final_edge_num}")
+  return final_class_sum/final_edge_num #maxMatch
+end #end of determineClass method
+#------------------------------------------#------------------------------------------#------------------------------------------
+def compare_edges(e1, e2, wordnet)
+  speller = Aspell.new("en_US")
+  speller.suggestion_mode = Aspell::NORMAL
+  avg_match_without_syntax = 0
+  #compare edges so that only non-nouns or non-subjects are compared
+  # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
+    avg_match_without_syntax = (wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller) +
+                              wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller))/2.to_f
+  # elsif(!e1.in_vertex.pos_tag.include?("NN"))
+    # avg_match_without_syntax = wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller)
+  # elsif(!e1.out_vertex.pos_tag.include?("NN"))
+    # avg_match_without_syntax = wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller)
+  # end
+  avg_match_with_syntax = 0
+  #matching in-out and out-in vertices
+  # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
+  avg_match_with_syntax = (wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller) +
+                              wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller))/2.to_f
+  # elsif(!e1.in_vertex.pos_tag.include?("NN"))
+    # avg_match_with_syntax = wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller)
+  # elsif(!e1.out_vertex.pos_tag.include?("NN"))
+    # avg_match_with_syntax = wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller)
+  # end
+  if(avg_match_without_syntax > avg_match_with_syntax)
+    return avg_match_without_syntax
+  else
+    return avg_match_with_syntax
+  end
+end #end of the compare_edges method
+end

data/lib/automated_metareview/sentence_state.rb ADDED Viewed

@@ -0,0 +1,293 @@
+require 'automated_metareview/negations'
+require 'automated_metareview/constants'
+class SentenceState
+  attr_accessor :broken_sentences
+  def identify_sentence_state(str_with_pos_tags)
+    # puts("**** Inside identify_sentence_state #{str_with_pos_tags}")
+    #break the sentence at the co-ordinating conjunction
+    num_conjunctions = break_at_coordinating_conjunctions(str_with_pos_tags)
+    states_array = Array.new
+    if(@broken_sentences == nil)
+      states_array[0] = sentence_state(str_with_pos_tags)
+    #identifying states for each of the sentence segments
+    else
+      for i in (0..num_conjunctions)
+        if(!@broken_sentences[i].nil?)
+          states_array[i] = sentence_state(@broken_sentences[i])
+        end
+      end
+    end
+    return states_array
+  end #end of the methods
+#------------------------------------------#------------------------------------------
+  def break_at_coordinating_conjunctions(str_with_pos_tags)
+    st = str_with_pos_tags.split(" ")
+    count = st.length
+    counter = 0
+    @broken_sentences = Array.new
+    #if the sentence contains a co-ordinating conjunction
+    if(str_with_pos_tags.include?("CC"))
+      counter = 0
+      temp = ""
+      for i in (0..count-1)
+        ps = st[i]
+        if(!ps.nil? and ps.include?("CC"))
+          @broken_sentences[counter] = temp #for "run/NN on/IN..."
+          counter+=1
+          temp = ps[0..ps.index("/")]
+          #the CC or IN goes as part of the following sentence
+        elsif (!ps.nil? and !ps.include?("CC"))
+          temp = temp +" "+ ps[0..ps.index("/")]
+        end
+      end
+      if(!temp.empty?) #setting the last sentence segment
+        @broken_sentences[counter] = temp
+        counter+=1
+      end
+    else
+      @broken_sentences[counter] = str_with_pos_tags
+      counter+=1
+    end
+    return counter
+  end #end of the method
+#------------------------------------------#------------------------------------------
+  #Checking if the token is a negative token
+  def sentence_state(str_with_pos_tags)
+    state = POSITIVE
+    #checking single tokens for negated words
+    st = str_with_pos_tags.split(" ")
+    count = st.length
+    tokens = Array.new
+    tagged_tokens = Array.new
+    i = 0
+    interim_noun_verb  = false #0 indicates no interim nouns or verbs
+    #fetching all the tokens
+    for k in (0..st.length-1)
+      ps = st[k]
+      #setting the tagged string
+      tagged_tokens[i] = ps
+      if(ps.include?("/"))
+        ps = ps[0..ps.index("/")-1]
+      end
+      #removing punctuations
+      if(ps.include?("."))
+        tokens[i] = ps[0..ps.index(".")-1]
+      elsif(ps.include?(","))
+        tokens[i] = ps.gsub(",", "")
+      elsif(ps.include?("!"))
+        tokens[i] = ps.gsub("!", "")
+      elsif(ps.include?(";"))
+        tokens[i] = ps.gsub(";", "")
+      else
+        tokens[i] = ps
+        i+=1
+      end
+    end#end of the for loop
+    #iterating through the tokens to determine state
+    prev_negative_word =""
+    for j  in (0..i-1)
+      #checking type of the word
+      #checking for negated words
+      if(is_negative_word(tokens[j]) == NEGATED)
+        returned_type = NEGATIVE_WORD
+      #checking for a negative descriptor (indirect indicators of negation)
+      elsif(is_negative_descriptor(tokens[j]) == NEGATED)
+        returned_type = NEGATIVE_DESCRIPTOR
+      #2-gram phrases of negative phrases
+      elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
+        is_negative_phrase(tokens[j]+" "+tokens[j+1]) == NEGATED)
+        returned_type = NEGATIVE_PHRASE
+        j = j+1
+      #if suggestion word is found
+      elsif(is_suggestive(tokens[j]) == SUGGESTIVE)
+        returned_type = SUGGESTIVE
+      #2-gram phrases suggestion phrases
+      elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
+         is_suggestive_phrase(tokens[j]+" "+tokens[j+1]) == SUGGESTIVE)
+        returned_type = SUGGESTIVE
+        j = j+1
+      #else set to positive
+      else
+        returned_type = POSITIVE
+      end
+      #----------------------------------------------------------------------
+      #comparing 'returnedType' with the existing STATE of the sentence clause
+      #after returnedType is identified, check its state and compare it to the existing state
+      #if present state is negative and an interim non-negative or non-suggestive word was found, set the flag to true
+      if((state == NEGATIVE_WORD or state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_PHRASE) and returned_type == POSITIVE)
+        if(interim_noun_verb == false and (tagged_tokens[j].include?("NN") or tagged_tokens[j].include?("PR") or tagged_tokens[j].include?("VB") or tagged_tokens[j].include?("MD")))
+          interim_noun_verb = true
+        end
+      end
+      if(state == POSITIVE and returned_type != POSITIVE)
+        state = returned_type
+      #when state is a negative word
+      elsif(state == NEGATIVE_WORD) #previous state
+        if(returned_type == NEGATIVE_WORD)
+          #these words embellish the negation, so only if the previous word was not one of them you make it positive
+          if(prev_negative_word.casecmp("NO") != 0 and prev_negative_word.casecmp("NEVER") != 0 and prev_negative_word.casecmp("NONE") != 0)
+            state = POSITIVE #e.g: "not had no work..", "doesn't have no work..", "its not that it doesn't bother me..."
+          else
+            state = NEGATIVE_WORD #e.g: "no it doesn't help", "no there is no use for ..."
+          end
+          interim_noun_verb = false #resetting
+        elsif(returned_type == NEGATIVE_DESCRIPTOR or returned_type == NEGATIVE_PHRASE)
+          state = POSITIVE #e.g.: "not bad", "not taken from", "I don't want nothing", "no code duplication"// ["It couldn't be more confusing.."- anomaly we dont handle this for now!]
+          interim_noun_verb = false #resetting
+        elsif(returned_type == SUGGESTIVE)
+          #e.g. " it is not too useful as people could...", what about this one?
+          if(interim_noun_verb == true) #there are some words in between
+            state = NEGATIVE_WORD
+          else
+            state = SUGGESTIVE #e.g.:"I do not(-) suggest(S) ..."
+          end
+          interim_noun_verb = false #resetting
+        end
+      #when state is a negative descriptor
+      elsif(state == NEGATIVE_DESCRIPTOR)
+        if(returned_type == NEGATIVE_WORD)
+          if(interim_noun_verb == true)#there are some words in between
+            state = NEGATIVE_WORD #e.g: "hard(-) to understand none(-) of the comments"
+          else
+            state = POSITIVE #e.g."He hardly not...."
+          end
+          interim_noun_verb = false #resetting
+        elsif(returned_type == NEGATIVE_DESCRIPTOR)
+          if(interim_noun_verb == true)#there are some words in between
+            state = NEGATIVE_DESCRIPTOR #e.g:"there is barely any code duplication"
+          else
+            state = POSITIVE #e.g."It is hardly confusing..", but what about "it is a little confusing.."
+          end
+          interim_noun_verb = false #resetting
+        elsif(returned_type == NEGATIVE_PHRASE)
+          if(interim_noun_verb == true)#there are some words in between
+            state = NEGATIVE_PHRASE #e.g:"there is barely any code duplication"
+          else
+            state = POSITIVE #e.g.:"it is hard and appears to be taken from"
+          end
+          interim_noun_verb = false #resetting
+        elsif(returned_type == SUGGESTIVE)
+          state = SUGGESTIVE #e.g.:"I hardly(-) suggested(S) ..."
+          interim_noun_verb = false #resetting
+        end
+      #when state is a negative phrase
+      elsif(state == NEGATIVE_PHRASE)
+        if(returned_type == NEGATIVE_WORD)
+          if(interim_noun_verb == true)#there are some words in between
+            state = NEGATIVE_WORD #e.g."It is too short the text and doesn't"
+          else
+            state = POSITIVE #e.g."It is too short not to contain.."
+          end
+          interim_noun_verb = false #resetting
+        elsif(returned_type == NEGATIVE_DESCRIPTOR)
+          state = NEGATIVE_DESCRIPTOR #e.g."It is too short barely covering..."
+          interim_noun_verb = false #resetting
+        elsif(returned_type == NEGATIVE_PHRASE)
+          state = NEGATIVE_PHRASE #e.g.:"it is too short, taken from ..."
+          interim_noun_verb = false #resetting
+        elsif(returned_type == SUGGESTIVE)
+          state = SUGGESTIVE #e.g.:"I too short and I suggest ..."
+          interim_noun_verb = false #resetting
+        end
+      #when state is suggestive
+      elsif(state == SUGGESTIVE) #e.g.:"I might(S) not(-) suggest(S) ..."
+        if(returned_type == NEGATIVE_DESCRIPTOR)
+          state = NEGATIVE_DESCRIPTOR
+        elsif(returned_type == NEGATIVE_PHRASE)
+          state = NEGATIVE_PHRASE
+        end
+        #e.g.:"I suggest you don't.." -> suggestive
+        interim_noun_verb = false #resetting
+      end
+      #setting the prevNegativeWord
+      if(tokens[j].casecmp("NO") == 0 or tokens[j].casecmp("NEVER") == 0 or tokens[j].casecmp("NONE") == 0)
+        prev_negative_word = tokens[j]
+      end
+    end #end of for loop
+    if(state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_WORD or state == NEGATIVE_PHRASE)
+      state = NEGATED
+    end
+    return state
+  end
+#------------------------------------------#------------------------------------------
+#Checking if the token is a negative token
+def is_negative_word(word)
+  not_negated = POSITIVE
+  for i in (0..NEGATED_WORDS.length - 1)
+    if(word.casecmp(NEGATED_WORDS[i]) == 0)
+      not_negated =  NEGATED #indicates negation found
+      break
+    end
+  end
+  return not_negated
+end
+#------------------------------------------#------------------------------------------
+#Checking if the token is a negative token
+def is_negative_descriptor(word)
+  not_negated = POSITIVE
+  for i in (0..NEGATIVE_DESCRIPTORS.length - 1)
+    if(word.casecmp(NEGATIVE_DESCRIPTORS[i]) == 0)
+      not_negated =  NEGATED #indicates negation found
+      break
+    end
+  end
+  return not_negated
+end
+#------------------------------------------#------------------------------------------
+#Checking if the phrase is negative
+def is_negative_phrase(phrase)
+  not_negated = POSITIVE
+  for i in (0..NEGATIVE_PHRASES.length - 1)
+    if(phrase.casecmp(NEGATIVE_PHRASES[i]) == 0)
+      not_negated =  NEGATED #indicates negation found
+      break
+    end
+  end
+  return not_negated
+end
+#------------------------------------------#------------------------------------------
+#Checking if the token is a suggestive token
+def is_suggestive(word)
+  not_suggestive = POSITIVE
+  #puts "inside is_suggestive for token:: #{word}"
+  for i in (0..SUGGESTIVE_WORDS.length - 1)
+    if(word.casecmp(SUGGESTIVE_WORDS[i]) == 0)
+      not_suggestive =  SUGGESTIVE #indicates negation found
+      break
+    end
+  end
+  return not_suggestive
+end
+#------------------------------------------#------------------------------------------
+#Checking if the PHRASE is suggestive
+def is_suggestive_phrase(phrase)
+  not_suggestive = POSITIVE
+  for i in (0..SUGGESTIVE_PHRASES.length - 1)
+    if(phrase.casecmp(SUGGESTIVE_PHRASES[i]) == 0)
+      not_suggestive =  SUGGESTIVE #indicates negation found
+      break
+    end
+  end
+  return not_suggestive
+end
+end #end of the class

data/lib/automated_metareview/text_preprocessing.rb ADDED Viewed

@@ -0,0 +1,342 @@
+require 'automated_metareview/constants'
+require 'automated_metareview/edge'
+require 'automated_metareview/vertex'
+class TextPreprocessing
+=begin
+ Fetching review data from the tables based on the response_map id
+=end
+  def fetch_review_data(auto_metareview, map_id)
+    reviews = Array.new
+    responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
+    auto_metareview.responses = responses
+    auto_metareview.response_id = responses.id
+    # puts "auto_metareview.response_id #{auto_metareview.response_id}"
+    # puts "responses updated_at #{responses.updated_at}"
+    responses.scores.each{
+      | review_score |
+      if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
+        # puts review_score.comments
+        reviews << review_score.comments
+      end
+    }
+    return reviews
+  end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+ Fetching submission data from the url submitted by the reviewee
+=end
+  def fetch_submission_data(map_id)
+    subm_array = Array.new
+    response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
+    reviewee_id = response_map.reviewee_id
+    reviewed_object = response_map.reviewed_object_id
+    url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
+    if(url.nil?)#in case of team assignments
+      teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
+      teams_users.each{
+        |team_user|
+        url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
+        if(!url.nil?)#break out when you find the url
+          break
+        end
+      }
+    end
+    # puts "***url #{url} #{url}"
+    #fetching the url submitted by the reviewee
+    url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
+    # puts "***url #{url} #{url.class}"
+    page = Nokogiri::HTML(open(url))
+    #fetching the paragraph texts from the specified url
+    if(page.css('p').count != 0)
+      page.css('p').each do |subm|
+        # puts "subm.text.. #{subm.text}"
+        subm_array << subm.text
+      end
+    end
+    #for google docs where the text is placed inside <script></script> tags
+    if(page.css('script').count != 0)
+      page.css('script').each do |subm|
+        if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
+          subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
+        end
+      end
+    end
+    return subm_array
+  end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+  pre-processes the review text and sends it in for graph formation and further analysis
+=end
+def segment_text(flag, text_array)
+  if(flag == 0)
+    reviews = Array.new(1){Array.new}
+  else
+    reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
+  end
+  i = 0
+  j = 0
+  for k in (0..text_array.length-1)
+    text = text_array[k]
+    if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
+      reviews[j] = Array.new #initializing the array for sentences in a test review
+      i = 0
+    end
+    #******* Pre-processing the review/submission text **********
+    #replacing commas in large numbers, makes parsing sentences with commas confusing!
+    #replacing quotation marks
+    text.gsub!("\"", "")
+    text.gsub!("(", "")
+    text.gsub!(")", "")
+    if(text.include?("http://"))
+      text = remove_urls(text)
+    end
+    #break the text into multiple sentences
+    beginn = 0
+    if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
+      while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
+        endd = 0
+        #these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
+        if(text.include?("."))
+          endd = text.index(".")
+        end
+        if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
+          endd = text.index("?")
+        end
+        if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
+          endd = text.index("!")
+        end
+        if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
+          endd = text.index(",")
+        end
+        if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
+          endd = text.index(";")
+        end
+        #check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
+        if(flag == 0) #training
+          reviews[0][i] = text[beginn..endd].strip
+        else #testing
+          reviews[j][i] = text[beginn..endd].strip
+        end
+        i+=1 #incrementing the sentence counter
+        text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
+      end #end of the while loop
+    else #if there is only 1 sentence in the text
+      if(flag == 0)#training
+        reviews[0][i] = text.strip
+        i+=1 #incrementing the sentence counter
+      else #testing
+        reviews[j][i] = text.strip
+      end
+    end
+    if(flag == 1)#incrementing reviews counter only for test reviews
+      j+=1
+    end
+  end #end of the for loop with 'k' reading text rows
+  #setting the number of reviews before returning
+  if(flag == 0)#training
+    num_reviews = 1 #for training the number of reviews is 1
+  else #testing
+    num_reviews = j
+  end
+  if(flag == 0)
+    return reviews[0]
+  end
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+   * Reads the patterns from the csv file containing them.
+   * maxValue is the maximum value of the patterns found
+=end
+def read_patterns(filename, pos)
+  num = 1000 #some large number
+  patterns = Array.new
+  state = POSITIVE
+  i = 0 #keeps track of the number of edges
+  #setting the state for problem detection and suggestive patterns
+  if(filename.include?("prob"))
+      state = NEGATED
+  elsif(filename.include?("suggest"))
+      state = SUGGESTIVE
+  end
+  FasterCSV.foreach(filename) do |text|
+    in_vertex = text[0][0..text[0].index("=")-1].strip
+    out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
+    first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
+    first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
+     patterns[i] = Edge.new("noun", NOUN)
+     #setting the invertex
+     if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
+          patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
+     elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
+      patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
+     elsif(first_string_in_vertex.include?("JJ"))
+      patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
+     elsif(first_string_in_vertex.include?("/RB"))
+      patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
+     else #default to noun
+      patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
+     end
+     #setting outvertex
+     if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
+      patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
+     elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
+      patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
+     elsif(first_string_out_vertex.include?("JJ"))
+      patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
+     elsif(first_string_out_vertex.include?("/RB"))
+      patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
+    else #default is noun
+      patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
+    end
+    i+=1 #incrementing for each pattern
+  end #end of the FasterCSV.foreach loop
+  num_patterns = i
+  return patterns
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+ Removes any urls in the text and returns the remaining text as it is
+=end
+def remove_urls(text)
+  final_text = String.new
+  if(text.include?("http://"))
+    tokens = text.split(" ")
+    tokens.each{
+      |token|
+      if(!token.include?("http://"))
+        final_text = final_text + " " + token
+      end
+    }
+  else
+    return text
+  end
+  return final_text
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+Check for plagiarism after removing text within quotes for reviews
+=end
+def remove_text_within_quotes(review_text)
+  # puts "Inside removeTextWithinQuotes:: "
+  reviews = Array.new
+  review_text.each{ |row|
+    # puts "row #{row}"
+    text = row
+    #text = text[1..text.length-2] #since the first and last characters are quotes
+    #puts "text #{text}"
+    #the read text is tagged with two sets of quotes!
+    if(text.include?("\""))
+      while(text.include?("\"")) do
+        replace_text = text.scan(/"([^"]*)"/)
+        # puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
+        # puts text.index(replace_text[0].to_s)
+        # puts "replace_text length .. #{replace_text[0].to_s.length}"
+        #fetching the start index of the quoted text, in order to replace the complete segment
+        start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
+        # puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
+        #replacing the text segment within the quotes (including the quotes) with an empty string
+        text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
+        # puts "text .. #{text}"
+      end #end of the while loop
+    end
+    reviews << text #set the text after all quoted segments have been removed.
+  } #end of the loop for "text" array
+  # puts "returning reviews length .. #{reviews.length}"
+  return reviews #return only the first array element - a string!
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+ Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
+=end
+def check_correct_spellings(review_text_array, speller)
+  review_text_array_temp = Array.new
+  #iterating through each response
+  review_text_array.each{
+    |review_text|
+    review_tokens = review_text.split(" ")
+    review_text_temp = ""
+    #iterating through tokens from each response
+    review_tokens.each{
+      |review_tok|
+      #checkiing the stem word's spelling for correctness
+      if(!speller.check(review_tok))
+        if(!speller.suggest(review_tok).first.nil?)
+          review_tok = speller.suggest(review_tok).first
+        end
+     end
+     review_text_temp = review_text_temp +" " + review_tok.downcase
+    }
+    review_text_array_temp << review_text_temp
+  }
+  return review_text_array_temp
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
+=end
+public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
+def contains_punct(str)
+  if(str.include?".")
+    str.gsub!(".","")
+  elsif(str.include?",")
+    str.gsub!(",","")
+  elsif(str.include?"?")
+    str.gsub!("?","")
+  elsif(str.include?"!")
+    str.gsub!("!","")
+  elsif(str.include?";")
+    str.gsub(";","")
+  elsif(str.include?":")
+    str.gsub!(":","")
+  elsif(str.include?"(")
+    str.gsub!("(","")
+  elsif(str.include?")")
+    str.gsub!(")","")
+  elsif(str.include?"[")
+    str.gsub!("[","")
+  elsif(str.include?"]")
+    str.gsub!("]","")
+  end
+  return str
+end
+def contains_punct_bool(str)
+  if(str.include?("\\n") or str.include?("}") or str.include?("{"))
+    return true
+  else
+    return false
+  end
+end
+#------------------------------------------#------------------------------------------#------------------------------------------
+=begin
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
+=end
+def is_punct(str)
+  if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
+    return true
+  else
+    return false
+  end
+end
+end #end of class