automated_metareview 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/constants'
3
+
4
+ class PredictClass
5
+ =begin
6
+ Identifies the probabilities of a review belonging to each of the three classes.
7
+ Returns an array of probablities (length = numClasses)
8
+ =end
9
+ #predicting the review's class
10
+ def predict_classes(pos_tagger, core_NLP_tagger, review_text, review_graph, pattern_files_array, num_classes)
11
+ #reading the patterns from the pattern files
12
+ patterns_files = Array.new
13
+ pattern_files_array.each do |file|
14
+ patterns_files << file #collecting the file names for each class of patterns
15
+ end
16
+
17
+ tc = TextPreprocessing.new
18
+ single_patterns = Array.new(num_classes){Array.new}
19
+ #reading the patterns from each of the pattern files
20
+ for i in (0..num_classes - 1) #for every class
21
+ #read_patterns in TextPreprocessing helps read patterns in the format 'X = Y'
22
+ single_patterns[i] = tc.read_patterns(patterns_files[i], pos_tagger)
23
+ end
24
+
25
+ #Predicting the probability of the review belonging to each of the content classes
26
+ wordnet = WordnetBasedSimilarity.new
27
+ max_probability = 0.0
28
+ class_value = 0
29
+ edges = review_graph.edges
30
+ class_prob = Array.new #contains the probabilities for each of the classes - it contains 3 rows for the 3 classes
31
+ #comparing each test review text with patterns from each of the classes
32
+ for k in (0..num_classes - 1)
33
+ #comparing edges with patterns from a particular class
34
+ class_prob[k] = compare_review_with_patterns(edges, single_patterns[k], wordnet)/6.to_f #normalizing the result
35
+ #we divide the match by 6 to ensure the value is in the range of [0-1]
36
+ end #end of for loop for the classes
37
+
38
+ #printing the probability values
39
+ # puts("########## Probability for test review:: "+review_text[0]+" is::")
40
+ # for k in (0..num_classes - 1)
41
+ # puts "class_prob[#{k}] .. #{class_prob[k]}"
42
+ # end
43
+ return class_prob
44
+ end #end of the prediction method
45
+ #------------------------------------------#------------------------------------------#------------------------------------------
46
+
47
+ def compare_review_with_patterns(single_edges, single_patterns, wordnet)
48
+ final_class_sum = 0.0
49
+ final_edge_num = 0
50
+ single_edge_matches = Array.new(single_edges.length){Array.new}
51
+ #resetting the average_match values for all the edges, before matching with the single_patterns for a new class
52
+ for i in 0..single_edges.length - 1
53
+ if(!single_edges[i].nil?)
54
+ single_edges[i].average_match = 0
55
+ end
56
+ end
57
+
58
+ #comparing each single edge with all the patterns
59
+ for i in (0..single_edges.length - 1) #iterating through the single edges
60
+ max_match = 0
61
+ if(!single_edges[i].nil?)
62
+ for j in (0..single_patterns.length - 1)
63
+ if(!single_patterns[j].nil?)
64
+ single_edge_matches[i][j] = compare_edges(single_edges[i], single_patterns[j], wordnet)
65
+ if(single_edge_matches[i][j] > max_match)
66
+ max_match = single_edge_matches[i][j]
67
+ end
68
+ end
69
+ end #end of for loop for the patterns
70
+ single_edges[i].average_match = max_match
71
+
72
+ #calculating class average
73
+ if(single_edges[i].average_match != 0.0)
74
+ final_class_sum = final_class_sum + single_edges[i].average_match
75
+ final_edge_num+=1
76
+ end
77
+ end #end of the if condition
78
+ end #end of for loop
79
+
80
+ if(final_edge_num == 0)
81
+ final_edge_num = 1
82
+ end
83
+
84
+ # puts("final_class_sum:: #{final_class_sum} final_edge_num:: #{final_edge_num} Class average #{final_class_sum/final_edge_num}")
85
+ return final_class_sum/final_edge_num #maxMatch
86
+ end #end of determineClass method
87
+ #------------------------------------------#------------------------------------------#------------------------------------------
88
+
89
+ def compare_edges(e1, e2, wordnet)
90
+ speller = Aspell.new("en_US")
91
+ speller.suggestion_mode = Aspell::NORMAL
92
+
93
+ avg_match_without_syntax = 0
94
+ #compare edges so that only non-nouns or non-subjects are compared
95
+ # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
96
+ avg_match_without_syntax = (wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller) +
97
+ wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller))/2.to_f
98
+ # elsif(!e1.in_vertex.pos_tag.include?("NN"))
99
+ # avg_match_without_syntax = wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller)
100
+ # elsif(!e1.out_vertex.pos_tag.include?("NN"))
101
+ # avg_match_without_syntax = wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller)
102
+ # end
103
+
104
+ avg_match_with_syntax = 0
105
+ #matching in-out and out-in vertices
106
+ # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
107
+ avg_match_with_syntax = (wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller) +
108
+ wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller))/2.to_f
109
+ # elsif(!e1.in_vertex.pos_tag.include?("NN"))
110
+ # avg_match_with_syntax = wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller)
111
+ # elsif(!e1.out_vertex.pos_tag.include?("NN"))
112
+ # avg_match_with_syntax = wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller)
113
+ # end
114
+
115
+ if(avg_match_without_syntax > avg_match_with_syntax)
116
+ return avg_match_without_syntax
117
+ else
118
+ return avg_match_with_syntax
119
+ end
120
+ end #end of the compare_edges method
121
+ end
@@ -0,0 +1,293 @@
1
+ require 'automated_metareview/negations'
2
+ require 'automated_metareview/constants'
3
+
4
+ class SentenceState
5
+ attr_accessor :broken_sentences
6
+ def identify_sentence_state(str_with_pos_tags)
7
+ # puts("**** Inside identify_sentence_state #{str_with_pos_tags}")
8
+ #break the sentence at the co-ordinating conjunction
9
+ num_conjunctions = break_at_coordinating_conjunctions(str_with_pos_tags)
10
+
11
+ states_array = Array.new
12
+ if(@broken_sentences == nil)
13
+ states_array[0] = sentence_state(str_with_pos_tags)
14
+ #identifying states for each of the sentence segments
15
+ else
16
+ for i in (0..num_conjunctions)
17
+ if(!@broken_sentences[i].nil?)
18
+ states_array[i] = sentence_state(@broken_sentences[i])
19
+ end
20
+ end
21
+ end
22
+ return states_array
23
+ end #end of the methods
24
+ #------------------------------------------#------------------------------------------
25
+ def break_at_coordinating_conjunctions(str_with_pos_tags)
26
+ st = str_with_pos_tags.split(" ")
27
+ count = st.length
28
+ counter = 0
29
+
30
+ @broken_sentences = Array.new
31
+ #if the sentence contains a co-ordinating conjunction
32
+ if(str_with_pos_tags.include?("CC"))
33
+ counter = 0
34
+ temp = ""
35
+ for i in (0..count-1)
36
+ ps = st[i]
37
+ if(!ps.nil? and ps.include?("CC"))
38
+ @broken_sentences[counter] = temp #for "run/NN on/IN..."
39
+ counter+=1
40
+ temp = ps[0..ps.index("/")]
41
+ #the CC or IN goes as part of the following sentence
42
+ elsif (!ps.nil? and !ps.include?("CC"))
43
+ temp = temp +" "+ ps[0..ps.index("/")]
44
+ end
45
+ end
46
+ if(!temp.empty?) #setting the last sentence segment
47
+ @broken_sentences[counter] = temp
48
+ counter+=1
49
+ end
50
+ else
51
+ @broken_sentences[counter] = str_with_pos_tags
52
+ counter+=1
53
+ end
54
+ return counter
55
+ end #end of the method
56
+ #------------------------------------------#------------------------------------------
57
+
58
+ #Checking if the token is a negative token
59
+ def sentence_state(str_with_pos_tags)
60
+ state = POSITIVE
61
+ #checking single tokens for negated words
62
+ st = str_with_pos_tags.split(" ")
63
+ count = st.length
64
+ tokens = Array.new
65
+ tagged_tokens = Array.new
66
+ i = 0
67
+ interim_noun_verb = false #0 indicates no interim nouns or verbs
68
+
69
+ #fetching all the tokens
70
+ for k in (0..st.length-1)
71
+ ps = st[k]
72
+ #setting the tagged string
73
+ tagged_tokens[i] = ps
74
+ if(ps.include?("/"))
75
+ ps = ps[0..ps.index("/")-1]
76
+ end
77
+ #removing punctuations
78
+ if(ps.include?("."))
79
+ tokens[i] = ps[0..ps.index(".")-1]
80
+ elsif(ps.include?(","))
81
+ tokens[i] = ps.gsub(",", "")
82
+ elsif(ps.include?("!"))
83
+ tokens[i] = ps.gsub("!", "")
84
+ elsif(ps.include?(";"))
85
+ tokens[i] = ps.gsub(";", "")
86
+ else
87
+ tokens[i] = ps
88
+ i+=1
89
+ end
90
+ end#end of the for loop
91
+
92
+ #iterating through the tokens to determine state
93
+ prev_negative_word =""
94
+ for j in (0..i-1)
95
+ #checking type of the word
96
+ #checking for negated words
97
+ if(is_negative_word(tokens[j]) == NEGATED)
98
+ returned_type = NEGATIVE_WORD
99
+ #checking for a negative descriptor (indirect indicators of negation)
100
+ elsif(is_negative_descriptor(tokens[j]) == NEGATED)
101
+ returned_type = NEGATIVE_DESCRIPTOR
102
+ #2-gram phrases of negative phrases
103
+ elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
104
+ is_negative_phrase(tokens[j]+" "+tokens[j+1]) == NEGATED)
105
+ returned_type = NEGATIVE_PHRASE
106
+ j = j+1
107
+ #if suggestion word is found
108
+ elsif(is_suggestive(tokens[j]) == SUGGESTIVE)
109
+ returned_type = SUGGESTIVE
110
+ #2-gram phrases suggestion phrases
111
+ elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
112
+ is_suggestive_phrase(tokens[j]+" "+tokens[j+1]) == SUGGESTIVE)
113
+ returned_type = SUGGESTIVE
114
+ j = j+1
115
+ #else set to positive
116
+ else
117
+ returned_type = POSITIVE
118
+ end
119
+
120
+ #----------------------------------------------------------------------
121
+ #comparing 'returnedType' with the existing STATE of the sentence clause
122
+ #after returnedType is identified, check its state and compare it to the existing state
123
+ #if present state is negative and an interim non-negative or non-suggestive word was found, set the flag to true
124
+ if((state == NEGATIVE_WORD or state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_PHRASE) and returned_type == POSITIVE)
125
+ if(interim_noun_verb == false and (tagged_tokens[j].include?("NN") or tagged_tokens[j].include?("PR") or tagged_tokens[j].include?("VB") or tagged_tokens[j].include?("MD")))
126
+ interim_noun_verb = true
127
+ end
128
+ end
129
+
130
+ if(state == POSITIVE and returned_type != POSITIVE)
131
+ state = returned_type
132
+ #when state is a negative word
133
+ elsif(state == NEGATIVE_WORD) #previous state
134
+ if(returned_type == NEGATIVE_WORD)
135
+ #these words embellish the negation, so only if the previous word was not one of them you make it positive
136
+ if(prev_negative_word.casecmp("NO") != 0 and prev_negative_word.casecmp("NEVER") != 0 and prev_negative_word.casecmp("NONE") != 0)
137
+ state = POSITIVE #e.g: "not had no work..", "doesn't have no work..", "its not that it doesn't bother me..."
138
+ else
139
+ state = NEGATIVE_WORD #e.g: "no it doesn't help", "no there is no use for ..."
140
+ end
141
+ interim_noun_verb = false #resetting
142
+ elsif(returned_type == NEGATIVE_DESCRIPTOR or returned_type == NEGATIVE_PHRASE)
143
+ state = POSITIVE #e.g.: "not bad", "not taken from", "I don't want nothing", "no code duplication"// ["It couldn't be more confusing.."- anomaly we dont handle this for now!]
144
+ interim_noun_verb = false #resetting
145
+ elsif(returned_type == SUGGESTIVE)
146
+ #e.g. " it is not too useful as people could...", what about this one?
147
+ if(interim_noun_verb == true) #there are some words in between
148
+ state = NEGATIVE_WORD
149
+ else
150
+ state = SUGGESTIVE #e.g.:"I do not(-) suggest(S) ..."
151
+ end
152
+ interim_noun_verb = false #resetting
153
+ end
154
+ #when state is a negative descriptor
155
+ elsif(state == NEGATIVE_DESCRIPTOR)
156
+ if(returned_type == NEGATIVE_WORD)
157
+ if(interim_noun_verb == true)#there are some words in between
158
+ state = NEGATIVE_WORD #e.g: "hard(-) to understand none(-) of the comments"
159
+ else
160
+ state = POSITIVE #e.g."He hardly not...."
161
+ end
162
+ interim_noun_verb = false #resetting
163
+ elsif(returned_type == NEGATIVE_DESCRIPTOR)
164
+ if(interim_noun_verb == true)#there are some words in between
165
+ state = NEGATIVE_DESCRIPTOR #e.g:"there is barely any code duplication"
166
+ else
167
+ state = POSITIVE #e.g."It is hardly confusing..", but what about "it is a little confusing.."
168
+ end
169
+ interim_noun_verb = false #resetting
170
+ elsif(returned_type == NEGATIVE_PHRASE)
171
+ if(interim_noun_verb == true)#there are some words in between
172
+ state = NEGATIVE_PHRASE #e.g:"there is barely any code duplication"
173
+ else
174
+ state = POSITIVE #e.g.:"it is hard and appears to be taken from"
175
+ end
176
+ interim_noun_verb = false #resetting
177
+ elsif(returned_type == SUGGESTIVE)
178
+ state = SUGGESTIVE #e.g.:"I hardly(-) suggested(S) ..."
179
+ interim_noun_verb = false #resetting
180
+ end
181
+ #when state is a negative phrase
182
+ elsif(state == NEGATIVE_PHRASE)
183
+ if(returned_type == NEGATIVE_WORD)
184
+ if(interim_noun_verb == true)#there are some words in between
185
+ state = NEGATIVE_WORD #e.g."It is too short the text and doesn't"
186
+ else
187
+ state = POSITIVE #e.g."It is too short not to contain.."
188
+ end
189
+ interim_noun_verb = false #resetting
190
+ elsif(returned_type == NEGATIVE_DESCRIPTOR)
191
+ state = NEGATIVE_DESCRIPTOR #e.g."It is too short barely covering..."
192
+ interim_noun_verb = false #resetting
193
+ elsif(returned_type == NEGATIVE_PHRASE)
194
+ state = NEGATIVE_PHRASE #e.g.:"it is too short, taken from ..."
195
+ interim_noun_verb = false #resetting
196
+ elsif(returned_type == SUGGESTIVE)
197
+ state = SUGGESTIVE #e.g.:"I too short and I suggest ..."
198
+ interim_noun_verb = false #resetting
199
+ end
200
+ #when state is suggestive
201
+ elsif(state == SUGGESTIVE) #e.g.:"I might(S) not(-) suggest(S) ..."
202
+ if(returned_type == NEGATIVE_DESCRIPTOR)
203
+ state = NEGATIVE_DESCRIPTOR
204
+ elsif(returned_type == NEGATIVE_PHRASE)
205
+ state = NEGATIVE_PHRASE
206
+ end
207
+ #e.g.:"I suggest you don't.." -> suggestive
208
+ interim_noun_verb = false #resetting
209
+ end
210
+
211
+ #setting the prevNegativeWord
212
+ if(tokens[j].casecmp("NO") == 0 or tokens[j].casecmp("NEVER") == 0 or tokens[j].casecmp("NONE") == 0)
213
+ prev_negative_word = tokens[j]
214
+ end
215
+
216
+ end #end of for loop
217
+
218
+ if(state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_WORD or state == NEGATIVE_PHRASE)
219
+ state = NEGATED
220
+ end
221
+
222
+ return state
223
+ end
224
+
225
+ #------------------------------------------#------------------------------------------
226
+
227
+ #Checking if the token is a negative token
228
+ def is_negative_word(word)
229
+ not_negated = POSITIVE
230
+ for i in (0..NEGATED_WORDS.length - 1)
231
+ if(word.casecmp(NEGATED_WORDS[i]) == 0)
232
+ not_negated = NEGATED #indicates negation found
233
+ break
234
+ end
235
+ end
236
+ return not_negated
237
+ end
238
+ #------------------------------------------#------------------------------------------
239
+
240
+ #Checking if the token is a negative token
241
+ def is_negative_descriptor(word)
242
+ not_negated = POSITIVE
243
+ for i in (0..NEGATIVE_DESCRIPTORS.length - 1)
244
+ if(word.casecmp(NEGATIVE_DESCRIPTORS[i]) == 0)
245
+ not_negated = NEGATED #indicates negation found
246
+ break
247
+ end
248
+ end
249
+ return not_negated
250
+ end
251
+
252
+ #------------------------------------------#------------------------------------------
253
+
254
+ #Checking if the phrase is negative
255
+ def is_negative_phrase(phrase)
256
+ not_negated = POSITIVE
257
+ for i in (0..NEGATIVE_PHRASES.length - 1)
258
+ if(phrase.casecmp(NEGATIVE_PHRASES[i]) == 0)
259
+ not_negated = NEGATED #indicates negation found
260
+ break
261
+ end
262
+ end
263
+ return not_negated
264
+ end
265
+
266
+ #------------------------------------------#------------------------------------------
267
+ #Checking if the token is a suggestive token
268
+ def is_suggestive(word)
269
+ not_suggestive = POSITIVE
270
+ #puts "inside is_suggestive for token:: #{word}"
271
+ for i in (0..SUGGESTIVE_WORDS.length - 1)
272
+ if(word.casecmp(SUGGESTIVE_WORDS[i]) == 0)
273
+ not_suggestive = SUGGESTIVE #indicates negation found
274
+ break
275
+ end
276
+ end
277
+ return not_suggestive
278
+ end
279
+ #------------------------------------------#------------------------------------------
280
+
281
+ #Checking if the PHRASE is suggestive
282
+ def is_suggestive_phrase(phrase)
283
+ not_suggestive = POSITIVE
284
+ for i in (0..SUGGESTIVE_PHRASES.length - 1)
285
+ if(phrase.casecmp(SUGGESTIVE_PHRASES[i]) == 0)
286
+ not_suggestive = SUGGESTIVE #indicates negation found
287
+ break
288
+ end
289
+ end
290
+ return not_suggestive
291
+ end
292
+
293
+ end #end of the class
@@ -0,0 +1,342 @@
1
+ require 'automated_metareview/constants'
2
+ require 'automated_metareview/edge'
3
+ require 'automated_metareview/vertex'
4
+
5
+ class TextPreprocessing
6
+
7
+ =begin
8
+ Fetching review data from the tables based on the response_map id
9
+ =end
10
+ def fetch_review_data(auto_metareview, map_id)
11
+ reviews = Array.new
12
+ responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
13
+ auto_metareview.responses = responses
14
+ auto_metareview.response_id = responses.id
15
+ # puts "auto_metareview.response_id #{auto_metareview.response_id}"
16
+ # puts "responses updated_at #{responses.updated_at}"
17
+ responses.scores.each{
18
+ | review_score |
19
+ if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
20
+ # puts review_score.comments
21
+ reviews << review_score.comments
22
+ end
23
+ }
24
+ return reviews
25
+ end
26
+ #------------------------------------------#------------------------------------------#------------------------------------------
27
+ =begin
28
+ Fetching submission data from the url submitted by the reviewee
29
+ =end
30
+ def fetch_submission_data(map_id)
31
+ subm_array = Array.new
32
+ response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
33
+ reviewee_id = response_map.reviewee_id
34
+ reviewed_object = response_map.reviewed_object_id
35
+ url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
36
+ if(url.nil?)#in case of team assignments
37
+ teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
38
+ teams_users.each{
39
+ |team_user|
40
+ url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
41
+ if(!url.nil?)#break out when you find the url
42
+ break
43
+ end
44
+ }
45
+ end
46
+ # puts "***url #{url} #{url}"
47
+ #fetching the url submitted by the reviewee
48
+ url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
49
+ # puts "***url #{url} #{url.class}"
50
+ page = Nokogiri::HTML(open(url))
51
+ #fetching the paragraph texts from the specified url
52
+ if(page.css('p').count != 0)
53
+ page.css('p').each do |subm|
54
+ # puts "subm.text.. #{subm.text}"
55
+ subm_array << subm.text
56
+ end
57
+ end
58
+ #for google docs where the text is placed inside <script></script> tags
59
+ if(page.css('script').count != 0)
60
+ page.css('script').each do |subm|
61
+ if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
62
+ subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
63
+ end
64
+ end
65
+ end
66
+ return subm_array
67
+ end
68
+ #------------------------------------------#------------------------------------------#------------------------------------------
69
+ =begin
70
+ pre-processes the review text and sends it in for graph formation and further analysis
71
+ =end
72
+ def segment_text(flag, text_array)
73
+ if(flag == 0)
74
+ reviews = Array.new(1){Array.new}
75
+ else
76
+ reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
77
+ end
78
+
79
+ i = 0
80
+ j = 0
81
+
82
+ for k in (0..text_array.length-1)
83
+ text = text_array[k]
84
+ if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
85
+ reviews[j] = Array.new #initializing the array for sentences in a test review
86
+ i = 0
87
+ end
88
+
89
+ #******* Pre-processing the review/submission text **********
90
+ #replacing commas in large numbers, makes parsing sentences with commas confusing!
91
+ #replacing quotation marks
92
+ text.gsub!("\"", "")
93
+ text.gsub!("(", "")
94
+ text.gsub!(")", "")
95
+ if(text.include?("http://"))
96
+ text = remove_urls(text)
97
+ end
98
+ #break the text into multiple sentences
99
+ beginn = 0
100
+ if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
101
+ while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
102
+ endd = 0
103
+ #these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
104
+ if(text.include?("."))
105
+ endd = text.index(".")
106
+ end
107
+ if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
108
+ endd = text.index("?")
109
+ end
110
+ if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
111
+ endd = text.index("!")
112
+ end
113
+ if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
114
+ endd = text.index(",")
115
+ end
116
+ if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
117
+ endd = text.index(";")
118
+ end
119
+
120
+ #check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
121
+ if(flag == 0) #training
122
+ reviews[0][i] = text[beginn..endd].strip
123
+ else #testing
124
+ reviews[j][i] = text[beginn..endd].strip
125
+ end
126
+ i+=1 #incrementing the sentence counter
127
+ text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
128
+ end #end of the while loop
129
+ else #if there is only 1 sentence in the text
130
+ if(flag == 0)#training
131
+ reviews[0][i] = text.strip
132
+ i+=1 #incrementing the sentence counter
133
+ else #testing
134
+ reviews[j][i] = text.strip
135
+ end
136
+ end
137
+
138
+ if(flag == 1)#incrementing reviews counter only for test reviews
139
+ j+=1
140
+ end
141
+ end #end of the for loop with 'k' reading text rows
142
+
143
+ #setting the number of reviews before returning
144
+ if(flag == 0)#training
145
+ num_reviews = 1 #for training the number of reviews is 1
146
+ else #testing
147
+ num_reviews = j
148
+ end
149
+
150
+ if(flag == 0)
151
+ return reviews[0]
152
+ end
153
+ end
154
+ #------------------------------------------#------------------------------------------#------------------------------------------
155
+ =begin
156
+ * Reads the patterns from the csv file containing them.
157
+ * maxValue is the maximum value of the patterns found
158
+ =end
159
+
160
+ def read_patterns(filename, pos)
161
+ num = 1000 #some large number
162
+ patterns = Array.new
163
+ state = POSITIVE
164
+ i = 0 #keeps track of the number of edges
165
+
166
+ #setting the state for problem detection and suggestive patterns
167
+ if(filename.include?("prob"))
168
+ state = NEGATED
169
+ elsif(filename.include?("suggest"))
170
+ state = SUGGESTIVE
171
+ end
172
+
173
+ FasterCSV.foreach(filename) do |text|
174
+ in_vertex = text[0][0..text[0].index("=")-1].strip
175
+ out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
176
+
177
+ first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
178
+ first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
179
+
180
+ patterns[i] = Edge.new("noun", NOUN)
181
+ #setting the invertex
182
+ if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
183
+ patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
184
+ elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
185
+ patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
186
+ elsif(first_string_in_vertex.include?("JJ"))
187
+ patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
188
+ elsif(first_string_in_vertex.include?("/RB"))
189
+ patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
190
+ else #default to noun
191
+ patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
192
+ end
193
+
194
+ #setting outvertex
195
+ if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
196
+ patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
197
+ elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
198
+ patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
199
+ elsif(first_string_out_vertex.include?("JJ"))
200
+ patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
201
+ elsif(first_string_out_vertex.include?("/RB"))
202
+ patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
203
+ else #default is noun
204
+ patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
205
+ end
206
+ i+=1 #incrementing for each pattern
207
+ end #end of the FasterCSV.foreach loop
208
+ num_patterns = i
209
+ return patterns
210
+ end
211
+
212
+ #------------------------------------------#------------------------------------------#------------------------------------------
213
+
214
+ =begin
215
+ Removes any urls in the text and returns the remaining text as it is
216
+ =end
217
+ def remove_urls(text)
218
+ final_text = String.new
219
+ if(text.include?("http://"))
220
+ tokens = text.split(" ")
221
+ tokens.each{
222
+ |token|
223
+ if(!token.include?("http://"))
224
+ final_text = final_text + " " + token
225
+ end
226
+ }
227
+ else
228
+ return text
229
+ end
230
+ return final_text
231
+ end
232
+ #------------------------------------------#------------------------------------------#------------------------------------------
233
+
234
+ =begin
235
+ Check for plagiarism after removing text within quotes for reviews
236
+ =end
237
+ def remove_text_within_quotes(review_text)
238
+ # puts "Inside removeTextWithinQuotes:: "
239
+ reviews = Array.new
240
+ review_text.each{ |row|
241
+ # puts "row #{row}"
242
+ text = row
243
+ #text = text[1..text.length-2] #since the first and last characters are quotes
244
+ #puts "text #{text}"
245
+ #the read text is tagged with two sets of quotes!
246
+ if(text.include?("\""))
247
+ while(text.include?("\"")) do
248
+ replace_text = text.scan(/"([^"]*)"/)
249
+ # puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
250
+ # puts text.index(replace_text[0].to_s)
251
+ # puts "replace_text length .. #{replace_text[0].to_s.length}"
252
+ #fetching the start index of the quoted text, in order to replace the complete segment
253
+ start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
254
+ # puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
255
+ #replacing the text segment within the quotes (including the quotes) with an empty string
256
+ text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
257
+ # puts "text .. #{text}"
258
+ end #end of the while loop
259
+ end
260
+ reviews << text #set the text after all quoted segments have been removed.
261
+ } #end of the loop for "text" array
262
+ # puts "returning reviews length .. #{reviews.length}"
263
+ return reviews #return only the first array element - a string!
264
+ end
265
+ #------------------------------------------#------------------------------------------#------------------------------------------
266
+ =begin
267
+ Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
268
+ =end
269
+ def check_correct_spellings(review_text_array, speller)
270
+ review_text_array_temp = Array.new
271
+ #iterating through each response
272
+ review_text_array.each{
273
+ |review_text|
274
+ review_tokens = review_text.split(" ")
275
+ review_text_temp = ""
276
+ #iterating through tokens from each response
277
+ review_tokens.each{
278
+ |review_tok|
279
+ #checkiing the stem word's spelling for correctness
280
+ if(!speller.check(review_tok))
281
+ if(!speller.suggest(review_tok).first.nil?)
282
+ review_tok = speller.suggest(review_tok).first
283
+ end
284
+ end
285
+ review_text_temp = review_text_temp +" " + review_tok.downcase
286
+ }
287
+ review_text_array_temp << review_text_temp
288
+ }
289
+ return review_text_array_temp
290
+ end
291
+
292
+ #------------------------------------------#------------------------------------------#------------------------------------------
293
+ =begin
294
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
295
+ =end
296
+ public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
297
+ def contains_punct(str)
298
+ if(str.include?".")
299
+ str.gsub!(".","")
300
+ elsif(str.include?",")
301
+ str.gsub!(",","")
302
+ elsif(str.include?"?")
303
+ str.gsub!("?","")
304
+ elsif(str.include?"!")
305
+ str.gsub!("!","")
306
+ elsif(str.include?";")
307
+ str.gsub(";","")
308
+ elsif(str.include?":")
309
+ str.gsub!(":","")
310
+ elsif(str.include?"(")
311
+ str.gsub!("(","")
312
+ elsif(str.include?")")
313
+ str.gsub!(")","")
314
+ elsif(str.include?"[")
315
+ str.gsub!("[","")
316
+ elsif(str.include?"]")
317
+ str.gsub!("]","")
318
+ end
319
+ return str
320
+ end
321
+
322
+ def contains_punct_bool(str)
323
+ if(str.include?("\\n") or str.include?("}") or str.include?("{"))
324
+ return true
325
+ else
326
+ return false
327
+ end
328
+ end
329
+
330
+ #------------------------------------------#------------------------------------------#------------------------------------------
331
+ =begin
332
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
333
+ =end
334
+ def is_punct(str)
335
+ if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
336
+ return true
337
+ else
338
+ return false
339
+ end
340
+ end
341
+
342
+ end #end of class