automated_metareview 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,121 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/constants'
3
+
4
+ class PredictClass
5
+ =begin
6
+ Identifies the probabilities of a review belonging to each of the three classes.
7
+ Returns an array of probablities (length = numClasses)
8
+ =end
9
+ #predicting the review's class
10
+ def predict_classes(pos_tagger, core_NLP_tagger, review_text, review_graph, pattern_files_array, num_classes)
11
+ #reading the patterns from the pattern files
12
+ patterns_files = Array.new
13
+ pattern_files_array.each do |file|
14
+ patterns_files << file #collecting the file names for each class of patterns
15
+ end
16
+
17
+ tc = TextPreprocessing.new
18
+ single_patterns = Array.new(num_classes){Array.new}
19
+ #reading the patterns from each of the pattern files
20
+ for i in (0..num_classes - 1) #for every class
21
+ #read_patterns in TextPreprocessing helps read patterns in the format 'X = Y'
22
+ single_patterns[i] = tc.read_patterns(patterns_files[i], pos_tagger)
23
+ end
24
+
25
+ #Predicting the probability of the review belonging to each of the content classes
26
+ wordnet = WordnetBasedSimilarity.new
27
+ max_probability = 0.0
28
+ class_value = 0
29
+ edges = review_graph.edges
30
+ class_prob = Array.new #contains the probabilities for each of the classes - it contains 3 rows for the 3 classes
31
+ #comparing each test review text with patterns from each of the classes
32
+ for k in (0..num_classes - 1)
33
+ #comparing edges with patterns from a particular class
34
+ class_prob[k] = compare_review_with_patterns(edges, single_patterns[k], wordnet)/6.to_f #normalizing the result
35
+ #we divide the match by 6 to ensure the value is in the range of [0-1]
36
+ end #end of for loop for the classes
37
+
38
+ #printing the probability values
39
+ # puts("########## Probability for test review:: "+review_text[0]+" is::")
40
+ # for k in (0..num_classes - 1)
41
+ # puts "class_prob[#{k}] .. #{class_prob[k]}"
42
+ # end
43
+ return class_prob
44
+ end #end of the prediction method
45
+ #------------------------------------------#------------------------------------------#------------------------------------------
46
+
47
+ def compare_review_with_patterns(single_edges, single_patterns, wordnet)
48
+ final_class_sum = 0.0
49
+ final_edge_num = 0
50
+ single_edge_matches = Array.new(single_edges.length){Array.new}
51
+ #resetting the average_match values for all the edges, before matching with the single_patterns for a new class
52
+ for i in 0..single_edges.length - 1
53
+ if(!single_edges[i].nil?)
54
+ single_edges[i].average_match = 0
55
+ end
56
+ end
57
+
58
+ #comparing each single edge with all the patterns
59
+ for i in (0..single_edges.length - 1) #iterating through the single edges
60
+ max_match = 0
61
+ if(!single_edges[i].nil?)
62
+ for j in (0..single_patterns.length - 1)
63
+ if(!single_patterns[j].nil?)
64
+ single_edge_matches[i][j] = compare_edges(single_edges[i], single_patterns[j], wordnet)
65
+ if(single_edge_matches[i][j] > max_match)
66
+ max_match = single_edge_matches[i][j]
67
+ end
68
+ end
69
+ end #end of for loop for the patterns
70
+ single_edges[i].average_match = max_match
71
+
72
+ #calculating class average
73
+ if(single_edges[i].average_match != 0.0)
74
+ final_class_sum = final_class_sum + single_edges[i].average_match
75
+ final_edge_num+=1
76
+ end
77
+ end #end of the if condition
78
+ end #end of for loop
79
+
80
+ if(final_edge_num == 0)
81
+ final_edge_num = 1
82
+ end
83
+
84
+ # puts("final_class_sum:: #{final_class_sum} final_edge_num:: #{final_edge_num} Class average #{final_class_sum/final_edge_num}")
85
+ return final_class_sum/final_edge_num #maxMatch
86
+ end #end of determineClass method
87
+ #------------------------------------------#------------------------------------------#------------------------------------------
88
+
89
+ def compare_edges(e1, e2, wordnet)
90
+ speller = Aspell.new("en_US")
91
+ speller.suggestion_mode = Aspell::NORMAL
92
+
93
+ avg_match_without_syntax = 0
94
+ #compare edges so that only non-nouns or non-subjects are compared
95
+ # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
96
+ avg_match_without_syntax = (wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller) +
97
+ wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller))/2.to_f
98
+ # elsif(!e1.in_vertex.pos_tag.include?("NN"))
99
+ # avg_match_without_syntax = wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller)
100
+ # elsif(!e1.out_vertex.pos_tag.include?("NN"))
101
+ # avg_match_without_syntax = wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller)
102
+ # end
103
+
104
+ avg_match_with_syntax = 0
105
+ #matching in-out and out-in vertices
106
+ # if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
107
+ avg_match_with_syntax = (wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller) +
108
+ wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller))/2.to_f
109
+ # elsif(!e1.in_vertex.pos_tag.include?("NN"))
110
+ # avg_match_with_syntax = wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller)
111
+ # elsif(!e1.out_vertex.pos_tag.include?("NN"))
112
+ # avg_match_with_syntax = wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller)
113
+ # end
114
+
115
+ if(avg_match_without_syntax > avg_match_with_syntax)
116
+ return avg_match_without_syntax
117
+ else
118
+ return avg_match_with_syntax
119
+ end
120
+ end #end of the compare_edges method
121
+ end
@@ -0,0 +1,293 @@
1
+ require 'automated_metareview/negations'
2
+ require 'automated_metareview/constants'
3
+
4
+ class SentenceState
5
+ attr_accessor :broken_sentences
6
+ def identify_sentence_state(str_with_pos_tags)
7
+ # puts("**** Inside identify_sentence_state #{str_with_pos_tags}")
8
+ #break the sentence at the co-ordinating conjunction
9
+ num_conjunctions = break_at_coordinating_conjunctions(str_with_pos_tags)
10
+
11
+ states_array = Array.new
12
+ if(@broken_sentences == nil)
13
+ states_array[0] = sentence_state(str_with_pos_tags)
14
+ #identifying states for each of the sentence segments
15
+ else
16
+ for i in (0..num_conjunctions)
17
+ if(!@broken_sentences[i].nil?)
18
+ states_array[i] = sentence_state(@broken_sentences[i])
19
+ end
20
+ end
21
+ end
22
+ return states_array
23
+ end #end of the methods
24
+ #------------------------------------------#------------------------------------------
25
+ def break_at_coordinating_conjunctions(str_with_pos_tags)
26
+ st = str_with_pos_tags.split(" ")
27
+ count = st.length
28
+ counter = 0
29
+
30
+ @broken_sentences = Array.new
31
+ #if the sentence contains a co-ordinating conjunction
32
+ if(str_with_pos_tags.include?("CC"))
33
+ counter = 0
34
+ temp = ""
35
+ for i in (0..count-1)
36
+ ps = st[i]
37
+ if(!ps.nil? and ps.include?("CC"))
38
+ @broken_sentences[counter] = temp #for "run/NN on/IN..."
39
+ counter+=1
40
+ temp = ps[0..ps.index("/")]
41
+ #the CC or IN goes as part of the following sentence
42
+ elsif (!ps.nil? and !ps.include?("CC"))
43
+ temp = temp +" "+ ps[0..ps.index("/")]
44
+ end
45
+ end
46
+ if(!temp.empty?) #setting the last sentence segment
47
+ @broken_sentences[counter] = temp
48
+ counter+=1
49
+ end
50
+ else
51
+ @broken_sentences[counter] = str_with_pos_tags
52
+ counter+=1
53
+ end
54
+ return counter
55
+ end #end of the method
56
+ #------------------------------------------#------------------------------------------
57
+
58
+ #Checking if the token is a negative token
59
+ def sentence_state(str_with_pos_tags)
60
+ state = POSITIVE
61
+ #checking single tokens for negated words
62
+ st = str_with_pos_tags.split(" ")
63
+ count = st.length
64
+ tokens = Array.new
65
+ tagged_tokens = Array.new
66
+ i = 0
67
+ interim_noun_verb = false #0 indicates no interim nouns or verbs
68
+
69
+ #fetching all the tokens
70
+ for k in (0..st.length-1)
71
+ ps = st[k]
72
+ #setting the tagged string
73
+ tagged_tokens[i] = ps
74
+ if(ps.include?("/"))
75
+ ps = ps[0..ps.index("/")-1]
76
+ end
77
+ #removing punctuations
78
+ if(ps.include?("."))
79
+ tokens[i] = ps[0..ps.index(".")-1]
80
+ elsif(ps.include?(","))
81
+ tokens[i] = ps.gsub(",", "")
82
+ elsif(ps.include?("!"))
83
+ tokens[i] = ps.gsub("!", "")
84
+ elsif(ps.include?(";"))
85
+ tokens[i] = ps.gsub(";", "")
86
+ else
87
+ tokens[i] = ps
88
+ i+=1
89
+ end
90
+ end#end of the for loop
91
+
92
+ #iterating through the tokens to determine state
93
+ prev_negative_word =""
94
+ for j in (0..i-1)
95
+ #checking type of the word
96
+ #checking for negated words
97
+ if(is_negative_word(tokens[j]) == NEGATED)
98
+ returned_type = NEGATIVE_WORD
99
+ #checking for a negative descriptor (indirect indicators of negation)
100
+ elsif(is_negative_descriptor(tokens[j]) == NEGATED)
101
+ returned_type = NEGATIVE_DESCRIPTOR
102
+ #2-gram phrases of negative phrases
103
+ elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
104
+ is_negative_phrase(tokens[j]+" "+tokens[j+1]) == NEGATED)
105
+ returned_type = NEGATIVE_PHRASE
106
+ j = j+1
107
+ #if suggestion word is found
108
+ elsif(is_suggestive(tokens[j]) == SUGGESTIVE)
109
+ returned_type = SUGGESTIVE
110
+ #2-gram phrases suggestion phrases
111
+ elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
112
+ is_suggestive_phrase(tokens[j]+" "+tokens[j+1]) == SUGGESTIVE)
113
+ returned_type = SUGGESTIVE
114
+ j = j+1
115
+ #else set to positive
116
+ else
117
+ returned_type = POSITIVE
118
+ end
119
+
120
+ #----------------------------------------------------------------------
121
+ #comparing 'returnedType' with the existing STATE of the sentence clause
122
+ #after returnedType is identified, check its state and compare it to the existing state
123
+ #if present state is negative and an interim non-negative or non-suggestive word was found, set the flag to true
124
+ if((state == NEGATIVE_WORD or state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_PHRASE) and returned_type == POSITIVE)
125
+ if(interim_noun_verb == false and (tagged_tokens[j].include?("NN") or tagged_tokens[j].include?("PR") or tagged_tokens[j].include?("VB") or tagged_tokens[j].include?("MD")))
126
+ interim_noun_verb = true
127
+ end
128
+ end
129
+
130
+ if(state == POSITIVE and returned_type != POSITIVE)
131
+ state = returned_type
132
+ #when state is a negative word
133
+ elsif(state == NEGATIVE_WORD) #previous state
134
+ if(returned_type == NEGATIVE_WORD)
135
+ #these words embellish the negation, so only if the previous word was not one of them you make it positive
136
+ if(prev_negative_word.casecmp("NO") != 0 and prev_negative_word.casecmp("NEVER") != 0 and prev_negative_word.casecmp("NONE") != 0)
137
+ state = POSITIVE #e.g: "not had no work..", "doesn't have no work..", "its not that it doesn't bother me..."
138
+ else
139
+ state = NEGATIVE_WORD #e.g: "no it doesn't help", "no there is no use for ..."
140
+ end
141
+ interim_noun_verb = false #resetting
142
+ elsif(returned_type == NEGATIVE_DESCRIPTOR or returned_type == NEGATIVE_PHRASE)
143
+ state = POSITIVE #e.g.: "not bad", "not taken from", "I don't want nothing", "no code duplication"// ["It couldn't be more confusing.."- anomaly we dont handle this for now!]
144
+ interim_noun_verb = false #resetting
145
+ elsif(returned_type == SUGGESTIVE)
146
+ #e.g. " it is not too useful as people could...", what about this one?
147
+ if(interim_noun_verb == true) #there are some words in between
148
+ state = NEGATIVE_WORD
149
+ else
150
+ state = SUGGESTIVE #e.g.:"I do not(-) suggest(S) ..."
151
+ end
152
+ interim_noun_verb = false #resetting
153
+ end
154
+ #when state is a negative descriptor
155
+ elsif(state == NEGATIVE_DESCRIPTOR)
156
+ if(returned_type == NEGATIVE_WORD)
157
+ if(interim_noun_verb == true)#there are some words in between
158
+ state = NEGATIVE_WORD #e.g: "hard(-) to understand none(-) of the comments"
159
+ else
160
+ state = POSITIVE #e.g."He hardly not...."
161
+ end
162
+ interim_noun_verb = false #resetting
163
+ elsif(returned_type == NEGATIVE_DESCRIPTOR)
164
+ if(interim_noun_verb == true)#there are some words in between
165
+ state = NEGATIVE_DESCRIPTOR #e.g:"there is barely any code duplication"
166
+ else
167
+ state = POSITIVE #e.g."It is hardly confusing..", but what about "it is a little confusing.."
168
+ end
169
+ interim_noun_verb = false #resetting
170
+ elsif(returned_type == NEGATIVE_PHRASE)
171
+ if(interim_noun_verb == true)#there are some words in between
172
+ state = NEGATIVE_PHRASE #e.g:"there is barely any code duplication"
173
+ else
174
+ state = POSITIVE #e.g.:"it is hard and appears to be taken from"
175
+ end
176
+ interim_noun_verb = false #resetting
177
+ elsif(returned_type == SUGGESTIVE)
178
+ state = SUGGESTIVE #e.g.:"I hardly(-) suggested(S) ..."
179
+ interim_noun_verb = false #resetting
180
+ end
181
+ #when state is a negative phrase
182
+ elsif(state == NEGATIVE_PHRASE)
183
+ if(returned_type == NEGATIVE_WORD)
184
+ if(interim_noun_verb == true)#there are some words in between
185
+ state = NEGATIVE_WORD #e.g."It is too short the text and doesn't"
186
+ else
187
+ state = POSITIVE #e.g."It is too short not to contain.."
188
+ end
189
+ interim_noun_verb = false #resetting
190
+ elsif(returned_type == NEGATIVE_DESCRIPTOR)
191
+ state = NEGATIVE_DESCRIPTOR #e.g."It is too short barely covering..."
192
+ interim_noun_verb = false #resetting
193
+ elsif(returned_type == NEGATIVE_PHRASE)
194
+ state = NEGATIVE_PHRASE #e.g.:"it is too short, taken from ..."
195
+ interim_noun_verb = false #resetting
196
+ elsif(returned_type == SUGGESTIVE)
197
+ state = SUGGESTIVE #e.g.:"I too short and I suggest ..."
198
+ interim_noun_verb = false #resetting
199
+ end
200
+ #when state is suggestive
201
+ elsif(state == SUGGESTIVE) #e.g.:"I might(S) not(-) suggest(S) ..."
202
+ if(returned_type == NEGATIVE_DESCRIPTOR)
203
+ state = NEGATIVE_DESCRIPTOR
204
+ elsif(returned_type == NEGATIVE_PHRASE)
205
+ state = NEGATIVE_PHRASE
206
+ end
207
+ #e.g.:"I suggest you don't.." -> suggestive
208
+ interim_noun_verb = false #resetting
209
+ end
210
+
211
+ #setting the prevNegativeWord
212
+ if(tokens[j].casecmp("NO") == 0 or tokens[j].casecmp("NEVER") == 0 or tokens[j].casecmp("NONE") == 0)
213
+ prev_negative_word = tokens[j]
214
+ end
215
+
216
+ end #end of for loop
217
+
218
+ if(state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_WORD or state == NEGATIVE_PHRASE)
219
+ state = NEGATED
220
+ end
221
+
222
+ return state
223
+ end
224
+
225
+ #------------------------------------------#------------------------------------------
226
+
227
+ #Checking if the token is a negative token
228
+ def is_negative_word(word)
229
+ not_negated = POSITIVE
230
+ for i in (0..NEGATED_WORDS.length - 1)
231
+ if(word.casecmp(NEGATED_WORDS[i]) == 0)
232
+ not_negated = NEGATED #indicates negation found
233
+ break
234
+ end
235
+ end
236
+ return not_negated
237
+ end
238
+ #------------------------------------------#------------------------------------------
239
+
240
+ #Checking if the token is a negative token
241
+ def is_negative_descriptor(word)
242
+ not_negated = POSITIVE
243
+ for i in (0..NEGATIVE_DESCRIPTORS.length - 1)
244
+ if(word.casecmp(NEGATIVE_DESCRIPTORS[i]) == 0)
245
+ not_negated = NEGATED #indicates negation found
246
+ break
247
+ end
248
+ end
249
+ return not_negated
250
+ end
251
+
252
+ #------------------------------------------#------------------------------------------
253
+
254
+ #Checking if the phrase is negative
255
+ def is_negative_phrase(phrase)
256
+ not_negated = POSITIVE
257
+ for i in (0..NEGATIVE_PHRASES.length - 1)
258
+ if(phrase.casecmp(NEGATIVE_PHRASES[i]) == 0)
259
+ not_negated = NEGATED #indicates negation found
260
+ break
261
+ end
262
+ end
263
+ return not_negated
264
+ end
265
+
266
+ #------------------------------------------#------------------------------------------
267
+ #Checking if the token is a suggestive token
268
+ def is_suggestive(word)
269
+ not_suggestive = POSITIVE
270
+ #puts "inside is_suggestive for token:: #{word}"
271
+ for i in (0..SUGGESTIVE_WORDS.length - 1)
272
+ if(word.casecmp(SUGGESTIVE_WORDS[i]) == 0)
273
+ not_suggestive = SUGGESTIVE #indicates negation found
274
+ break
275
+ end
276
+ end
277
+ return not_suggestive
278
+ end
279
+ #------------------------------------------#------------------------------------------
280
+
281
+ #Checking if the PHRASE is suggestive
282
+ def is_suggestive_phrase(phrase)
283
+ not_suggestive = POSITIVE
284
+ for i in (0..SUGGESTIVE_PHRASES.length - 1)
285
+ if(phrase.casecmp(SUGGESTIVE_PHRASES[i]) == 0)
286
+ not_suggestive = SUGGESTIVE #indicates negation found
287
+ break
288
+ end
289
+ end
290
+ return not_suggestive
291
+ end
292
+
293
+ end #end of the class
@@ -0,0 +1,342 @@
1
+ require 'automated_metareview/constants'
2
+ require 'automated_metareview/edge'
3
+ require 'automated_metareview/vertex'
4
+
5
+ class TextPreprocessing
6
+
7
+ =begin
8
+ Fetching review data from the tables based on the response_map id
9
+ =end
10
+ def fetch_review_data(auto_metareview, map_id)
11
+ reviews = Array.new
12
+ responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
13
+ auto_metareview.responses = responses
14
+ auto_metareview.response_id = responses.id
15
+ # puts "auto_metareview.response_id #{auto_metareview.response_id}"
16
+ # puts "responses updated_at #{responses.updated_at}"
17
+ responses.scores.each{
18
+ | review_score |
19
+ if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
20
+ # puts review_score.comments
21
+ reviews << review_score.comments
22
+ end
23
+ }
24
+ return reviews
25
+ end
26
+ #------------------------------------------#------------------------------------------#------------------------------------------
27
+ =begin
28
+ Fetching submission data from the url submitted by the reviewee
29
+ =end
30
+ def fetch_submission_data(map_id)
31
+ subm_array = Array.new
32
+ response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
33
+ reviewee_id = response_map.reviewee_id
34
+ reviewed_object = response_map.reviewed_object_id
35
+ url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
36
+ if(url.nil?)#in case of team assignments
37
+ teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
38
+ teams_users.each{
39
+ |team_user|
40
+ url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
41
+ if(!url.nil?)#break out when you find the url
42
+ break
43
+ end
44
+ }
45
+ end
46
+ # puts "***url #{url} #{url}"
47
+ #fetching the url submitted by the reviewee
48
+ url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
49
+ # puts "***url #{url} #{url.class}"
50
+ page = Nokogiri::HTML(open(url))
51
+ #fetching the paragraph texts from the specified url
52
+ if(page.css('p').count != 0)
53
+ page.css('p').each do |subm|
54
+ # puts "subm.text.. #{subm.text}"
55
+ subm_array << subm.text
56
+ end
57
+ end
58
+ #for google docs where the text is placed inside <script></script> tags
59
+ if(page.css('script').count != 0)
60
+ page.css('script').each do |subm|
61
+ if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
62
+ subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
63
+ end
64
+ end
65
+ end
66
+ return subm_array
67
+ end
68
+ #------------------------------------------#------------------------------------------#------------------------------------------
69
+ =begin
70
+ pre-processes the review text and sends it in for graph formation and further analysis
71
+ =end
72
+ def segment_text(flag, text_array)
73
+ if(flag == 0)
74
+ reviews = Array.new(1){Array.new}
75
+ else
76
+ reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
77
+ end
78
+
79
+ i = 0
80
+ j = 0
81
+
82
+ for k in (0..text_array.length-1)
83
+ text = text_array[k]
84
+ if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
85
+ reviews[j] = Array.new #initializing the array for sentences in a test review
86
+ i = 0
87
+ end
88
+
89
+ #******* Pre-processing the review/submission text **********
90
+ #replacing commas in large numbers, makes parsing sentences with commas confusing!
91
+ #replacing quotation marks
92
+ text.gsub!("\"", "")
93
+ text.gsub!("(", "")
94
+ text.gsub!(")", "")
95
+ if(text.include?("http://"))
96
+ text = remove_urls(text)
97
+ end
98
+ #break the text into multiple sentences
99
+ beginn = 0
100
+ if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
101
+ while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
102
+ endd = 0
103
+ #these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
104
+ if(text.include?("."))
105
+ endd = text.index(".")
106
+ end
107
+ if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
108
+ endd = text.index("?")
109
+ end
110
+ if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
111
+ endd = text.index("!")
112
+ end
113
+ if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
114
+ endd = text.index(",")
115
+ end
116
+ if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
117
+ endd = text.index(";")
118
+ end
119
+
120
+ #check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
121
+ if(flag == 0) #training
122
+ reviews[0][i] = text[beginn..endd].strip
123
+ else #testing
124
+ reviews[j][i] = text[beginn..endd].strip
125
+ end
126
+ i+=1 #incrementing the sentence counter
127
+ text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
128
+ end #end of the while loop
129
+ else #if there is only 1 sentence in the text
130
+ if(flag == 0)#training
131
+ reviews[0][i] = text.strip
132
+ i+=1 #incrementing the sentence counter
133
+ else #testing
134
+ reviews[j][i] = text.strip
135
+ end
136
+ end
137
+
138
+ if(flag == 1)#incrementing reviews counter only for test reviews
139
+ j+=1
140
+ end
141
+ end #end of the for loop with 'k' reading text rows
142
+
143
+ #setting the number of reviews before returning
144
+ if(flag == 0)#training
145
+ num_reviews = 1 #for training the number of reviews is 1
146
+ else #testing
147
+ num_reviews = j
148
+ end
149
+
150
+ if(flag == 0)
151
+ return reviews[0]
152
+ end
153
+ end
154
+ #------------------------------------------#------------------------------------------#------------------------------------------
155
+ =begin
156
+ * Reads the patterns from the csv file containing them.
157
+ * maxValue is the maximum value of the patterns found
158
+ =end
159
+
160
+ def read_patterns(filename, pos)
161
+ num = 1000 #some large number
162
+ patterns = Array.new
163
+ state = POSITIVE
164
+ i = 0 #keeps track of the number of edges
165
+
166
+ #setting the state for problem detection and suggestive patterns
167
+ if(filename.include?("prob"))
168
+ state = NEGATED
169
+ elsif(filename.include?("suggest"))
170
+ state = SUGGESTIVE
171
+ end
172
+
173
+ FasterCSV.foreach(filename) do |text|
174
+ in_vertex = text[0][0..text[0].index("=")-1].strip
175
+ out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
176
+
177
+ first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
178
+ first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
179
+
180
+ patterns[i] = Edge.new("noun", NOUN)
181
+ #setting the invertex
182
+ if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
183
+ patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
184
+ elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
185
+ patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
186
+ elsif(first_string_in_vertex.include?("JJ"))
187
+ patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
188
+ elsif(first_string_in_vertex.include?("/RB"))
189
+ patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
190
+ else #default to noun
191
+ patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
192
+ end
193
+
194
+ #setting outvertex
195
+ if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
196
+ patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
197
+ elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
198
+ patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
199
+ elsif(first_string_out_vertex.include?("JJ"))
200
+ patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
201
+ elsif(first_string_out_vertex.include?("/RB"))
202
+ patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
203
+ else #default is noun
204
+ patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
205
+ end
206
+ i+=1 #incrementing for each pattern
207
+ end #end of the FasterCSV.foreach loop
208
+ num_patterns = i
209
+ return patterns
210
+ end
211
+
212
+ #------------------------------------------#------------------------------------------#------------------------------------------
213
+
214
+ =begin
215
+ Removes any urls in the text and returns the remaining text as it is
216
+ =end
217
+ def remove_urls(text)
218
+ final_text = String.new
219
+ if(text.include?("http://"))
220
+ tokens = text.split(" ")
221
+ tokens.each{
222
+ |token|
223
+ if(!token.include?("http://"))
224
+ final_text = final_text + " " + token
225
+ end
226
+ }
227
+ else
228
+ return text
229
+ end
230
+ return final_text
231
+ end
232
+ #------------------------------------------#------------------------------------------#------------------------------------------
233
+
234
+ =begin
235
+ Check for plagiarism after removing text within quotes for reviews
236
+ =end
237
+ def remove_text_within_quotes(review_text)
238
+ # puts "Inside removeTextWithinQuotes:: "
239
+ reviews = Array.new
240
+ review_text.each{ |row|
241
+ # puts "row #{row}"
242
+ text = row
243
+ #text = text[1..text.length-2] #since the first and last characters are quotes
244
+ #puts "text #{text}"
245
+ #the read text is tagged with two sets of quotes!
246
+ if(text.include?("\""))
247
+ while(text.include?("\"")) do
248
+ replace_text = text.scan(/"([^"]*)"/)
249
+ # puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
250
+ # puts text.index(replace_text[0].to_s)
251
+ # puts "replace_text length .. #{replace_text[0].to_s.length}"
252
+ #fetching the start index of the quoted text, in order to replace the complete segment
253
+ start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
254
+ # puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
255
+ #replacing the text segment within the quotes (including the quotes) with an empty string
256
+ text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
257
+ # puts "text .. #{text}"
258
+ end #end of the while loop
259
+ end
260
+ reviews << text #set the text after all quoted segments have been removed.
261
+ } #end of the loop for "text" array
262
+ # puts "returning reviews length .. #{reviews.length}"
263
+ return reviews #return only the first array element - a string!
264
+ end
265
+ #------------------------------------------#------------------------------------------#------------------------------------------
266
+ =begin
267
+ Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
268
+ =end
269
+ def check_correct_spellings(review_text_array, speller)
270
+ review_text_array_temp = Array.new
271
+ #iterating through each response
272
+ review_text_array.each{
273
+ |review_text|
274
+ review_tokens = review_text.split(" ")
275
+ review_text_temp = ""
276
+ #iterating through tokens from each response
277
+ review_tokens.each{
278
+ |review_tok|
279
+ #checkiing the stem word's spelling for correctness
280
+ if(!speller.check(review_tok))
281
+ if(!speller.suggest(review_tok).first.nil?)
282
+ review_tok = speller.suggest(review_tok).first
283
+ end
284
+ end
285
+ review_text_temp = review_text_temp +" " + review_tok.downcase
286
+ }
287
+ review_text_array_temp << review_text_temp
288
+ }
289
+ return review_text_array_temp
290
+ end
291
+
292
+ #------------------------------------------#------------------------------------------#------------------------------------------
293
+ =begin
294
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
295
+ =end
296
+ public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
297
+ def contains_punct(str)
298
+ if(str.include?".")
299
+ str.gsub!(".","")
300
+ elsif(str.include?",")
301
+ str.gsub!(",","")
302
+ elsif(str.include?"?")
303
+ str.gsub!("?","")
304
+ elsif(str.include?"!")
305
+ str.gsub!("!","")
306
+ elsif(str.include?";")
307
+ str.gsub(";","")
308
+ elsif(str.include?":")
309
+ str.gsub!(":","")
310
+ elsif(str.include?"(")
311
+ str.gsub!("(","")
312
+ elsif(str.include?")")
313
+ str.gsub!(")","")
314
+ elsif(str.include?"[")
315
+ str.gsub!("[","")
316
+ elsif(str.include?"]")
317
+ str.gsub!("]","")
318
+ end
319
+ return str
320
+ end
321
+
322
+ def contains_punct_bool(str)
323
+ if(str.include?("\\n") or str.include?("}") or str.include?("{"))
324
+ return true
325
+ else
326
+ return false
327
+ end
328
+ end
329
+
330
+ #------------------------------------------#------------------------------------------#------------------------------------------
331
+ =begin
332
+ Checking if "str" is a punctuation mark like ".", ",", "?" etc.
333
+ =end
334
+ def is_punct(str)
335
+ if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
336
+ return true
337
+ else
338
+ return false
339
+ end
340
+ end
341
+
342
+ end #end of class