automated_metareview 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class PredictClass
|
5
|
+
=begin
|
6
|
+
Identifies the probabilities of a review belonging to each of the three classes.
|
7
|
+
Returns an array of probablities (length = numClasses)
|
8
|
+
=end
|
9
|
+
#predicting the review's class
|
10
|
+
def predict_classes(pos_tagger, core_NLP_tagger, review_text, review_graph, pattern_files_array, num_classes)
|
11
|
+
#reading the patterns from the pattern files
|
12
|
+
patterns_files = Array.new
|
13
|
+
pattern_files_array.each do |file|
|
14
|
+
patterns_files << file #collecting the file names for each class of patterns
|
15
|
+
end
|
16
|
+
|
17
|
+
tc = TextPreprocessing.new
|
18
|
+
single_patterns = Array.new(num_classes){Array.new}
|
19
|
+
#reading the patterns from each of the pattern files
|
20
|
+
for i in (0..num_classes - 1) #for every class
|
21
|
+
#read_patterns in TextPreprocessing helps read patterns in the format 'X = Y'
|
22
|
+
single_patterns[i] = tc.read_patterns(patterns_files[i], pos_tagger)
|
23
|
+
end
|
24
|
+
|
25
|
+
#Predicting the probability of the review belonging to each of the content classes
|
26
|
+
wordnet = WordnetBasedSimilarity.new
|
27
|
+
max_probability = 0.0
|
28
|
+
class_value = 0
|
29
|
+
edges = review_graph.edges
|
30
|
+
class_prob = Array.new #contains the probabilities for each of the classes - it contains 3 rows for the 3 classes
|
31
|
+
#comparing each test review text with patterns from each of the classes
|
32
|
+
for k in (0..num_classes - 1)
|
33
|
+
#comparing edges with patterns from a particular class
|
34
|
+
class_prob[k] = compare_review_with_patterns(edges, single_patterns[k], wordnet)/6.to_f #normalizing the result
|
35
|
+
#we divide the match by 6 to ensure the value is in the range of [0-1]
|
36
|
+
end #end of for loop for the classes
|
37
|
+
|
38
|
+
#printing the probability values
|
39
|
+
# puts("########## Probability for test review:: "+review_text[0]+" is::")
|
40
|
+
# for k in (0..num_classes - 1)
|
41
|
+
# puts "class_prob[#{k}] .. #{class_prob[k]}"
|
42
|
+
# end
|
43
|
+
return class_prob
|
44
|
+
end #end of the prediction method
|
45
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
46
|
+
|
47
|
+
def compare_review_with_patterns(single_edges, single_patterns, wordnet)
|
48
|
+
final_class_sum = 0.0
|
49
|
+
final_edge_num = 0
|
50
|
+
single_edge_matches = Array.new(single_edges.length){Array.new}
|
51
|
+
#resetting the average_match values for all the edges, before matching with the single_patterns for a new class
|
52
|
+
for i in 0..single_edges.length - 1
|
53
|
+
if(!single_edges[i].nil?)
|
54
|
+
single_edges[i].average_match = 0
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
#comparing each single edge with all the patterns
|
59
|
+
for i in (0..single_edges.length - 1) #iterating through the single edges
|
60
|
+
max_match = 0
|
61
|
+
if(!single_edges[i].nil?)
|
62
|
+
for j in (0..single_patterns.length - 1)
|
63
|
+
if(!single_patterns[j].nil?)
|
64
|
+
single_edge_matches[i][j] = compare_edges(single_edges[i], single_patterns[j], wordnet)
|
65
|
+
if(single_edge_matches[i][j] > max_match)
|
66
|
+
max_match = single_edge_matches[i][j]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end #end of for loop for the patterns
|
70
|
+
single_edges[i].average_match = max_match
|
71
|
+
|
72
|
+
#calculating class average
|
73
|
+
if(single_edges[i].average_match != 0.0)
|
74
|
+
final_class_sum = final_class_sum + single_edges[i].average_match
|
75
|
+
final_edge_num+=1
|
76
|
+
end
|
77
|
+
end #end of the if condition
|
78
|
+
end #end of for loop
|
79
|
+
|
80
|
+
if(final_edge_num == 0)
|
81
|
+
final_edge_num = 1
|
82
|
+
end
|
83
|
+
|
84
|
+
# puts("final_class_sum:: #{final_class_sum} final_edge_num:: #{final_edge_num} Class average #{final_class_sum/final_edge_num}")
|
85
|
+
return final_class_sum/final_edge_num #maxMatch
|
86
|
+
end #end of determineClass method
|
87
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
88
|
+
|
89
|
+
def compare_edges(e1, e2, wordnet)
|
90
|
+
speller = Aspell.new("en_US")
|
91
|
+
speller.suggestion_mode = Aspell::NORMAL
|
92
|
+
|
93
|
+
avg_match_without_syntax = 0
|
94
|
+
#compare edges so that only non-nouns or non-subjects are compared
|
95
|
+
# if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
|
96
|
+
avg_match_without_syntax = (wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller) +
|
97
|
+
wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller))/2.to_f
|
98
|
+
# elsif(!e1.in_vertex.pos_tag.include?("NN"))
|
99
|
+
# avg_match_without_syntax = wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller)
|
100
|
+
# elsif(!e1.out_vertex.pos_tag.include?("NN"))
|
101
|
+
# avg_match_without_syntax = wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller)
|
102
|
+
# end
|
103
|
+
|
104
|
+
avg_match_with_syntax = 0
|
105
|
+
#matching in-out and out-in vertices
|
106
|
+
# if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
|
107
|
+
avg_match_with_syntax = (wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller) +
|
108
|
+
wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller))/2.to_f
|
109
|
+
# elsif(!e1.in_vertex.pos_tag.include?("NN"))
|
110
|
+
# avg_match_with_syntax = wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller)
|
111
|
+
# elsif(!e1.out_vertex.pos_tag.include?("NN"))
|
112
|
+
# avg_match_with_syntax = wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller)
|
113
|
+
# end
|
114
|
+
|
115
|
+
if(avg_match_without_syntax > avg_match_with_syntax)
|
116
|
+
return avg_match_without_syntax
|
117
|
+
else
|
118
|
+
return avg_match_with_syntax
|
119
|
+
end
|
120
|
+
end #end of the compare_edges method
|
121
|
+
end
|
@@ -0,0 +1,293 @@
|
|
1
|
+
require 'automated_metareview/negations'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class SentenceState
|
5
|
+
attr_accessor :broken_sentences
|
6
|
+
def identify_sentence_state(str_with_pos_tags)
|
7
|
+
# puts("**** Inside identify_sentence_state #{str_with_pos_tags}")
|
8
|
+
#break the sentence at the co-ordinating conjunction
|
9
|
+
num_conjunctions = break_at_coordinating_conjunctions(str_with_pos_tags)
|
10
|
+
|
11
|
+
states_array = Array.new
|
12
|
+
if(@broken_sentences == nil)
|
13
|
+
states_array[0] = sentence_state(str_with_pos_tags)
|
14
|
+
#identifying states for each of the sentence segments
|
15
|
+
else
|
16
|
+
for i in (0..num_conjunctions)
|
17
|
+
if(!@broken_sentences[i].nil?)
|
18
|
+
states_array[i] = sentence_state(@broken_sentences[i])
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return states_array
|
23
|
+
end #end of the methods
|
24
|
+
#------------------------------------------#------------------------------------------
|
25
|
+
def break_at_coordinating_conjunctions(str_with_pos_tags)
|
26
|
+
st = str_with_pos_tags.split(" ")
|
27
|
+
count = st.length
|
28
|
+
counter = 0
|
29
|
+
|
30
|
+
@broken_sentences = Array.new
|
31
|
+
#if the sentence contains a co-ordinating conjunction
|
32
|
+
if(str_with_pos_tags.include?("CC"))
|
33
|
+
counter = 0
|
34
|
+
temp = ""
|
35
|
+
for i in (0..count-1)
|
36
|
+
ps = st[i]
|
37
|
+
if(!ps.nil? and ps.include?("CC"))
|
38
|
+
@broken_sentences[counter] = temp #for "run/NN on/IN..."
|
39
|
+
counter+=1
|
40
|
+
temp = ps[0..ps.index("/")]
|
41
|
+
#the CC or IN goes as part of the following sentence
|
42
|
+
elsif (!ps.nil? and !ps.include?("CC"))
|
43
|
+
temp = temp +" "+ ps[0..ps.index("/")]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
if(!temp.empty?) #setting the last sentence segment
|
47
|
+
@broken_sentences[counter] = temp
|
48
|
+
counter+=1
|
49
|
+
end
|
50
|
+
else
|
51
|
+
@broken_sentences[counter] = str_with_pos_tags
|
52
|
+
counter+=1
|
53
|
+
end
|
54
|
+
return counter
|
55
|
+
end #end of the method
|
56
|
+
#------------------------------------------#------------------------------------------
|
57
|
+
|
58
|
+
#Checking if the token is a negative token
|
59
|
+
def sentence_state(str_with_pos_tags)
|
60
|
+
state = POSITIVE
|
61
|
+
#checking single tokens for negated words
|
62
|
+
st = str_with_pos_tags.split(" ")
|
63
|
+
count = st.length
|
64
|
+
tokens = Array.new
|
65
|
+
tagged_tokens = Array.new
|
66
|
+
i = 0
|
67
|
+
interim_noun_verb = false #0 indicates no interim nouns or verbs
|
68
|
+
|
69
|
+
#fetching all the tokens
|
70
|
+
for k in (0..st.length-1)
|
71
|
+
ps = st[k]
|
72
|
+
#setting the tagged string
|
73
|
+
tagged_tokens[i] = ps
|
74
|
+
if(ps.include?("/"))
|
75
|
+
ps = ps[0..ps.index("/")-1]
|
76
|
+
end
|
77
|
+
#removing punctuations
|
78
|
+
if(ps.include?("."))
|
79
|
+
tokens[i] = ps[0..ps.index(".")-1]
|
80
|
+
elsif(ps.include?(","))
|
81
|
+
tokens[i] = ps.gsub(",", "")
|
82
|
+
elsif(ps.include?("!"))
|
83
|
+
tokens[i] = ps.gsub("!", "")
|
84
|
+
elsif(ps.include?(";"))
|
85
|
+
tokens[i] = ps.gsub(";", "")
|
86
|
+
else
|
87
|
+
tokens[i] = ps
|
88
|
+
i+=1
|
89
|
+
end
|
90
|
+
end#end of the for loop
|
91
|
+
|
92
|
+
#iterating through the tokens to determine state
|
93
|
+
prev_negative_word =""
|
94
|
+
for j in (0..i-1)
|
95
|
+
#checking type of the word
|
96
|
+
#checking for negated words
|
97
|
+
if(is_negative_word(tokens[j]) == NEGATED)
|
98
|
+
returned_type = NEGATIVE_WORD
|
99
|
+
#checking for a negative descriptor (indirect indicators of negation)
|
100
|
+
elsif(is_negative_descriptor(tokens[j]) == NEGATED)
|
101
|
+
returned_type = NEGATIVE_DESCRIPTOR
|
102
|
+
#2-gram phrases of negative phrases
|
103
|
+
elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
|
104
|
+
is_negative_phrase(tokens[j]+" "+tokens[j+1]) == NEGATED)
|
105
|
+
returned_type = NEGATIVE_PHRASE
|
106
|
+
j = j+1
|
107
|
+
#if suggestion word is found
|
108
|
+
elsif(is_suggestive(tokens[j]) == SUGGESTIVE)
|
109
|
+
returned_type = SUGGESTIVE
|
110
|
+
#2-gram phrases suggestion phrases
|
111
|
+
elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
|
112
|
+
is_suggestive_phrase(tokens[j]+" "+tokens[j+1]) == SUGGESTIVE)
|
113
|
+
returned_type = SUGGESTIVE
|
114
|
+
j = j+1
|
115
|
+
#else set to positive
|
116
|
+
else
|
117
|
+
returned_type = POSITIVE
|
118
|
+
end
|
119
|
+
|
120
|
+
#----------------------------------------------------------------------
|
121
|
+
#comparing 'returnedType' with the existing STATE of the sentence clause
|
122
|
+
#after returnedType is identified, check its state and compare it to the existing state
|
123
|
+
#if present state is negative and an interim non-negative or non-suggestive word was found, set the flag to true
|
124
|
+
if((state == NEGATIVE_WORD or state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_PHRASE) and returned_type == POSITIVE)
|
125
|
+
if(interim_noun_verb == false and (tagged_tokens[j].include?("NN") or tagged_tokens[j].include?("PR") or tagged_tokens[j].include?("VB") or tagged_tokens[j].include?("MD")))
|
126
|
+
interim_noun_verb = true
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
if(state == POSITIVE and returned_type != POSITIVE)
|
131
|
+
state = returned_type
|
132
|
+
#when state is a negative word
|
133
|
+
elsif(state == NEGATIVE_WORD) #previous state
|
134
|
+
if(returned_type == NEGATIVE_WORD)
|
135
|
+
#these words embellish the negation, so only if the previous word was not one of them you make it positive
|
136
|
+
if(prev_negative_word.casecmp("NO") != 0 and prev_negative_word.casecmp("NEVER") != 0 and prev_negative_word.casecmp("NONE") != 0)
|
137
|
+
state = POSITIVE #e.g: "not had no work..", "doesn't have no work..", "its not that it doesn't bother me..."
|
138
|
+
else
|
139
|
+
state = NEGATIVE_WORD #e.g: "no it doesn't help", "no there is no use for ..."
|
140
|
+
end
|
141
|
+
interim_noun_verb = false #resetting
|
142
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR or returned_type == NEGATIVE_PHRASE)
|
143
|
+
state = POSITIVE #e.g.: "not bad", "not taken from", "I don't want nothing", "no code duplication"// ["It couldn't be more confusing.."- anomaly we dont handle this for now!]
|
144
|
+
interim_noun_verb = false #resetting
|
145
|
+
elsif(returned_type == SUGGESTIVE)
|
146
|
+
#e.g. " it is not too useful as people could...", what about this one?
|
147
|
+
if(interim_noun_verb == true) #there are some words in between
|
148
|
+
state = NEGATIVE_WORD
|
149
|
+
else
|
150
|
+
state = SUGGESTIVE #e.g.:"I do not(-) suggest(S) ..."
|
151
|
+
end
|
152
|
+
interim_noun_verb = false #resetting
|
153
|
+
end
|
154
|
+
#when state is a negative descriptor
|
155
|
+
elsif(state == NEGATIVE_DESCRIPTOR)
|
156
|
+
if(returned_type == NEGATIVE_WORD)
|
157
|
+
if(interim_noun_verb == true)#there are some words in between
|
158
|
+
state = NEGATIVE_WORD #e.g: "hard(-) to understand none(-) of the comments"
|
159
|
+
else
|
160
|
+
state = POSITIVE #e.g."He hardly not...."
|
161
|
+
end
|
162
|
+
interim_noun_verb = false #resetting
|
163
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR)
|
164
|
+
if(interim_noun_verb == true)#there are some words in between
|
165
|
+
state = NEGATIVE_DESCRIPTOR #e.g:"there is barely any code duplication"
|
166
|
+
else
|
167
|
+
state = POSITIVE #e.g."It is hardly confusing..", but what about "it is a little confusing.."
|
168
|
+
end
|
169
|
+
interim_noun_verb = false #resetting
|
170
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
171
|
+
if(interim_noun_verb == true)#there are some words in between
|
172
|
+
state = NEGATIVE_PHRASE #e.g:"there is barely any code duplication"
|
173
|
+
else
|
174
|
+
state = POSITIVE #e.g.:"it is hard and appears to be taken from"
|
175
|
+
end
|
176
|
+
interim_noun_verb = false #resetting
|
177
|
+
elsif(returned_type == SUGGESTIVE)
|
178
|
+
state = SUGGESTIVE #e.g.:"I hardly(-) suggested(S) ..."
|
179
|
+
interim_noun_verb = false #resetting
|
180
|
+
end
|
181
|
+
#when state is a negative phrase
|
182
|
+
elsif(state == NEGATIVE_PHRASE)
|
183
|
+
if(returned_type == NEGATIVE_WORD)
|
184
|
+
if(interim_noun_verb == true)#there are some words in between
|
185
|
+
state = NEGATIVE_WORD #e.g."It is too short the text and doesn't"
|
186
|
+
else
|
187
|
+
state = POSITIVE #e.g."It is too short not to contain.."
|
188
|
+
end
|
189
|
+
interim_noun_verb = false #resetting
|
190
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR)
|
191
|
+
state = NEGATIVE_DESCRIPTOR #e.g."It is too short barely covering..."
|
192
|
+
interim_noun_verb = false #resetting
|
193
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
194
|
+
state = NEGATIVE_PHRASE #e.g.:"it is too short, taken from ..."
|
195
|
+
interim_noun_verb = false #resetting
|
196
|
+
elsif(returned_type == SUGGESTIVE)
|
197
|
+
state = SUGGESTIVE #e.g.:"I too short and I suggest ..."
|
198
|
+
interim_noun_verb = false #resetting
|
199
|
+
end
|
200
|
+
#when state is suggestive
|
201
|
+
elsif(state == SUGGESTIVE) #e.g.:"I might(S) not(-) suggest(S) ..."
|
202
|
+
if(returned_type == NEGATIVE_DESCRIPTOR)
|
203
|
+
state = NEGATIVE_DESCRIPTOR
|
204
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
205
|
+
state = NEGATIVE_PHRASE
|
206
|
+
end
|
207
|
+
#e.g.:"I suggest you don't.." -> suggestive
|
208
|
+
interim_noun_verb = false #resetting
|
209
|
+
end
|
210
|
+
|
211
|
+
#setting the prevNegativeWord
|
212
|
+
if(tokens[j].casecmp("NO") == 0 or tokens[j].casecmp("NEVER") == 0 or tokens[j].casecmp("NONE") == 0)
|
213
|
+
prev_negative_word = tokens[j]
|
214
|
+
end
|
215
|
+
|
216
|
+
end #end of for loop
|
217
|
+
|
218
|
+
if(state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_WORD or state == NEGATIVE_PHRASE)
|
219
|
+
state = NEGATED
|
220
|
+
end
|
221
|
+
|
222
|
+
return state
|
223
|
+
end
|
224
|
+
|
225
|
+
#------------------------------------------#------------------------------------------
|
226
|
+
|
227
|
+
#Checking if the token is a negative token
|
228
|
+
def is_negative_word(word)
|
229
|
+
not_negated = POSITIVE
|
230
|
+
for i in (0..NEGATED_WORDS.length - 1)
|
231
|
+
if(word.casecmp(NEGATED_WORDS[i]) == 0)
|
232
|
+
not_negated = NEGATED #indicates negation found
|
233
|
+
break
|
234
|
+
end
|
235
|
+
end
|
236
|
+
return not_negated
|
237
|
+
end
|
238
|
+
#------------------------------------------#------------------------------------------
|
239
|
+
|
240
|
+
#Checking if the token is a negative token
|
241
|
+
def is_negative_descriptor(word)
|
242
|
+
not_negated = POSITIVE
|
243
|
+
for i in (0..NEGATIVE_DESCRIPTORS.length - 1)
|
244
|
+
if(word.casecmp(NEGATIVE_DESCRIPTORS[i]) == 0)
|
245
|
+
not_negated = NEGATED #indicates negation found
|
246
|
+
break
|
247
|
+
end
|
248
|
+
end
|
249
|
+
return not_negated
|
250
|
+
end
|
251
|
+
|
252
|
+
#------------------------------------------#------------------------------------------
|
253
|
+
|
254
|
+
#Checking if the phrase is negative
|
255
|
+
def is_negative_phrase(phrase)
|
256
|
+
not_negated = POSITIVE
|
257
|
+
for i in (0..NEGATIVE_PHRASES.length - 1)
|
258
|
+
if(phrase.casecmp(NEGATIVE_PHRASES[i]) == 0)
|
259
|
+
not_negated = NEGATED #indicates negation found
|
260
|
+
break
|
261
|
+
end
|
262
|
+
end
|
263
|
+
return not_negated
|
264
|
+
end
|
265
|
+
|
266
|
+
#------------------------------------------#------------------------------------------
|
267
|
+
#Checking if the token is a suggestive token
|
268
|
+
def is_suggestive(word)
|
269
|
+
not_suggestive = POSITIVE
|
270
|
+
#puts "inside is_suggestive for token:: #{word}"
|
271
|
+
for i in (0..SUGGESTIVE_WORDS.length - 1)
|
272
|
+
if(word.casecmp(SUGGESTIVE_WORDS[i]) == 0)
|
273
|
+
not_suggestive = SUGGESTIVE #indicates negation found
|
274
|
+
break
|
275
|
+
end
|
276
|
+
end
|
277
|
+
return not_suggestive
|
278
|
+
end
|
279
|
+
#------------------------------------------#------------------------------------------
|
280
|
+
|
281
|
+
#Checking if the PHRASE is suggestive
|
282
|
+
def is_suggestive_phrase(phrase)
|
283
|
+
not_suggestive = POSITIVE
|
284
|
+
for i in (0..SUGGESTIVE_PHRASES.length - 1)
|
285
|
+
if(phrase.casecmp(SUGGESTIVE_PHRASES[i]) == 0)
|
286
|
+
not_suggestive = SUGGESTIVE #indicates negation found
|
287
|
+
break
|
288
|
+
end
|
289
|
+
end
|
290
|
+
return not_suggestive
|
291
|
+
end
|
292
|
+
|
293
|
+
end #end of the class
|
@@ -0,0 +1,342 @@
|
|
1
|
+
require 'automated_metareview/constants'
|
2
|
+
require 'automated_metareview/edge'
|
3
|
+
require 'automated_metareview/vertex'
|
4
|
+
|
5
|
+
class TextPreprocessing
|
6
|
+
|
7
|
+
=begin
|
8
|
+
Fetching review data from the tables based on the response_map id
|
9
|
+
=end
|
10
|
+
def fetch_review_data(auto_metareview, map_id)
|
11
|
+
reviews = Array.new
|
12
|
+
responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
|
13
|
+
auto_metareview.responses = responses
|
14
|
+
auto_metareview.response_id = responses.id
|
15
|
+
# puts "auto_metareview.response_id #{auto_metareview.response_id}"
|
16
|
+
# puts "responses updated_at #{responses.updated_at}"
|
17
|
+
responses.scores.each{
|
18
|
+
| review_score |
|
19
|
+
if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
|
20
|
+
# puts review_score.comments
|
21
|
+
reviews << review_score.comments
|
22
|
+
end
|
23
|
+
}
|
24
|
+
return reviews
|
25
|
+
end
|
26
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
27
|
+
=begin
|
28
|
+
Fetching submission data from the url submitted by the reviewee
|
29
|
+
=end
|
30
|
+
def fetch_submission_data(map_id)
|
31
|
+
subm_array = Array.new
|
32
|
+
response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
|
33
|
+
reviewee_id = response_map.reviewee_id
|
34
|
+
reviewed_object = response_map.reviewed_object_id
|
35
|
+
url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
|
36
|
+
if(url.nil?)#in case of team assignments
|
37
|
+
teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
|
38
|
+
teams_users.each{
|
39
|
+
|team_user|
|
40
|
+
url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
|
41
|
+
if(!url.nil?)#break out when you find the url
|
42
|
+
break
|
43
|
+
end
|
44
|
+
}
|
45
|
+
end
|
46
|
+
# puts "***url #{url} #{url}"
|
47
|
+
#fetching the url submitted by the reviewee
|
48
|
+
url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
|
49
|
+
# puts "***url #{url} #{url.class}"
|
50
|
+
page = Nokogiri::HTML(open(url))
|
51
|
+
#fetching the paragraph texts from the specified url
|
52
|
+
if(page.css('p').count != 0)
|
53
|
+
page.css('p').each do |subm|
|
54
|
+
# puts "subm.text.. #{subm.text}"
|
55
|
+
subm_array << subm.text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
#for google docs where the text is placed inside <script></script> tags
|
59
|
+
if(page.css('script').count != 0)
|
60
|
+
page.css('script').each do |subm|
|
61
|
+
if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
|
62
|
+
subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
return subm_array
|
67
|
+
end
|
68
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
69
|
+
=begin
|
70
|
+
pre-processes the review text and sends it in for graph formation and further analysis
|
71
|
+
=end
|
72
|
+
def segment_text(flag, text_array)
|
73
|
+
if(flag == 0)
|
74
|
+
reviews = Array.new(1){Array.new}
|
75
|
+
else
|
76
|
+
reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
|
77
|
+
end
|
78
|
+
|
79
|
+
i = 0
|
80
|
+
j = 0
|
81
|
+
|
82
|
+
for k in (0..text_array.length-1)
|
83
|
+
text = text_array[k]
|
84
|
+
if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
|
85
|
+
reviews[j] = Array.new #initializing the array for sentences in a test review
|
86
|
+
i = 0
|
87
|
+
end
|
88
|
+
|
89
|
+
#******* Pre-processing the review/submission text **********
|
90
|
+
#replacing commas in large numbers, makes parsing sentences with commas confusing!
|
91
|
+
#replacing quotation marks
|
92
|
+
text.gsub!("\"", "")
|
93
|
+
text.gsub!("(", "")
|
94
|
+
text.gsub!(")", "")
|
95
|
+
if(text.include?("http://"))
|
96
|
+
text = remove_urls(text)
|
97
|
+
end
|
98
|
+
#break the text into multiple sentences
|
99
|
+
beginn = 0
|
100
|
+
if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
|
101
|
+
while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
|
102
|
+
endd = 0
|
103
|
+
#these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
|
104
|
+
if(text.include?("."))
|
105
|
+
endd = text.index(".")
|
106
|
+
end
|
107
|
+
if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
|
108
|
+
endd = text.index("?")
|
109
|
+
end
|
110
|
+
if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
|
111
|
+
endd = text.index("!")
|
112
|
+
end
|
113
|
+
if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
|
114
|
+
endd = text.index(",")
|
115
|
+
end
|
116
|
+
if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
|
117
|
+
endd = text.index(";")
|
118
|
+
end
|
119
|
+
|
120
|
+
#check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
|
121
|
+
if(flag == 0) #training
|
122
|
+
reviews[0][i] = text[beginn..endd].strip
|
123
|
+
else #testing
|
124
|
+
reviews[j][i] = text[beginn..endd].strip
|
125
|
+
end
|
126
|
+
i+=1 #incrementing the sentence counter
|
127
|
+
text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
|
128
|
+
end #end of the while loop
|
129
|
+
else #if there is only 1 sentence in the text
|
130
|
+
if(flag == 0)#training
|
131
|
+
reviews[0][i] = text.strip
|
132
|
+
i+=1 #incrementing the sentence counter
|
133
|
+
else #testing
|
134
|
+
reviews[j][i] = text.strip
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
if(flag == 1)#incrementing reviews counter only for test reviews
|
139
|
+
j+=1
|
140
|
+
end
|
141
|
+
end #end of the for loop with 'k' reading text rows
|
142
|
+
|
143
|
+
#setting the number of reviews before returning
|
144
|
+
if(flag == 0)#training
|
145
|
+
num_reviews = 1 #for training the number of reviews is 1
|
146
|
+
else #testing
|
147
|
+
num_reviews = j
|
148
|
+
end
|
149
|
+
|
150
|
+
if(flag == 0)
|
151
|
+
return reviews[0]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
155
|
+
=begin
|
156
|
+
* Reads the patterns from the csv file containing them.
|
157
|
+
* maxValue is the maximum value of the patterns found
|
158
|
+
=end
|
159
|
+
|
160
|
+
def read_patterns(filename, pos)
|
161
|
+
num = 1000 #some large number
|
162
|
+
patterns = Array.new
|
163
|
+
state = POSITIVE
|
164
|
+
i = 0 #keeps track of the number of edges
|
165
|
+
|
166
|
+
#setting the state for problem detection and suggestive patterns
|
167
|
+
if(filename.include?("prob"))
|
168
|
+
state = NEGATED
|
169
|
+
elsif(filename.include?("suggest"))
|
170
|
+
state = SUGGESTIVE
|
171
|
+
end
|
172
|
+
|
173
|
+
FasterCSV.foreach(filename) do |text|
|
174
|
+
in_vertex = text[0][0..text[0].index("=")-1].strip
|
175
|
+
out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
|
176
|
+
|
177
|
+
first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
|
178
|
+
first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
|
179
|
+
|
180
|
+
patterns[i] = Edge.new("noun", NOUN)
|
181
|
+
#setting the invertex
|
182
|
+
if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
|
183
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
184
|
+
elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
|
185
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
186
|
+
elsif(first_string_in_vertex.include?("JJ"))
|
187
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
188
|
+
elsif(first_string_in_vertex.include?("/RB"))
|
189
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
190
|
+
else #default to noun
|
191
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
192
|
+
end
|
193
|
+
|
194
|
+
#setting outvertex
|
195
|
+
if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
|
196
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
197
|
+
elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
|
198
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
199
|
+
elsif(first_string_out_vertex.include?("JJ"))
|
200
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
|
201
|
+
elsif(first_string_out_vertex.include?("/RB"))
|
202
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
203
|
+
else #default is noun
|
204
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
205
|
+
end
|
206
|
+
i+=1 #incrementing for each pattern
|
207
|
+
end #end of the FasterCSV.foreach loop
|
208
|
+
num_patterns = i
|
209
|
+
return patterns
|
210
|
+
end
|
211
|
+
|
212
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
213
|
+
|
214
|
+
=begin
|
215
|
+
Removes any urls in the text and returns the remaining text as it is
|
216
|
+
=end
|
217
|
+
def remove_urls(text)
|
218
|
+
final_text = String.new
|
219
|
+
if(text.include?("http://"))
|
220
|
+
tokens = text.split(" ")
|
221
|
+
tokens.each{
|
222
|
+
|token|
|
223
|
+
if(!token.include?("http://"))
|
224
|
+
final_text = final_text + " " + token
|
225
|
+
end
|
226
|
+
}
|
227
|
+
else
|
228
|
+
return text
|
229
|
+
end
|
230
|
+
return final_text
|
231
|
+
end
|
232
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
233
|
+
|
234
|
+
=begin
|
235
|
+
Check for plagiarism after removing text within quotes for reviews
|
236
|
+
=end
|
237
|
+
def remove_text_within_quotes(review_text)
|
238
|
+
# puts "Inside removeTextWithinQuotes:: "
|
239
|
+
reviews = Array.new
|
240
|
+
review_text.each{ |row|
|
241
|
+
# puts "row #{row}"
|
242
|
+
text = row
|
243
|
+
#text = text[1..text.length-2] #since the first and last characters are quotes
|
244
|
+
#puts "text #{text}"
|
245
|
+
#the read text is tagged with two sets of quotes!
|
246
|
+
if(text.include?("\""))
|
247
|
+
while(text.include?("\"")) do
|
248
|
+
replace_text = text.scan(/"([^"]*)"/)
|
249
|
+
# puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
|
250
|
+
# puts text.index(replace_text[0].to_s)
|
251
|
+
# puts "replace_text length .. #{replace_text[0].to_s.length}"
|
252
|
+
#fetching the start index of the quoted text, in order to replace the complete segment
|
253
|
+
start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
|
254
|
+
# puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
|
255
|
+
#replacing the text segment within the quotes (including the quotes) with an empty string
|
256
|
+
text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
|
257
|
+
# puts "text .. #{text}"
|
258
|
+
end #end of the while loop
|
259
|
+
end
|
260
|
+
reviews << text #set the text after all quoted segments have been removed.
|
261
|
+
} #end of the loop for "text" array
|
262
|
+
# puts "returning reviews length .. #{reviews.length}"
|
263
|
+
return reviews #return only the first array element - a string!
|
264
|
+
end
|
265
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
266
|
+
=begin
|
267
|
+
Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
|
268
|
+
=end
|
269
|
+
def check_correct_spellings(review_text_array, speller)
|
270
|
+
review_text_array_temp = Array.new
|
271
|
+
#iterating through each response
|
272
|
+
review_text_array.each{
|
273
|
+
|review_text|
|
274
|
+
review_tokens = review_text.split(" ")
|
275
|
+
review_text_temp = ""
|
276
|
+
#iterating through tokens from each response
|
277
|
+
review_tokens.each{
|
278
|
+
|review_tok|
|
279
|
+
#checkiing the stem word's spelling for correctness
|
280
|
+
if(!speller.check(review_tok))
|
281
|
+
if(!speller.suggest(review_tok).first.nil?)
|
282
|
+
review_tok = speller.suggest(review_tok).first
|
283
|
+
end
|
284
|
+
end
|
285
|
+
review_text_temp = review_text_temp +" " + review_tok.downcase
|
286
|
+
}
|
287
|
+
review_text_array_temp << review_text_temp
|
288
|
+
}
|
289
|
+
return review_text_array_temp
|
290
|
+
end
|
291
|
+
|
292
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
293
|
+
=begin
|
294
|
+
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
|
295
|
+
=end
|
296
|
+
public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
|
297
|
+
def contains_punct(str)
|
298
|
+
if(str.include?".")
|
299
|
+
str.gsub!(".","")
|
300
|
+
elsif(str.include?",")
|
301
|
+
str.gsub!(",","")
|
302
|
+
elsif(str.include?"?")
|
303
|
+
str.gsub!("?","")
|
304
|
+
elsif(str.include?"!")
|
305
|
+
str.gsub!("!","")
|
306
|
+
elsif(str.include?";")
|
307
|
+
str.gsub(";","")
|
308
|
+
elsif(str.include?":")
|
309
|
+
str.gsub!(":","")
|
310
|
+
elsif(str.include?"(")
|
311
|
+
str.gsub!("(","")
|
312
|
+
elsif(str.include?")")
|
313
|
+
str.gsub!(")","")
|
314
|
+
elsif(str.include?"[")
|
315
|
+
str.gsub!("[","")
|
316
|
+
elsif(str.include?"]")
|
317
|
+
str.gsub!("]","")
|
318
|
+
end
|
319
|
+
return str
|
320
|
+
end
|
321
|
+
|
322
|
+
def contains_punct_bool(str)
|
323
|
+
if(str.include?("\\n") or str.include?("}") or str.include?("{"))
|
324
|
+
return true
|
325
|
+
else
|
326
|
+
return false
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
331
|
+
=begin
|
332
|
+
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
|
333
|
+
=end
|
334
|
+
def is_punct(str)
|
335
|
+
if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
|
336
|
+
return true
|
337
|
+
else
|
338
|
+
return false
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
end #end of class
|