automated_metareview 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class PredictClass
|
5
|
+
=begin
|
6
|
+
Identifies the probabilities of a review belonging to each of the three classes.
|
7
|
+
Returns an array of probablities (length = numClasses)
|
8
|
+
=end
|
9
|
+
#predicting the review's class
|
10
|
+
def predict_classes(pos_tagger, core_NLP_tagger, review_text, review_graph, pattern_files_array, num_classes)
|
11
|
+
#reading the patterns from the pattern files
|
12
|
+
patterns_files = Array.new
|
13
|
+
pattern_files_array.each do |file|
|
14
|
+
patterns_files << file #collecting the file names for each class of patterns
|
15
|
+
end
|
16
|
+
|
17
|
+
tc = TextPreprocessing.new
|
18
|
+
single_patterns = Array.new(num_classes){Array.new}
|
19
|
+
#reading the patterns from each of the pattern files
|
20
|
+
for i in (0..num_classes - 1) #for every class
|
21
|
+
#read_patterns in TextPreprocessing helps read patterns in the format 'X = Y'
|
22
|
+
single_patterns[i] = tc.read_patterns(patterns_files[i], pos_tagger)
|
23
|
+
end
|
24
|
+
|
25
|
+
#Predicting the probability of the review belonging to each of the content classes
|
26
|
+
wordnet = WordnetBasedSimilarity.new
|
27
|
+
max_probability = 0.0
|
28
|
+
class_value = 0
|
29
|
+
edges = review_graph.edges
|
30
|
+
class_prob = Array.new #contains the probabilities for each of the classes - it contains 3 rows for the 3 classes
|
31
|
+
#comparing each test review text with patterns from each of the classes
|
32
|
+
for k in (0..num_classes - 1)
|
33
|
+
#comparing edges with patterns from a particular class
|
34
|
+
class_prob[k] = compare_review_with_patterns(edges, single_patterns[k], wordnet)/6.to_f #normalizing the result
|
35
|
+
#we divide the match by 6 to ensure the value is in the range of [0-1]
|
36
|
+
end #end of for loop for the classes
|
37
|
+
|
38
|
+
#printing the probability values
|
39
|
+
# puts("########## Probability for test review:: "+review_text[0]+" is::")
|
40
|
+
# for k in (0..num_classes - 1)
|
41
|
+
# puts "class_prob[#{k}] .. #{class_prob[k]}"
|
42
|
+
# end
|
43
|
+
return class_prob
|
44
|
+
end #end of the prediction method
|
45
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
46
|
+
|
47
|
+
def compare_review_with_patterns(single_edges, single_patterns, wordnet)
|
48
|
+
final_class_sum = 0.0
|
49
|
+
final_edge_num = 0
|
50
|
+
single_edge_matches = Array.new(single_edges.length){Array.new}
|
51
|
+
#resetting the average_match values for all the edges, before matching with the single_patterns for a new class
|
52
|
+
for i in 0..single_edges.length - 1
|
53
|
+
if(!single_edges[i].nil?)
|
54
|
+
single_edges[i].average_match = 0
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
#comparing each single edge with all the patterns
|
59
|
+
for i in (0..single_edges.length - 1) #iterating through the single edges
|
60
|
+
max_match = 0
|
61
|
+
if(!single_edges[i].nil?)
|
62
|
+
for j in (0..single_patterns.length - 1)
|
63
|
+
if(!single_patterns[j].nil?)
|
64
|
+
single_edge_matches[i][j] = compare_edges(single_edges[i], single_patterns[j], wordnet)
|
65
|
+
if(single_edge_matches[i][j] > max_match)
|
66
|
+
max_match = single_edge_matches[i][j]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end #end of for loop for the patterns
|
70
|
+
single_edges[i].average_match = max_match
|
71
|
+
|
72
|
+
#calculating class average
|
73
|
+
if(single_edges[i].average_match != 0.0)
|
74
|
+
final_class_sum = final_class_sum + single_edges[i].average_match
|
75
|
+
final_edge_num+=1
|
76
|
+
end
|
77
|
+
end #end of the if condition
|
78
|
+
end #end of for loop
|
79
|
+
|
80
|
+
if(final_edge_num == 0)
|
81
|
+
final_edge_num = 1
|
82
|
+
end
|
83
|
+
|
84
|
+
# puts("final_class_sum:: #{final_class_sum} final_edge_num:: #{final_edge_num} Class average #{final_class_sum/final_edge_num}")
|
85
|
+
return final_class_sum/final_edge_num #maxMatch
|
86
|
+
end #end of determineClass method
|
87
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
88
|
+
|
89
|
+
def compare_edges(e1, e2, wordnet)
|
90
|
+
speller = Aspell.new("en_US")
|
91
|
+
speller.suggestion_mode = Aspell::NORMAL
|
92
|
+
|
93
|
+
avg_match_without_syntax = 0
|
94
|
+
#compare edges so that only non-nouns or non-subjects are compared
|
95
|
+
# if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
|
96
|
+
avg_match_without_syntax = (wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller) +
|
97
|
+
wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller))/2.to_f
|
98
|
+
# elsif(!e1.in_vertex.pos_tag.include?("NN"))
|
99
|
+
# avg_match_without_syntax = wordnet.compare_strings(e1.in_vertex, e2.in_vertex, speller)
|
100
|
+
# elsif(!e1.out_vertex.pos_tag.include?("NN"))
|
101
|
+
# avg_match_without_syntax = wordnet.compare_strings(e1.out_vertex, e2.out_vertex, speller)
|
102
|
+
# end
|
103
|
+
|
104
|
+
avg_match_with_syntax = 0
|
105
|
+
#matching in-out and out-in vertices
|
106
|
+
# if(!e1.in_vertex.pos_tag.include?("NN") and !e1.out_vertex.pos_tag.include?("NN"))
|
107
|
+
avg_match_with_syntax = (wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller) +
|
108
|
+
wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller))/2.to_f
|
109
|
+
# elsif(!e1.in_vertex.pos_tag.include?("NN"))
|
110
|
+
# avg_match_with_syntax = wordnet.compare_strings(e1.in_vertex, e2.out_vertex, speller)
|
111
|
+
# elsif(!e1.out_vertex.pos_tag.include?("NN"))
|
112
|
+
# avg_match_with_syntax = wordnet.compare_strings(e1.out_vertex, e2.in_vertex, speller)
|
113
|
+
# end
|
114
|
+
|
115
|
+
if(avg_match_without_syntax > avg_match_with_syntax)
|
116
|
+
return avg_match_without_syntax
|
117
|
+
else
|
118
|
+
return avg_match_with_syntax
|
119
|
+
end
|
120
|
+
end #end of the compare_edges method
|
121
|
+
end
|
@@ -0,0 +1,293 @@
|
|
1
|
+
require 'automated_metareview/negations'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class SentenceState
|
5
|
+
attr_accessor :broken_sentences
|
6
|
+
def identify_sentence_state(str_with_pos_tags)
|
7
|
+
# puts("**** Inside identify_sentence_state #{str_with_pos_tags}")
|
8
|
+
#break the sentence at the co-ordinating conjunction
|
9
|
+
num_conjunctions = break_at_coordinating_conjunctions(str_with_pos_tags)
|
10
|
+
|
11
|
+
states_array = Array.new
|
12
|
+
if(@broken_sentences == nil)
|
13
|
+
states_array[0] = sentence_state(str_with_pos_tags)
|
14
|
+
#identifying states for each of the sentence segments
|
15
|
+
else
|
16
|
+
for i in (0..num_conjunctions)
|
17
|
+
if(!@broken_sentences[i].nil?)
|
18
|
+
states_array[i] = sentence_state(@broken_sentences[i])
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return states_array
|
23
|
+
end #end of the methods
|
24
|
+
#------------------------------------------#------------------------------------------
|
25
|
+
def break_at_coordinating_conjunctions(str_with_pos_tags)
|
26
|
+
st = str_with_pos_tags.split(" ")
|
27
|
+
count = st.length
|
28
|
+
counter = 0
|
29
|
+
|
30
|
+
@broken_sentences = Array.new
|
31
|
+
#if the sentence contains a co-ordinating conjunction
|
32
|
+
if(str_with_pos_tags.include?("CC"))
|
33
|
+
counter = 0
|
34
|
+
temp = ""
|
35
|
+
for i in (0..count-1)
|
36
|
+
ps = st[i]
|
37
|
+
if(!ps.nil? and ps.include?("CC"))
|
38
|
+
@broken_sentences[counter] = temp #for "run/NN on/IN..."
|
39
|
+
counter+=1
|
40
|
+
temp = ps[0..ps.index("/")]
|
41
|
+
#the CC or IN goes as part of the following sentence
|
42
|
+
elsif (!ps.nil? and !ps.include?("CC"))
|
43
|
+
temp = temp +" "+ ps[0..ps.index("/")]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
if(!temp.empty?) #setting the last sentence segment
|
47
|
+
@broken_sentences[counter] = temp
|
48
|
+
counter+=1
|
49
|
+
end
|
50
|
+
else
|
51
|
+
@broken_sentences[counter] = str_with_pos_tags
|
52
|
+
counter+=1
|
53
|
+
end
|
54
|
+
return counter
|
55
|
+
end #end of the method
|
56
|
+
#------------------------------------------#------------------------------------------
|
57
|
+
|
58
|
+
#Checking if the token is a negative token
|
59
|
+
def sentence_state(str_with_pos_tags)
|
60
|
+
state = POSITIVE
|
61
|
+
#checking single tokens for negated words
|
62
|
+
st = str_with_pos_tags.split(" ")
|
63
|
+
count = st.length
|
64
|
+
tokens = Array.new
|
65
|
+
tagged_tokens = Array.new
|
66
|
+
i = 0
|
67
|
+
interim_noun_verb = false #0 indicates no interim nouns or verbs
|
68
|
+
|
69
|
+
#fetching all the tokens
|
70
|
+
for k in (0..st.length-1)
|
71
|
+
ps = st[k]
|
72
|
+
#setting the tagged string
|
73
|
+
tagged_tokens[i] = ps
|
74
|
+
if(ps.include?("/"))
|
75
|
+
ps = ps[0..ps.index("/")-1]
|
76
|
+
end
|
77
|
+
#removing punctuations
|
78
|
+
if(ps.include?("."))
|
79
|
+
tokens[i] = ps[0..ps.index(".")-1]
|
80
|
+
elsif(ps.include?(","))
|
81
|
+
tokens[i] = ps.gsub(",", "")
|
82
|
+
elsif(ps.include?("!"))
|
83
|
+
tokens[i] = ps.gsub("!", "")
|
84
|
+
elsif(ps.include?(";"))
|
85
|
+
tokens[i] = ps.gsub(";", "")
|
86
|
+
else
|
87
|
+
tokens[i] = ps
|
88
|
+
i+=1
|
89
|
+
end
|
90
|
+
end#end of the for loop
|
91
|
+
|
92
|
+
#iterating through the tokens to determine state
|
93
|
+
prev_negative_word =""
|
94
|
+
for j in (0..i-1)
|
95
|
+
#checking type of the word
|
96
|
+
#checking for negated words
|
97
|
+
if(is_negative_word(tokens[j]) == NEGATED)
|
98
|
+
returned_type = NEGATIVE_WORD
|
99
|
+
#checking for a negative descriptor (indirect indicators of negation)
|
100
|
+
elsif(is_negative_descriptor(tokens[j]) == NEGATED)
|
101
|
+
returned_type = NEGATIVE_DESCRIPTOR
|
102
|
+
#2-gram phrases of negative phrases
|
103
|
+
elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
|
104
|
+
is_negative_phrase(tokens[j]+" "+tokens[j+1]) == NEGATED)
|
105
|
+
returned_type = NEGATIVE_PHRASE
|
106
|
+
j = j+1
|
107
|
+
#if suggestion word is found
|
108
|
+
elsif(is_suggestive(tokens[j]) == SUGGESTIVE)
|
109
|
+
returned_type = SUGGESTIVE
|
110
|
+
#2-gram phrases suggestion phrases
|
111
|
+
elsif(j+1 < count && !tokens[j].nil? && !tokens[j+1].nil? &&
|
112
|
+
is_suggestive_phrase(tokens[j]+" "+tokens[j+1]) == SUGGESTIVE)
|
113
|
+
returned_type = SUGGESTIVE
|
114
|
+
j = j+1
|
115
|
+
#else set to positive
|
116
|
+
else
|
117
|
+
returned_type = POSITIVE
|
118
|
+
end
|
119
|
+
|
120
|
+
#----------------------------------------------------------------------
|
121
|
+
#comparing 'returnedType' with the existing STATE of the sentence clause
|
122
|
+
#after returnedType is identified, check its state and compare it to the existing state
|
123
|
+
#if present state is negative and an interim non-negative or non-suggestive word was found, set the flag to true
|
124
|
+
if((state == NEGATIVE_WORD or state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_PHRASE) and returned_type == POSITIVE)
|
125
|
+
if(interim_noun_verb == false and (tagged_tokens[j].include?("NN") or tagged_tokens[j].include?("PR") or tagged_tokens[j].include?("VB") or tagged_tokens[j].include?("MD")))
|
126
|
+
interim_noun_verb = true
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
if(state == POSITIVE and returned_type != POSITIVE)
|
131
|
+
state = returned_type
|
132
|
+
#when state is a negative word
|
133
|
+
elsif(state == NEGATIVE_WORD) #previous state
|
134
|
+
if(returned_type == NEGATIVE_WORD)
|
135
|
+
#these words embellish the negation, so only if the previous word was not one of them you make it positive
|
136
|
+
if(prev_negative_word.casecmp("NO") != 0 and prev_negative_word.casecmp("NEVER") != 0 and prev_negative_word.casecmp("NONE") != 0)
|
137
|
+
state = POSITIVE #e.g: "not had no work..", "doesn't have no work..", "its not that it doesn't bother me..."
|
138
|
+
else
|
139
|
+
state = NEGATIVE_WORD #e.g: "no it doesn't help", "no there is no use for ..."
|
140
|
+
end
|
141
|
+
interim_noun_verb = false #resetting
|
142
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR or returned_type == NEGATIVE_PHRASE)
|
143
|
+
state = POSITIVE #e.g.: "not bad", "not taken from", "I don't want nothing", "no code duplication"// ["It couldn't be more confusing.."- anomaly we dont handle this for now!]
|
144
|
+
interim_noun_verb = false #resetting
|
145
|
+
elsif(returned_type == SUGGESTIVE)
|
146
|
+
#e.g. " it is not too useful as people could...", what about this one?
|
147
|
+
if(interim_noun_verb == true) #there are some words in between
|
148
|
+
state = NEGATIVE_WORD
|
149
|
+
else
|
150
|
+
state = SUGGESTIVE #e.g.:"I do not(-) suggest(S) ..."
|
151
|
+
end
|
152
|
+
interim_noun_verb = false #resetting
|
153
|
+
end
|
154
|
+
#when state is a negative descriptor
|
155
|
+
elsif(state == NEGATIVE_DESCRIPTOR)
|
156
|
+
if(returned_type == NEGATIVE_WORD)
|
157
|
+
if(interim_noun_verb == true)#there are some words in between
|
158
|
+
state = NEGATIVE_WORD #e.g: "hard(-) to understand none(-) of the comments"
|
159
|
+
else
|
160
|
+
state = POSITIVE #e.g."He hardly not...."
|
161
|
+
end
|
162
|
+
interim_noun_verb = false #resetting
|
163
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR)
|
164
|
+
if(interim_noun_verb == true)#there are some words in between
|
165
|
+
state = NEGATIVE_DESCRIPTOR #e.g:"there is barely any code duplication"
|
166
|
+
else
|
167
|
+
state = POSITIVE #e.g."It is hardly confusing..", but what about "it is a little confusing.."
|
168
|
+
end
|
169
|
+
interim_noun_verb = false #resetting
|
170
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
171
|
+
if(interim_noun_verb == true)#there are some words in between
|
172
|
+
state = NEGATIVE_PHRASE #e.g:"there is barely any code duplication"
|
173
|
+
else
|
174
|
+
state = POSITIVE #e.g.:"it is hard and appears to be taken from"
|
175
|
+
end
|
176
|
+
interim_noun_verb = false #resetting
|
177
|
+
elsif(returned_type == SUGGESTIVE)
|
178
|
+
state = SUGGESTIVE #e.g.:"I hardly(-) suggested(S) ..."
|
179
|
+
interim_noun_verb = false #resetting
|
180
|
+
end
|
181
|
+
#when state is a negative phrase
|
182
|
+
elsif(state == NEGATIVE_PHRASE)
|
183
|
+
if(returned_type == NEGATIVE_WORD)
|
184
|
+
if(interim_noun_verb == true)#there are some words in between
|
185
|
+
state = NEGATIVE_WORD #e.g."It is too short the text and doesn't"
|
186
|
+
else
|
187
|
+
state = POSITIVE #e.g."It is too short not to contain.."
|
188
|
+
end
|
189
|
+
interim_noun_verb = false #resetting
|
190
|
+
elsif(returned_type == NEGATIVE_DESCRIPTOR)
|
191
|
+
state = NEGATIVE_DESCRIPTOR #e.g."It is too short barely covering..."
|
192
|
+
interim_noun_verb = false #resetting
|
193
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
194
|
+
state = NEGATIVE_PHRASE #e.g.:"it is too short, taken from ..."
|
195
|
+
interim_noun_verb = false #resetting
|
196
|
+
elsif(returned_type == SUGGESTIVE)
|
197
|
+
state = SUGGESTIVE #e.g.:"I too short and I suggest ..."
|
198
|
+
interim_noun_verb = false #resetting
|
199
|
+
end
|
200
|
+
#when state is suggestive
|
201
|
+
elsif(state == SUGGESTIVE) #e.g.:"I might(S) not(-) suggest(S) ..."
|
202
|
+
if(returned_type == NEGATIVE_DESCRIPTOR)
|
203
|
+
state = NEGATIVE_DESCRIPTOR
|
204
|
+
elsif(returned_type == NEGATIVE_PHRASE)
|
205
|
+
state = NEGATIVE_PHRASE
|
206
|
+
end
|
207
|
+
#e.g.:"I suggest you don't.." -> suggestive
|
208
|
+
interim_noun_verb = false #resetting
|
209
|
+
end
|
210
|
+
|
211
|
+
#setting the prevNegativeWord
|
212
|
+
if(tokens[j].casecmp("NO") == 0 or tokens[j].casecmp("NEVER") == 0 or tokens[j].casecmp("NONE") == 0)
|
213
|
+
prev_negative_word = tokens[j]
|
214
|
+
end
|
215
|
+
|
216
|
+
end #end of for loop
|
217
|
+
|
218
|
+
if(state == NEGATIVE_DESCRIPTOR or state == NEGATIVE_WORD or state == NEGATIVE_PHRASE)
|
219
|
+
state = NEGATED
|
220
|
+
end
|
221
|
+
|
222
|
+
return state
|
223
|
+
end
|
224
|
+
|
225
|
+
#------------------------------------------#------------------------------------------
|
226
|
+
|
227
|
+
#Checking if the token is a negative token
|
228
|
+
def is_negative_word(word)
|
229
|
+
not_negated = POSITIVE
|
230
|
+
for i in (0..NEGATED_WORDS.length - 1)
|
231
|
+
if(word.casecmp(NEGATED_WORDS[i]) == 0)
|
232
|
+
not_negated = NEGATED #indicates negation found
|
233
|
+
break
|
234
|
+
end
|
235
|
+
end
|
236
|
+
return not_negated
|
237
|
+
end
|
238
|
+
#------------------------------------------#------------------------------------------
|
239
|
+
|
240
|
+
#Checking if the token is a negative token
|
241
|
+
def is_negative_descriptor(word)
|
242
|
+
not_negated = POSITIVE
|
243
|
+
for i in (0..NEGATIVE_DESCRIPTORS.length - 1)
|
244
|
+
if(word.casecmp(NEGATIVE_DESCRIPTORS[i]) == 0)
|
245
|
+
not_negated = NEGATED #indicates negation found
|
246
|
+
break
|
247
|
+
end
|
248
|
+
end
|
249
|
+
return not_negated
|
250
|
+
end
|
251
|
+
|
252
|
+
#------------------------------------------#------------------------------------------
|
253
|
+
|
254
|
+
#Checking if the phrase is negative
|
255
|
+
def is_negative_phrase(phrase)
|
256
|
+
not_negated = POSITIVE
|
257
|
+
for i in (0..NEGATIVE_PHRASES.length - 1)
|
258
|
+
if(phrase.casecmp(NEGATIVE_PHRASES[i]) == 0)
|
259
|
+
not_negated = NEGATED #indicates negation found
|
260
|
+
break
|
261
|
+
end
|
262
|
+
end
|
263
|
+
return not_negated
|
264
|
+
end
|
265
|
+
|
266
|
+
#------------------------------------------#------------------------------------------
|
267
|
+
#Checking if the token is a suggestive token
|
268
|
+
def is_suggestive(word)
|
269
|
+
not_suggestive = POSITIVE
|
270
|
+
#puts "inside is_suggestive for token:: #{word}"
|
271
|
+
for i in (0..SUGGESTIVE_WORDS.length - 1)
|
272
|
+
if(word.casecmp(SUGGESTIVE_WORDS[i]) == 0)
|
273
|
+
not_suggestive = SUGGESTIVE #indicates negation found
|
274
|
+
break
|
275
|
+
end
|
276
|
+
end
|
277
|
+
return not_suggestive
|
278
|
+
end
|
279
|
+
#------------------------------------------#------------------------------------------
|
280
|
+
|
281
|
+
#Checking if the PHRASE is suggestive
|
282
|
+
def is_suggestive_phrase(phrase)
|
283
|
+
not_suggestive = POSITIVE
|
284
|
+
for i in (0..SUGGESTIVE_PHRASES.length - 1)
|
285
|
+
if(phrase.casecmp(SUGGESTIVE_PHRASES[i]) == 0)
|
286
|
+
not_suggestive = SUGGESTIVE #indicates negation found
|
287
|
+
break
|
288
|
+
end
|
289
|
+
end
|
290
|
+
return not_suggestive
|
291
|
+
end
|
292
|
+
|
293
|
+
end #end of the class
|
@@ -0,0 +1,342 @@
|
|
1
|
+
require 'automated_metareview/constants'
|
2
|
+
require 'automated_metareview/edge'
|
3
|
+
require 'automated_metareview/vertex'
|
4
|
+
|
5
|
+
class TextPreprocessing
|
6
|
+
|
7
|
+
=begin
|
8
|
+
Fetching review data from the tables based on the response_map id
|
9
|
+
=end
|
10
|
+
def fetch_review_data(auto_metareview, map_id)
|
11
|
+
reviews = Array.new
|
12
|
+
responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
|
13
|
+
auto_metareview.responses = responses
|
14
|
+
auto_metareview.response_id = responses.id
|
15
|
+
# puts "auto_metareview.response_id #{auto_metareview.response_id}"
|
16
|
+
# puts "responses updated_at #{responses.updated_at}"
|
17
|
+
responses.scores.each{
|
18
|
+
| review_score |
|
19
|
+
if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
|
20
|
+
# puts review_score.comments
|
21
|
+
reviews << review_score.comments
|
22
|
+
end
|
23
|
+
}
|
24
|
+
return reviews
|
25
|
+
end
|
26
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
27
|
+
=begin
|
28
|
+
Fetching submission data from the url submitted by the reviewee
|
29
|
+
=end
|
30
|
+
def fetch_submission_data(map_id)
|
31
|
+
subm_array = Array.new
|
32
|
+
response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
|
33
|
+
reviewee_id = response_map.reviewee_id
|
34
|
+
reviewed_object = response_map.reviewed_object_id
|
35
|
+
url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
|
36
|
+
if(url.nil?)#in case of team assignments
|
37
|
+
teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
|
38
|
+
teams_users.each{
|
39
|
+
|team_user|
|
40
|
+
url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
|
41
|
+
if(!url.nil?)#break out when you find the url
|
42
|
+
break
|
43
|
+
end
|
44
|
+
}
|
45
|
+
end
|
46
|
+
# puts "***url #{url} #{url}"
|
47
|
+
#fetching the url submitted by the reviewee
|
48
|
+
url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
|
49
|
+
# puts "***url #{url} #{url.class}"
|
50
|
+
page = Nokogiri::HTML(open(url))
|
51
|
+
#fetching the paragraph texts from the specified url
|
52
|
+
if(page.css('p').count != 0)
|
53
|
+
page.css('p').each do |subm|
|
54
|
+
# puts "subm.text.. #{subm.text}"
|
55
|
+
subm_array << subm.text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
#for google docs where the text is placed inside <script></script> tags
|
59
|
+
if(page.css('script').count != 0)
|
60
|
+
page.css('script').each do |subm|
|
61
|
+
if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
|
62
|
+
subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
return subm_array
|
67
|
+
end
|
68
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
69
|
+
=begin
|
70
|
+
pre-processes the review text and sends it in for graph formation and further analysis
|
71
|
+
=end
|
72
|
+
def segment_text(flag, text_array)
|
73
|
+
if(flag == 0)
|
74
|
+
reviews = Array.new(1){Array.new}
|
75
|
+
else
|
76
|
+
reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
|
77
|
+
end
|
78
|
+
|
79
|
+
i = 0
|
80
|
+
j = 0
|
81
|
+
|
82
|
+
for k in (0..text_array.length-1)
|
83
|
+
text = text_array[k]
|
84
|
+
if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
|
85
|
+
reviews[j] = Array.new #initializing the array for sentences in a test review
|
86
|
+
i = 0
|
87
|
+
end
|
88
|
+
|
89
|
+
#******* Pre-processing the review/submission text **********
|
90
|
+
#replacing commas in large numbers, makes parsing sentences with commas confusing!
|
91
|
+
#replacing quotation marks
|
92
|
+
text.gsub!("\"", "")
|
93
|
+
text.gsub!("(", "")
|
94
|
+
text.gsub!(")", "")
|
95
|
+
if(text.include?("http://"))
|
96
|
+
text = remove_urls(text)
|
97
|
+
end
|
98
|
+
#break the text into multiple sentences
|
99
|
+
beginn = 0
|
100
|
+
if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
|
101
|
+
while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
|
102
|
+
endd = 0
|
103
|
+
#these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
|
104
|
+
if(text.include?("."))
|
105
|
+
endd = text.index(".")
|
106
|
+
end
|
107
|
+
if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
|
108
|
+
endd = text.index("?")
|
109
|
+
end
|
110
|
+
if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
|
111
|
+
endd = text.index("!")
|
112
|
+
end
|
113
|
+
if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
|
114
|
+
endd = text.index(",")
|
115
|
+
end
|
116
|
+
if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
|
117
|
+
endd = text.index(";")
|
118
|
+
end
|
119
|
+
|
120
|
+
#check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
|
121
|
+
if(flag == 0) #training
|
122
|
+
reviews[0][i] = text[beginn..endd].strip
|
123
|
+
else #testing
|
124
|
+
reviews[j][i] = text[beginn..endd].strip
|
125
|
+
end
|
126
|
+
i+=1 #incrementing the sentence counter
|
127
|
+
text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
|
128
|
+
end #end of the while loop
|
129
|
+
else #if there is only 1 sentence in the text
|
130
|
+
if(flag == 0)#training
|
131
|
+
reviews[0][i] = text.strip
|
132
|
+
i+=1 #incrementing the sentence counter
|
133
|
+
else #testing
|
134
|
+
reviews[j][i] = text.strip
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
if(flag == 1)#incrementing reviews counter only for test reviews
|
139
|
+
j+=1
|
140
|
+
end
|
141
|
+
end #end of the for loop with 'k' reading text rows
|
142
|
+
|
143
|
+
#setting the number of reviews before returning
|
144
|
+
if(flag == 0)#training
|
145
|
+
num_reviews = 1 #for training the number of reviews is 1
|
146
|
+
else #testing
|
147
|
+
num_reviews = j
|
148
|
+
end
|
149
|
+
|
150
|
+
if(flag == 0)
|
151
|
+
return reviews[0]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
155
|
+
=begin
|
156
|
+
* Reads the patterns from the csv file containing them.
|
157
|
+
* maxValue is the maximum value of the patterns found
|
158
|
+
=end
|
159
|
+
|
160
|
+
def read_patterns(filename, pos)
|
161
|
+
num = 1000 #some large number
|
162
|
+
patterns = Array.new
|
163
|
+
state = POSITIVE
|
164
|
+
i = 0 #keeps track of the number of edges
|
165
|
+
|
166
|
+
#setting the state for problem detection and suggestive patterns
|
167
|
+
if(filename.include?("prob"))
|
168
|
+
state = NEGATED
|
169
|
+
elsif(filename.include?("suggest"))
|
170
|
+
state = SUGGESTIVE
|
171
|
+
end
|
172
|
+
|
173
|
+
FasterCSV.foreach(filename) do |text|
|
174
|
+
in_vertex = text[0][0..text[0].index("=")-1].strip
|
175
|
+
out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
|
176
|
+
|
177
|
+
first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
|
178
|
+
first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
|
179
|
+
|
180
|
+
patterns[i] = Edge.new("noun", NOUN)
|
181
|
+
#setting the invertex
|
182
|
+
if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
|
183
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
184
|
+
elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
|
185
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
186
|
+
elsif(first_string_in_vertex.include?("JJ"))
|
187
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
188
|
+
elsif(first_string_in_vertex.include?("/RB"))
|
189
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
190
|
+
else #default to noun
|
191
|
+
patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
|
192
|
+
end
|
193
|
+
|
194
|
+
#setting outvertex
|
195
|
+
if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
|
196
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
197
|
+
elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
|
198
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
199
|
+
elsif(first_string_out_vertex.include?("JJ"))
|
200
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
|
201
|
+
elsif(first_string_out_vertex.include?("/RB"))
|
202
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
203
|
+
else #default is noun
|
204
|
+
patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
|
205
|
+
end
|
206
|
+
i+=1 #incrementing for each pattern
|
207
|
+
end #end of the FasterCSV.foreach loop
|
208
|
+
num_patterns = i
|
209
|
+
return patterns
|
210
|
+
end
|
211
|
+
|
212
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
213
|
+
|
214
|
+
=begin
|
215
|
+
Removes any urls in the text and returns the remaining text as it is
|
216
|
+
=end
|
217
|
+
def remove_urls(text)
|
218
|
+
final_text = String.new
|
219
|
+
if(text.include?("http://"))
|
220
|
+
tokens = text.split(" ")
|
221
|
+
tokens.each{
|
222
|
+
|token|
|
223
|
+
if(!token.include?("http://"))
|
224
|
+
final_text = final_text + " " + token
|
225
|
+
end
|
226
|
+
}
|
227
|
+
else
|
228
|
+
return text
|
229
|
+
end
|
230
|
+
return final_text
|
231
|
+
end
|
232
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
233
|
+
|
234
|
+
=begin
|
235
|
+
Check for plagiarism after removing text within quotes for reviews
|
236
|
+
=end
|
237
|
+
def remove_text_within_quotes(review_text)
|
238
|
+
# puts "Inside removeTextWithinQuotes:: "
|
239
|
+
reviews = Array.new
|
240
|
+
review_text.each{ |row|
|
241
|
+
# puts "row #{row}"
|
242
|
+
text = row
|
243
|
+
#text = text[1..text.length-2] #since the first and last characters are quotes
|
244
|
+
#puts "text #{text}"
|
245
|
+
#the read text is tagged with two sets of quotes!
|
246
|
+
if(text.include?("\""))
|
247
|
+
while(text.include?("\"")) do
|
248
|
+
replace_text = text.scan(/"([^"]*)"/)
|
249
|
+
# puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
|
250
|
+
# puts text.index(replace_text[0].to_s)
|
251
|
+
# puts "replace_text length .. #{replace_text[0].to_s.length}"
|
252
|
+
#fetching the start index of the quoted text, in order to replace the complete segment
|
253
|
+
start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
|
254
|
+
# puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
|
255
|
+
#replacing the text segment within the quotes (including the quotes) with an empty string
|
256
|
+
text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
|
257
|
+
# puts "text .. #{text}"
|
258
|
+
end #end of the while loop
|
259
|
+
end
|
260
|
+
reviews << text #set the text after all quoted segments have been removed.
|
261
|
+
} #end of the loop for "text" array
|
262
|
+
# puts "returning reviews length .. #{reviews.length}"
|
263
|
+
return reviews #return only the first array element - a string!
|
264
|
+
end
|
265
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
266
|
+
=begin
|
267
|
+
Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
|
268
|
+
=end
|
269
|
+
def check_correct_spellings(review_text_array, speller)
|
270
|
+
review_text_array_temp = Array.new
|
271
|
+
#iterating through each response
|
272
|
+
review_text_array.each{
|
273
|
+
|review_text|
|
274
|
+
review_tokens = review_text.split(" ")
|
275
|
+
review_text_temp = ""
|
276
|
+
#iterating through tokens from each response
|
277
|
+
review_tokens.each{
|
278
|
+
|review_tok|
|
279
|
+
#checkiing the stem word's spelling for correctness
|
280
|
+
if(!speller.check(review_tok))
|
281
|
+
if(!speller.suggest(review_tok).first.nil?)
|
282
|
+
review_tok = speller.suggest(review_tok).first
|
283
|
+
end
|
284
|
+
end
|
285
|
+
review_text_temp = review_text_temp +" " + review_tok.downcase
|
286
|
+
}
|
287
|
+
review_text_array_temp << review_text_temp
|
288
|
+
}
|
289
|
+
return review_text_array_temp
|
290
|
+
end
|
291
|
+
|
292
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
293
|
+
=begin
|
294
|
+
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
|
295
|
+
=end
|
296
|
+
public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
|
297
|
+
def contains_punct(str)
|
298
|
+
if(str.include?".")
|
299
|
+
str.gsub!(".","")
|
300
|
+
elsif(str.include?",")
|
301
|
+
str.gsub!(",","")
|
302
|
+
elsif(str.include?"?")
|
303
|
+
str.gsub!("?","")
|
304
|
+
elsif(str.include?"!")
|
305
|
+
str.gsub!("!","")
|
306
|
+
elsif(str.include?";")
|
307
|
+
str.gsub(";","")
|
308
|
+
elsif(str.include?":")
|
309
|
+
str.gsub!(":","")
|
310
|
+
elsif(str.include?"(")
|
311
|
+
str.gsub!("(","")
|
312
|
+
elsif(str.include?")")
|
313
|
+
str.gsub!(")","")
|
314
|
+
elsif(str.include?"[")
|
315
|
+
str.gsub!("[","")
|
316
|
+
elsif(str.include?"]")
|
317
|
+
str.gsub!("]","")
|
318
|
+
end
|
319
|
+
return str
|
320
|
+
end
|
321
|
+
|
322
|
+
def contains_punct_bool(str)
|
323
|
+
if(str.include?("\\n") or str.include?("}") or str.include?("{"))
|
324
|
+
return true
|
325
|
+
else
|
326
|
+
return false
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
331
|
+
=begin
|
332
|
+
Checking if "str" is a punctuation mark like ".", ",", "?" etc.
|
333
|
+
=end
|
334
|
+
def is_punct(str)
|
335
|
+
if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
|
336
|
+
return true
|
337
|
+
else
|
338
|
+
return false
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
end #end of class
|