automated_metareview 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/text_preprocessing'
3
+
4
+ class TextQuantity
5
+ def number_of_unique_tokens(text_array)
6
+ pre_string = "" #preString helps keep track of the text that has been checked for unique tokens and text that has not
7
+ count = 0 #counts the number of unique tokens
8
+ instance = WordnetBasedSimilarity.new
9
+ text_array.each{
10
+ |text|
11
+ tp = TextPreprocessing.new
12
+ text = tp.contains_punct(text)
13
+ all_tokens = text.split(" ")
14
+ all_tokens.each{
15
+ |token|
16
+ if(!instance.is_frequent_word(token.downcase)) #do not count this word if it is a frequent word
17
+ if(!pre_string.downcase.include?(token.downcase)) #if the token was not already seen earlier i.e. not a part of the preString
18
+ count+=1
19
+ end
20
+ end
21
+ pre_string = pre_string +" " + token.downcase #adding token to the preString
22
+ }
23
+ }
24
+ return count
25
+ end
26
+ end
@@ -0,0 +1,212 @@
1
+ require 'automated_metareview/graph_generator'
2
+ require 'automated_metareview/wordnet_based_similarity'
3
+ require 'automated_metareview/constants'
4
+
5
+ class Tone
6
+ def identify_tone(pos_tagger, core_NLP_tagger, review_text, review_graph)
7
+ speller = Aspell.new("en_US")
8
+ speller.suggestion_mode = Aspell::NORMAL
9
+
10
+ cumulative_edge_feature = Array.new
11
+ cumulative_review_tone = Array.new
12
+ cumulative_review_tone = [-1, -1, -1] #sum of all edge tones
13
+
14
+ #extracting positive and negative words from files into arrays
15
+ positive_file = "app/models/automated_metareview/positive-words.csv"
16
+ negative_file = "app/models/automated_metareview/negative-words.csv"
17
+ positive = Array.new
18
+ negative = Array.new
19
+ FasterCSV.foreach(positive_file) do |text|
20
+ positive << text[0]
21
+ end
22
+
23
+ FasterCSV.foreach(negative_file) do |text|
24
+ negative << text[0]
25
+ end
26
+
27
+ negative = negative + NEGATIVE_DESCRIPTORS
28
+ review_edges = review_graph.edges
29
+
30
+ #if the edges are nil
31
+ if(review_edges.nil?)
32
+ return cumulative_review_tone
33
+ end
34
+
35
+ wbsim = WordnetBasedSimilarity.new
36
+ in_feature = Array.new
37
+ out_feature = Array.new
38
+ review_edges.each{
39
+ |edge|
40
+ if(!edge.nil?)
41
+ if(!edge.in_vertex.nil?)
42
+ # puts "#### Checking for edge #{edge.in_vertex.name}"
43
+ in_feature = get_feature_vector(edge.in_vertex, positive, negative, speller)
44
+ end
45
+ if(!edge.out_vertex.nil?)
46
+ # puts "#### with outvertex #{edge.out_vertex.name}"
47
+ out_feature = get_feature_vector(edge.out_vertex, positive, negative, speller)
48
+ end
49
+
50
+ # puts "in_feature :: [#{in_feature[0]}, #{in_feature[1]}]"
51
+ # puts "out_feature :: [#{out_feature[0]}, #{out_feature[1]}]"
52
+
53
+ #making sure that we don't include frequent tokens' tones while calculating cumulative edge tone (both + and -)
54
+ if(!wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
55
+ cumulative_edge_feature[0] = (in_feature[0].to_f + out_feature[0].to_f)/2.to_f
56
+ cumulative_edge_feature[1] = (in_feature[1].to_f + out_feature[1].to_f)/2.to_f
57
+ elsif(wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
58
+ cumulative_edge_feature[0] = out_feature[0].to_f
59
+ cumulative_edge_feature[1] = out_feature[1].to_f
60
+ elsif(!wbsim.is_frequent_word(edge.in_vertex.name) and wbsim.is_frequent_word(edge.out_vertex.name))
61
+ cumulative_edge_feature[0] = in_feature[0].to_f
62
+ cumulative_edge_feature[1] = in_feature[1].to_f
63
+ else
64
+ cumulative_edge_feature[0] = 0
65
+ cumulative_edge_feature[1] = 0
66
+ end
67
+
68
+ # puts "cumulative_edge_feature :: [#{cumulative_edge_feature[0]}, #{cumulative_edge_feature[1]}]"
69
+ if((cumulative_review_tone[0] == -1 and cumulative_review_tone[1] == -1) or
70
+ (cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)) #has not been initialized as yet
71
+ cumulative_review_tone[0] = cumulative_edge_feature[0].to_f
72
+ cumulative_review_tone[1] = cumulative_edge_feature[1].to_f
73
+ elsif(cumulative_edge_feature[0] > 0 or cumulative_edge_feature[1] > 0)
74
+ #only edges with some tone (either vertices) are taken into consideration during cumulative edge calculation
75
+ #else all edges will be considered, which may adversely affect the net tone of the review text
76
+ cumulative_review_tone[0] = (cumulative_review_tone[0].to_f + cumulative_edge_feature[0].to_f)/2.to_f
77
+ cumulative_review_tone[1] = (cumulative_review_tone[1].to_f + cumulative_edge_feature[1].to_f)/2.to_f
78
+ end
79
+ # puts "cumulative_review_tone :: [#{cumulative_review_tone[0]}, #{cumulative_review_tone[1]}]"
80
+ end
81
+ }
82
+ # puts "cumulative tone :: positive - #{cumulative_review_tone[0]}, negative - #{cumulative_review_tone[1]}"
83
+ if(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)
84
+ cumulative_review_tone[2] = 1 #setting neutrality value
85
+ else
86
+ cumulative_review_tone[2] = 0
87
+ end
88
+ return cumulative_review_tone
89
+ end
90
+ #--------
91
+ def get_feature_vector(vertex, positive, negative, speller)
92
+ threshold = THRESHOLD #max distance at which synonyms can be searched
93
+ feature_vector = Array.new #size of the array depends on th number of tone dimensions e.g.[positive, negative, netural]
94
+ feature_vector = [0, 0] #initializing
95
+ #look for the presence of token in positive set
96
+ if(positive.include?(vertex.name.downcase))
97
+ feature_vector[0] = 1 #
98
+ else
99
+ #recursively check for synonyms of token in the positive set
100
+ distance = 1
101
+ flag = 0
102
+ synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
103
+ synonym_sets.each{
104
+ |set|
105
+ if(positive.length - (positive - set).length > 0)
106
+ feature_vector[0] = 1/distance
107
+ flag = 1
108
+ end
109
+
110
+ if(flag == 1)
111
+ break #break out of the loop
112
+ end
113
+ distance+=1 #incrementing to check synonyms in the next level
114
+ }
115
+ end
116
+
117
+ # repeat above with negative set
118
+ if(negative.include?(vertex.name.downcase))
119
+ feature_vector[1] = 1 #
120
+ else
121
+ #recursively check for synonyms of token in the positive set
122
+ distance = 1
123
+ flag = 0
124
+ synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
125
+ if(!synonym_sets[1].empty?)#i.e. if there were no synonyms identified for the token avoid rechecking for [0] - since that contains the original token
126
+ synonym_sets.each{
127
+ |set|
128
+ if(negative.length - (negative - set).length > 0)
129
+ feature_vector[1] = 1/distance
130
+ flag = 1
131
+ end
132
+
133
+ if(flag == 1)
134
+ break #break out of the loop
135
+ end
136
+ distance+=1 #incrementing to check synonyms in the next level
137
+ } #end of loop for synonym sets
138
+ end
139
+ end #end of if condition
140
+
141
+ return feature_vector
142
+ end
143
+ #--------
144
+ =begin
145
+ getSynonyms - gets synonyms for vertex - upto 'threshold' levels of synonyms
146
+ level 1 = token
147
+ level 2 = token's synonyms
148
+ ...
149
+ level 'threshold' = synonyms of tokens in threshold - 1 level
150
+ =end
151
+
152
+ def get_synonyms(vertex, threshold, speller)
153
+ wbsim = WordnetBasedSimilarity.new
154
+ if(vertex.pos_tag.nil?)
155
+ pos = wbsim.determine_POS(vertex)
156
+ else
157
+ pos = vertex.pos_tag
158
+ end
159
+
160
+ revSyn = Array.new(threshold+1){Array.new} #contains synonyms for the different levels
161
+ revSyn[0] << vertex.name.downcase.split(" ")[0] #holds the array of tokens whose synonyms are to be identified,
162
+ # and what if the vertex had a long phrase
163
+ #at first level '0' is the token itself
164
+ i = 0
165
+ while i < threshold do
166
+ list_new = Array.new
167
+ revSyn[i].each{
168
+ |token|
169
+ lemmas = WordNet::WordNetDB.find(token) #reviewLemma = revIndex.find(revToken) #
170
+ if(lemmas.nil?)
171
+ lemmas = WordNet::WordNetDB.find(wbsim.findStemWord(token, speller)) #revIndex.find(revStem[0])
172
+ end
173
+ #select the lemma corresponding to the token's POS
174
+ lemma = lemmas[0] #set the first one as the default lemma, later if one with exact POS is found, set that as the lemma
175
+ lemmas.each do |l|
176
+ #puts "lemma's POS :: #{l.pos} and reviewPOS :: #{pos}"
177
+ if(l.pos.casecmp(pos) == 0)
178
+ lemma = l
179
+ end
180
+ end
181
+
182
+ #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
183
+ #if selected reviewLemma is not nil or empty
184
+ if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
185
+ #creating arrays of all the values for synonyms, hyponyms etc. for the review token
186
+ for g in 0..lemma.synsets.length - 1
187
+ #fetching the first review synset
188
+ review_lemma_synset = lemma.synsets[g]
189
+ #synonyms
190
+ begin #error handling
191
+ rev_lemma_syns = review_lemma_synset.get_relation("&")
192
+ #for each synset get the values and add them to the array
193
+ for h in 0..rev_lemma_syns.length - 1
194
+ #incrementing the array with new synonym words
195
+ list_new = list_new + rev_lemma_syns[h].words
196
+ end
197
+ rescue
198
+ list_new = nil
199
+ end
200
+ end
201
+ end #end of checking if the lemma is nil or empty
202
+ } #end of iterating through revSyn[level]'s tokens
203
+
204
+ if(list_new.nil? or list_new.empty?)
205
+ break
206
+ end
207
+ i+=1 #level is incremented
208
+ revSyn[i] = list_new #setting synonyms
209
+ end
210
+ return revSyn
211
+ end
212
+ end
@@ -0,0 +1,3 @@
1
+ module AutomatedMetareview
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,18 @@
1
+ class Vertex
2
+ #attr_accessor auto creates the get and set methods for the following attributes
3
+ attr_accessor :name, :type, :frequency, :index, :node_id, :state, :label, :parent, :pos_tag
4
+ def initialize(vertex_name, vertex_type, index_value, state, lab, par, pos_tag)
5
+ @name = vertex_name
6
+ @type = vertex_type
7
+ @frequency = 0
8
+ @index = index_value
9
+ @node_id = -1 #to identify if the id has been set or not
10
+ @state = state #they are not negated by default
11
+
12
+ #for semantic role labelling
13
+ @label = lab
14
+ @parent = par
15
+
16
+ @pos_tag = pos_tag
17
+ end
18
+ end
@@ -0,0 +1,480 @@
1
+ require 'automated_metareview/vertex'
2
+ require 'automated_metareview/constants'
3
+
4
+ class WordnetBasedSimilarity
5
+ attr_accessor :match, :count
6
+ # @@posTagger = EngTagger.new
7
+ def compare_strings(reviewVertex, submVertex, speller)
8
+ #must fix this to something that is local to the app
9
+ # WordNet::WordNetDB.path = "/usr/local/WordNet-3.0"
10
+ # WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0"
11
+ review = reviewVertex.name
12
+ submission = submVertex.name
13
+ reviewState = reviewVertex.state
14
+ submState = submVertex.state
15
+
16
+ # puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}");
17
+ @match = 0
18
+ @count = 0
19
+
20
+ reviewPOS = ""
21
+ submPOS = ""
22
+
23
+ #checking for exact matches between the tokens
24
+ if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value
25
+ # puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}")
26
+ if(reviewState.equal?(submState))
27
+ @match = @match + EXACT
28
+ elsif(!reviewState.equal?(submState))
29
+ @match = @match + NEGEXACT
30
+ end
31
+ return @match
32
+ end
33
+
34
+ stokRev = review.split(" ")
35
+ #stokSub = submission.split(" ") #should've been inside when doing n * n comparison
36
+
37
+ #iterating through review tokens
38
+ for i in (0..stokRev.length-1)
39
+ #if either of the tokens is null
40
+ if(stokRev[i].nil?)
41
+ next #continue with the next token
42
+ end
43
+ revToken = stokRev[i].downcase()
44
+ if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
45
+ reviewPOS = determine_POS(reviewVertex).strip
46
+ end
47
+
48
+ # puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}")
49
+ if(revToken.equal?("n't"))
50
+ revToken = "not"
51
+ # puts("replacing n't")
52
+ end
53
+
54
+ #if the review token is a frequent word, continue
55
+ if(is_frequent_word(revToken))
56
+ # puts("Skipping frequent review token .. #{revToken}")
57
+ next #equivalent of the "continue"
58
+ end
59
+
60
+ #fetching synonyms, hypernyms, hyponyms etc. for the review token
61
+ revStem = find_stem_word(revToken, speller)
62
+ #fetching all the relations
63
+ review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS)
64
+ #setting the values in specific array variables
65
+ revGloss = review_relations[0]
66
+ revSyn =review_relations[1]
67
+ revHyper = review_relations[2]
68
+ revHypo = review_relations[3]
69
+ revAnt = review_relations[4]
70
+
71
+ # puts "reviewStem:: #{revStem} .. #{revStem.class}"
72
+ # puts "reviewGloss:: #{revGloss} .. #{revGloss.class}"
73
+ # puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}"
74
+ # puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}"
75
+ # puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}"
76
+ # puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}"
77
+
78
+ stokSub = submission.split(" ")
79
+ #iterating through submission tokens
80
+ for j in (0..stokSub.length-1)
81
+
82
+ if(stokSub[i].nil?)
83
+ next
84
+ end
85
+
86
+ subToken = stokSub[j].downcase()
87
+ if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
88
+ submPOS = determine_POS(submVertex).strip
89
+ end
90
+
91
+ # puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}")
92
+ if(subToken.equal?("n't"))
93
+ subToken = "not"
94
+ # puts("replacing n't")
95
+ end
96
+
97
+ #if the review token is a frequent word, continue
98
+ if(is_frequent_word(subToken))
99
+ # puts("Skipping frequent subtoken .. #{subToken}")
100
+ next #equivalent of the "continue"
101
+ end
102
+
103
+ #fetching synonyms, hypernyms, hyponyms etc. for the submission token
104
+ submStem = find_stem_word(subToken, speller)
105
+ subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS)
106
+ submGloss = subm_relations[0]
107
+ submSyn =subm_relations[1]
108
+ submHyper = subm_relations[2]
109
+ submHypo = subm_relations[3]
110
+ submAnt = subm_relations[4]
111
+ # puts "submStem:: #{submStem}"
112
+ # puts "submGloss:: #{submGloss}"
113
+ # puts "submSynonyms:: #{submSyn}"
114
+ # puts "submHypernyms:: #{submHyper}"
115
+ # puts "submHyponyms:: #{submHypo}"
116
+ # puts "submAntonyms:: #{submAnt}"
117
+
118
+ #------------------------------------------
119
+ #checks are ordered from BEST to LEAST degree of semantic relatedness
120
+ #*****exact matches
121
+ # puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}"
122
+ # puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}"
123
+ # puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}"
124
+ if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase()))
125
+ # puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}")
126
+ if(reviewState.equal?(submState))
127
+ @match = @match + EXACT
128
+ elsif(!reviewState.equal?(submState))
129
+ @match = @match + NEGEXACT
130
+ end
131
+ @count+=1
132
+ next #skip all remaining checks
133
+ end #end of if condition checking for exact matches
134
+ #------------------------------------------
135
+ #*****For Synonyms
136
+ #if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped
137
+ if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM))
138
+ next
139
+ end
140
+ #------------------------------------------
141
+ #ANTONYMS
142
+ if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM))
143
+ next
144
+ end
145
+ #------------------------------------------
146
+ #*****For Hypernyms
147
+ if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM))
148
+ next
149
+ end
150
+ #------------------------------------------
151
+ #*****For Hyponyms
152
+ if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM))
153
+ next
154
+ end
155
+
156
+ #overlap across definitions
157
+ # checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review
158
+ # or submission token or stem.
159
+ # puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}"
160
+ # puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}"
161
+ # rev_def = extract_definition(revGloss)
162
+ # sub_def = extract_definition(submGloss)
163
+ #(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or
164
+ if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or
165
+ (!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem))))
166
+ if(reviewState == submState)
167
+ @match = @match + OVERLAPDEFIN
168
+ elsif(reviewState != submState)
169
+ @match = @match + NEGOVERLAPDEFIN
170
+ end
171
+ @count+=1
172
+ next
173
+ end
174
+
175
+ #no match found!
176
+ # puts "No Match found!"
177
+ @match = @match + NOMATCH
178
+ @count+=1
179
+ end #end of the for loop for submission tokens
180
+ end #end of the for loop for review tokens
181
+
182
+ if(@count > 0)
183
+ # puts ("Match: #{@match} Count:: #{@count}")
184
+ result = (@match.to_f/@count.to_f).round
185
+ # puts("@@@@@@@@@ Returning Value: #{result}")
186
+ return result #an average of the matches found
187
+ end
188
+ # puts("@@@@@@@@@ Returning NOMATCH")
189
+ return NOMATCH
190
+
191
+ end #end of compareStrings method
192
+
193
+ #------------------------------------------------------------------------------
194
+ =begin
195
+ This method fetches the synonyms, hypernyms, hyponyms and other relations for the 'token' and its stem 'stem'.
196
+ This is done for both review and submission tokens/stems.
197
+ It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.
198
+ =end
199
+
200
+ def get_relations_for_review_submission_tokens(token, stem, pos)
201
+ # puts "@@@@ Inside get_relations_for_review_submission_tokens"
202
+ relations = Array.new
203
+ lemmas = WordNet::WordNetDB.find(token)
204
+ if(lemmas.nil?)
205
+ lemmas = WordNet::WordNetDB.find(stem)
206
+ end
207
+ #select the lemma corresponding to the token's POS
208
+ lemma = ""
209
+ lemmas.each do |l|
210
+ # puts "lemma's POS :: #{l.pos} and POS :: #{pos}"
211
+ if(l.pos == pos)
212
+ lemma = l
213
+ break
214
+ end
215
+ end
216
+
217
+ def_arr = Array.new
218
+ syn_arr = Array.new
219
+ hyper_arr = Array.new
220
+ hypo_arr = Array.new
221
+ anto_arr = Array.new
222
+
223
+ #if selected reviewLemma is not nil or empty
224
+ if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
225
+ #creating arrays of all the values for synonyms, hyponyms etc. for the review token
226
+ for g in 0..lemma.synsets.length - 1
227
+ #fetching the first review synset
228
+ lemma_synset = lemma.synsets[g]
229
+
230
+ #definitions
231
+ if(!lemma_synset.gloss.nil?)
232
+ #puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}"
233
+ if(def_arr[0].nil?)
234
+ def_arr << extract_definition(lemma_synset.gloss)
235
+ else
236
+ def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss)
237
+ end
238
+ else
239
+ def_arr << nil
240
+ end
241
+
242
+ #looking for all relations synonym, hypernym, hyponym etc. from among this synset
243
+ #synonyms
244
+ begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
245
+ lemmaSyns = lemma_synset.get_relation("&")
246
+ if(!lemmaSyns.nil? and lemmaSyns.length != 0)
247
+ # puts "lemmaSyns.length #{lemmaSyns.length}"
248
+ #for each synset get the values and add them to the array
249
+ for h in 0..lemmaSyns.length - 1
250
+ # puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}"
251
+ syn_arr = syn_arr + lemmaSyns[h].words
252
+ # puts "**** syn_arr #{syn_arr}"
253
+ end
254
+ else
255
+ syn_arr << nil #setting nil when no synset match is found for a particular type of relation
256
+ end
257
+ rescue
258
+ syn_arr << nil
259
+ end
260
+
261
+ #hypernyms
262
+ begin
263
+ lemmaHypers = lemma_synset.get_relation("@")#hypernym.words
264
+ if(!lemmaHypers.nil? and lemmaHypers.length != 0)
265
+ #for each synset get the values and add them to the array
266
+ for h in 0..lemmaHypers.length - 1
267
+ #puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}"
268
+ hyper_arr = hyper_arr + lemmaHypers[h].words
269
+ end
270
+ else
271
+ hyper_arr << nil
272
+ end
273
+ rescue
274
+ hyper_arr << nil
275
+ end
276
+
277
+ #hyponyms
278
+ begin
279
+ lemmaHypos = lemma_synset.get_relation("~")#hyponym
280
+ if(!lemmaHypos.nil? and lemmaHypos.length != 0)
281
+ #for each synset get the values and add them to the array
282
+ for h in 0..lemmaHypos.length - 1
283
+ hypo_arr = hypo_arr + lemmaHypos[h].words
284
+ end
285
+ else
286
+ hypo_arr << nil
287
+ end
288
+ rescue
289
+ hypo_arr << nil
290
+ end
291
+
292
+ #antonyms
293
+ begin
294
+ lemmaAnts = lemma_synset.get_relation("!")
295
+ if(!lemmaAnts.nil? and lemmaAnts.length != 0)
296
+ #for each synset get the values and add them to the array
297
+ for h in 0..lemmaAnts.length - 1
298
+ anto_arr = anto_arr + lemmaAnts[h].words
299
+ end
300
+ else
301
+ anto_arr << nil
302
+ end
303
+ rescue
304
+ anto_arr << nil
305
+ end
306
+ end #end of the for loop for g
307
+ end #end of checking if the lemma is nil or empty
308
+
309
+ #setting the array elements before returning the array
310
+ relations << def_arr
311
+ relations << syn_arr
312
+ relations << hyper_arr
313
+ relations << hypo_arr
314
+ relations << anto_arr
315
+ return relations
316
+ end
317
+
318
+ #------------------------------------------------------------------------------
319
+ =begin
320
+ This method compares the submission and reviews' synonyms and antonyms with each others' tokens and stem values.
321
+ The instance variables 'match' and 'count' are updated accordingly.
322
+ =end
323
+ def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type)
324
+ flag = 0 #indicates if a match was found
325
+ # puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}")
326
+ # puts "rev_arr #{rev_arr}"
327
+ # puts "subm_arr #{subm_arr}"
328
+ if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or
329
+ (!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem))))
330
+ # puts("Match found between: #{rev_token} & #{subm_token}")
331
+ flag = 1 #setting the flag to indicate that a match was found
332
+ if(rev_state == subm_state)
333
+ @match = @match + match_type
334
+ elsif(rev_state != subm_state)
335
+ @match = @match+ non_match_type
336
+ end
337
+ @count+=1
338
+ end
339
+ if(flag == 1)
340
+ return true
341
+ else
342
+ return false
343
+ end
344
+ end
345
+
346
+ #------------------------------------------------------------------------------
347
+
348
+ =begin
349
+ determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word
350
+ =end
351
+ def determine_POS(vert)
352
+ str_pos = vert.pos_tag
353
+ # puts("Inside determine_POS POS Tag:: #{str_pos}")
354
+ if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP"))
355
+ pos = "n"#WordNet::Noun
356
+ elsif(str_pos.include?("JJ"))
357
+ pos = "a" #WordNet::Adjective
358
+ elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD"))
359
+ pos = "v" #WordNet::Verb
360
+ elsif(str_pos.include?("RB"))
361
+ pos = "r" #WordNet::Adverb
362
+ else
363
+ pos = "n" #WordNet::Noun
364
+ end
365
+ return pos
366
+ end
367
+
368
+ #------------------------------------------------------------------------------
369
+ =begin
370
+ is_frequent_word - method checks to see if the given word is a frequent word
371
+ =end
372
+ def is_frequent_word(word)
373
+ word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution
374
+ word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable
375
+ word.gsub!("[", "")
376
+ word.gsub!("]", "")
377
+ word.gsub!("\"", "")
378
+
379
+ if(FREQUENT_WORDS.include?(word))
380
+ return true
381
+ end
382
+
383
+ if(CLOSED_CLASS_WORDS.include?(word))
384
+ return true
385
+ end
386
+
387
+ return false
388
+ end #end of is_frequent_word method
389
+ #------------------------------------------------------------------------------
390
+ =begin
391
+ find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck
392
+ It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!
393
+ =end
394
+ def find_stem_word(word, speller)
395
+ stem = word.stem
396
+ correct = stem #initializing correct to the stem word
397
+ #checkiing the stem word's spelling for correctness
398
+ while(!speller.check(correct)) do
399
+ if(!speller.suggest(correct).first.nil?)
400
+ correct = speller.suggest(correct).first
401
+ else
402
+ #break out of the loop, else it will continue infinitely
403
+ break #break out of the loop if the first correction was nil
404
+ end
405
+ end
406
+ return correct
407
+ end #end of is_frequent_word method
408
+
409
+ #------------------------------------------------------------------------------
410
+
411
+ =begin
412
+ This method is used to extract definitions for the words (since glossed contain definitions and examples!)
413
+ glosses - string containing the gloss of the synset
414
+ =end
415
+ def extract_definition(glosses)
416
+ definitions = ""#[]
417
+ #extracting examples from definitions
418
+ temp = glosses
419
+ tempList = temp.split(";")
420
+ for i in 0..tempList.length - 1
421
+ if(!tempList[i].include?('"'))
422
+ if(definitions.empty?)
423
+ definitions = tempList[i]
424
+ else
425
+ definitions = definitions +" "+ tempList[i]
426
+ end
427
+ end
428
+ end
429
+ #puts definitions
430
+ return definitions
431
+ end
432
+ #------------------------------------------------------------------------------
433
+
434
+ def overlap(def1, def2, speller)
435
+ instance = WordnetBasedSimilarity.new
436
+ numOverlap = 0
437
+ #only overlaps across the ALL definitions
438
+ # puts "def1 #{def1}"
439
+ # puts "def2 #{def2}"
440
+
441
+ #iterating through def1's definitions
442
+ for i in 0..def1.length-1
443
+ if(!def1[i].nil?)
444
+ #puts "def1[#{i}] #{def1[i]}"
445
+ if( def1[i].include?("\""))
446
+ def1[i].gsub!("\"", " ")
447
+ end
448
+ if(def1[i].include?(";"))
449
+ def1[i] = def1[i][0..def1[i].index(";")]
450
+ end
451
+ #iterating through def2's definitions
452
+ for j in 0..def2.length - 1
453
+ if(!def2[j].nil?)
454
+ if(def2[j].include?(";"))
455
+ def2[j] = def2[j][0..def2[j].index(";")]
456
+ end
457
+ #puts "def2[#{j}] #{def2[j]}"
458
+ s1 = def1[i].split(" ")
459
+ s1.each do |tok1|
460
+ tok1stem = find_stem_word(tok1, speller)
461
+ s2 = def2[j].split(" ")
462
+ s2.each do |tok2|
463
+ tok2stem = find_stem_word(tok2, speller)
464
+ # puts "tok1 #{tok1} and tok2 #{tok2}"
465
+ # puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}"
466
+ if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and
467
+ !instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem))
468
+ # puts("**Overlap def/ex:: #{tok1} or #{tok1stem}")
469
+ numOverlap+=1
470
+ end
471
+ end #end of s2 loop
472
+ end #end of s1 loop
473
+ end #end of def2[j][0] being null
474
+ end #end of for loop for def2 - j
475
+ end #end of if def1[i][0] being null
476
+ end #end of for loop for def1 - i
477
+ return numOverlap
478
+ end
479
+ #------------------------------------------------------------------------------
480
+ end #end of WordnetBasedSimilarity class