automated_metareview 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/text_preprocessing'
3
+
4
+ class TextQuantity
5
+ def number_of_unique_tokens(text_array)
6
+ pre_string = "" #preString helps keep track of the text that has been checked for unique tokens and text that has not
7
+ count = 0 #counts the number of unique tokens
8
+ instance = WordnetBasedSimilarity.new
9
+ text_array.each{
10
+ |text|
11
+ tp = TextPreprocessing.new
12
+ text = tp.contains_punct(text)
13
+ all_tokens = text.split(" ")
14
+ all_tokens.each{
15
+ |token|
16
+ if(!instance.is_frequent_word(token.downcase)) #do not count this word if it is a frequent word
17
+ if(!pre_string.downcase.include?(token.downcase)) #if the token was not already seen earlier i.e. not a part of the preString
18
+ count+=1
19
+ end
20
+ end
21
+ pre_string = pre_string +" " + token.downcase #adding token to the preString
22
+ }
23
+ }
24
+ return count
25
+ end
26
+ end
@@ -0,0 +1,212 @@
1
+ require 'automated_metareview/graph_generator'
2
+ require 'automated_metareview/wordnet_based_similarity'
3
+ require 'automated_metareview/constants'
4
+
5
+ class Tone
6
+ def identify_tone(pos_tagger, core_NLP_tagger, review_text, review_graph)
7
+ speller = Aspell.new("en_US")
8
+ speller.suggestion_mode = Aspell::NORMAL
9
+
10
+ cumulative_edge_feature = Array.new
11
+ cumulative_review_tone = Array.new
12
+ cumulative_review_tone = [-1, -1, -1] #sum of all edge tones
13
+
14
+ #extracting positive and negative words from files into arrays
15
+ positive_file = "app/models/automated_metareview/positive-words.csv"
16
+ negative_file = "app/models/automated_metareview/negative-words.csv"
17
+ positive = Array.new
18
+ negative = Array.new
19
+ FasterCSV.foreach(positive_file) do |text|
20
+ positive << text[0]
21
+ end
22
+
23
+ FasterCSV.foreach(negative_file) do |text|
24
+ negative << text[0]
25
+ end
26
+
27
+ negative = negative + NEGATIVE_DESCRIPTORS
28
+ review_edges = review_graph.edges
29
+
30
+ #if the edges are nil
31
+ if(review_edges.nil?)
32
+ return cumulative_review_tone
33
+ end
34
+
35
+ wbsim = WordnetBasedSimilarity.new
36
+ in_feature = Array.new
37
+ out_feature = Array.new
38
+ review_edges.each{
39
+ |edge|
40
+ if(!edge.nil?)
41
+ if(!edge.in_vertex.nil?)
42
+ # puts "#### Checking for edge #{edge.in_vertex.name}"
43
+ in_feature = get_feature_vector(edge.in_vertex, positive, negative, speller)
44
+ end
45
+ if(!edge.out_vertex.nil?)
46
+ # puts "#### with outvertex #{edge.out_vertex.name}"
47
+ out_feature = get_feature_vector(edge.out_vertex, positive, negative, speller)
48
+ end
49
+
50
+ # puts "in_feature :: [#{in_feature[0]}, #{in_feature[1]}]"
51
+ # puts "out_feature :: [#{out_feature[0]}, #{out_feature[1]}]"
52
+
53
+ #making sure that we don't include frequent tokens' tones while calculating cumulative edge tone (both + and -)
54
+ if(!wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
55
+ cumulative_edge_feature[0] = (in_feature[0].to_f + out_feature[0].to_f)/2.to_f
56
+ cumulative_edge_feature[1] = (in_feature[1].to_f + out_feature[1].to_f)/2.to_f
57
+ elsif(wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
58
+ cumulative_edge_feature[0] = out_feature[0].to_f
59
+ cumulative_edge_feature[1] = out_feature[1].to_f
60
+ elsif(!wbsim.is_frequent_word(edge.in_vertex.name) and wbsim.is_frequent_word(edge.out_vertex.name))
61
+ cumulative_edge_feature[0] = in_feature[0].to_f
62
+ cumulative_edge_feature[1] = in_feature[1].to_f
63
+ else
64
+ cumulative_edge_feature[0] = 0
65
+ cumulative_edge_feature[1] = 0
66
+ end
67
+
68
+ # puts "cumulative_edge_feature :: [#{cumulative_edge_feature[0]}, #{cumulative_edge_feature[1]}]"
69
+ if((cumulative_review_tone[0] == -1 and cumulative_review_tone[1] == -1) or
70
+ (cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)) #has not been initialized as yet
71
+ cumulative_review_tone[0] = cumulative_edge_feature[0].to_f
72
+ cumulative_review_tone[1] = cumulative_edge_feature[1].to_f
73
+ elsif(cumulative_edge_feature[0] > 0 or cumulative_edge_feature[1] > 0)
74
+ #only edges with some tone (either vertices) are taken into consideration during cumulative edge calculation
75
+ #else all edges will be considered, which may adversely affect the net tone of the review text
76
+ cumulative_review_tone[0] = (cumulative_review_tone[0].to_f + cumulative_edge_feature[0].to_f)/2.to_f
77
+ cumulative_review_tone[1] = (cumulative_review_tone[1].to_f + cumulative_edge_feature[1].to_f)/2.to_f
78
+ end
79
+ # puts "cumulative_review_tone :: [#{cumulative_review_tone[0]}, #{cumulative_review_tone[1]}]"
80
+ end
81
+ }
82
+ # puts "cumulative tone :: positive - #{cumulative_review_tone[0]}, negative - #{cumulative_review_tone[1]}"
83
+ if(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)
84
+ cumulative_review_tone[2] = 1 #setting neutrality value
85
+ else
86
+ cumulative_review_tone[2] = 0
87
+ end
88
+ return cumulative_review_tone
89
+ end
90
+ #--------
91
+ def get_feature_vector(vertex, positive, negative, speller)
92
+ threshold = THRESHOLD #max distance at which synonyms can be searched
93
+ feature_vector = Array.new #size of the array depends on th number of tone dimensions e.g.[positive, negative, netural]
94
+ feature_vector = [0, 0] #initializing
95
+ #look for the presence of token in positive set
96
+ if(positive.include?(vertex.name.downcase))
97
+ feature_vector[0] = 1 #
98
+ else
99
+ #recursively check for synonyms of token in the positive set
100
+ distance = 1
101
+ flag = 0
102
+ synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
103
+ synonym_sets.each{
104
+ |set|
105
+ if(positive.length - (positive - set).length > 0)
106
+ feature_vector[0] = 1/distance
107
+ flag = 1
108
+ end
109
+
110
+ if(flag == 1)
111
+ break #break out of the loop
112
+ end
113
+ distance+=1 #incrementing to check synonyms in the next level
114
+ }
115
+ end
116
+
117
+ # repeat above with negative set
118
+ if(negative.include?(vertex.name.downcase))
119
+ feature_vector[1] = 1 #
120
+ else
121
+ #recursively check for synonyms of token in the positive set
122
+ distance = 1
123
+ flag = 0
124
+ synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
125
+ if(!synonym_sets[1].empty?)#i.e. if there were no synonyms identified for the token avoid rechecking for [0] - since that contains the original token
126
+ synonym_sets.each{
127
+ |set|
128
+ if(negative.length - (negative - set).length > 0)
129
+ feature_vector[1] = 1/distance
130
+ flag = 1
131
+ end
132
+
133
+ if(flag == 1)
134
+ break #break out of the loop
135
+ end
136
+ distance+=1 #incrementing to check synonyms in the next level
137
+ } #end of loop for synonym sets
138
+ end
139
+ end #end of if condition
140
+
141
+ return feature_vector
142
+ end
143
+ #--------
144
+ =begin
145
+ getSynonyms - gets synonyms for vertex - upto 'threshold' levels of synonyms
146
+ level 1 = token
147
+ level 2 = token's synonyms
148
+ ...
149
+ level 'threshold' = synonyms of tokens in threshold - 1 level
150
+ =end
151
+
152
+ def get_synonyms(vertex, threshold, speller)
153
+ wbsim = WordnetBasedSimilarity.new
154
+ if(vertex.pos_tag.nil?)
155
+ pos = wbsim.determine_POS(vertex)
156
+ else
157
+ pos = vertex.pos_tag
158
+ end
159
+
160
+ revSyn = Array.new(threshold+1){Array.new} #contains synonyms for the different levels
161
+ revSyn[0] << vertex.name.downcase.split(" ")[0] #holds the array of tokens whose synonyms are to be identified,
162
+ # and what if the vertex had a long phrase
163
+ #at first level '0' is the token itself
164
+ i = 0
165
+ while i < threshold do
166
+ list_new = Array.new
167
+ revSyn[i].each{
168
+ |token|
169
+ lemmas = WordNet::WordNetDB.find(token) #reviewLemma = revIndex.find(revToken) #
170
+ if(lemmas.nil?)
171
+ lemmas = WordNet::WordNetDB.find(wbsim.findStemWord(token, speller)) #revIndex.find(revStem[0])
172
+ end
173
+ #select the lemma corresponding to the token's POS
174
+ lemma = lemmas[0] #set the first one as the default lemma, later if one with exact POS is found, set that as the lemma
175
+ lemmas.each do |l|
176
+ #puts "lemma's POS :: #{l.pos} and reviewPOS :: #{pos}"
177
+ if(l.pos.casecmp(pos) == 0)
178
+ lemma = l
179
+ end
180
+ end
181
+
182
+ #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
183
+ #if selected reviewLemma is not nil or empty
184
+ if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
185
+ #creating arrays of all the values for synonyms, hyponyms etc. for the review token
186
+ for g in 0..lemma.synsets.length - 1
187
+ #fetching the first review synset
188
+ review_lemma_synset = lemma.synsets[g]
189
+ #synonyms
190
+ begin #error handling
191
+ rev_lemma_syns = review_lemma_synset.get_relation("&")
192
+ #for each synset get the values and add them to the array
193
+ for h in 0..rev_lemma_syns.length - 1
194
+ #incrementing the array with new synonym words
195
+ list_new = list_new + rev_lemma_syns[h].words
196
+ end
197
+ rescue
198
+ list_new = nil
199
+ end
200
+ end
201
+ end #end of checking if the lemma is nil or empty
202
+ } #end of iterating through revSyn[level]'s tokens
203
+
204
+ if(list_new.nil? or list_new.empty?)
205
+ break
206
+ end
207
+ i+=1 #level is incremented
208
+ revSyn[i] = list_new #setting synonyms
209
+ end
210
+ return revSyn
211
+ end
212
+ end
@@ -0,0 +1,3 @@
1
+ module AutomatedMetareview
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,18 @@
1
+ class Vertex
2
+ #attr_accessor auto creates the get and set methods for the following attributes
3
+ attr_accessor :name, :type, :frequency, :index, :node_id, :state, :label, :parent, :pos_tag
4
+ def initialize(vertex_name, vertex_type, index_value, state, lab, par, pos_tag)
5
+ @name = vertex_name
6
+ @type = vertex_type
7
+ @frequency = 0
8
+ @index = index_value
9
+ @node_id = -1 #to identify if the id has been set or not
10
+ @state = state #they are not negated by default
11
+
12
+ #for semantic role labelling
13
+ @label = lab
14
+ @parent = par
15
+
16
+ @pos_tag = pos_tag
17
+ end
18
+ end
@@ -0,0 +1,480 @@
1
+ require 'automated_metareview/vertex'
2
+ require 'automated_metareview/constants'
3
+
4
+ class WordnetBasedSimilarity
5
+ attr_accessor :match, :count
6
+ # @@posTagger = EngTagger.new
7
+ def compare_strings(reviewVertex, submVertex, speller)
8
+ #must fix this to something that is local to the app
9
+ # WordNet::WordNetDB.path = "/usr/local/WordNet-3.0"
10
+ # WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0"
11
+ review = reviewVertex.name
12
+ submission = submVertex.name
13
+ reviewState = reviewVertex.state
14
+ submState = submVertex.state
15
+
16
+ # puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}");
17
+ @match = 0
18
+ @count = 0
19
+
20
+ reviewPOS = ""
21
+ submPOS = ""
22
+
23
+ #checking for exact matches between the tokens
24
+ if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value
25
+ # puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}")
26
+ if(reviewState.equal?(submState))
27
+ @match = @match + EXACT
28
+ elsif(!reviewState.equal?(submState))
29
+ @match = @match + NEGEXACT
30
+ end
31
+ return @match
32
+ end
33
+
34
+ stokRev = review.split(" ")
35
+ #stokSub = submission.split(" ") #should've been inside when doing n * n comparison
36
+
37
+ #iterating through review tokens
38
+ for i in (0..stokRev.length-1)
39
+ #if either of the tokens is null
40
+ if(stokRev[i].nil?)
41
+ next #continue with the next token
42
+ end
43
+ revToken = stokRev[i].downcase()
44
+ if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
45
+ reviewPOS = determine_POS(reviewVertex).strip
46
+ end
47
+
48
+ # puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}")
49
+ if(revToken.equal?("n't"))
50
+ revToken = "not"
51
+ # puts("replacing n't")
52
+ end
53
+
54
+ #if the review token is a frequent word, continue
55
+ if(is_frequent_word(revToken))
56
+ # puts("Skipping frequent review token .. #{revToken}")
57
+ next #equivalent of the "continue"
58
+ end
59
+
60
+ #fetching synonyms, hypernyms, hyponyms etc. for the review token
61
+ revStem = find_stem_word(revToken, speller)
62
+ #fetching all the relations
63
+ review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS)
64
+ #setting the values in specific array variables
65
+ revGloss = review_relations[0]
66
+ revSyn =review_relations[1]
67
+ revHyper = review_relations[2]
68
+ revHypo = review_relations[3]
69
+ revAnt = review_relations[4]
70
+
71
+ # puts "reviewStem:: #{revStem} .. #{revStem.class}"
72
+ # puts "reviewGloss:: #{revGloss} .. #{revGloss.class}"
73
+ # puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}"
74
+ # puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}"
75
+ # puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}"
76
+ # puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}"
77
+
78
+ stokSub = submission.split(" ")
79
+ #iterating through submission tokens
80
+ for j in (0..stokSub.length-1)
81
+
82
+ if(stokSub[i].nil?)
83
+ next
84
+ end
85
+
86
+ subToken = stokSub[j].downcase()
87
+ if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
88
+ submPOS = determine_POS(submVertex).strip
89
+ end
90
+
91
+ # puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}")
92
+ if(subToken.equal?("n't"))
93
+ subToken = "not"
94
+ # puts("replacing n't")
95
+ end
96
+
97
+ #if the review token is a frequent word, continue
98
+ if(is_frequent_word(subToken))
99
+ # puts("Skipping frequent subtoken .. #{subToken}")
100
+ next #equivalent of the "continue"
101
+ end
102
+
103
+ #fetching synonyms, hypernyms, hyponyms etc. for the submission token
104
+ submStem = find_stem_word(subToken, speller)
105
+ subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS)
106
+ submGloss = subm_relations[0]
107
+ submSyn =subm_relations[1]
108
+ submHyper = subm_relations[2]
109
+ submHypo = subm_relations[3]
110
+ submAnt = subm_relations[4]
111
+ # puts "submStem:: #{submStem}"
112
+ # puts "submGloss:: #{submGloss}"
113
+ # puts "submSynonyms:: #{submSyn}"
114
+ # puts "submHypernyms:: #{submHyper}"
115
+ # puts "submHyponyms:: #{submHypo}"
116
+ # puts "submAntonyms:: #{submAnt}"
117
+
118
+ #------------------------------------------
119
+ #checks are ordered from BEST to LEAST degree of semantic relatedness
120
+ #*****exact matches
121
+ # puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}"
122
+ # puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}"
123
+ # puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}"
124
+ if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase()))
125
+ # puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}")
126
+ if(reviewState.equal?(submState))
127
+ @match = @match + EXACT
128
+ elsif(!reviewState.equal?(submState))
129
+ @match = @match + NEGEXACT
130
+ end
131
+ @count+=1
132
+ next #skip all remaining checks
133
+ end #end of if condition checking for exact matches
134
+ #------------------------------------------
135
+ #*****For Synonyms
136
+ #if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped
137
+ if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM))
138
+ next
139
+ end
140
+ #------------------------------------------
141
+ #ANTONYMS
142
+ if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM))
143
+ next
144
+ end
145
+ #------------------------------------------
146
+ #*****For Hypernyms
147
+ if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM))
148
+ next
149
+ end
150
+ #------------------------------------------
151
+ #*****For Hyponyms
152
+ if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM))
153
+ next
154
+ end
155
+
156
+ #overlap across definitions
157
+ # checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review
158
+ # or submission token or stem.
159
+ # puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}"
160
+ # puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}"
161
+ # rev_def = extract_definition(revGloss)
162
+ # sub_def = extract_definition(submGloss)
163
+ #(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or
164
+ if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or
165
+ (!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem))))
166
+ if(reviewState == submState)
167
+ @match = @match + OVERLAPDEFIN
168
+ elsif(reviewState != submState)
169
+ @match = @match + NEGOVERLAPDEFIN
170
+ end
171
+ @count+=1
172
+ next
173
+ end
174
+
175
+ #no match found!
176
+ # puts "No Match found!"
177
+ @match = @match + NOMATCH
178
+ @count+=1
179
+ end #end of the for loop for submission tokens
180
+ end #end of the for loop for review tokens
181
+
182
+ if(@count > 0)
183
+ # puts ("Match: #{@match} Count:: #{@count}")
184
+ result = (@match.to_f/@count.to_f).round
185
+ # puts("@@@@@@@@@ Returning Value: #{result}")
186
+ return result #an average of the matches found
187
+ end
188
+ # puts("@@@@@@@@@ Returning NOMATCH")
189
+ return NOMATCH
190
+
191
+ end #end of compareStrings method
192
+
193
+ #------------------------------------------------------------------------------
194
+ =begin
195
+ This method fetches the synonyms, hypernyms, hyponyms and other relations for the 'token' and its stem 'stem'.
196
+ This is done for both review and submission tokens/stems.
197
+ It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.
198
+ =end
199
+
200
+ def get_relations_for_review_submission_tokens(token, stem, pos)
201
+ # puts "@@@@ Inside get_relations_for_review_submission_tokens"
202
+ relations = Array.new
203
+ lemmas = WordNet::WordNetDB.find(token)
204
+ if(lemmas.nil?)
205
+ lemmas = WordNet::WordNetDB.find(stem)
206
+ end
207
+ #select the lemma corresponding to the token's POS
208
+ lemma = ""
209
+ lemmas.each do |l|
210
+ # puts "lemma's POS :: #{l.pos} and POS :: #{pos}"
211
+ if(l.pos == pos)
212
+ lemma = l
213
+ break
214
+ end
215
+ end
216
+
217
+ def_arr = Array.new
218
+ syn_arr = Array.new
219
+ hyper_arr = Array.new
220
+ hypo_arr = Array.new
221
+ anto_arr = Array.new
222
+
223
+ #if selected reviewLemma is not nil or empty
224
+ if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
225
+ #creating arrays of all the values for synonyms, hyponyms etc. for the review token
226
+ for g in 0..lemma.synsets.length - 1
227
+ #fetching the first review synset
228
+ lemma_synset = lemma.synsets[g]
229
+
230
+ #definitions
231
+ if(!lemma_synset.gloss.nil?)
232
+ #puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}"
233
+ if(def_arr[0].nil?)
234
+ def_arr << extract_definition(lemma_synset.gloss)
235
+ else
236
+ def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss)
237
+ end
238
+ else
239
+ def_arr << nil
240
+ end
241
+
242
+ #looking for all relations synonym, hypernym, hyponym etc. from among this synset
243
+ #synonyms
244
+ begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
245
+ lemmaSyns = lemma_synset.get_relation("&")
246
+ if(!lemmaSyns.nil? and lemmaSyns.length != 0)
247
+ # puts "lemmaSyns.length #{lemmaSyns.length}"
248
+ #for each synset get the values and add them to the array
249
+ for h in 0..lemmaSyns.length - 1
250
+ # puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}"
251
+ syn_arr = syn_arr + lemmaSyns[h].words
252
+ # puts "**** syn_arr #{syn_arr}"
253
+ end
254
+ else
255
+ syn_arr << nil #setting nil when no synset match is found for a particular type of relation
256
+ end
257
+ rescue
258
+ syn_arr << nil
259
+ end
260
+
261
+ #hypernyms
262
+ begin
263
+ lemmaHypers = lemma_synset.get_relation("@")#hypernym.words
264
+ if(!lemmaHypers.nil? and lemmaHypers.length != 0)
265
+ #for each synset get the values and add them to the array
266
+ for h in 0..lemmaHypers.length - 1
267
+ #puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}"
268
+ hyper_arr = hyper_arr + lemmaHypers[h].words
269
+ end
270
+ else
271
+ hyper_arr << nil
272
+ end
273
+ rescue
274
+ hyper_arr << nil
275
+ end
276
+
277
+ #hyponyms
278
+ begin
279
+ lemmaHypos = lemma_synset.get_relation("~")#hyponym
280
+ if(!lemmaHypos.nil? and lemmaHypos.length != 0)
281
+ #for each synset get the values and add them to the array
282
+ for h in 0..lemmaHypos.length - 1
283
+ hypo_arr = hypo_arr + lemmaHypos[h].words
284
+ end
285
+ else
286
+ hypo_arr << nil
287
+ end
288
+ rescue
289
+ hypo_arr << nil
290
+ end
291
+
292
+ #antonyms
293
+ begin
294
+ lemmaAnts = lemma_synset.get_relation("!")
295
+ if(!lemmaAnts.nil? and lemmaAnts.length != 0)
296
+ #for each synset get the values and add them to the array
297
+ for h in 0..lemmaAnts.length - 1
298
+ anto_arr = anto_arr + lemmaAnts[h].words
299
+ end
300
+ else
301
+ anto_arr << nil
302
+ end
303
+ rescue
304
+ anto_arr << nil
305
+ end
306
+ end #end of the for loop for g
307
+ end #end of checking if the lemma is nil or empty
308
+
309
+ #setting the array elements before returning the array
310
+ relations << def_arr
311
+ relations << syn_arr
312
+ relations << hyper_arr
313
+ relations << hypo_arr
314
+ relations << anto_arr
315
+ return relations
316
+ end
317
+
318
+ #------------------------------------------------------------------------------
319
+ =begin
320
+ This method compares the submission and reviews' synonyms and antonyms with each others' tokens and stem values.
321
+ The instance variables 'match' and 'count' are updated accordingly.
322
+ =end
323
+ def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type)
324
+ flag = 0 #indicates if a match was found
325
+ # puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}")
326
+ # puts "rev_arr #{rev_arr}"
327
+ # puts "subm_arr #{subm_arr}"
328
+ if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or
329
+ (!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem))))
330
+ # puts("Match found between: #{rev_token} & #{subm_token}")
331
+ flag = 1 #setting the flag to indicate that a match was found
332
+ if(rev_state == subm_state)
333
+ @match = @match + match_type
334
+ elsif(rev_state != subm_state)
335
+ @match = @match+ non_match_type
336
+ end
337
+ @count+=1
338
+ end
339
+ if(flag == 1)
340
+ return true
341
+ else
342
+ return false
343
+ end
344
+ end
345
+
346
+ #------------------------------------------------------------------------------
347
+
348
+ =begin
349
+ determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word
350
+ =end
351
+ def determine_POS(vert)
352
+ str_pos = vert.pos_tag
353
+ # puts("Inside determine_POS POS Tag:: #{str_pos}")
354
+ if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP"))
355
+ pos = "n"#WordNet::Noun
356
+ elsif(str_pos.include?("JJ"))
357
+ pos = "a" #WordNet::Adjective
358
+ elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD"))
359
+ pos = "v" #WordNet::Verb
360
+ elsif(str_pos.include?("RB"))
361
+ pos = "r" #WordNet::Adverb
362
+ else
363
+ pos = "n" #WordNet::Noun
364
+ end
365
+ return pos
366
+ end
367
+
368
+ #------------------------------------------------------------------------------
369
+ =begin
370
+ is_frequent_word - method checks to see if the given word is a frequent word
371
+ =end
372
+ def is_frequent_word(word)
373
+ word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution
374
+ word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable
375
+ word.gsub!("[", "")
376
+ word.gsub!("]", "")
377
+ word.gsub!("\"", "")
378
+
379
+ if(FREQUENT_WORDS.include?(word))
380
+ return true
381
+ end
382
+
383
+ if(CLOSED_CLASS_WORDS.include?(word))
384
+ return true
385
+ end
386
+
387
+ return false
388
+ end #end of is_frequent_word method
389
+ #------------------------------------------------------------------------------
390
+ =begin
391
+ find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck
392
+ It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!
393
+ =end
394
+ def find_stem_word(word, speller)
395
+ stem = word.stem
396
+ correct = stem #initializing correct to the stem word
397
+ #checkiing the stem word's spelling for correctness
398
+ while(!speller.check(correct)) do
399
+ if(!speller.suggest(correct).first.nil?)
400
+ correct = speller.suggest(correct).first
401
+ else
402
+ #break out of the loop, else it will continue infinitely
403
+ break #break out of the loop if the first correction was nil
404
+ end
405
+ end
406
+ return correct
407
+ end #end of is_frequent_word method
408
+
409
+ #------------------------------------------------------------------------------
410
+
411
+ =begin
412
+ This method is used to extract definitions for the words (since glossed contain definitions and examples!)
413
+ glosses - string containing the gloss of the synset
414
+ =end
415
+ def extract_definition(glosses)
416
+ definitions = ""#[]
417
+ #extracting examples from definitions
418
+ temp = glosses
419
+ tempList = temp.split(";")
420
+ for i in 0..tempList.length - 1
421
+ if(!tempList[i].include?('"'))
422
+ if(definitions.empty?)
423
+ definitions = tempList[i]
424
+ else
425
+ definitions = definitions +" "+ tempList[i]
426
+ end
427
+ end
428
+ end
429
+ #puts definitions
430
+ return definitions
431
+ end
432
+ #------------------------------------------------------------------------------
433
+
434
+ def overlap(def1, def2, speller)
435
+ instance = WordnetBasedSimilarity.new
436
+ numOverlap = 0
437
+ #only overlaps across the ALL definitions
438
+ # puts "def1 #{def1}"
439
+ # puts "def2 #{def2}"
440
+
441
+ #iterating through def1's definitions
442
+ for i in 0..def1.length-1
443
+ if(!def1[i].nil?)
444
+ #puts "def1[#{i}] #{def1[i]}"
445
+ if( def1[i].include?("\""))
446
+ def1[i].gsub!("\"", " ")
447
+ end
448
+ if(def1[i].include?(";"))
449
+ def1[i] = def1[i][0..def1[i].index(";")]
450
+ end
451
+ #iterating through def2's definitions
452
+ for j in 0..def2.length - 1
453
+ if(!def2[j].nil?)
454
+ if(def2[j].include?(";"))
455
+ def2[j] = def2[j][0..def2[j].index(";")]
456
+ end
457
+ #puts "def2[#{j}] #{def2[j]}"
458
+ s1 = def1[i].split(" ")
459
+ s1.each do |tok1|
460
+ tok1stem = find_stem_word(tok1, speller)
461
+ s2 = def2[j].split(" ")
462
+ s2.each do |tok2|
463
+ tok2stem = find_stem_word(tok2, speller)
464
+ # puts "tok1 #{tok1} and tok2 #{tok2}"
465
+ # puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}"
466
+ if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and
467
+ !instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem))
468
+ # puts("**Overlap def/ex:: #{tok1} or #{tok1stem}")
469
+ numOverlap+=1
470
+ end
471
+ end #end of s2 loop
472
+ end #end of s1 loop
473
+ end #end of def2[j][0] being null
474
+ end #end of for loop for def2 - j
475
+ end #end of if def1[i][0] being null
476
+ end #end of for loop for def1 - i
477
+ return numOverlap
478
+ end
479
+ #------------------------------------------------------------------------------
480
+ end #end of WordnetBasedSimilarity class