automated_metareview 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/text_preprocessing'
|
3
|
+
|
4
|
+
class TextQuantity
|
5
|
+
def number_of_unique_tokens(text_array)
|
6
|
+
pre_string = "" #preString helps keep track of the text that has been checked for unique tokens and text that has not
|
7
|
+
count = 0 #counts the number of unique tokens
|
8
|
+
instance = WordnetBasedSimilarity.new
|
9
|
+
text_array.each{
|
10
|
+
|text|
|
11
|
+
tp = TextPreprocessing.new
|
12
|
+
text = tp.contains_punct(text)
|
13
|
+
all_tokens = text.split(" ")
|
14
|
+
all_tokens.each{
|
15
|
+
|token|
|
16
|
+
if(!instance.is_frequent_word(token.downcase)) #do not count this word if it is a frequent word
|
17
|
+
if(!pre_string.downcase.include?(token.downcase)) #if the token was not already seen earlier i.e. not a part of the preString
|
18
|
+
count+=1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
pre_string = pre_string +" " + token.downcase #adding token to the preString
|
22
|
+
}
|
23
|
+
}
|
24
|
+
return count
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'automated_metareview/graph_generator'
|
2
|
+
require 'automated_metareview/wordnet_based_similarity'
|
3
|
+
require 'automated_metareview/constants'
|
4
|
+
|
5
|
+
class Tone
|
6
|
+
def identify_tone(pos_tagger, core_NLP_tagger, review_text, review_graph)
|
7
|
+
speller = Aspell.new("en_US")
|
8
|
+
speller.suggestion_mode = Aspell::NORMAL
|
9
|
+
|
10
|
+
cumulative_edge_feature = Array.new
|
11
|
+
cumulative_review_tone = Array.new
|
12
|
+
cumulative_review_tone = [-1, -1, -1] #sum of all edge tones
|
13
|
+
|
14
|
+
#extracting positive and negative words from files into arrays
|
15
|
+
positive_file = "app/models/automated_metareview/positive-words.csv"
|
16
|
+
negative_file = "app/models/automated_metareview/negative-words.csv"
|
17
|
+
positive = Array.new
|
18
|
+
negative = Array.new
|
19
|
+
FasterCSV.foreach(positive_file) do |text|
|
20
|
+
positive << text[0]
|
21
|
+
end
|
22
|
+
|
23
|
+
FasterCSV.foreach(negative_file) do |text|
|
24
|
+
negative << text[0]
|
25
|
+
end
|
26
|
+
|
27
|
+
negative = negative + NEGATIVE_DESCRIPTORS
|
28
|
+
review_edges = review_graph.edges
|
29
|
+
|
30
|
+
#if the edges are nil
|
31
|
+
if(review_edges.nil?)
|
32
|
+
return cumulative_review_tone
|
33
|
+
end
|
34
|
+
|
35
|
+
wbsim = WordnetBasedSimilarity.new
|
36
|
+
in_feature = Array.new
|
37
|
+
out_feature = Array.new
|
38
|
+
review_edges.each{
|
39
|
+
|edge|
|
40
|
+
if(!edge.nil?)
|
41
|
+
if(!edge.in_vertex.nil?)
|
42
|
+
# puts "#### Checking for edge #{edge.in_vertex.name}"
|
43
|
+
in_feature = get_feature_vector(edge.in_vertex, positive, negative, speller)
|
44
|
+
end
|
45
|
+
if(!edge.out_vertex.nil?)
|
46
|
+
# puts "#### with outvertex #{edge.out_vertex.name}"
|
47
|
+
out_feature = get_feature_vector(edge.out_vertex, positive, negative, speller)
|
48
|
+
end
|
49
|
+
|
50
|
+
# puts "in_feature :: [#{in_feature[0]}, #{in_feature[1]}]"
|
51
|
+
# puts "out_feature :: [#{out_feature[0]}, #{out_feature[1]}]"
|
52
|
+
|
53
|
+
#making sure that we don't include frequent tokens' tones while calculating cumulative edge tone (both + and -)
|
54
|
+
if(!wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
|
55
|
+
cumulative_edge_feature[0] = (in_feature[0].to_f + out_feature[0].to_f)/2.to_f
|
56
|
+
cumulative_edge_feature[1] = (in_feature[1].to_f + out_feature[1].to_f)/2.to_f
|
57
|
+
elsif(wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
|
58
|
+
cumulative_edge_feature[0] = out_feature[0].to_f
|
59
|
+
cumulative_edge_feature[1] = out_feature[1].to_f
|
60
|
+
elsif(!wbsim.is_frequent_word(edge.in_vertex.name) and wbsim.is_frequent_word(edge.out_vertex.name))
|
61
|
+
cumulative_edge_feature[0] = in_feature[0].to_f
|
62
|
+
cumulative_edge_feature[1] = in_feature[1].to_f
|
63
|
+
else
|
64
|
+
cumulative_edge_feature[0] = 0
|
65
|
+
cumulative_edge_feature[1] = 0
|
66
|
+
end
|
67
|
+
|
68
|
+
# puts "cumulative_edge_feature :: [#{cumulative_edge_feature[0]}, #{cumulative_edge_feature[1]}]"
|
69
|
+
if((cumulative_review_tone[0] == -1 and cumulative_review_tone[1] == -1) or
|
70
|
+
(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)) #has not been initialized as yet
|
71
|
+
cumulative_review_tone[0] = cumulative_edge_feature[0].to_f
|
72
|
+
cumulative_review_tone[1] = cumulative_edge_feature[1].to_f
|
73
|
+
elsif(cumulative_edge_feature[0] > 0 or cumulative_edge_feature[1] > 0)
|
74
|
+
#only edges with some tone (either vertices) are taken into consideration during cumulative edge calculation
|
75
|
+
#else all edges will be considered, which may adversely affect the net tone of the review text
|
76
|
+
cumulative_review_tone[0] = (cumulative_review_tone[0].to_f + cumulative_edge_feature[0].to_f)/2.to_f
|
77
|
+
cumulative_review_tone[1] = (cumulative_review_tone[1].to_f + cumulative_edge_feature[1].to_f)/2.to_f
|
78
|
+
end
|
79
|
+
# puts "cumulative_review_tone :: [#{cumulative_review_tone[0]}, #{cumulative_review_tone[1]}]"
|
80
|
+
end
|
81
|
+
}
|
82
|
+
# puts "cumulative tone :: positive - #{cumulative_review_tone[0]}, negative - #{cumulative_review_tone[1]}"
|
83
|
+
if(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)
|
84
|
+
cumulative_review_tone[2] = 1 #setting neutrality value
|
85
|
+
else
|
86
|
+
cumulative_review_tone[2] = 0
|
87
|
+
end
|
88
|
+
return cumulative_review_tone
|
89
|
+
end
|
90
|
+
#--------
|
91
|
+
def get_feature_vector(vertex, positive, negative, speller)
|
92
|
+
threshold = THRESHOLD #max distance at which synonyms can be searched
|
93
|
+
feature_vector = Array.new #size of the array depends on th number of tone dimensions e.g.[positive, negative, netural]
|
94
|
+
feature_vector = [0, 0] #initializing
|
95
|
+
#look for the presence of token in positive set
|
96
|
+
if(positive.include?(vertex.name.downcase))
|
97
|
+
feature_vector[0] = 1 #
|
98
|
+
else
|
99
|
+
#recursively check for synonyms of token in the positive set
|
100
|
+
distance = 1
|
101
|
+
flag = 0
|
102
|
+
synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
|
103
|
+
synonym_sets.each{
|
104
|
+
|set|
|
105
|
+
if(positive.length - (positive - set).length > 0)
|
106
|
+
feature_vector[0] = 1/distance
|
107
|
+
flag = 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if(flag == 1)
|
111
|
+
break #break out of the loop
|
112
|
+
end
|
113
|
+
distance+=1 #incrementing to check synonyms in the next level
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
# repeat above with negative set
|
118
|
+
if(negative.include?(vertex.name.downcase))
|
119
|
+
feature_vector[1] = 1 #
|
120
|
+
else
|
121
|
+
#recursively check for synonyms of token in the positive set
|
122
|
+
distance = 1
|
123
|
+
flag = 0
|
124
|
+
synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
|
125
|
+
if(!synonym_sets[1].empty?)#i.e. if there were no synonyms identified for the token avoid rechecking for [0] - since that contains the original token
|
126
|
+
synonym_sets.each{
|
127
|
+
|set|
|
128
|
+
if(negative.length - (negative - set).length > 0)
|
129
|
+
feature_vector[1] = 1/distance
|
130
|
+
flag = 1
|
131
|
+
end
|
132
|
+
|
133
|
+
if(flag == 1)
|
134
|
+
break #break out of the loop
|
135
|
+
end
|
136
|
+
distance+=1 #incrementing to check synonyms in the next level
|
137
|
+
} #end of loop for synonym sets
|
138
|
+
end
|
139
|
+
end #end of if condition
|
140
|
+
|
141
|
+
return feature_vector
|
142
|
+
end
|
143
|
+
#--------
|
144
|
+
=begin
|
145
|
+
getSynonyms - gets synonyms for vertex - upto 'threshold' levels of synonyms
|
146
|
+
level 1 = token
|
147
|
+
level 2 = token's synonyms
|
148
|
+
...
|
149
|
+
level 'threshold' = synonyms of tokens in threshold - 1 level
|
150
|
+
=end
|
151
|
+
|
152
|
+
def get_synonyms(vertex, threshold, speller)
|
153
|
+
wbsim = WordnetBasedSimilarity.new
|
154
|
+
if(vertex.pos_tag.nil?)
|
155
|
+
pos = wbsim.determine_POS(vertex)
|
156
|
+
else
|
157
|
+
pos = vertex.pos_tag
|
158
|
+
end
|
159
|
+
|
160
|
+
revSyn = Array.new(threshold+1){Array.new} #contains synonyms for the different levels
|
161
|
+
revSyn[0] << vertex.name.downcase.split(" ")[0] #holds the array of tokens whose synonyms are to be identified,
|
162
|
+
# and what if the vertex had a long phrase
|
163
|
+
#at first level '0' is the token itself
|
164
|
+
i = 0
|
165
|
+
while i < threshold do
|
166
|
+
list_new = Array.new
|
167
|
+
revSyn[i].each{
|
168
|
+
|token|
|
169
|
+
lemmas = WordNet::WordNetDB.find(token) #reviewLemma = revIndex.find(revToken) #
|
170
|
+
if(lemmas.nil?)
|
171
|
+
lemmas = WordNet::WordNetDB.find(wbsim.findStemWord(token, speller)) #revIndex.find(revStem[0])
|
172
|
+
end
|
173
|
+
#select the lemma corresponding to the token's POS
|
174
|
+
lemma = lemmas[0] #set the first one as the default lemma, later if one with exact POS is found, set that as the lemma
|
175
|
+
lemmas.each do |l|
|
176
|
+
#puts "lemma's POS :: #{l.pos} and reviewPOS :: #{pos}"
|
177
|
+
if(l.pos.casecmp(pos) == 0)
|
178
|
+
lemma = l
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
#error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
|
183
|
+
#if selected reviewLemma is not nil or empty
|
184
|
+
if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
|
185
|
+
#creating arrays of all the values for synonyms, hyponyms etc. for the review token
|
186
|
+
for g in 0..lemma.synsets.length - 1
|
187
|
+
#fetching the first review synset
|
188
|
+
review_lemma_synset = lemma.synsets[g]
|
189
|
+
#synonyms
|
190
|
+
begin #error handling
|
191
|
+
rev_lemma_syns = review_lemma_synset.get_relation("&")
|
192
|
+
#for each synset get the values and add them to the array
|
193
|
+
for h in 0..rev_lemma_syns.length - 1
|
194
|
+
#incrementing the array with new synonym words
|
195
|
+
list_new = list_new + rev_lemma_syns[h].words
|
196
|
+
end
|
197
|
+
rescue
|
198
|
+
list_new = nil
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end #end of checking if the lemma is nil or empty
|
202
|
+
} #end of iterating through revSyn[level]'s tokens
|
203
|
+
|
204
|
+
if(list_new.nil? or list_new.empty?)
|
205
|
+
break
|
206
|
+
end
|
207
|
+
i+=1 #level is incremented
|
208
|
+
revSyn[i] = list_new #setting synonyms
|
209
|
+
end
|
210
|
+
return revSyn
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
class Vertex
|
2
|
+
#attr_accessor auto creates the get and set methods for the following attributes
|
3
|
+
attr_accessor :name, :type, :frequency, :index, :node_id, :state, :label, :parent, :pos_tag
|
4
|
+
def initialize(vertex_name, vertex_type, index_value, state, lab, par, pos_tag)
|
5
|
+
@name = vertex_name
|
6
|
+
@type = vertex_type
|
7
|
+
@frequency = 0
|
8
|
+
@index = index_value
|
9
|
+
@node_id = -1 #to identify if the id has been set or not
|
10
|
+
@state = state #they are not negated by default
|
11
|
+
|
12
|
+
#for semantic role labelling
|
13
|
+
@label = lab
|
14
|
+
@parent = par
|
15
|
+
|
16
|
+
@pos_tag = pos_tag
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,480 @@
|
|
1
|
+
require 'automated_metareview/vertex'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class WordnetBasedSimilarity
|
5
|
+
attr_accessor :match, :count
|
6
|
+
# @@posTagger = EngTagger.new
|
7
|
+
def compare_strings(reviewVertex, submVertex, speller)
|
8
|
+
#must fix this to something that is local to the app
|
9
|
+
# WordNet::WordNetDB.path = "/usr/local/WordNet-3.0"
|
10
|
+
# WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0"
|
11
|
+
review = reviewVertex.name
|
12
|
+
submission = submVertex.name
|
13
|
+
reviewState = reviewVertex.state
|
14
|
+
submState = submVertex.state
|
15
|
+
|
16
|
+
# puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}");
|
17
|
+
@match = 0
|
18
|
+
@count = 0
|
19
|
+
|
20
|
+
reviewPOS = ""
|
21
|
+
submPOS = ""
|
22
|
+
|
23
|
+
#checking for exact matches between the tokens
|
24
|
+
if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value
|
25
|
+
# puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}")
|
26
|
+
if(reviewState.equal?(submState))
|
27
|
+
@match = @match + EXACT
|
28
|
+
elsif(!reviewState.equal?(submState))
|
29
|
+
@match = @match + NEGEXACT
|
30
|
+
end
|
31
|
+
return @match
|
32
|
+
end
|
33
|
+
|
34
|
+
stokRev = review.split(" ")
|
35
|
+
#stokSub = submission.split(" ") #should've been inside when doing n * n comparison
|
36
|
+
|
37
|
+
#iterating through review tokens
|
38
|
+
for i in (0..stokRev.length-1)
|
39
|
+
#if either of the tokens is null
|
40
|
+
if(stokRev[i].nil?)
|
41
|
+
next #continue with the next token
|
42
|
+
end
|
43
|
+
revToken = stokRev[i].downcase()
|
44
|
+
if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
|
45
|
+
reviewPOS = determine_POS(reviewVertex).strip
|
46
|
+
end
|
47
|
+
|
48
|
+
# puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}")
|
49
|
+
if(revToken.equal?("n't"))
|
50
|
+
revToken = "not"
|
51
|
+
# puts("replacing n't")
|
52
|
+
end
|
53
|
+
|
54
|
+
#if the review token is a frequent word, continue
|
55
|
+
if(is_frequent_word(revToken))
|
56
|
+
# puts("Skipping frequent review token .. #{revToken}")
|
57
|
+
next #equivalent of the "continue"
|
58
|
+
end
|
59
|
+
|
60
|
+
#fetching synonyms, hypernyms, hyponyms etc. for the review token
|
61
|
+
revStem = find_stem_word(revToken, speller)
|
62
|
+
#fetching all the relations
|
63
|
+
review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS)
|
64
|
+
#setting the values in specific array variables
|
65
|
+
revGloss = review_relations[0]
|
66
|
+
revSyn =review_relations[1]
|
67
|
+
revHyper = review_relations[2]
|
68
|
+
revHypo = review_relations[3]
|
69
|
+
revAnt = review_relations[4]
|
70
|
+
|
71
|
+
# puts "reviewStem:: #{revStem} .. #{revStem.class}"
|
72
|
+
# puts "reviewGloss:: #{revGloss} .. #{revGloss.class}"
|
73
|
+
# puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}"
|
74
|
+
# puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}"
|
75
|
+
# puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}"
|
76
|
+
# puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}"
|
77
|
+
|
78
|
+
stokSub = submission.split(" ")
|
79
|
+
#iterating through submission tokens
|
80
|
+
for j in (0..stokSub.length-1)
|
81
|
+
|
82
|
+
if(stokSub[i].nil?)
|
83
|
+
next
|
84
|
+
end
|
85
|
+
|
86
|
+
subToken = stokSub[j].downcase()
|
87
|
+
if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
|
88
|
+
submPOS = determine_POS(submVertex).strip
|
89
|
+
end
|
90
|
+
|
91
|
+
# puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}")
|
92
|
+
if(subToken.equal?("n't"))
|
93
|
+
subToken = "not"
|
94
|
+
# puts("replacing n't")
|
95
|
+
end
|
96
|
+
|
97
|
+
#if the review token is a frequent word, continue
|
98
|
+
if(is_frequent_word(subToken))
|
99
|
+
# puts("Skipping frequent subtoken .. #{subToken}")
|
100
|
+
next #equivalent of the "continue"
|
101
|
+
end
|
102
|
+
|
103
|
+
#fetching synonyms, hypernyms, hyponyms etc. for the submission token
|
104
|
+
submStem = find_stem_word(subToken, speller)
|
105
|
+
subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS)
|
106
|
+
submGloss = subm_relations[0]
|
107
|
+
submSyn =subm_relations[1]
|
108
|
+
submHyper = subm_relations[2]
|
109
|
+
submHypo = subm_relations[3]
|
110
|
+
submAnt = subm_relations[4]
|
111
|
+
# puts "submStem:: #{submStem}"
|
112
|
+
# puts "submGloss:: #{submGloss}"
|
113
|
+
# puts "submSynonyms:: #{submSyn}"
|
114
|
+
# puts "submHypernyms:: #{submHyper}"
|
115
|
+
# puts "submHyponyms:: #{submHypo}"
|
116
|
+
# puts "submAntonyms:: #{submAnt}"
|
117
|
+
|
118
|
+
#------------------------------------------
|
119
|
+
#checks are ordered from BEST to LEAST degree of semantic relatedness
|
120
|
+
#*****exact matches
|
121
|
+
# puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}"
|
122
|
+
# puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}"
|
123
|
+
# puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}"
|
124
|
+
if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase()))
|
125
|
+
# puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}")
|
126
|
+
if(reviewState.equal?(submState))
|
127
|
+
@match = @match + EXACT
|
128
|
+
elsif(!reviewState.equal?(submState))
|
129
|
+
@match = @match + NEGEXACT
|
130
|
+
end
|
131
|
+
@count+=1
|
132
|
+
next #skip all remaining checks
|
133
|
+
end #end of if condition checking for exact matches
|
134
|
+
#------------------------------------------
|
135
|
+
#*****For Synonyms
|
136
|
+
#if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped
|
137
|
+
if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM))
|
138
|
+
next
|
139
|
+
end
|
140
|
+
#------------------------------------------
|
141
|
+
#ANTONYMS
|
142
|
+
if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM))
|
143
|
+
next
|
144
|
+
end
|
145
|
+
#------------------------------------------
|
146
|
+
#*****For Hypernyms
|
147
|
+
if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM))
|
148
|
+
next
|
149
|
+
end
|
150
|
+
#------------------------------------------
|
151
|
+
#*****For Hyponyms
|
152
|
+
if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM))
|
153
|
+
next
|
154
|
+
end
|
155
|
+
|
156
|
+
#overlap across definitions
|
157
|
+
# checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review
|
158
|
+
# or submission token or stem.
|
159
|
+
# puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}"
|
160
|
+
# puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}"
|
161
|
+
# rev_def = extract_definition(revGloss)
|
162
|
+
# sub_def = extract_definition(submGloss)
|
163
|
+
#(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or
|
164
|
+
if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or
|
165
|
+
(!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem))))
|
166
|
+
if(reviewState == submState)
|
167
|
+
@match = @match + OVERLAPDEFIN
|
168
|
+
elsif(reviewState != submState)
|
169
|
+
@match = @match + NEGOVERLAPDEFIN
|
170
|
+
end
|
171
|
+
@count+=1
|
172
|
+
next
|
173
|
+
end
|
174
|
+
|
175
|
+
#no match found!
|
176
|
+
# puts "No Match found!"
|
177
|
+
@match = @match + NOMATCH
|
178
|
+
@count+=1
|
179
|
+
end #end of the for loop for submission tokens
|
180
|
+
end #end of the for loop for review tokens
|
181
|
+
|
182
|
+
if(@count > 0)
|
183
|
+
# puts ("Match: #{@match} Count:: #{@count}")
|
184
|
+
result = (@match.to_f/@count.to_f).round
|
185
|
+
# puts("@@@@@@@@@ Returning Value: #{result}")
|
186
|
+
return result #an average of the matches found
|
187
|
+
end
|
188
|
+
# puts("@@@@@@@@@ Returning NOMATCH")
|
189
|
+
return NOMATCH
|
190
|
+
|
191
|
+
end #end of compareStrings method
|
192
|
+
|
193
|
+
#------------------------------------------------------------------------------
|
194
|
+
=begin
|
195
|
+
This method fetches the synonyms, hypernyms, hyponyms and other relations for the 'token' and its stem 'stem'.
|
196
|
+
This is done for both review and submission tokens/stems.
|
197
|
+
It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.
|
198
|
+
=end
|
199
|
+
|
200
|
+
def get_relations_for_review_submission_tokens(token, stem, pos)
|
201
|
+
# puts "@@@@ Inside get_relations_for_review_submission_tokens"
|
202
|
+
relations = Array.new
|
203
|
+
lemmas = WordNet::WordNetDB.find(token)
|
204
|
+
if(lemmas.nil?)
|
205
|
+
lemmas = WordNet::WordNetDB.find(stem)
|
206
|
+
end
|
207
|
+
#select the lemma corresponding to the token's POS
|
208
|
+
lemma = ""
|
209
|
+
lemmas.each do |l|
|
210
|
+
# puts "lemma's POS :: #{l.pos} and POS :: #{pos}"
|
211
|
+
if(l.pos == pos)
|
212
|
+
lemma = l
|
213
|
+
break
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def_arr = Array.new
|
218
|
+
syn_arr = Array.new
|
219
|
+
hyper_arr = Array.new
|
220
|
+
hypo_arr = Array.new
|
221
|
+
anto_arr = Array.new
|
222
|
+
|
223
|
+
#if selected reviewLemma is not nil or empty
|
224
|
+
if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
|
225
|
+
#creating arrays of all the values for synonyms, hyponyms etc. for the review token
|
226
|
+
for g in 0..lemma.synsets.length - 1
|
227
|
+
#fetching the first review synset
|
228
|
+
lemma_synset = lemma.synsets[g]
|
229
|
+
|
230
|
+
#definitions
|
231
|
+
if(!lemma_synset.gloss.nil?)
|
232
|
+
#puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}"
|
233
|
+
if(def_arr[0].nil?)
|
234
|
+
def_arr << extract_definition(lemma_synset.gloss)
|
235
|
+
else
|
236
|
+
def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss)
|
237
|
+
end
|
238
|
+
else
|
239
|
+
def_arr << nil
|
240
|
+
end
|
241
|
+
|
242
|
+
#looking for all relations synonym, hypernym, hyponym etc. from among this synset
|
243
|
+
#synonyms
|
244
|
+
begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
|
245
|
+
lemmaSyns = lemma_synset.get_relation("&")
|
246
|
+
if(!lemmaSyns.nil? and lemmaSyns.length != 0)
|
247
|
+
# puts "lemmaSyns.length #{lemmaSyns.length}"
|
248
|
+
#for each synset get the values and add them to the array
|
249
|
+
for h in 0..lemmaSyns.length - 1
|
250
|
+
# puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}"
|
251
|
+
syn_arr = syn_arr + lemmaSyns[h].words
|
252
|
+
# puts "**** syn_arr #{syn_arr}"
|
253
|
+
end
|
254
|
+
else
|
255
|
+
syn_arr << nil #setting nil when no synset match is found for a particular type of relation
|
256
|
+
end
|
257
|
+
rescue
|
258
|
+
syn_arr << nil
|
259
|
+
end
|
260
|
+
|
261
|
+
#hypernyms
|
262
|
+
begin
|
263
|
+
lemmaHypers = lemma_synset.get_relation("@")#hypernym.words
|
264
|
+
if(!lemmaHypers.nil? and lemmaHypers.length != 0)
|
265
|
+
#for each synset get the values and add them to the array
|
266
|
+
for h in 0..lemmaHypers.length - 1
|
267
|
+
#puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}"
|
268
|
+
hyper_arr = hyper_arr + lemmaHypers[h].words
|
269
|
+
end
|
270
|
+
else
|
271
|
+
hyper_arr << nil
|
272
|
+
end
|
273
|
+
rescue
|
274
|
+
hyper_arr << nil
|
275
|
+
end
|
276
|
+
|
277
|
+
#hyponyms
|
278
|
+
begin
|
279
|
+
lemmaHypos = lemma_synset.get_relation("~")#hyponym
|
280
|
+
if(!lemmaHypos.nil? and lemmaHypos.length != 0)
|
281
|
+
#for each synset get the values and add them to the array
|
282
|
+
for h in 0..lemmaHypos.length - 1
|
283
|
+
hypo_arr = hypo_arr + lemmaHypos[h].words
|
284
|
+
end
|
285
|
+
else
|
286
|
+
hypo_arr << nil
|
287
|
+
end
|
288
|
+
rescue
|
289
|
+
hypo_arr << nil
|
290
|
+
end
|
291
|
+
|
292
|
+
#antonyms
|
293
|
+
begin
|
294
|
+
lemmaAnts = lemma_synset.get_relation("!")
|
295
|
+
if(!lemmaAnts.nil? and lemmaAnts.length != 0)
|
296
|
+
#for each synset get the values and add them to the array
|
297
|
+
for h in 0..lemmaAnts.length - 1
|
298
|
+
anto_arr = anto_arr + lemmaAnts[h].words
|
299
|
+
end
|
300
|
+
else
|
301
|
+
anto_arr << nil
|
302
|
+
end
|
303
|
+
rescue
|
304
|
+
anto_arr << nil
|
305
|
+
end
|
306
|
+
end #end of the for loop for g
|
307
|
+
end #end of checking if the lemma is nil or empty
|
308
|
+
|
309
|
+
#setting the array elements before returning the array
|
310
|
+
relations << def_arr
|
311
|
+
relations << syn_arr
|
312
|
+
relations << hyper_arr
|
313
|
+
relations << hypo_arr
|
314
|
+
relations << anto_arr
|
315
|
+
return relations
|
316
|
+
end
|
317
|
+
|
318
|
+
#------------------------------------------------------------------------------
|
319
|
+
=begin
|
320
|
+
This method compares the submission and reviews' synonyms and antonyms with each others' tokens and stem values.
|
321
|
+
The instance variables 'match' and 'count' are updated accordingly.
|
322
|
+
=end
|
323
|
+
def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type)
|
324
|
+
flag = 0 #indicates if a match was found
|
325
|
+
# puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}")
|
326
|
+
# puts "rev_arr #{rev_arr}"
|
327
|
+
# puts "subm_arr #{subm_arr}"
|
328
|
+
if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or
|
329
|
+
(!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem))))
|
330
|
+
# puts("Match found between: #{rev_token} & #{subm_token}")
|
331
|
+
flag = 1 #setting the flag to indicate that a match was found
|
332
|
+
if(rev_state == subm_state)
|
333
|
+
@match = @match + match_type
|
334
|
+
elsif(rev_state != subm_state)
|
335
|
+
@match = @match+ non_match_type
|
336
|
+
end
|
337
|
+
@count+=1
|
338
|
+
end
|
339
|
+
if(flag == 1)
|
340
|
+
return true
|
341
|
+
else
|
342
|
+
return false
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
#------------------------------------------------------------------------------
|
347
|
+
|
348
|
+
=begin
|
349
|
+
determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word
|
350
|
+
=end
|
351
|
+
def determine_POS(vert)
|
352
|
+
str_pos = vert.pos_tag
|
353
|
+
# puts("Inside determine_POS POS Tag:: #{str_pos}")
|
354
|
+
if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP"))
|
355
|
+
pos = "n"#WordNet::Noun
|
356
|
+
elsif(str_pos.include?("JJ"))
|
357
|
+
pos = "a" #WordNet::Adjective
|
358
|
+
elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD"))
|
359
|
+
pos = "v" #WordNet::Verb
|
360
|
+
elsif(str_pos.include?("RB"))
|
361
|
+
pos = "r" #WordNet::Adverb
|
362
|
+
else
|
363
|
+
pos = "n" #WordNet::Noun
|
364
|
+
end
|
365
|
+
return pos
|
366
|
+
end
|
367
|
+
|
368
|
+
#------------------------------------------------------------------------------
|
369
|
+
=begin
|
370
|
+
is_frequent_word - method checks to see if the given word is a frequent word
|
371
|
+
=end
|
372
|
+
def is_frequent_word(word)
|
373
|
+
word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution
|
374
|
+
word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable
|
375
|
+
word.gsub!("[", "")
|
376
|
+
word.gsub!("]", "")
|
377
|
+
word.gsub!("\"", "")
|
378
|
+
|
379
|
+
if(FREQUENT_WORDS.include?(word))
|
380
|
+
return true
|
381
|
+
end
|
382
|
+
|
383
|
+
if(CLOSED_CLASS_WORDS.include?(word))
|
384
|
+
return true
|
385
|
+
end
|
386
|
+
|
387
|
+
return false
|
388
|
+
end #end of is_frequent_word method
|
389
|
+
#------------------------------------------------------------------------------
|
390
|
+
=begin
|
391
|
+
find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck
|
392
|
+
It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!
|
393
|
+
=end
|
394
|
+
def find_stem_word(word, speller)
|
395
|
+
stem = word.stem
|
396
|
+
correct = stem #initializing correct to the stem word
|
397
|
+
#checkiing the stem word's spelling for correctness
|
398
|
+
while(!speller.check(correct)) do
|
399
|
+
if(!speller.suggest(correct).first.nil?)
|
400
|
+
correct = speller.suggest(correct).first
|
401
|
+
else
|
402
|
+
#break out of the loop, else it will continue infinitely
|
403
|
+
break #break out of the loop if the first correction was nil
|
404
|
+
end
|
405
|
+
end
|
406
|
+
return correct
|
407
|
+
end #end of is_frequent_word method
|
408
|
+
|
409
|
+
#------------------------------------------------------------------------------
|
410
|
+
|
411
|
+
=begin
|
412
|
+
This method is used to extract definitions for the words (since glossed contain definitions and examples!)
|
413
|
+
glosses - string containing the gloss of the synset
|
414
|
+
=end
|
415
|
+
def extract_definition(glosses)
|
416
|
+
definitions = ""#[]
|
417
|
+
#extracting examples from definitions
|
418
|
+
temp = glosses
|
419
|
+
tempList = temp.split(";")
|
420
|
+
for i in 0..tempList.length - 1
|
421
|
+
if(!tempList[i].include?('"'))
|
422
|
+
if(definitions.empty?)
|
423
|
+
definitions = tempList[i]
|
424
|
+
else
|
425
|
+
definitions = definitions +" "+ tempList[i]
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
#puts definitions
|
430
|
+
return definitions
|
431
|
+
end
|
432
|
+
#------------------------------------------------------------------------------
|
433
|
+
|
434
|
+
def overlap(def1, def2, speller)
|
435
|
+
instance = WordnetBasedSimilarity.new
|
436
|
+
numOverlap = 0
|
437
|
+
#only overlaps across the ALL definitions
|
438
|
+
# puts "def1 #{def1}"
|
439
|
+
# puts "def2 #{def2}"
|
440
|
+
|
441
|
+
#iterating through def1's definitions
|
442
|
+
for i in 0..def1.length-1
|
443
|
+
if(!def1[i].nil?)
|
444
|
+
#puts "def1[#{i}] #{def1[i]}"
|
445
|
+
if( def1[i].include?("\""))
|
446
|
+
def1[i].gsub!("\"", " ")
|
447
|
+
end
|
448
|
+
if(def1[i].include?(";"))
|
449
|
+
def1[i] = def1[i][0..def1[i].index(";")]
|
450
|
+
end
|
451
|
+
#iterating through def2's definitions
|
452
|
+
for j in 0..def2.length - 1
|
453
|
+
if(!def2[j].nil?)
|
454
|
+
if(def2[j].include?(";"))
|
455
|
+
def2[j] = def2[j][0..def2[j].index(";")]
|
456
|
+
end
|
457
|
+
#puts "def2[#{j}] #{def2[j]}"
|
458
|
+
s1 = def1[i].split(" ")
|
459
|
+
s1.each do |tok1|
|
460
|
+
tok1stem = find_stem_word(tok1, speller)
|
461
|
+
s2 = def2[j].split(" ")
|
462
|
+
s2.each do |tok2|
|
463
|
+
tok2stem = find_stem_word(tok2, speller)
|
464
|
+
# puts "tok1 #{tok1} and tok2 #{tok2}"
|
465
|
+
# puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}"
|
466
|
+
if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and
|
467
|
+
!instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem))
|
468
|
+
# puts("**Overlap def/ex:: #{tok1} or #{tok1stem}")
|
469
|
+
numOverlap+=1
|
470
|
+
end
|
471
|
+
end #end of s2 loop
|
472
|
+
end #end of s1 loop
|
473
|
+
end #end of def2[j][0] being null
|
474
|
+
end #end of for loop for def2 - j
|
475
|
+
end #end of if def1[i][0] being null
|
476
|
+
end #end of for loop for def1 - i
|
477
|
+
return numOverlap
|
478
|
+
end
|
479
|
+
#------------------------------------------------------------------------------
|
480
|
+
end #end of WordnetBasedSimilarity class
|