automated_metareview 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/text_preprocessing'
|
3
|
+
|
4
|
+
class TextQuantity
|
5
|
+
def number_of_unique_tokens(text_array)
|
6
|
+
pre_string = "" #preString helps keep track of the text that has been checked for unique tokens and text that has not
|
7
|
+
count = 0 #counts the number of unique tokens
|
8
|
+
instance = WordnetBasedSimilarity.new
|
9
|
+
text_array.each{
|
10
|
+
|text|
|
11
|
+
tp = TextPreprocessing.new
|
12
|
+
text = tp.contains_punct(text)
|
13
|
+
all_tokens = text.split(" ")
|
14
|
+
all_tokens.each{
|
15
|
+
|token|
|
16
|
+
if(!instance.is_frequent_word(token.downcase)) #do not count this word if it is a frequent word
|
17
|
+
if(!pre_string.downcase.include?(token.downcase)) #if the token was not already seen earlier i.e. not a part of the preString
|
18
|
+
count+=1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
pre_string = pre_string +" " + token.downcase #adding token to the preString
|
22
|
+
}
|
23
|
+
}
|
24
|
+
return count
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'automated_metareview/graph_generator'
|
2
|
+
require 'automated_metareview/wordnet_based_similarity'
|
3
|
+
require 'automated_metareview/constants'
|
4
|
+
|
5
|
+
class Tone
|
6
|
+
def identify_tone(pos_tagger, core_NLP_tagger, review_text, review_graph)
|
7
|
+
speller = Aspell.new("en_US")
|
8
|
+
speller.suggestion_mode = Aspell::NORMAL
|
9
|
+
|
10
|
+
cumulative_edge_feature = Array.new
|
11
|
+
cumulative_review_tone = Array.new
|
12
|
+
cumulative_review_tone = [-1, -1, -1] #sum of all edge tones
|
13
|
+
|
14
|
+
#extracting positive and negative words from files into arrays
|
15
|
+
positive_file = "app/models/automated_metareview/positive-words.csv"
|
16
|
+
negative_file = "app/models/automated_metareview/negative-words.csv"
|
17
|
+
positive = Array.new
|
18
|
+
negative = Array.new
|
19
|
+
FasterCSV.foreach(positive_file) do |text|
|
20
|
+
positive << text[0]
|
21
|
+
end
|
22
|
+
|
23
|
+
FasterCSV.foreach(negative_file) do |text|
|
24
|
+
negative << text[0]
|
25
|
+
end
|
26
|
+
|
27
|
+
negative = negative + NEGATIVE_DESCRIPTORS
|
28
|
+
review_edges = review_graph.edges
|
29
|
+
|
30
|
+
#if the edges are nil
|
31
|
+
if(review_edges.nil?)
|
32
|
+
return cumulative_review_tone
|
33
|
+
end
|
34
|
+
|
35
|
+
wbsim = WordnetBasedSimilarity.new
|
36
|
+
in_feature = Array.new
|
37
|
+
out_feature = Array.new
|
38
|
+
review_edges.each{
|
39
|
+
|edge|
|
40
|
+
if(!edge.nil?)
|
41
|
+
if(!edge.in_vertex.nil?)
|
42
|
+
# puts "#### Checking for edge #{edge.in_vertex.name}"
|
43
|
+
in_feature = get_feature_vector(edge.in_vertex, positive, negative, speller)
|
44
|
+
end
|
45
|
+
if(!edge.out_vertex.nil?)
|
46
|
+
# puts "#### with outvertex #{edge.out_vertex.name}"
|
47
|
+
out_feature = get_feature_vector(edge.out_vertex, positive, negative, speller)
|
48
|
+
end
|
49
|
+
|
50
|
+
# puts "in_feature :: [#{in_feature[0]}, #{in_feature[1]}]"
|
51
|
+
# puts "out_feature :: [#{out_feature[0]}, #{out_feature[1]}]"
|
52
|
+
|
53
|
+
#making sure that we don't include frequent tokens' tones while calculating cumulative edge tone (both + and -)
|
54
|
+
if(!wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
|
55
|
+
cumulative_edge_feature[0] = (in_feature[0].to_f + out_feature[0].to_f)/2.to_f
|
56
|
+
cumulative_edge_feature[1] = (in_feature[1].to_f + out_feature[1].to_f)/2.to_f
|
57
|
+
elsif(wbsim.is_frequent_word(edge.in_vertex.name) and !wbsim.is_frequent_word(edge.out_vertex.name))
|
58
|
+
cumulative_edge_feature[0] = out_feature[0].to_f
|
59
|
+
cumulative_edge_feature[1] = out_feature[1].to_f
|
60
|
+
elsif(!wbsim.is_frequent_word(edge.in_vertex.name) and wbsim.is_frequent_word(edge.out_vertex.name))
|
61
|
+
cumulative_edge_feature[0] = in_feature[0].to_f
|
62
|
+
cumulative_edge_feature[1] = in_feature[1].to_f
|
63
|
+
else
|
64
|
+
cumulative_edge_feature[0] = 0
|
65
|
+
cumulative_edge_feature[1] = 0
|
66
|
+
end
|
67
|
+
|
68
|
+
# puts "cumulative_edge_feature :: [#{cumulative_edge_feature[0]}, #{cumulative_edge_feature[1]}]"
|
69
|
+
if((cumulative_review_tone[0] == -1 and cumulative_review_tone[1] == -1) or
|
70
|
+
(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)) #has not been initialized as yet
|
71
|
+
cumulative_review_tone[0] = cumulative_edge_feature[0].to_f
|
72
|
+
cumulative_review_tone[1] = cumulative_edge_feature[1].to_f
|
73
|
+
elsif(cumulative_edge_feature[0] > 0 or cumulative_edge_feature[1] > 0)
|
74
|
+
#only edges with some tone (either vertices) are taken into consideration during cumulative edge calculation
|
75
|
+
#else all edges will be considered, which may adversely affect the net tone of the review text
|
76
|
+
cumulative_review_tone[0] = (cumulative_review_tone[0].to_f + cumulative_edge_feature[0].to_f)/2.to_f
|
77
|
+
cumulative_review_tone[1] = (cumulative_review_tone[1].to_f + cumulative_edge_feature[1].to_f)/2.to_f
|
78
|
+
end
|
79
|
+
# puts "cumulative_review_tone :: [#{cumulative_review_tone[0]}, #{cumulative_review_tone[1]}]"
|
80
|
+
end
|
81
|
+
}
|
82
|
+
# puts "cumulative tone :: positive - #{cumulative_review_tone[0]}, negative - #{cumulative_review_tone[1]}"
|
83
|
+
if(cumulative_review_tone[0] == 0 and cumulative_review_tone[1] == 0)
|
84
|
+
cumulative_review_tone[2] = 1 #setting neutrality value
|
85
|
+
else
|
86
|
+
cumulative_review_tone[2] = 0
|
87
|
+
end
|
88
|
+
return cumulative_review_tone
|
89
|
+
end
|
90
|
+
#--------
|
91
|
+
def get_feature_vector(vertex, positive, negative, speller)
|
92
|
+
threshold = THRESHOLD #max distance at which synonyms can be searched
|
93
|
+
feature_vector = Array.new #size of the array depends on th number of tone dimensions e.g.[positive, negative, netural]
|
94
|
+
feature_vector = [0, 0] #initializing
|
95
|
+
#look for the presence of token in positive set
|
96
|
+
if(positive.include?(vertex.name.downcase))
|
97
|
+
feature_vector[0] = 1 #
|
98
|
+
else
|
99
|
+
#recursively check for synonyms of token in the positive set
|
100
|
+
distance = 1
|
101
|
+
flag = 0
|
102
|
+
synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
|
103
|
+
synonym_sets.each{
|
104
|
+
|set|
|
105
|
+
if(positive.length - (positive - set).length > 0)
|
106
|
+
feature_vector[0] = 1/distance
|
107
|
+
flag = 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if(flag == 1)
|
111
|
+
break #break out of the loop
|
112
|
+
end
|
113
|
+
distance+=1 #incrementing to check synonyms in the next level
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
# repeat above with negative set
|
118
|
+
if(negative.include?(vertex.name.downcase))
|
119
|
+
feature_vector[1] = 1 #
|
120
|
+
else
|
121
|
+
#recursively check for synonyms of token in the positive set
|
122
|
+
distance = 1
|
123
|
+
flag = 0
|
124
|
+
synonym_sets = get_synonyms(vertex, threshold, speller) #gets upto 'threshold' levels of synonms in a double dimensional array
|
125
|
+
if(!synonym_sets[1].empty?)#i.e. if there were no synonyms identified for the token avoid rechecking for [0] - since that contains the original token
|
126
|
+
synonym_sets.each{
|
127
|
+
|set|
|
128
|
+
if(negative.length - (negative - set).length > 0)
|
129
|
+
feature_vector[1] = 1/distance
|
130
|
+
flag = 1
|
131
|
+
end
|
132
|
+
|
133
|
+
if(flag == 1)
|
134
|
+
break #break out of the loop
|
135
|
+
end
|
136
|
+
distance+=1 #incrementing to check synonyms in the next level
|
137
|
+
} #end of loop for synonym sets
|
138
|
+
end
|
139
|
+
end #end of if condition
|
140
|
+
|
141
|
+
return feature_vector
|
142
|
+
end
|
143
|
+
#--------
|
144
|
+
=begin
|
145
|
+
getSynonyms - gets synonyms for vertex - upto 'threshold' levels of synonyms
|
146
|
+
level 1 = token
|
147
|
+
level 2 = token's synonyms
|
148
|
+
...
|
149
|
+
level 'threshold' = synonyms of tokens in threshold - 1 level
|
150
|
+
=end
|
151
|
+
|
152
|
+
def get_synonyms(vertex, threshold, speller)
|
153
|
+
wbsim = WordnetBasedSimilarity.new
|
154
|
+
if(vertex.pos_tag.nil?)
|
155
|
+
pos = wbsim.determine_POS(vertex)
|
156
|
+
else
|
157
|
+
pos = vertex.pos_tag
|
158
|
+
end
|
159
|
+
|
160
|
+
revSyn = Array.new(threshold+1){Array.new} #contains synonyms for the different levels
|
161
|
+
revSyn[0] << vertex.name.downcase.split(" ")[0] #holds the array of tokens whose synonyms are to be identified,
|
162
|
+
# and what if the vertex had a long phrase
|
163
|
+
#at first level '0' is the token itself
|
164
|
+
i = 0
|
165
|
+
while i < threshold do
|
166
|
+
list_new = Array.new
|
167
|
+
revSyn[i].each{
|
168
|
+
|token|
|
169
|
+
lemmas = WordNet::WordNetDB.find(token) #reviewLemma = revIndex.find(revToken) #
|
170
|
+
if(lemmas.nil?)
|
171
|
+
lemmas = WordNet::WordNetDB.find(wbsim.findStemWord(token, speller)) #revIndex.find(revStem[0])
|
172
|
+
end
|
173
|
+
#select the lemma corresponding to the token's POS
|
174
|
+
lemma = lemmas[0] #set the first one as the default lemma, later if one with exact POS is found, set that as the lemma
|
175
|
+
lemmas.each do |l|
|
176
|
+
#puts "lemma's POS :: #{l.pos} and reviewPOS :: #{pos}"
|
177
|
+
if(l.pos.casecmp(pos) == 0)
|
178
|
+
lemma = l
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
#error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
|
183
|
+
#if selected reviewLemma is not nil or empty
|
184
|
+
if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
|
185
|
+
#creating arrays of all the values for synonyms, hyponyms etc. for the review token
|
186
|
+
for g in 0..lemma.synsets.length - 1
|
187
|
+
#fetching the first review synset
|
188
|
+
review_lemma_synset = lemma.synsets[g]
|
189
|
+
#synonyms
|
190
|
+
begin #error handling
|
191
|
+
rev_lemma_syns = review_lemma_synset.get_relation("&")
|
192
|
+
#for each synset get the values and add them to the array
|
193
|
+
for h in 0..rev_lemma_syns.length - 1
|
194
|
+
#incrementing the array with new synonym words
|
195
|
+
list_new = list_new + rev_lemma_syns[h].words
|
196
|
+
end
|
197
|
+
rescue
|
198
|
+
list_new = nil
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end #end of checking if the lemma is nil or empty
|
202
|
+
} #end of iterating through revSyn[level]'s tokens
|
203
|
+
|
204
|
+
if(list_new.nil? or list_new.empty?)
|
205
|
+
break
|
206
|
+
end
|
207
|
+
i+=1 #level is incremented
|
208
|
+
revSyn[i] = list_new #setting synonyms
|
209
|
+
end
|
210
|
+
return revSyn
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
class Vertex
|
2
|
+
#attr_accessor auto creates the get and set methods for the following attributes
|
3
|
+
attr_accessor :name, :type, :frequency, :index, :node_id, :state, :label, :parent, :pos_tag
|
4
|
+
def initialize(vertex_name, vertex_type, index_value, state, lab, par, pos_tag)
|
5
|
+
@name = vertex_name
|
6
|
+
@type = vertex_type
|
7
|
+
@frequency = 0
|
8
|
+
@index = index_value
|
9
|
+
@node_id = -1 #to identify if the id has been set or not
|
10
|
+
@state = state #they are not negated by default
|
11
|
+
|
12
|
+
#for semantic role labelling
|
13
|
+
@label = lab
|
14
|
+
@parent = par
|
15
|
+
|
16
|
+
@pos_tag = pos_tag
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,480 @@
|
|
1
|
+
require 'automated_metareview/vertex'
|
2
|
+
require 'automated_metareview/constants'
|
3
|
+
|
4
|
+
class WordnetBasedSimilarity
|
5
|
+
attr_accessor :match, :count
|
6
|
+
# @@posTagger = EngTagger.new
|
7
|
+
def compare_strings(reviewVertex, submVertex, speller)
|
8
|
+
#must fix this to something that is local to the app
|
9
|
+
# WordNet::WordNetDB.path = "/usr/local/WordNet-3.0"
|
10
|
+
# WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0"
|
11
|
+
review = reviewVertex.name
|
12
|
+
submission = submVertex.name
|
13
|
+
reviewState = reviewVertex.state
|
14
|
+
submState = submVertex.state
|
15
|
+
|
16
|
+
# puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}");
|
17
|
+
@match = 0
|
18
|
+
@count = 0
|
19
|
+
|
20
|
+
reviewPOS = ""
|
21
|
+
submPOS = ""
|
22
|
+
|
23
|
+
#checking for exact matches between the tokens
|
24
|
+
if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value
|
25
|
+
# puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}")
|
26
|
+
if(reviewState.equal?(submState))
|
27
|
+
@match = @match + EXACT
|
28
|
+
elsif(!reviewState.equal?(submState))
|
29
|
+
@match = @match + NEGEXACT
|
30
|
+
end
|
31
|
+
return @match
|
32
|
+
end
|
33
|
+
|
34
|
+
stokRev = review.split(" ")
|
35
|
+
#stokSub = submission.split(" ") #should've been inside when doing n * n comparison
|
36
|
+
|
37
|
+
#iterating through review tokens
|
38
|
+
for i in (0..stokRev.length-1)
|
39
|
+
#if either of the tokens is null
|
40
|
+
if(stokRev[i].nil?)
|
41
|
+
next #continue with the next token
|
42
|
+
end
|
43
|
+
revToken = stokRev[i].downcase()
|
44
|
+
if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
|
45
|
+
reviewPOS = determine_POS(reviewVertex).strip
|
46
|
+
end
|
47
|
+
|
48
|
+
# puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}")
|
49
|
+
if(revToken.equal?("n't"))
|
50
|
+
revToken = "not"
|
51
|
+
# puts("replacing n't")
|
52
|
+
end
|
53
|
+
|
54
|
+
#if the review token is a frequent word, continue
|
55
|
+
if(is_frequent_word(revToken))
|
56
|
+
# puts("Skipping frequent review token .. #{revToken}")
|
57
|
+
next #equivalent of the "continue"
|
58
|
+
end
|
59
|
+
|
60
|
+
#fetching synonyms, hypernyms, hyponyms etc. for the review token
|
61
|
+
revStem = find_stem_word(revToken, speller)
|
62
|
+
#fetching all the relations
|
63
|
+
review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS)
|
64
|
+
#setting the values in specific array variables
|
65
|
+
revGloss = review_relations[0]
|
66
|
+
revSyn =review_relations[1]
|
67
|
+
revHyper = review_relations[2]
|
68
|
+
revHypo = review_relations[3]
|
69
|
+
revAnt = review_relations[4]
|
70
|
+
|
71
|
+
# puts "reviewStem:: #{revStem} .. #{revStem.class}"
|
72
|
+
# puts "reviewGloss:: #{revGloss} .. #{revGloss.class}"
|
73
|
+
# puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}"
|
74
|
+
# puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}"
|
75
|
+
# puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}"
|
76
|
+
# puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}"
|
77
|
+
|
78
|
+
stokSub = submission.split(" ")
|
79
|
+
#iterating through submission tokens
|
80
|
+
for j in (0..stokSub.length-1)
|
81
|
+
|
82
|
+
if(stokSub[i].nil?)
|
83
|
+
next
|
84
|
+
end
|
85
|
+
|
86
|
+
subToken = stokSub[j].downcase()
|
87
|
+
if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
|
88
|
+
submPOS = determine_POS(submVertex).strip
|
89
|
+
end
|
90
|
+
|
91
|
+
# puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}")
|
92
|
+
if(subToken.equal?("n't"))
|
93
|
+
subToken = "not"
|
94
|
+
# puts("replacing n't")
|
95
|
+
end
|
96
|
+
|
97
|
+
#if the review token is a frequent word, continue
|
98
|
+
if(is_frequent_word(subToken))
|
99
|
+
# puts("Skipping frequent subtoken .. #{subToken}")
|
100
|
+
next #equivalent of the "continue"
|
101
|
+
end
|
102
|
+
|
103
|
+
#fetching synonyms, hypernyms, hyponyms etc. for the submission token
|
104
|
+
submStem = find_stem_word(subToken, speller)
|
105
|
+
subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS)
|
106
|
+
submGloss = subm_relations[0]
|
107
|
+
submSyn =subm_relations[1]
|
108
|
+
submHyper = subm_relations[2]
|
109
|
+
submHypo = subm_relations[3]
|
110
|
+
submAnt = subm_relations[4]
|
111
|
+
# puts "submStem:: #{submStem}"
|
112
|
+
# puts "submGloss:: #{submGloss}"
|
113
|
+
# puts "submSynonyms:: #{submSyn}"
|
114
|
+
# puts "submHypernyms:: #{submHyper}"
|
115
|
+
# puts "submHyponyms:: #{submHypo}"
|
116
|
+
# puts "submAntonyms:: #{submAnt}"
|
117
|
+
|
118
|
+
#------------------------------------------
|
119
|
+
#checks are ordered from BEST to LEAST degree of semantic relatedness
|
120
|
+
#*****exact matches
|
121
|
+
# puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}"
|
122
|
+
# puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}"
|
123
|
+
# puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}"
|
124
|
+
if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase()))
|
125
|
+
# puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}")
|
126
|
+
if(reviewState.equal?(submState))
|
127
|
+
@match = @match + EXACT
|
128
|
+
elsif(!reviewState.equal?(submState))
|
129
|
+
@match = @match + NEGEXACT
|
130
|
+
end
|
131
|
+
@count+=1
|
132
|
+
next #skip all remaining checks
|
133
|
+
end #end of if condition checking for exact matches
|
134
|
+
#------------------------------------------
|
135
|
+
#*****For Synonyms
|
136
|
+
#if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped
|
137
|
+
if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM))
|
138
|
+
next
|
139
|
+
end
|
140
|
+
#------------------------------------------
|
141
|
+
#ANTONYMS
|
142
|
+
if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM))
|
143
|
+
next
|
144
|
+
end
|
145
|
+
#------------------------------------------
|
146
|
+
#*****For Hypernyms
|
147
|
+
if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM))
|
148
|
+
next
|
149
|
+
end
|
150
|
+
#------------------------------------------
|
151
|
+
#*****For Hyponyms
|
152
|
+
if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM))
|
153
|
+
next
|
154
|
+
end
|
155
|
+
|
156
|
+
#overlap across definitions
|
157
|
+
# checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review
|
158
|
+
# or submission token or stem.
|
159
|
+
# puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}"
|
160
|
+
# puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}"
|
161
|
+
# rev_def = extract_definition(revGloss)
|
162
|
+
# sub_def = extract_definition(submGloss)
|
163
|
+
#(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or
|
164
|
+
if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or
|
165
|
+
(!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem))))
|
166
|
+
if(reviewState == submState)
|
167
|
+
@match = @match + OVERLAPDEFIN
|
168
|
+
elsif(reviewState != submState)
|
169
|
+
@match = @match + NEGOVERLAPDEFIN
|
170
|
+
end
|
171
|
+
@count+=1
|
172
|
+
next
|
173
|
+
end
|
174
|
+
|
175
|
+
#no match found!
|
176
|
+
# puts "No Match found!"
|
177
|
+
@match = @match + NOMATCH
|
178
|
+
@count+=1
|
179
|
+
end #end of the for loop for submission tokens
|
180
|
+
end #end of the for loop for review tokens
|
181
|
+
|
182
|
+
if(@count > 0)
|
183
|
+
# puts ("Match: #{@match} Count:: #{@count}")
|
184
|
+
result = (@match.to_f/@count.to_f).round
|
185
|
+
# puts("@@@@@@@@@ Returning Value: #{result}")
|
186
|
+
return result #an average of the matches found
|
187
|
+
end
|
188
|
+
# puts("@@@@@@@@@ Returning NOMATCH")
|
189
|
+
return NOMATCH
|
190
|
+
|
191
|
+
end #end of compareStrings method
|
192
|
+
|
193
|
+
#------------------------------------------------------------------------------
|
194
|
+
=begin
|
195
|
+
This method fetches the synonyms, hypernyms, hyponyms and other relations for the 'token' and its stem 'stem'.
|
196
|
+
This is done for both review and submission tokens/stems.
|
197
|
+
It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.
|
198
|
+
=end
|
199
|
+
|
200
|
+
def get_relations_for_review_submission_tokens(token, stem, pos)
|
201
|
+
# puts "@@@@ Inside get_relations_for_review_submission_tokens"
|
202
|
+
relations = Array.new
|
203
|
+
lemmas = WordNet::WordNetDB.find(token)
|
204
|
+
if(lemmas.nil?)
|
205
|
+
lemmas = WordNet::WordNetDB.find(stem)
|
206
|
+
end
|
207
|
+
#select the lemma corresponding to the token's POS
|
208
|
+
lemma = ""
|
209
|
+
lemmas.each do |l|
|
210
|
+
# puts "lemma's POS :: #{l.pos} and POS :: #{pos}"
|
211
|
+
if(l.pos == pos)
|
212
|
+
lemma = l
|
213
|
+
break
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def_arr = Array.new
|
218
|
+
syn_arr = Array.new
|
219
|
+
hyper_arr = Array.new
|
220
|
+
hypo_arr = Array.new
|
221
|
+
anto_arr = Array.new
|
222
|
+
|
223
|
+
#if selected reviewLemma is not nil or empty
|
224
|
+
if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)
|
225
|
+
#creating arrays of all the values for synonyms, hyponyms etc. for the review token
|
226
|
+
for g in 0..lemma.synsets.length - 1
|
227
|
+
#fetching the first review synset
|
228
|
+
lemma_synset = lemma.synsets[g]
|
229
|
+
|
230
|
+
#definitions
|
231
|
+
if(!lemma_synset.gloss.nil?)
|
232
|
+
#puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}"
|
233
|
+
if(def_arr[0].nil?)
|
234
|
+
def_arr << extract_definition(lemma_synset.gloss)
|
235
|
+
else
|
236
|
+
def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss)
|
237
|
+
end
|
238
|
+
else
|
239
|
+
def_arr << nil
|
240
|
+
end
|
241
|
+
|
242
|
+
#looking for all relations synonym, hypernym, hyponym etc. from among this synset
|
243
|
+
#synonyms
|
244
|
+
begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
|
245
|
+
lemmaSyns = lemma_synset.get_relation("&")
|
246
|
+
if(!lemmaSyns.nil? and lemmaSyns.length != 0)
|
247
|
+
# puts "lemmaSyns.length #{lemmaSyns.length}"
|
248
|
+
#for each synset get the values and add them to the array
|
249
|
+
for h in 0..lemmaSyns.length - 1
|
250
|
+
# puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}"
|
251
|
+
syn_arr = syn_arr + lemmaSyns[h].words
|
252
|
+
# puts "**** syn_arr #{syn_arr}"
|
253
|
+
end
|
254
|
+
else
|
255
|
+
syn_arr << nil #setting nil when no synset match is found for a particular type of relation
|
256
|
+
end
|
257
|
+
rescue
|
258
|
+
syn_arr << nil
|
259
|
+
end
|
260
|
+
|
261
|
+
#hypernyms
|
262
|
+
begin
|
263
|
+
lemmaHypers = lemma_synset.get_relation("@")#hypernym.words
|
264
|
+
if(!lemmaHypers.nil? and lemmaHypers.length != 0)
|
265
|
+
#for each synset get the values and add them to the array
|
266
|
+
for h in 0..lemmaHypers.length - 1
|
267
|
+
#puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}"
|
268
|
+
hyper_arr = hyper_arr + lemmaHypers[h].words
|
269
|
+
end
|
270
|
+
else
|
271
|
+
hyper_arr << nil
|
272
|
+
end
|
273
|
+
rescue
|
274
|
+
hyper_arr << nil
|
275
|
+
end
|
276
|
+
|
277
|
+
#hyponyms
|
278
|
+
begin
|
279
|
+
lemmaHypos = lemma_synset.get_relation("~")#hyponym
|
280
|
+
if(!lemmaHypos.nil? and lemmaHypos.length != 0)
|
281
|
+
#for each synset get the values and add them to the array
|
282
|
+
for h in 0..lemmaHypos.length - 1
|
283
|
+
hypo_arr = hypo_arr + lemmaHypos[h].words
|
284
|
+
end
|
285
|
+
else
|
286
|
+
hypo_arr << nil
|
287
|
+
end
|
288
|
+
rescue
|
289
|
+
hypo_arr << nil
|
290
|
+
end
|
291
|
+
|
292
|
+
#antonyms
|
293
|
+
begin
|
294
|
+
lemmaAnts = lemma_synset.get_relation("!")
|
295
|
+
if(!lemmaAnts.nil? and lemmaAnts.length != 0)
|
296
|
+
#for each synset get the values and add them to the array
|
297
|
+
for h in 0..lemmaAnts.length - 1
|
298
|
+
anto_arr = anto_arr + lemmaAnts[h].words
|
299
|
+
end
|
300
|
+
else
|
301
|
+
anto_arr << nil
|
302
|
+
end
|
303
|
+
rescue
|
304
|
+
anto_arr << nil
|
305
|
+
end
|
306
|
+
end #end of the for loop for g
|
307
|
+
end #end of checking if the lemma is nil or empty
|
308
|
+
|
309
|
+
#setting the array elements before returning the array
|
310
|
+
relations << def_arr
|
311
|
+
relations << syn_arr
|
312
|
+
relations << hyper_arr
|
313
|
+
relations << hypo_arr
|
314
|
+
relations << anto_arr
|
315
|
+
return relations
|
316
|
+
end
|
317
|
+
|
318
|
+
#------------------------------------------------------------------------------
|
319
|
+
=begin
|
320
|
+
This method compares the submission and reviews' synonyms and antonyms with each others' tokens and stem values.
|
321
|
+
The instance variables 'match' and 'count' are updated accordingly.
|
322
|
+
=end
|
323
|
+
def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type)
|
324
|
+
flag = 0 #indicates if a match was found
|
325
|
+
# puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}")
|
326
|
+
# puts "rev_arr #{rev_arr}"
|
327
|
+
# puts "subm_arr #{subm_arr}"
|
328
|
+
if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or
|
329
|
+
(!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem))))
|
330
|
+
# puts("Match found between: #{rev_token} & #{subm_token}")
|
331
|
+
flag = 1 #setting the flag to indicate that a match was found
|
332
|
+
if(rev_state == subm_state)
|
333
|
+
@match = @match + match_type
|
334
|
+
elsif(rev_state != subm_state)
|
335
|
+
@match = @match+ non_match_type
|
336
|
+
end
|
337
|
+
@count+=1
|
338
|
+
end
|
339
|
+
if(flag == 1)
|
340
|
+
return true
|
341
|
+
else
|
342
|
+
return false
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
#------------------------------------------------------------------------------
|
347
|
+
|
348
|
+
=begin
|
349
|
+
determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word
|
350
|
+
=end
|
351
|
+
def determine_POS(vert)
|
352
|
+
str_pos = vert.pos_tag
|
353
|
+
# puts("Inside determine_POS POS Tag:: #{str_pos}")
|
354
|
+
if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP"))
|
355
|
+
pos = "n"#WordNet::Noun
|
356
|
+
elsif(str_pos.include?("JJ"))
|
357
|
+
pos = "a" #WordNet::Adjective
|
358
|
+
elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD"))
|
359
|
+
pos = "v" #WordNet::Verb
|
360
|
+
elsif(str_pos.include?("RB"))
|
361
|
+
pos = "r" #WordNet::Adverb
|
362
|
+
else
|
363
|
+
pos = "n" #WordNet::Noun
|
364
|
+
end
|
365
|
+
return pos
|
366
|
+
end
|
367
|
+
|
368
|
+
#------------------------------------------------------------------------------
|
369
|
+
=begin
|
370
|
+
is_frequent_word - method checks to see if the given word is a frequent word
|
371
|
+
=end
|
372
|
+
def is_frequent_word(word)
|
373
|
+
word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution
|
374
|
+
word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable
|
375
|
+
word.gsub!("[", "")
|
376
|
+
word.gsub!("]", "")
|
377
|
+
word.gsub!("\"", "")
|
378
|
+
|
379
|
+
if(FREQUENT_WORDS.include?(word))
|
380
|
+
return true
|
381
|
+
end
|
382
|
+
|
383
|
+
if(CLOSED_CLASS_WORDS.include?(word))
|
384
|
+
return true
|
385
|
+
end
|
386
|
+
|
387
|
+
return false
|
388
|
+
end #end of is_frequent_word method
|
389
|
+
#------------------------------------------------------------------------------
|
390
|
+
=begin
|
391
|
+
find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck
|
392
|
+
It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!
|
393
|
+
=end
|
394
|
+
def find_stem_word(word, speller)
|
395
|
+
stem = word.stem
|
396
|
+
correct = stem #initializing correct to the stem word
|
397
|
+
#checkiing the stem word's spelling for correctness
|
398
|
+
while(!speller.check(correct)) do
|
399
|
+
if(!speller.suggest(correct).first.nil?)
|
400
|
+
correct = speller.suggest(correct).first
|
401
|
+
else
|
402
|
+
#break out of the loop, else it will continue infinitely
|
403
|
+
break #break out of the loop if the first correction was nil
|
404
|
+
end
|
405
|
+
end
|
406
|
+
return correct
|
407
|
+
end #end of is_frequent_word method
|
408
|
+
|
409
|
+
#------------------------------------------------------------------------------
|
410
|
+
|
411
|
+
=begin
|
412
|
+
This method is used to extract definitions for the words (since glossed contain definitions and examples!)
|
413
|
+
glosses - string containing the gloss of the synset
|
414
|
+
=end
|
415
|
+
def extract_definition(glosses)
|
416
|
+
definitions = ""#[]
|
417
|
+
#extracting examples from definitions
|
418
|
+
temp = glosses
|
419
|
+
tempList = temp.split(";")
|
420
|
+
for i in 0..tempList.length - 1
|
421
|
+
if(!tempList[i].include?('"'))
|
422
|
+
if(definitions.empty?)
|
423
|
+
definitions = tempList[i]
|
424
|
+
else
|
425
|
+
definitions = definitions +" "+ tempList[i]
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
#puts definitions
|
430
|
+
return definitions
|
431
|
+
end
|
432
|
+
#------------------------------------------------------------------------------
|
433
|
+
|
434
|
+
def overlap(def1, def2, speller)
|
435
|
+
instance = WordnetBasedSimilarity.new
|
436
|
+
numOverlap = 0
|
437
|
+
#only overlaps across the ALL definitions
|
438
|
+
# puts "def1 #{def1}"
|
439
|
+
# puts "def2 #{def2}"
|
440
|
+
|
441
|
+
#iterating through def1's definitions
|
442
|
+
for i in 0..def1.length-1
|
443
|
+
if(!def1[i].nil?)
|
444
|
+
#puts "def1[#{i}] #{def1[i]}"
|
445
|
+
if( def1[i].include?("\""))
|
446
|
+
def1[i].gsub!("\"", " ")
|
447
|
+
end
|
448
|
+
if(def1[i].include?(";"))
|
449
|
+
def1[i] = def1[i][0..def1[i].index(";")]
|
450
|
+
end
|
451
|
+
#iterating through def2's definitions
|
452
|
+
for j in 0..def2.length - 1
|
453
|
+
if(!def2[j].nil?)
|
454
|
+
if(def2[j].include?(";"))
|
455
|
+
def2[j] = def2[j][0..def2[j].index(";")]
|
456
|
+
end
|
457
|
+
#puts "def2[#{j}] #{def2[j]}"
|
458
|
+
s1 = def1[i].split(" ")
|
459
|
+
s1.each do |tok1|
|
460
|
+
tok1stem = find_stem_word(tok1, speller)
|
461
|
+
s2 = def2[j].split(" ")
|
462
|
+
s2.each do |tok2|
|
463
|
+
tok2stem = find_stem_word(tok2, speller)
|
464
|
+
# puts "tok1 #{tok1} and tok2 #{tok2}"
|
465
|
+
# puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}"
|
466
|
+
if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and
|
467
|
+
!instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem))
|
468
|
+
# puts("**Overlap def/ex:: #{tok1} or #{tok1stem}")
|
469
|
+
numOverlap+=1
|
470
|
+
end
|
471
|
+
end #end of s2 loop
|
472
|
+
end #end of s1 loop
|
473
|
+
end #end of def2[j][0] being null
|
474
|
+
end #end of for loop for def2 - j
|
475
|
+
end #end of if def1[i][0] being null
|
476
|
+
end #end of for loop for def1 - i
|
477
|
+
return numOverlap
|
478
|
+
end
|
479
|
+
#------------------------------------------------------------------------------
|
480
|
+
end #end of WordnetBasedSimilarity class
|