automated_metareview 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,562 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/graph_generator'
|
3
|
+
|
4
|
+
class DegreeOfRelevance
|
5
|
+
#creating accessors for the instance variables
|
6
|
+
attr_accessor :vertex_match
|
7
|
+
attr_accessor :review
|
8
|
+
=begin
|
9
|
+
Identifies relevance between a review and a submission
|
10
|
+
=end
|
11
|
+
def get_relevance(reviews, submissions, num_reviews, pos_tagger, core_NLP_tagger, speller) #double dimensional arrays that contain the submissions and the reviews respectively
|
12
|
+
review_vertices = nil
|
13
|
+
review_edges = nil
|
14
|
+
subm_vertices = nil
|
15
|
+
subm_edges = nil
|
16
|
+
num_rev_vert = 0
|
17
|
+
num_rev_edg = 0
|
18
|
+
num_sub_vert = 0
|
19
|
+
numSubEdg = 0
|
20
|
+
vert_match = 0.0
|
21
|
+
edge_without_syn = 0.0
|
22
|
+
edge_with_syn = 0.0
|
23
|
+
edge_diff_type = 0.0
|
24
|
+
double_edge = 0.0
|
25
|
+
double_edge_with_syn = 0.0
|
26
|
+
|
27
|
+
#since Reviews and Submissions "should" contain the same number of records review - submission pairs
|
28
|
+
g = GraphGenerator.new
|
29
|
+
#generating review's graph
|
30
|
+
g.generate_graph(reviews, pos_tagger, core_NLP_tagger, true, false)
|
31
|
+
review_vertices = g.vertices
|
32
|
+
review_edges = g.edges
|
33
|
+
num_rev_vert = g.num_vertices
|
34
|
+
num_rev_edg = g.num_edges
|
35
|
+
|
36
|
+
#assigning graph as a review graph to use in content classification
|
37
|
+
@review = g.clone
|
38
|
+
|
39
|
+
#generating the submission's graph
|
40
|
+
g.generate_graph(submissions, pos_tagger, core_NLP_tagger, true, false)
|
41
|
+
subm_vertices = g.vertices
|
42
|
+
subm_edges = g.edges
|
43
|
+
num_sub_vert = g.num_vertices
|
44
|
+
num_sub_edg = g.num_edges
|
45
|
+
|
46
|
+
vert_match = compare_vertices(pos_tagger, review_vertices, subm_vertices, num_rev_vert, num_sub_vert, speller)
|
47
|
+
if(num_rev_edg > 0 and num_sub_edg > 0)
|
48
|
+
edge_without_syn = compare_edges_non_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
49
|
+
edge_with_syn = compare_edges_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
50
|
+
edge_diff_type = compare_edges_diff_types(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
51
|
+
edge_match = (edge_without_syn.to_f + edge_with_syn.to_f )/2.to_f #+ edge_diff_type.to_f
|
52
|
+
double_edge = compare_SVO_edges(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
53
|
+
double_edge_with_syn = compare_SVO_diff_syntax(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
54
|
+
double_edge_match = (double_edge.to_f + double_edge_with_syn.to_f)/2.to_f
|
55
|
+
else
|
56
|
+
edge_match = 0
|
57
|
+
double_edge_match = 0
|
58
|
+
end
|
59
|
+
|
60
|
+
#differently weighted cases
|
61
|
+
#tweak this!!
|
62
|
+
alpha = 0.55
|
63
|
+
beta = 0.35
|
64
|
+
gamma = 0.1 #alpha > beta > gamma
|
65
|
+
relevance = (alpha.to_f * vert_match.to_f) + (beta * edge_match.to_f) + (gamma * double_edge_match.to_f) #case1's value will be in the range [0-6] (our semantic values)
|
66
|
+
scaled_relevance = relevance.to_f/6.to_f #scaled from [0-6] in the range [0-1]
|
67
|
+
|
68
|
+
#printing values
|
69
|
+
# puts("vertexMatch is [0-6]:: #{vert_match}")
|
70
|
+
# puts("edgeWithoutSyn Match is [0-6]:: #{edge_without_syn}")
|
71
|
+
# puts("edgeWithSyn Match is [0-6]:: #{edge_with_syn}")
|
72
|
+
# puts("edgeDiffType Match is [0-6]:: #{edge_diff_type}")
|
73
|
+
# puts("doubleEdge Match is [0-6]:: #{double_edge}")
|
74
|
+
# puts("doubleEdge with syntax Match is [0-6]:: #{double_edge_with_syn}")
|
75
|
+
# puts("relevance [0-6]:: #{relevance}")
|
76
|
+
# puts("scaled relevance on [0-1]:: #{scaled_relevance}")
|
77
|
+
# puts("*************************************************")
|
78
|
+
return scaled_relevance
|
79
|
+
end
|
80
|
+
=begin
|
81
|
+
* every vertex is compared with every other vertex
|
82
|
+
* Compares the vertices from across the two graphs to identify matches and quantify various metrics
|
83
|
+
* v1- vertices of the submission/past review and v2 - vertices from new review
|
84
|
+
=end
|
85
|
+
def compare_vertices(pos_tagger, rev, subm, num_rev_vert, num_sub_vert, speller)
|
86
|
+
# puts("****Inside compare_vertices:: rev.length:: #{num_rev_vert} subm.length:: #{num_sub_vert}")
|
87
|
+
#for double dimensional arrays, one of the dimensions should be initialized
|
88
|
+
@vertex_match = Array.new(num_rev_vert){Array.new}
|
89
|
+
wnet = WordnetBasedSimilarity.new
|
90
|
+
cum_vertex_match = 0.0
|
91
|
+
count = 0
|
92
|
+
max = 0.0
|
93
|
+
flag = 0
|
94
|
+
|
95
|
+
for i in (0..num_rev_vert - 1)
|
96
|
+
if(!rev.nil? and !rev[i].nil?)
|
97
|
+
rev[i].node_id = i
|
98
|
+
# puts("%%%%%%%%%%% Token #{rev[i].name} ::: POS tags:: rev[i].pos_tag:: #{rev[i].pos_tag} :: rev[i].node_id #{rev[i].node_id}")
|
99
|
+
#skipping frequent words from vertex comparison
|
100
|
+
if(wnet.is_frequent_word(rev[i].name))
|
101
|
+
next #ruby equivalent for continue
|
102
|
+
end
|
103
|
+
#looking for the best match
|
104
|
+
#j tracks every element in the set of all vertices, some of which are null
|
105
|
+
for j in (0..num_sub_vert - 1)
|
106
|
+
if(!subm[j].nil?)
|
107
|
+
if(subm[j].node_id == -1)
|
108
|
+
subm[j].node_id = j
|
109
|
+
end
|
110
|
+
# puts("%%%%%%%%%%% Token #{subm[j].name} ::: POS tags:: subm[j].pos_tag:: #{subm[j].pos_tag} subm[j].node_id #{subm[j].node_id}")
|
111
|
+
if(wnet.is_frequent_word(subm[j].name))
|
112
|
+
next #ruby equivalent for continue
|
113
|
+
end
|
114
|
+
#comparing only if one of the two vertices is a noun
|
115
|
+
if(rev[i].pos_tag.include?("NN") and subm[j].pos_tag.include?("NN"))
|
116
|
+
@vertex_match[i][j] = wnet.compare_strings(rev[i], subm[j], speller)
|
117
|
+
#only if the "if" condition is satisfied, since there could be null objects in between and you dont want unnecess. increments
|
118
|
+
flag = 1
|
119
|
+
if(@vertex_match[i][j] > max)
|
120
|
+
max = @vertex_match[i][j]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end #end of for loop for the submission vertices
|
125
|
+
|
126
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
127
|
+
# puts("**** Best match for:: #{rev[i].name}-- #{max}")
|
128
|
+
cum_vertex_match = cum_vertex_match + max
|
129
|
+
count+=1
|
130
|
+
max = 0.0 #re-initialize
|
131
|
+
flag = 0
|
132
|
+
end
|
133
|
+
end #end of if condition
|
134
|
+
end #end of for loop
|
135
|
+
|
136
|
+
avg_match = 0.0
|
137
|
+
if(count > 0)
|
138
|
+
avg_match = cum_vertex_match/ count
|
139
|
+
end
|
140
|
+
return avg_match
|
141
|
+
end #end of compare_vertices
|
142
|
+
|
143
|
+
#------------------------------------------#------------------------------------------
|
144
|
+
=begin
|
145
|
+
* SAME TYPE COMPARISON!!
|
146
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
147
|
+
* compare SUBJECT-VERB edges with SUBJECT-VERB matches
|
148
|
+
* where SUBJECT-SUBJECT and VERB-VERB or VERB-VERB and OBJECT-OBJECT comparisons are done
|
149
|
+
=end
|
150
|
+
def compare_edges_non_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
|
151
|
+
# puts("*****Inside compareEdgesnNonSyntaxDiff numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
152
|
+
best_SV_SV_match = Array.new(num_rev_edg){Array.new}
|
153
|
+
cum_edge_match = 0.0
|
154
|
+
count = 0
|
155
|
+
max = 0.0
|
156
|
+
flag = 0
|
157
|
+
|
158
|
+
wnet = WordnetBasedSimilarity.new
|
159
|
+
for i in (0..num_rev_edg - 1)
|
160
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
161
|
+
#skipping edges with frequent words for vertices
|
162
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
163
|
+
next
|
164
|
+
end
|
165
|
+
|
166
|
+
#looking for best matches
|
167
|
+
for j in (0..num_sub_edg - 1)
|
168
|
+
#comparing in-vertex with out-vertex to make sure they are of the same type
|
169
|
+
if(!subm[j].nil? && subm[j].in_vertex.node_id != -1 && subm[j].out_vertex.node_id != -1)
|
170
|
+
|
171
|
+
#checking if the subm token is a frequent word
|
172
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
173
|
+
next
|
174
|
+
end
|
175
|
+
|
176
|
+
#carrying out the normal comparison
|
177
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type && rev[i].out_vertex.type == subm[j].out_vertex.type)
|
178
|
+
if(!rev[i].label.nil?)
|
179
|
+
if(!subm[j].label.nil?)
|
180
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
181
|
+
sum = 0.0
|
182
|
+
cou = 0
|
183
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
184
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
185
|
+
cou +=1
|
186
|
+
end
|
187
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
188
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
189
|
+
cou +=1
|
190
|
+
end
|
191
|
+
#--Only vertex matches
|
192
|
+
if(cou > 0)
|
193
|
+
best_SV_SV_match[i][j] = sum.to_f/cou.to_f
|
194
|
+
else
|
195
|
+
best_SV_SV_match[i][j] = 0.0
|
196
|
+
end
|
197
|
+
#--Vertex and SRL - Dividing it by the label's match value
|
198
|
+
best_SV_SV_match[i][j] = best_SV_SV_match[i][j]/ compare_labels(rev[i], subm[j])
|
199
|
+
flag = 1
|
200
|
+
if(best_SV_SV_match[i][j] > max)
|
201
|
+
max = best_SV_SV_match[i][j]
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end #end of for loop for the submission edges
|
208
|
+
|
209
|
+
#cumulating the review edges' matches in order to get its average value
|
210
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
211
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
|
212
|
+
cum_edge_match = cum_edge_match + max
|
213
|
+
count+=1
|
214
|
+
max = 0.0#re-initialize
|
215
|
+
flag = 0
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end #end of 'for' loop for the review's edges
|
219
|
+
|
220
|
+
#getting the average for all the review edges' matches with the submission's edges
|
221
|
+
avg_match = 0.0
|
222
|
+
if(count > 0)
|
223
|
+
avg_match = cum_edge_match/ count
|
224
|
+
end
|
225
|
+
return avg_match
|
226
|
+
end
|
227
|
+
#------------------------------------------#------------------------------------------
|
228
|
+
=begin
|
229
|
+
* SAME TYPE COMPARISON!!
|
230
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
231
|
+
* compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
|
232
|
+
* where SUBJECT-OBJECT and VERB_VERB comparisons are done - same type comparisons!!
|
233
|
+
=end
|
234
|
+
|
235
|
+
def compare_edges_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
|
236
|
+
# puts("*****Inside compareEdgesSyntaxDiff :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
237
|
+
best_SV_VS_match = Array.new(num_rev_edg){Array.new}
|
238
|
+
cum_edge_match = 0.0
|
239
|
+
count = 0
|
240
|
+
max = 0.0
|
241
|
+
flag = 0
|
242
|
+
wnet = WordnetBasedSimilarity.new
|
243
|
+
for i in (0..num_rev_edg - 1)
|
244
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
245
|
+
#skipping frequent word
|
246
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
247
|
+
next
|
248
|
+
end
|
249
|
+
for j in (0..num_sub_edg - 1)
|
250
|
+
if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
|
251
|
+
#checking if the subm token is a frequent word
|
252
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
253
|
+
next
|
254
|
+
end
|
255
|
+
if(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
|
256
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
257
|
+
sum = 0.0
|
258
|
+
cou = 0
|
259
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
260
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
|
261
|
+
cou +=1
|
262
|
+
end
|
263
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
264
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
|
265
|
+
cou +=1
|
266
|
+
end
|
267
|
+
|
268
|
+
if(cou > 0)
|
269
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
270
|
+
else
|
271
|
+
best_SV_VS_match[i][j] = 0.0
|
272
|
+
end
|
273
|
+
|
274
|
+
flag = 1
|
275
|
+
if(best_SV_VS_match[i][j] > max)
|
276
|
+
max = best_SV_VS_match[i][j]
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end #end of the if condition
|
280
|
+
end #end of the for loop for the submission edges
|
281
|
+
|
282
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
283
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name}-- #{max}")
|
284
|
+
cum_edge_match = cum_edge_match + max
|
285
|
+
count+=1
|
286
|
+
max = 0.0 #re-initialize
|
287
|
+
flag = 0
|
288
|
+
end
|
289
|
+
|
290
|
+
end #end of the if condition
|
291
|
+
end #end of the for loop for the review
|
292
|
+
|
293
|
+
avg_match = 0.0
|
294
|
+
if(count > 0)
|
295
|
+
avg_match = cum_edge_match.to_f/count.to_f
|
296
|
+
end
|
297
|
+
return avg_match
|
298
|
+
end #end of the method
|
299
|
+
#------------------------------------------#------------------------------------------
|
300
|
+
=begin
|
301
|
+
DIFFERENT TYPE COMPARISON!!
|
302
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
303
|
+
* compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
|
304
|
+
* SUBJECT-VERB, VERB-SUBJECT, OBJECT-VERB, VERB-OBJECT comparisons are done!
|
305
|
+
=end
|
306
|
+
def compare_edges_diff_types(rev, subm, num_rev_edg, num_sub_edg)
|
307
|
+
# puts("*****Inside compareEdgesDiffTypes :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
308
|
+
best_SV_VS_match = Array.new(num_rev_edg){Array.new}
|
309
|
+
cum_edge_match = 0.0
|
310
|
+
count = 0
|
311
|
+
max = 0.0
|
312
|
+
flag = 0
|
313
|
+
wnet = WordnetBasedSimilarity.new
|
314
|
+
for i in (0..num_rev_edg - 1)
|
315
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
316
|
+
#skipping edges with frequent words for vertices
|
317
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
318
|
+
next
|
319
|
+
end
|
320
|
+
#identifying best match for edges
|
321
|
+
for j in (0..num_sub_edg - 1)
|
322
|
+
if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
|
323
|
+
#checking if the subm token is a frequent word
|
324
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
325
|
+
next
|
326
|
+
end
|
327
|
+
#for S-V with S-V or V-O with V-O
|
328
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type)
|
329
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
330
|
+
sum = 0.0
|
331
|
+
cou = 0
|
332
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
333
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
|
334
|
+
cou +=1
|
335
|
+
end
|
336
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
337
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
|
338
|
+
cou +=1
|
339
|
+
end
|
340
|
+
if(cou > 0)
|
341
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
342
|
+
else
|
343
|
+
best_SV_VS_match[i][j] = 0.0
|
344
|
+
end
|
345
|
+
#-- Vertex and SRL
|
346
|
+
best_SV_VS_match[i][j] = best_SV_VS_match[i][j]/ compare_labels(rev[i], subm[j])
|
347
|
+
flag = 1
|
348
|
+
if(best_SV_VS_match[i][j] > max)
|
349
|
+
max = best_SV_VS_match[i][j]
|
350
|
+
end
|
351
|
+
#for S-V with V-O or V-O with S-V
|
352
|
+
elsif(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
|
353
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
354
|
+
sum = 0.0
|
355
|
+
cou = 0
|
356
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
357
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
358
|
+
cou +=1
|
359
|
+
end
|
360
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
361
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
362
|
+
cou +=1
|
363
|
+
end
|
364
|
+
if(cou > 0)
|
365
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
366
|
+
else
|
367
|
+
best_SV_VS_match[i][j] =0.0
|
368
|
+
end
|
369
|
+
flag = 1
|
370
|
+
if(best_SV_VS_match[i][j] > max)
|
371
|
+
max = best_SV_VS_match[i][j]
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end #end of the if condition
|
375
|
+
end #end of the for loop for submission edges
|
376
|
+
|
377
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
378
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
|
379
|
+
cum_edge_match = cum_edge_match + max
|
380
|
+
count+=1
|
381
|
+
max = 0.0 #re-initialize
|
382
|
+
flag = 0
|
383
|
+
end
|
384
|
+
end #end of if condition
|
385
|
+
end #end of for loop for review edges
|
386
|
+
|
387
|
+
avg_match = 0.0
|
388
|
+
if(count > 0)
|
389
|
+
avg_match = cum_edge_match.to_f/ count.to_f
|
390
|
+
end
|
391
|
+
return avg_match
|
392
|
+
end #end of the method
|
393
|
+
#------------------------------------------#------------------------------------------
|
394
|
+
|
395
|
+
def compare_SVO_edges(rev, subm, num_rev_edg, num_sub_edg)
|
396
|
+
# puts("***********Inside compare SVO edges numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
397
|
+
best_SVO_SVO_edges_match = Array.new(num_rev_edg){Array.new}
|
398
|
+
cum_double_edge_match = 0.0
|
399
|
+
count = 0
|
400
|
+
max = 0.0
|
401
|
+
flag = 0
|
402
|
+
wnet = WordnetBasedSimilarity.new
|
403
|
+
for i in (0..num_rev_edg - 1)
|
404
|
+
if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
|
405
|
+
rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
|
406
|
+
#skipping edges with frequent words for vertices
|
407
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
|
408
|
+
next
|
409
|
+
end
|
410
|
+
#best match
|
411
|
+
for j in (0..num_sub_edg - 1)
|
412
|
+
if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and
|
413
|
+
subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
|
414
|
+
#checking if the subm token is a frequent word
|
415
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
416
|
+
next
|
417
|
+
end
|
418
|
+
#making sure the types are the same during comparison
|
419
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
|
420
|
+
rev[i+1].out_vertex.type == subm[j+1].out_vertex.type)
|
421
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
422
|
+
sum = 0.0
|
423
|
+
cou = 0
|
424
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
425
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
426
|
+
cou +=1
|
427
|
+
end
|
428
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
429
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
430
|
+
cou +=1
|
431
|
+
end
|
432
|
+
if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
|
433
|
+
sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id]
|
434
|
+
cou +=1
|
435
|
+
end
|
436
|
+
#-- Only Vertex match
|
437
|
+
if(cou > 0)
|
438
|
+
best_SVO_SVO_edges_match[i][j] = sum.to_f/cou.to_f
|
439
|
+
else
|
440
|
+
best_SVO_SVO_edges_match[i][j] = 0.0
|
441
|
+
end
|
442
|
+
#-- Vertex and SRL
|
443
|
+
best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i], subm[j]).to_f
|
444
|
+
best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i+1], subm[j+1]).to_f
|
445
|
+
#-- Only SRL
|
446
|
+
if(best_SVO_SVO_edges_match[i][j] > max)
|
447
|
+
max = best_SVO_SVO_edges_match[i][j]
|
448
|
+
end
|
449
|
+
flag = 1
|
450
|
+
end
|
451
|
+
end #end of 'if' condition
|
452
|
+
end #end of 'for' loop for 'j'
|
453
|
+
|
454
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
455
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name} -- #{max}")
|
456
|
+
cum_double_edge_match = cum_double_edge_match + max
|
457
|
+
count+=1
|
458
|
+
max = 0.0 #re-initialize
|
459
|
+
flag = 0
|
460
|
+
end
|
461
|
+
end #end of 'if' condition
|
462
|
+
end #end of 'for' loop for 'i'
|
463
|
+
|
464
|
+
avg_match = 0.0
|
465
|
+
if(count > 0)
|
466
|
+
avg_match = cum_double_edge_match.to_f/ count.to_f
|
467
|
+
end
|
468
|
+
return avg_match
|
469
|
+
end
|
470
|
+
#------------------------------------------#------------------------------------------
|
471
|
+
|
472
|
+
def compare_SVO_diff_syntax(rev, subm, num_rev_edg, num_sub_edg)
|
473
|
+
# puts("***********Inside compare SVO edges with syntax difference numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
474
|
+
best_SVO_OVS_edges_match = Array.new(num_rev_edg){ Array.new}
|
475
|
+
cum_double_edge_match = 0.0
|
476
|
+
count = 0
|
477
|
+
max = 0.0
|
478
|
+
flag = 0
|
479
|
+
wnet = WordnetBasedSimilarity.new
|
480
|
+
for i in (0..num_rev_edg - 1)
|
481
|
+
if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
|
482
|
+
rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
|
483
|
+
#skipping edges with frequent words for vertices
|
484
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
|
485
|
+
next
|
486
|
+
end
|
487
|
+
|
488
|
+
for j in (0..num_sub_edg - 1)
|
489
|
+
if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
|
490
|
+
#making sure the types are the same during comparison
|
491
|
+
if(rev[i].in_vertex.type == subm[j+1].out_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
|
492
|
+
rev[i+1].out_vertex.type == subm[j].in_vertex.type)
|
493
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
494
|
+
sum = 0.0
|
495
|
+
cou = 0
|
496
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
|
497
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id]
|
498
|
+
cou +=1
|
499
|
+
end
|
500
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
501
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
502
|
+
cou +=1
|
503
|
+
end
|
504
|
+
if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
505
|
+
sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id]
|
506
|
+
cou +=1
|
507
|
+
end
|
508
|
+
#comparing s-v-o (from review) with o-v-s (from submission)
|
509
|
+
if(cou > 0)
|
510
|
+
best_SVO_OVS_edges_match[i][j] = sum.to_f/cou.to_f
|
511
|
+
else
|
512
|
+
best_SVO_OVS_edges_match[i][j] = 0.0
|
513
|
+
end
|
514
|
+
flag = 1
|
515
|
+
if(best_SVO_OVS_edges_match[i][j] > max)
|
516
|
+
max = best_SVO_OVS_edges_match[i][j]
|
517
|
+
end
|
518
|
+
end
|
519
|
+
end #end of 'if' condition
|
520
|
+
end #end of 'for' loop for 'j'
|
521
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
522
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name}-- #{max}")
|
523
|
+
cum_double_edge_match = cum_double_edge_match + max
|
524
|
+
count+=1
|
525
|
+
max = 0.0 #re-initialize
|
526
|
+
flag = 0
|
527
|
+
end
|
528
|
+
|
529
|
+
end #end of if condition
|
530
|
+
end #end of for loop for 'i'
|
531
|
+
|
532
|
+
avg_match = 0.0
|
533
|
+
if(count > 0)
|
534
|
+
avg_match = cum_double_edge_match.to_f / count.to_f
|
535
|
+
end
|
536
|
+
return avg_match
|
537
|
+
end #end of method
|
538
|
+
#------------------------------------------#------------------------------------------
|
539
|
+
=begin
|
540
|
+
SR Labels and vertex matches are given equal importance
|
541
|
+
* Problem is even if the vertices didn't match, the SRL labels would cause them to have a high similarity.
|
542
|
+
* Consider "boy - said" and "chocolate - melted" - these edges have NOMATCH for vertices, but both edges have the same label "SBJ" and would get an EXACT match,
|
543
|
+
* resulting in an avg of 3! This cant be right!
|
544
|
+
* We therefore use the labels to only decrease the match value found from vertices, i.e., if the labels were different.
|
545
|
+
* Match value will be left as is, if the labels were the same.
|
546
|
+
=end
|
547
|
+
def compare_labels(edge1, edge2)
|
548
|
+
result = EQUAL
|
549
|
+
if(!edge1.label.nil? and !edge2.label .nil?)
|
550
|
+
if(edge1.label.downcase == edge2.label.downcase)
|
551
|
+
result = EQUAL #divide by 1
|
552
|
+
else
|
553
|
+
result = DISTINCT #divide by 2
|
554
|
+
end
|
555
|
+
elsif((!edge1.label.nil? and !edge2.label.nil?) or (edge1.label.nil? and !edge2.label.nil? )) #if only one of the labels was null
|
556
|
+
result = DISTINCT
|
557
|
+
elsif(edge1.label.nil? and edge2.label.nil?) #if both labels were null!
|
558
|
+
result = EQUAL
|
559
|
+
end
|
560
|
+
return result
|
561
|
+
end # end of method
|
562
|
+
end
|