automated_metareview 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,562 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/graph_generator'
3
+
4
+ class DegreeOfRelevance
5
+ #creating accessors for the instance variables
6
+ attr_accessor :vertex_match
7
+ attr_accessor :review
8
+ =begin
9
+ Identifies relevance between a review and a submission
10
+ =end
11
+ def get_relevance(reviews, submissions, num_reviews, pos_tagger, core_NLP_tagger, speller) #double dimensional arrays that contain the submissions and the reviews respectively
12
+ review_vertices = nil
13
+ review_edges = nil
14
+ subm_vertices = nil
15
+ subm_edges = nil
16
+ num_rev_vert = 0
17
+ num_rev_edg = 0
18
+ num_sub_vert = 0
19
+ numSubEdg = 0
20
+ vert_match = 0.0
21
+ edge_without_syn = 0.0
22
+ edge_with_syn = 0.0
23
+ edge_diff_type = 0.0
24
+ double_edge = 0.0
25
+ double_edge_with_syn = 0.0
26
+
27
+ #since Reviews and Submissions "should" contain the same number of records review - submission pairs
28
+ g = GraphGenerator.new
29
+ #generating review's graph
30
+ g.generate_graph(reviews, pos_tagger, core_NLP_tagger, true, false)
31
+ review_vertices = g.vertices
32
+ review_edges = g.edges
33
+ num_rev_vert = g.num_vertices
34
+ num_rev_edg = g.num_edges
35
+
36
+ #assigning graph as a review graph to use in content classification
37
+ @review = g.clone
38
+
39
+ #generating the submission's graph
40
+ g.generate_graph(submissions, pos_tagger, core_NLP_tagger, true, false)
41
+ subm_vertices = g.vertices
42
+ subm_edges = g.edges
43
+ num_sub_vert = g.num_vertices
44
+ num_sub_edg = g.num_edges
45
+
46
+ vert_match = compare_vertices(pos_tagger, review_vertices, subm_vertices, num_rev_vert, num_sub_vert, speller)
47
+ if(num_rev_edg > 0 and num_sub_edg > 0)
48
+ edge_without_syn = compare_edges_non_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
49
+ edge_with_syn = compare_edges_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
50
+ edge_diff_type = compare_edges_diff_types(review_edges, subm_edges, num_rev_edg, num_sub_edg)
51
+ edge_match = (edge_without_syn.to_f + edge_with_syn.to_f )/2.to_f #+ edge_diff_type.to_f
52
+ double_edge = compare_SVO_edges(review_edges, subm_edges, num_rev_edg, num_sub_edg)
53
+ double_edge_with_syn = compare_SVO_diff_syntax(review_edges, subm_edges, num_rev_edg, num_sub_edg)
54
+ double_edge_match = (double_edge.to_f + double_edge_with_syn.to_f)/2.to_f
55
+ else
56
+ edge_match = 0
57
+ double_edge_match = 0
58
+ end
59
+
60
+ #differently weighted cases
61
+ #tweak this!!
62
+ alpha = 0.55
63
+ beta = 0.35
64
+ gamma = 0.1 #alpha > beta > gamma
65
+ relevance = (alpha.to_f * vert_match.to_f) + (beta * edge_match.to_f) + (gamma * double_edge_match.to_f) #case1's value will be in the range [0-6] (our semantic values)
66
+ scaled_relevance = relevance.to_f/6.to_f #scaled from [0-6] in the range [0-1]
67
+
68
+ #printing values
69
+ # puts("vertexMatch is [0-6]:: #{vert_match}")
70
+ # puts("edgeWithoutSyn Match is [0-6]:: #{edge_without_syn}")
71
+ # puts("edgeWithSyn Match is [0-6]:: #{edge_with_syn}")
72
+ # puts("edgeDiffType Match is [0-6]:: #{edge_diff_type}")
73
+ # puts("doubleEdge Match is [0-6]:: #{double_edge}")
74
+ # puts("doubleEdge with syntax Match is [0-6]:: #{double_edge_with_syn}")
75
+ # puts("relevance [0-6]:: #{relevance}")
76
+ # puts("scaled relevance on [0-1]:: #{scaled_relevance}")
77
+ # puts("*************************************************")
78
+ return scaled_relevance
79
+ end
80
+ =begin
81
+ * every vertex is compared with every other vertex
82
+ * Compares the vertices from across the two graphs to identify matches and quantify various metrics
83
+ * v1- vertices of the submission/past review and v2 - vertices from new review
84
+ =end
85
+ def compare_vertices(pos_tagger, rev, subm, num_rev_vert, num_sub_vert, speller)
86
+ # puts("****Inside compare_vertices:: rev.length:: #{num_rev_vert} subm.length:: #{num_sub_vert}")
87
+ #for double dimensional arrays, one of the dimensions should be initialized
88
+ @vertex_match = Array.new(num_rev_vert){Array.new}
89
+ wnet = WordnetBasedSimilarity.new
90
+ cum_vertex_match = 0.0
91
+ count = 0
92
+ max = 0.0
93
+ flag = 0
94
+
95
+ for i in (0..num_rev_vert - 1)
96
+ if(!rev.nil? and !rev[i].nil?)
97
+ rev[i].node_id = i
98
+ # puts("%%%%%%%%%%% Token #{rev[i].name} ::: POS tags:: rev[i].pos_tag:: #{rev[i].pos_tag} :: rev[i].node_id #{rev[i].node_id}")
99
+ #skipping frequent words from vertex comparison
100
+ if(wnet.is_frequent_word(rev[i].name))
101
+ next #ruby equivalent for continue
102
+ end
103
+ #looking for the best match
104
+ #j tracks every element in the set of all vertices, some of which are null
105
+ for j in (0..num_sub_vert - 1)
106
+ if(!subm[j].nil?)
107
+ if(subm[j].node_id == -1)
108
+ subm[j].node_id = j
109
+ end
110
+ # puts("%%%%%%%%%%% Token #{subm[j].name} ::: POS tags:: subm[j].pos_tag:: #{subm[j].pos_tag} subm[j].node_id #{subm[j].node_id}")
111
+ if(wnet.is_frequent_word(subm[j].name))
112
+ next #ruby equivalent for continue
113
+ end
114
+ #comparing only if one of the two vertices is a noun
115
+ if(rev[i].pos_tag.include?("NN") and subm[j].pos_tag.include?("NN"))
116
+ @vertex_match[i][j] = wnet.compare_strings(rev[i], subm[j], speller)
117
+ #only if the "if" condition is satisfied, since there could be null objects in between and you dont want unnecess. increments
118
+ flag = 1
119
+ if(@vertex_match[i][j] > max)
120
+ max = @vertex_match[i][j]
121
+ end
122
+ end
123
+ end
124
+ end #end of for loop for the submission vertices
125
+
126
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
127
+ # puts("**** Best match for:: #{rev[i].name}-- #{max}")
128
+ cum_vertex_match = cum_vertex_match + max
129
+ count+=1
130
+ max = 0.0 #re-initialize
131
+ flag = 0
132
+ end
133
+ end #end of if condition
134
+ end #end of for loop
135
+
136
+ avg_match = 0.0
137
+ if(count > 0)
138
+ avg_match = cum_vertex_match/ count
139
+ end
140
+ return avg_match
141
+ end #end of compare_vertices
142
+
143
+ #------------------------------------------#------------------------------------------
144
+ =begin
145
+ * SAME TYPE COMPARISON!!
146
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
147
+ * compare SUBJECT-VERB edges with SUBJECT-VERB matches
148
+ * where SUBJECT-SUBJECT and VERB-VERB or VERB-VERB and OBJECT-OBJECT comparisons are done
149
+ =end
150
+ def compare_edges_non_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
151
+ # puts("*****Inside compareEdgesnNonSyntaxDiff numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
152
+ best_SV_SV_match = Array.new(num_rev_edg){Array.new}
153
+ cum_edge_match = 0.0
154
+ count = 0
155
+ max = 0.0
156
+ flag = 0
157
+
158
+ wnet = WordnetBasedSimilarity.new
159
+ for i in (0..num_rev_edg - 1)
160
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
161
+ #skipping edges with frequent words for vertices
162
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
163
+ next
164
+ end
165
+
166
+ #looking for best matches
167
+ for j in (0..num_sub_edg - 1)
168
+ #comparing in-vertex with out-vertex to make sure they are of the same type
169
+ if(!subm[j].nil? && subm[j].in_vertex.node_id != -1 && subm[j].out_vertex.node_id != -1)
170
+
171
+ #checking if the subm token is a frequent word
172
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
173
+ next
174
+ end
175
+
176
+ #carrying out the normal comparison
177
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type && rev[i].out_vertex.type == subm[j].out_vertex.type)
178
+ if(!rev[i].label.nil?)
179
+ if(!subm[j].label.nil?)
180
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
181
+ sum = 0.0
182
+ cou = 0
183
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
184
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
185
+ cou +=1
186
+ end
187
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
188
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
189
+ cou +=1
190
+ end
191
+ #--Only vertex matches
192
+ if(cou > 0)
193
+ best_SV_SV_match[i][j] = sum.to_f/cou.to_f
194
+ else
195
+ best_SV_SV_match[i][j] = 0.0
196
+ end
197
+ #--Vertex and SRL - Dividing it by the label's match value
198
+ best_SV_SV_match[i][j] = best_SV_SV_match[i][j]/ compare_labels(rev[i], subm[j])
199
+ flag = 1
200
+ if(best_SV_SV_match[i][j] > max)
201
+ max = best_SV_SV_match[i][j]
202
+ end
203
+ end
204
+ end
205
+ end
206
+ end
207
+ end #end of for loop for the submission edges
208
+
209
+ #cumulating the review edges' matches in order to get its average value
210
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
211
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
212
+ cum_edge_match = cum_edge_match + max
213
+ count+=1
214
+ max = 0.0#re-initialize
215
+ flag = 0
216
+ end
217
+ end
218
+ end #end of 'for' loop for the review's edges
219
+
220
+ #getting the average for all the review edges' matches with the submission's edges
221
+ avg_match = 0.0
222
+ if(count > 0)
223
+ avg_match = cum_edge_match/ count
224
+ end
225
+ return avg_match
226
+ end
227
+ #------------------------------------------#------------------------------------------
228
+ =begin
229
+ * SAME TYPE COMPARISON!!
230
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
231
+ * compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
232
+ * where SUBJECT-OBJECT and VERB_VERB comparisons are done - same type comparisons!!
233
+ =end
234
+
235
+ def compare_edges_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
236
+ # puts("*****Inside compareEdgesSyntaxDiff :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
237
+ best_SV_VS_match = Array.new(num_rev_edg){Array.new}
238
+ cum_edge_match = 0.0
239
+ count = 0
240
+ max = 0.0
241
+ flag = 0
242
+ wnet = WordnetBasedSimilarity.new
243
+ for i in (0..num_rev_edg - 1)
244
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
245
+ #skipping frequent word
246
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
247
+ next
248
+ end
249
+ for j in (0..num_sub_edg - 1)
250
+ if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
251
+ #checking if the subm token is a frequent word
252
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
253
+ next
254
+ end
255
+ if(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
256
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
257
+ sum = 0.0
258
+ cou = 0
259
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
260
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
261
+ cou +=1
262
+ end
263
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
264
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
265
+ cou +=1
266
+ end
267
+
268
+ if(cou > 0)
269
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
270
+ else
271
+ best_SV_VS_match[i][j] = 0.0
272
+ end
273
+
274
+ flag = 1
275
+ if(best_SV_VS_match[i][j] > max)
276
+ max = best_SV_VS_match[i][j]
277
+ end
278
+ end
279
+ end #end of the if condition
280
+ end #end of the for loop for the submission edges
281
+
282
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
283
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name}-- #{max}")
284
+ cum_edge_match = cum_edge_match + max
285
+ count+=1
286
+ max = 0.0 #re-initialize
287
+ flag = 0
288
+ end
289
+
290
+ end #end of the if condition
291
+ end #end of the for loop for the review
292
+
293
+ avg_match = 0.0
294
+ if(count > 0)
295
+ avg_match = cum_edge_match.to_f/count.to_f
296
+ end
297
+ return avg_match
298
+ end #end of the method
299
+ #------------------------------------------#------------------------------------------
300
+ =begin
301
+ DIFFERENT TYPE COMPARISON!!
302
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
303
+ * compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
304
+ * SUBJECT-VERB, VERB-SUBJECT, OBJECT-VERB, VERB-OBJECT comparisons are done!
305
+ =end
306
+ def compare_edges_diff_types(rev, subm, num_rev_edg, num_sub_edg)
307
+ # puts("*****Inside compareEdgesDiffTypes :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
308
+ best_SV_VS_match = Array.new(num_rev_edg){Array.new}
309
+ cum_edge_match = 0.0
310
+ count = 0
311
+ max = 0.0
312
+ flag = 0
313
+ wnet = WordnetBasedSimilarity.new
314
+ for i in (0..num_rev_edg - 1)
315
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
316
+ #skipping edges with frequent words for vertices
317
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
318
+ next
319
+ end
320
+ #identifying best match for edges
321
+ for j in (0..num_sub_edg - 1)
322
+ if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
323
+ #checking if the subm token is a frequent word
324
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
325
+ next
326
+ end
327
+ #for S-V with S-V or V-O with V-O
328
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type)
329
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
330
+ sum = 0.0
331
+ cou = 0
332
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
333
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
334
+ cou +=1
335
+ end
336
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
337
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
338
+ cou +=1
339
+ end
340
+ if(cou > 0)
341
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
342
+ else
343
+ best_SV_VS_match[i][j] = 0.0
344
+ end
345
+ #-- Vertex and SRL
346
+ best_SV_VS_match[i][j] = best_SV_VS_match[i][j]/ compare_labels(rev[i], subm[j])
347
+ flag = 1
348
+ if(best_SV_VS_match[i][j] > max)
349
+ max = best_SV_VS_match[i][j]
350
+ end
351
+ #for S-V with V-O or V-O with S-V
352
+ elsif(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
353
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
354
+ sum = 0.0
355
+ cou = 0
356
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
357
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
358
+ cou +=1
359
+ end
360
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
361
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
362
+ cou +=1
363
+ end
364
+ if(cou > 0)
365
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
366
+ else
367
+ best_SV_VS_match[i][j] =0.0
368
+ end
369
+ flag = 1
370
+ if(best_SV_VS_match[i][j] > max)
371
+ max = best_SV_VS_match[i][j]
372
+ end
373
+ end
374
+ end #end of the if condition
375
+ end #end of the for loop for submission edges
376
+
377
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
378
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
379
+ cum_edge_match = cum_edge_match + max
380
+ count+=1
381
+ max = 0.0 #re-initialize
382
+ flag = 0
383
+ end
384
+ end #end of if condition
385
+ end #end of for loop for review edges
386
+
387
+ avg_match = 0.0
388
+ if(count > 0)
389
+ avg_match = cum_edge_match.to_f/ count.to_f
390
+ end
391
+ return avg_match
392
+ end #end of the method
393
+ #------------------------------------------#------------------------------------------
394
+
395
+ def compare_SVO_edges(rev, subm, num_rev_edg, num_sub_edg)
396
+ # puts("***********Inside compare SVO edges numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
397
+ best_SVO_SVO_edges_match = Array.new(num_rev_edg){Array.new}
398
+ cum_double_edge_match = 0.0
399
+ count = 0
400
+ max = 0.0
401
+ flag = 0
402
+ wnet = WordnetBasedSimilarity.new
403
+ for i in (0..num_rev_edg - 1)
404
+ if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
405
+ rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
406
+ #skipping edges with frequent words for vertices
407
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
408
+ next
409
+ end
410
+ #best match
411
+ for j in (0..num_sub_edg - 1)
412
+ if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and
413
+ subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
414
+ #checking if the subm token is a frequent word
415
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
416
+ next
417
+ end
418
+ #making sure the types are the same during comparison
419
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
420
+ rev[i+1].out_vertex.type == subm[j+1].out_vertex.type)
421
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
422
+ sum = 0.0
423
+ cou = 0
424
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
425
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
426
+ cou +=1
427
+ end
428
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
429
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
430
+ cou +=1
431
+ end
432
+ if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
433
+ sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id]
434
+ cou +=1
435
+ end
436
+ #-- Only Vertex match
437
+ if(cou > 0)
438
+ best_SVO_SVO_edges_match[i][j] = sum.to_f/cou.to_f
439
+ else
440
+ best_SVO_SVO_edges_match[i][j] = 0.0
441
+ end
442
+ #-- Vertex and SRL
443
+ best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i], subm[j]).to_f
444
+ best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i+1], subm[j+1]).to_f
445
+ #-- Only SRL
446
+ if(best_SVO_SVO_edges_match[i][j] > max)
447
+ max = best_SVO_SVO_edges_match[i][j]
448
+ end
449
+ flag = 1
450
+ end
451
+ end #end of 'if' condition
452
+ end #end of 'for' loop for 'j'
453
+
454
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
455
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name} -- #{max}")
456
+ cum_double_edge_match = cum_double_edge_match + max
457
+ count+=1
458
+ max = 0.0 #re-initialize
459
+ flag = 0
460
+ end
461
+ end #end of 'if' condition
462
+ end #end of 'for' loop for 'i'
463
+
464
+ avg_match = 0.0
465
+ if(count > 0)
466
+ avg_match = cum_double_edge_match.to_f/ count.to_f
467
+ end
468
+ return avg_match
469
+ end
470
+ #------------------------------------------#------------------------------------------
471
+
472
+ def compare_SVO_diff_syntax(rev, subm, num_rev_edg, num_sub_edg)
473
+ # puts("***********Inside compare SVO edges with syntax difference numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
474
+ best_SVO_OVS_edges_match = Array.new(num_rev_edg){ Array.new}
475
+ cum_double_edge_match = 0.0
476
+ count = 0
477
+ max = 0.0
478
+ flag = 0
479
+ wnet = WordnetBasedSimilarity.new
480
+ for i in (0..num_rev_edg - 1)
481
+ if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
482
+ rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
483
+ #skipping edges with frequent words for vertices
484
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
485
+ next
486
+ end
487
+
488
+ for j in (0..num_sub_edg - 1)
489
+ if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
490
+ #making sure the types are the same during comparison
491
+ if(rev[i].in_vertex.type == subm[j+1].out_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
492
+ rev[i+1].out_vertex.type == subm[j].in_vertex.type)
493
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
494
+ sum = 0.0
495
+ cou = 0
496
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
497
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id]
498
+ cou +=1
499
+ end
500
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
501
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
502
+ cou +=1
503
+ end
504
+ if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
505
+ sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id]
506
+ cou +=1
507
+ end
508
+ #comparing s-v-o (from review) with o-v-s (from submission)
509
+ if(cou > 0)
510
+ best_SVO_OVS_edges_match[i][j] = sum.to_f/cou.to_f
511
+ else
512
+ best_SVO_OVS_edges_match[i][j] = 0.0
513
+ end
514
+ flag = 1
515
+ if(best_SVO_OVS_edges_match[i][j] > max)
516
+ max = best_SVO_OVS_edges_match[i][j]
517
+ end
518
+ end
519
+ end #end of 'if' condition
520
+ end #end of 'for' loop for 'j'
521
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
522
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name}-- #{max}")
523
+ cum_double_edge_match = cum_double_edge_match + max
524
+ count+=1
525
+ max = 0.0 #re-initialize
526
+ flag = 0
527
+ end
528
+
529
+ end #end of if condition
530
+ end #end of for loop for 'i'
531
+
532
+ avg_match = 0.0
533
+ if(count > 0)
534
+ avg_match = cum_double_edge_match.to_f / count.to_f
535
+ end
536
+ return avg_match
537
+ end #end of method
538
+ #------------------------------------------#------------------------------------------
539
+ =begin
540
+ SR Labels and vertex matches are given equal importance
541
+ * Problem is even if the vertices didn't match, the SRL labels would cause them to have a high similarity.
542
+ * Consider "boy - said" and "chocolate - melted" - these edges have NOMATCH for vertices, but both edges have the same label "SBJ" and would get an EXACT match,
543
+ * resulting in an avg of 3! This cant be right!
544
+ * We therefore use the labels to only decrease the match value found from vertices, i.e., if the labels were different.
545
+ * Match value will be left as is, if the labels were the same.
546
+ =end
547
+ def compare_labels(edge1, edge2)
548
+ result = EQUAL
549
+ if(!edge1.label.nil? and !edge2.label .nil?)
550
+ if(edge1.label.downcase == edge2.label.downcase)
551
+ result = EQUAL #divide by 1
552
+ else
553
+ result = DISTINCT #divide by 2
554
+ end
555
+ elsif((!edge1.label.nil? and !edge2.label.nil?) or (edge1.label.nil? and !edge2.label.nil? )) #if only one of the labels was null
556
+ result = DISTINCT
557
+ elsif(edge1.label.nil? and edge2.label.nil?) #if both labels were null!
558
+ result = EQUAL
559
+ end
560
+ return result
561
+ end # end of method
562
+ end