automated_metareview 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,562 @@
1
+ require 'automated_metareview/wordnet_based_similarity'
2
+ require 'automated_metareview/graph_generator'
3
+
4
+ class DegreeOfRelevance
5
+ #creating accessors for the instance variables
6
+ attr_accessor :vertex_match
7
+ attr_accessor :review
8
+ =begin
9
+ Identifies relevance between a review and a submission
10
+ =end
11
+ def get_relevance(reviews, submissions, num_reviews, pos_tagger, core_NLP_tagger, speller) #double dimensional arrays that contain the submissions and the reviews respectively
12
+ review_vertices = nil
13
+ review_edges = nil
14
+ subm_vertices = nil
15
+ subm_edges = nil
16
+ num_rev_vert = 0
17
+ num_rev_edg = 0
18
+ num_sub_vert = 0
19
+ numSubEdg = 0
20
+ vert_match = 0.0
21
+ edge_without_syn = 0.0
22
+ edge_with_syn = 0.0
23
+ edge_diff_type = 0.0
24
+ double_edge = 0.0
25
+ double_edge_with_syn = 0.0
26
+
27
+ #since Reviews and Submissions "should" contain the same number of records review - submission pairs
28
+ g = GraphGenerator.new
29
+ #generating review's graph
30
+ g.generate_graph(reviews, pos_tagger, core_NLP_tagger, true, false)
31
+ review_vertices = g.vertices
32
+ review_edges = g.edges
33
+ num_rev_vert = g.num_vertices
34
+ num_rev_edg = g.num_edges
35
+
36
+ #assigning graph as a review graph to use in content classification
37
+ @review = g.clone
38
+
39
+ #generating the submission's graph
40
+ g.generate_graph(submissions, pos_tagger, core_NLP_tagger, true, false)
41
+ subm_vertices = g.vertices
42
+ subm_edges = g.edges
43
+ num_sub_vert = g.num_vertices
44
+ num_sub_edg = g.num_edges
45
+
46
+ vert_match = compare_vertices(pos_tagger, review_vertices, subm_vertices, num_rev_vert, num_sub_vert, speller)
47
+ if(num_rev_edg > 0 and num_sub_edg > 0)
48
+ edge_without_syn = compare_edges_non_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
49
+ edge_with_syn = compare_edges_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
50
+ edge_diff_type = compare_edges_diff_types(review_edges, subm_edges, num_rev_edg, num_sub_edg)
51
+ edge_match = (edge_without_syn.to_f + edge_with_syn.to_f )/2.to_f #+ edge_diff_type.to_f
52
+ double_edge = compare_SVO_edges(review_edges, subm_edges, num_rev_edg, num_sub_edg)
53
+ double_edge_with_syn = compare_SVO_diff_syntax(review_edges, subm_edges, num_rev_edg, num_sub_edg)
54
+ double_edge_match = (double_edge.to_f + double_edge_with_syn.to_f)/2.to_f
55
+ else
56
+ edge_match = 0
57
+ double_edge_match = 0
58
+ end
59
+
60
+ #differently weighted cases
61
+ #tweak this!!
62
+ alpha = 0.55
63
+ beta = 0.35
64
+ gamma = 0.1 #alpha > beta > gamma
65
+ relevance = (alpha.to_f * vert_match.to_f) + (beta * edge_match.to_f) + (gamma * double_edge_match.to_f) #case1's value will be in the range [0-6] (our semantic values)
66
+ scaled_relevance = relevance.to_f/6.to_f #scaled from [0-6] in the range [0-1]
67
+
68
+ #printing values
69
+ # puts("vertexMatch is [0-6]:: #{vert_match}")
70
+ # puts("edgeWithoutSyn Match is [0-6]:: #{edge_without_syn}")
71
+ # puts("edgeWithSyn Match is [0-6]:: #{edge_with_syn}")
72
+ # puts("edgeDiffType Match is [0-6]:: #{edge_diff_type}")
73
+ # puts("doubleEdge Match is [0-6]:: #{double_edge}")
74
+ # puts("doubleEdge with syntax Match is [0-6]:: #{double_edge_with_syn}")
75
+ # puts("relevance [0-6]:: #{relevance}")
76
+ # puts("scaled relevance on [0-1]:: #{scaled_relevance}")
77
+ # puts("*************************************************")
78
+ return scaled_relevance
79
+ end
80
+ =begin
81
+ * every vertex is compared with every other vertex
82
+ * Compares the vertices from across the two graphs to identify matches and quantify various metrics
83
+ * v1- vertices of the submission/past review and v2 - vertices from new review
84
+ =end
85
+ def compare_vertices(pos_tagger, rev, subm, num_rev_vert, num_sub_vert, speller)
86
+ # puts("****Inside compare_vertices:: rev.length:: #{num_rev_vert} subm.length:: #{num_sub_vert}")
87
+ #for double dimensional arrays, one of the dimensions should be initialized
88
+ @vertex_match = Array.new(num_rev_vert){Array.new}
89
+ wnet = WordnetBasedSimilarity.new
90
+ cum_vertex_match = 0.0
91
+ count = 0
92
+ max = 0.0
93
+ flag = 0
94
+
95
+ for i in (0..num_rev_vert - 1)
96
+ if(!rev.nil? and !rev[i].nil?)
97
+ rev[i].node_id = i
98
+ # puts("%%%%%%%%%%% Token #{rev[i].name} ::: POS tags:: rev[i].pos_tag:: #{rev[i].pos_tag} :: rev[i].node_id #{rev[i].node_id}")
99
+ #skipping frequent words from vertex comparison
100
+ if(wnet.is_frequent_word(rev[i].name))
101
+ next #ruby equivalent for continue
102
+ end
103
+ #looking for the best match
104
+ #j tracks every element in the set of all vertices, some of which are null
105
+ for j in (0..num_sub_vert - 1)
106
+ if(!subm[j].nil?)
107
+ if(subm[j].node_id == -1)
108
+ subm[j].node_id = j
109
+ end
110
+ # puts("%%%%%%%%%%% Token #{subm[j].name} ::: POS tags:: subm[j].pos_tag:: #{subm[j].pos_tag} subm[j].node_id #{subm[j].node_id}")
111
+ if(wnet.is_frequent_word(subm[j].name))
112
+ next #ruby equivalent for continue
113
+ end
114
+ #comparing only if one of the two vertices is a noun
115
+ if(rev[i].pos_tag.include?("NN") and subm[j].pos_tag.include?("NN"))
116
+ @vertex_match[i][j] = wnet.compare_strings(rev[i], subm[j], speller)
117
+ #only if the "if" condition is satisfied, since there could be null objects in between and you dont want unnecess. increments
118
+ flag = 1
119
+ if(@vertex_match[i][j] > max)
120
+ max = @vertex_match[i][j]
121
+ end
122
+ end
123
+ end
124
+ end #end of for loop for the submission vertices
125
+
126
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
127
+ # puts("**** Best match for:: #{rev[i].name}-- #{max}")
128
+ cum_vertex_match = cum_vertex_match + max
129
+ count+=1
130
+ max = 0.0 #re-initialize
131
+ flag = 0
132
+ end
133
+ end #end of if condition
134
+ end #end of for loop
135
+
136
+ avg_match = 0.0
137
+ if(count > 0)
138
+ avg_match = cum_vertex_match/ count
139
+ end
140
+ return avg_match
141
+ end #end of compare_vertices
142
+
143
+ #------------------------------------------#------------------------------------------
144
+ =begin
145
+ * SAME TYPE COMPARISON!!
146
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
147
+ * compare SUBJECT-VERB edges with SUBJECT-VERB matches
148
+ * where SUBJECT-SUBJECT and VERB-VERB or VERB-VERB and OBJECT-OBJECT comparisons are done
149
+ =end
150
+ def compare_edges_non_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
151
+ # puts("*****Inside compareEdgesnNonSyntaxDiff numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
152
+ best_SV_SV_match = Array.new(num_rev_edg){Array.new}
153
+ cum_edge_match = 0.0
154
+ count = 0
155
+ max = 0.0
156
+ flag = 0
157
+
158
+ wnet = WordnetBasedSimilarity.new
159
+ for i in (0..num_rev_edg - 1)
160
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
161
+ #skipping edges with frequent words for vertices
162
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
163
+ next
164
+ end
165
+
166
+ #looking for best matches
167
+ for j in (0..num_sub_edg - 1)
168
+ #comparing in-vertex with out-vertex to make sure they are of the same type
169
+ if(!subm[j].nil? && subm[j].in_vertex.node_id != -1 && subm[j].out_vertex.node_id != -1)
170
+
171
+ #checking if the subm token is a frequent word
172
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
173
+ next
174
+ end
175
+
176
+ #carrying out the normal comparison
177
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type && rev[i].out_vertex.type == subm[j].out_vertex.type)
178
+ if(!rev[i].label.nil?)
179
+ if(!subm[j].label.nil?)
180
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
181
+ sum = 0.0
182
+ cou = 0
183
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
184
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
185
+ cou +=1
186
+ end
187
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
188
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
189
+ cou +=1
190
+ end
191
+ #--Only vertex matches
192
+ if(cou > 0)
193
+ best_SV_SV_match[i][j] = sum.to_f/cou.to_f
194
+ else
195
+ best_SV_SV_match[i][j] = 0.0
196
+ end
197
+ #--Vertex and SRL - Dividing it by the label's match value
198
+ best_SV_SV_match[i][j] = best_SV_SV_match[i][j]/ compare_labels(rev[i], subm[j])
199
+ flag = 1
200
+ if(best_SV_SV_match[i][j] > max)
201
+ max = best_SV_SV_match[i][j]
202
+ end
203
+ end
204
+ end
205
+ end
206
+ end
207
+ end #end of for loop for the submission edges
208
+
209
+ #cumulating the review edges' matches in order to get its average value
210
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
211
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
212
+ cum_edge_match = cum_edge_match + max
213
+ count+=1
214
+ max = 0.0#re-initialize
215
+ flag = 0
216
+ end
217
+ end
218
+ end #end of 'for' loop for the review's edges
219
+
220
+ #getting the average for all the review edges' matches with the submission's edges
221
+ avg_match = 0.0
222
+ if(count > 0)
223
+ avg_match = cum_edge_match/ count
224
+ end
225
+ return avg_match
226
+ end
227
+ #------------------------------------------#------------------------------------------
228
+ =begin
229
+ * SAME TYPE COMPARISON!!
230
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
231
+ * compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
232
+ * where SUBJECT-OBJECT and VERB_VERB comparisons are done - same type comparisons!!
233
+ =end
234
+
235
+ def compare_edges_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
236
+ # puts("*****Inside compareEdgesSyntaxDiff :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
237
+ best_SV_VS_match = Array.new(num_rev_edg){Array.new}
238
+ cum_edge_match = 0.0
239
+ count = 0
240
+ max = 0.0
241
+ flag = 0
242
+ wnet = WordnetBasedSimilarity.new
243
+ for i in (0..num_rev_edg - 1)
244
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
245
+ #skipping frequent word
246
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
247
+ next
248
+ end
249
+ for j in (0..num_sub_edg - 1)
250
+ if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
251
+ #checking if the subm token is a frequent word
252
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
253
+ next
254
+ end
255
+ if(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
256
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
257
+ sum = 0.0
258
+ cou = 0
259
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
260
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
261
+ cou +=1
262
+ end
263
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
264
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
265
+ cou +=1
266
+ end
267
+
268
+ if(cou > 0)
269
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
270
+ else
271
+ best_SV_VS_match[i][j] = 0.0
272
+ end
273
+
274
+ flag = 1
275
+ if(best_SV_VS_match[i][j] > max)
276
+ max = best_SV_VS_match[i][j]
277
+ end
278
+ end
279
+ end #end of the if condition
280
+ end #end of the for loop for the submission edges
281
+
282
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
283
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name}-- #{max}")
284
+ cum_edge_match = cum_edge_match + max
285
+ count+=1
286
+ max = 0.0 #re-initialize
287
+ flag = 0
288
+ end
289
+
290
+ end #end of the if condition
291
+ end #end of the for loop for the review
292
+
293
+ avg_match = 0.0
294
+ if(count > 0)
295
+ avg_match = cum_edge_match.to_f/count.to_f
296
+ end
297
+ return avg_match
298
+ end #end of the method
299
+ #------------------------------------------#------------------------------------------
300
+ =begin
301
+ DIFFERENT TYPE COMPARISON!!
302
+ * Compares the edges from across the two graphs to identify matches and quantify various metrics
303
+ * compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
304
+ * SUBJECT-VERB, VERB-SUBJECT, OBJECT-VERB, VERB-OBJECT comparisons are done!
305
+ =end
306
+ def compare_edges_diff_types(rev, subm, num_rev_edg, num_sub_edg)
307
+ # puts("*****Inside compareEdgesDiffTypes :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
308
+ best_SV_VS_match = Array.new(num_rev_edg){Array.new}
309
+ cum_edge_match = 0.0
310
+ count = 0
311
+ max = 0.0
312
+ flag = 0
313
+ wnet = WordnetBasedSimilarity.new
314
+ for i in (0..num_rev_edg - 1)
315
+ if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
316
+ #skipping edges with frequent words for vertices
317
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
318
+ next
319
+ end
320
+ #identifying best match for edges
321
+ for j in (0..num_sub_edg - 1)
322
+ if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
323
+ #checking if the subm token is a frequent word
324
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
325
+ next
326
+ end
327
+ #for S-V with S-V or V-O with V-O
328
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type)
329
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
330
+ sum = 0.0
331
+ cou = 0
332
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
333
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
334
+ cou +=1
335
+ end
336
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
337
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
338
+ cou +=1
339
+ end
340
+ if(cou > 0)
341
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
342
+ else
343
+ best_SV_VS_match[i][j] = 0.0
344
+ end
345
+ #-- Vertex and SRL
346
+ best_SV_VS_match[i][j] = best_SV_VS_match[i][j]/ compare_labels(rev[i], subm[j])
347
+ flag = 1
348
+ if(best_SV_VS_match[i][j] > max)
349
+ max = best_SV_VS_match[i][j]
350
+ end
351
+ #for S-V with V-O or V-O with S-V
352
+ elsif(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
353
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
354
+ sum = 0.0
355
+ cou = 0
356
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
357
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
358
+ cou +=1
359
+ end
360
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
361
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
362
+ cou +=1
363
+ end
364
+ if(cou > 0)
365
+ best_SV_VS_match[i][j] = sum.to_f/cou.to_f
366
+ else
367
+ best_SV_VS_match[i][j] =0.0
368
+ end
369
+ flag = 1
370
+ if(best_SV_VS_match[i][j] > max)
371
+ max = best_SV_VS_match[i][j]
372
+ end
373
+ end
374
+ end #end of the if condition
375
+ end #end of the for loop for submission edges
376
+
377
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
378
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
379
+ cum_edge_match = cum_edge_match + max
380
+ count+=1
381
+ max = 0.0 #re-initialize
382
+ flag = 0
383
+ end
384
+ end #end of if condition
385
+ end #end of for loop for review edges
386
+
387
+ avg_match = 0.0
388
+ if(count > 0)
389
+ avg_match = cum_edge_match.to_f/ count.to_f
390
+ end
391
+ return avg_match
392
+ end #end of the method
393
+ #------------------------------------------#------------------------------------------
394
+
395
+ def compare_SVO_edges(rev, subm, num_rev_edg, num_sub_edg)
396
+ # puts("***********Inside compare SVO edges numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
397
+ best_SVO_SVO_edges_match = Array.new(num_rev_edg){Array.new}
398
+ cum_double_edge_match = 0.0
399
+ count = 0
400
+ max = 0.0
401
+ flag = 0
402
+ wnet = WordnetBasedSimilarity.new
403
+ for i in (0..num_rev_edg - 1)
404
+ if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
405
+ rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
406
+ #skipping edges with frequent words for vertices
407
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
408
+ next
409
+ end
410
+ #best match
411
+ for j in (0..num_sub_edg - 1)
412
+ if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and
413
+ subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
414
+ #checking if the subm token is a frequent word
415
+ if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
416
+ next
417
+ end
418
+ #making sure the types are the same during comparison
419
+ if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
420
+ rev[i+1].out_vertex.type == subm[j+1].out_vertex.type)
421
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
422
+ sum = 0.0
423
+ cou = 0
424
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
425
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
426
+ cou +=1
427
+ end
428
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
429
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
430
+ cou +=1
431
+ end
432
+ if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
433
+ sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id]
434
+ cou +=1
435
+ end
436
+ #-- Only Vertex match
437
+ if(cou > 0)
438
+ best_SVO_SVO_edges_match[i][j] = sum.to_f/cou.to_f
439
+ else
440
+ best_SVO_SVO_edges_match[i][j] = 0.0
441
+ end
442
+ #-- Vertex and SRL
443
+ best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i], subm[j]).to_f
444
+ best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i+1], subm[j+1]).to_f
445
+ #-- Only SRL
446
+ if(best_SVO_SVO_edges_match[i][j] > max)
447
+ max = best_SVO_SVO_edges_match[i][j]
448
+ end
449
+ flag = 1
450
+ end
451
+ end #end of 'if' condition
452
+ end #end of 'for' loop for 'j'
453
+
454
+ if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
455
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name} -- #{max}")
456
+ cum_double_edge_match = cum_double_edge_match + max
457
+ count+=1
458
+ max = 0.0 #re-initialize
459
+ flag = 0
460
+ end
461
+ end #end of 'if' condition
462
+ end #end of 'for' loop for 'i'
463
+
464
+ avg_match = 0.0
465
+ if(count > 0)
466
+ avg_match = cum_double_edge_match.to_f/ count.to_f
467
+ end
468
+ return avg_match
469
+ end
470
+ #------------------------------------------#------------------------------------------
471
+
472
+ def compare_SVO_diff_syntax(rev, subm, num_rev_edg, num_sub_edg)
473
+ # puts("***********Inside compare SVO edges with syntax difference numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
474
+ best_SVO_OVS_edges_match = Array.new(num_rev_edg){ Array.new}
475
+ cum_double_edge_match = 0.0
476
+ count = 0
477
+ max = 0.0
478
+ flag = 0
479
+ wnet = WordnetBasedSimilarity.new
480
+ for i in (0..num_rev_edg - 1)
481
+ if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
482
+ rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
483
+ #skipping edges with frequent words for vertices
484
+ if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
485
+ next
486
+ end
487
+
488
+ for j in (0..num_sub_edg - 1)
489
+ if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
490
+ #making sure the types are the same during comparison
491
+ if(rev[i].in_vertex.type == subm[j+1].out_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
492
+ rev[i+1].out_vertex.type == subm[j].in_vertex.type)
493
+ #taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
494
+ sum = 0.0
495
+ cou = 0
496
+ if(!@vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
497
+ sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id]
498
+ cou +=1
499
+ end
500
+ if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
501
+ sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
502
+ cou +=1
503
+ end
504
+ if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
505
+ sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id]
506
+ cou +=1
507
+ end
508
+ #comparing s-v-o (from review) with o-v-s (from submission)
509
+ if(cou > 0)
510
+ best_SVO_OVS_edges_match[i][j] = sum.to_f/cou.to_f
511
+ else
512
+ best_SVO_OVS_edges_match[i][j] = 0.0
513
+ end
514
+ flag = 1
515
+ if(best_SVO_OVS_edges_match[i][j] > max)
516
+ max = best_SVO_OVS_edges_match[i][j]
517
+ end
518
+ end
519
+ end #end of 'if' condition
520
+ end #end of 'for' loop for 'j'
521
+ if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
522
+ # puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name}-- #{max}")
523
+ cum_double_edge_match = cum_double_edge_match + max
524
+ count+=1
525
+ max = 0.0 #re-initialize
526
+ flag = 0
527
+ end
528
+
529
+ end #end of if condition
530
+ end #end of for loop for 'i'
531
+
532
+ avg_match = 0.0
533
+ if(count > 0)
534
+ avg_match = cum_double_edge_match.to_f / count.to_f
535
+ end
536
+ return avg_match
537
+ end #end of method
538
+ #------------------------------------------#------------------------------------------
539
+ =begin
540
+ SR Labels and vertex matches are given equal importance
541
+ * Problem is even if the vertices didn't match, the SRL labels would cause them to have a high similarity.
542
+ * Consider "boy - said" and "chocolate - melted" - these edges have NOMATCH for vertices, but both edges have the same label "SBJ" and would get an EXACT match,
543
+ * resulting in an avg of 3! This cant be right!
544
+ * We therefore use the labels to only decrease the match value found from vertices, i.e., if the labels were different.
545
+ * Match value will be left as is, if the labels were the same.
546
+ =end
547
+ def compare_labels(edge1, edge2)
548
+ result = EQUAL
549
+ if(!edge1.label.nil? and !edge2.label .nil?)
550
+ if(edge1.label.downcase == edge2.label.downcase)
551
+ result = EQUAL #divide by 1
552
+ else
553
+ result = DISTINCT #divide by 2
554
+ end
555
+ elsif((!edge1.label.nil? and !edge2.label.nil?) or (edge1.label.nil? and !edge2.label.nil? )) #if only one of the labels was null
556
+ result = DISTINCT
557
+ elsif(edge1.label.nil? and edge2.label.nil?) #if both labels were null!
558
+ result = EQUAL
559
+ end
560
+ return result
561
+ end # end of method
562
+ end