automated_metareview 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,562 @@
|
|
1
|
+
require 'automated_metareview/wordnet_based_similarity'
|
2
|
+
require 'automated_metareview/graph_generator'
|
3
|
+
|
4
|
+
class DegreeOfRelevance
|
5
|
+
#creating accessors for the instance variables
|
6
|
+
attr_accessor :vertex_match
|
7
|
+
attr_accessor :review
|
8
|
+
=begin
|
9
|
+
Identifies relevance between a review and a submission
|
10
|
+
=end
|
11
|
+
def get_relevance(reviews, submissions, num_reviews, pos_tagger, core_NLP_tagger, speller) #double dimensional arrays that contain the submissions and the reviews respectively
|
12
|
+
review_vertices = nil
|
13
|
+
review_edges = nil
|
14
|
+
subm_vertices = nil
|
15
|
+
subm_edges = nil
|
16
|
+
num_rev_vert = 0
|
17
|
+
num_rev_edg = 0
|
18
|
+
num_sub_vert = 0
|
19
|
+
numSubEdg = 0
|
20
|
+
vert_match = 0.0
|
21
|
+
edge_without_syn = 0.0
|
22
|
+
edge_with_syn = 0.0
|
23
|
+
edge_diff_type = 0.0
|
24
|
+
double_edge = 0.0
|
25
|
+
double_edge_with_syn = 0.0
|
26
|
+
|
27
|
+
#since Reviews and Submissions "should" contain the same number of records review - submission pairs
|
28
|
+
g = GraphGenerator.new
|
29
|
+
#generating review's graph
|
30
|
+
g.generate_graph(reviews, pos_tagger, core_NLP_tagger, true, false)
|
31
|
+
review_vertices = g.vertices
|
32
|
+
review_edges = g.edges
|
33
|
+
num_rev_vert = g.num_vertices
|
34
|
+
num_rev_edg = g.num_edges
|
35
|
+
|
36
|
+
#assigning graph as a review graph to use in content classification
|
37
|
+
@review = g.clone
|
38
|
+
|
39
|
+
#generating the submission's graph
|
40
|
+
g.generate_graph(submissions, pos_tagger, core_NLP_tagger, true, false)
|
41
|
+
subm_vertices = g.vertices
|
42
|
+
subm_edges = g.edges
|
43
|
+
num_sub_vert = g.num_vertices
|
44
|
+
num_sub_edg = g.num_edges
|
45
|
+
|
46
|
+
vert_match = compare_vertices(pos_tagger, review_vertices, subm_vertices, num_rev_vert, num_sub_vert, speller)
|
47
|
+
if(num_rev_edg > 0 and num_sub_edg > 0)
|
48
|
+
edge_without_syn = compare_edges_non_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
49
|
+
edge_with_syn = compare_edges_syntax_diff(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
50
|
+
edge_diff_type = compare_edges_diff_types(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
51
|
+
edge_match = (edge_without_syn.to_f + edge_with_syn.to_f )/2.to_f #+ edge_diff_type.to_f
|
52
|
+
double_edge = compare_SVO_edges(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
53
|
+
double_edge_with_syn = compare_SVO_diff_syntax(review_edges, subm_edges, num_rev_edg, num_sub_edg)
|
54
|
+
double_edge_match = (double_edge.to_f + double_edge_with_syn.to_f)/2.to_f
|
55
|
+
else
|
56
|
+
edge_match = 0
|
57
|
+
double_edge_match = 0
|
58
|
+
end
|
59
|
+
|
60
|
+
#differently weighted cases
|
61
|
+
#tweak this!!
|
62
|
+
alpha = 0.55
|
63
|
+
beta = 0.35
|
64
|
+
gamma = 0.1 #alpha > beta > gamma
|
65
|
+
relevance = (alpha.to_f * vert_match.to_f) + (beta * edge_match.to_f) + (gamma * double_edge_match.to_f) #case1's value will be in the range [0-6] (our semantic values)
|
66
|
+
scaled_relevance = relevance.to_f/6.to_f #scaled from [0-6] in the range [0-1]
|
67
|
+
|
68
|
+
#printing values
|
69
|
+
# puts("vertexMatch is [0-6]:: #{vert_match}")
|
70
|
+
# puts("edgeWithoutSyn Match is [0-6]:: #{edge_without_syn}")
|
71
|
+
# puts("edgeWithSyn Match is [0-6]:: #{edge_with_syn}")
|
72
|
+
# puts("edgeDiffType Match is [0-6]:: #{edge_diff_type}")
|
73
|
+
# puts("doubleEdge Match is [0-6]:: #{double_edge}")
|
74
|
+
# puts("doubleEdge with syntax Match is [0-6]:: #{double_edge_with_syn}")
|
75
|
+
# puts("relevance [0-6]:: #{relevance}")
|
76
|
+
# puts("scaled relevance on [0-1]:: #{scaled_relevance}")
|
77
|
+
# puts("*************************************************")
|
78
|
+
return scaled_relevance
|
79
|
+
end
|
80
|
+
=begin
|
81
|
+
* every vertex is compared with every other vertex
|
82
|
+
* Compares the vertices from across the two graphs to identify matches and quantify various metrics
|
83
|
+
* v1- vertices of the submission/past review and v2 - vertices from new review
|
84
|
+
=end
|
85
|
+
def compare_vertices(pos_tagger, rev, subm, num_rev_vert, num_sub_vert, speller)
|
86
|
+
# puts("****Inside compare_vertices:: rev.length:: #{num_rev_vert} subm.length:: #{num_sub_vert}")
|
87
|
+
#for double dimensional arrays, one of the dimensions should be initialized
|
88
|
+
@vertex_match = Array.new(num_rev_vert){Array.new}
|
89
|
+
wnet = WordnetBasedSimilarity.new
|
90
|
+
cum_vertex_match = 0.0
|
91
|
+
count = 0
|
92
|
+
max = 0.0
|
93
|
+
flag = 0
|
94
|
+
|
95
|
+
for i in (0..num_rev_vert - 1)
|
96
|
+
if(!rev.nil? and !rev[i].nil?)
|
97
|
+
rev[i].node_id = i
|
98
|
+
# puts("%%%%%%%%%%% Token #{rev[i].name} ::: POS tags:: rev[i].pos_tag:: #{rev[i].pos_tag} :: rev[i].node_id #{rev[i].node_id}")
|
99
|
+
#skipping frequent words from vertex comparison
|
100
|
+
if(wnet.is_frequent_word(rev[i].name))
|
101
|
+
next #ruby equivalent for continue
|
102
|
+
end
|
103
|
+
#looking for the best match
|
104
|
+
#j tracks every element in the set of all vertices, some of which are null
|
105
|
+
for j in (0..num_sub_vert - 1)
|
106
|
+
if(!subm[j].nil?)
|
107
|
+
if(subm[j].node_id == -1)
|
108
|
+
subm[j].node_id = j
|
109
|
+
end
|
110
|
+
# puts("%%%%%%%%%%% Token #{subm[j].name} ::: POS tags:: subm[j].pos_tag:: #{subm[j].pos_tag} subm[j].node_id #{subm[j].node_id}")
|
111
|
+
if(wnet.is_frequent_word(subm[j].name))
|
112
|
+
next #ruby equivalent for continue
|
113
|
+
end
|
114
|
+
#comparing only if one of the two vertices is a noun
|
115
|
+
if(rev[i].pos_tag.include?("NN") and subm[j].pos_tag.include?("NN"))
|
116
|
+
@vertex_match[i][j] = wnet.compare_strings(rev[i], subm[j], speller)
|
117
|
+
#only if the "if" condition is satisfied, since there could be null objects in between and you dont want unnecess. increments
|
118
|
+
flag = 1
|
119
|
+
if(@vertex_match[i][j] > max)
|
120
|
+
max = @vertex_match[i][j]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end #end of for loop for the submission vertices
|
125
|
+
|
126
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
127
|
+
# puts("**** Best match for:: #{rev[i].name}-- #{max}")
|
128
|
+
cum_vertex_match = cum_vertex_match + max
|
129
|
+
count+=1
|
130
|
+
max = 0.0 #re-initialize
|
131
|
+
flag = 0
|
132
|
+
end
|
133
|
+
end #end of if condition
|
134
|
+
end #end of for loop
|
135
|
+
|
136
|
+
avg_match = 0.0
|
137
|
+
if(count > 0)
|
138
|
+
avg_match = cum_vertex_match/ count
|
139
|
+
end
|
140
|
+
return avg_match
|
141
|
+
end #end of compare_vertices
|
142
|
+
|
143
|
+
#------------------------------------------#------------------------------------------
|
144
|
+
=begin
|
145
|
+
* SAME TYPE COMPARISON!!
|
146
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
147
|
+
* compare SUBJECT-VERB edges with SUBJECT-VERB matches
|
148
|
+
* where SUBJECT-SUBJECT and VERB-VERB or VERB-VERB and OBJECT-OBJECT comparisons are done
|
149
|
+
=end
|
150
|
+
def compare_edges_non_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
|
151
|
+
# puts("*****Inside compareEdgesnNonSyntaxDiff numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
152
|
+
best_SV_SV_match = Array.new(num_rev_edg){Array.new}
|
153
|
+
cum_edge_match = 0.0
|
154
|
+
count = 0
|
155
|
+
max = 0.0
|
156
|
+
flag = 0
|
157
|
+
|
158
|
+
wnet = WordnetBasedSimilarity.new
|
159
|
+
for i in (0..num_rev_edg - 1)
|
160
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
161
|
+
#skipping edges with frequent words for vertices
|
162
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
163
|
+
next
|
164
|
+
end
|
165
|
+
|
166
|
+
#looking for best matches
|
167
|
+
for j in (0..num_sub_edg - 1)
|
168
|
+
#comparing in-vertex with out-vertex to make sure they are of the same type
|
169
|
+
if(!subm[j].nil? && subm[j].in_vertex.node_id != -1 && subm[j].out_vertex.node_id != -1)
|
170
|
+
|
171
|
+
#checking if the subm token is a frequent word
|
172
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
173
|
+
next
|
174
|
+
end
|
175
|
+
|
176
|
+
#carrying out the normal comparison
|
177
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type && rev[i].out_vertex.type == subm[j].out_vertex.type)
|
178
|
+
if(!rev[i].label.nil?)
|
179
|
+
if(!subm[j].label.nil?)
|
180
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
181
|
+
sum = 0.0
|
182
|
+
cou = 0
|
183
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
184
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
185
|
+
cou +=1
|
186
|
+
end
|
187
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
188
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
189
|
+
cou +=1
|
190
|
+
end
|
191
|
+
#--Only vertex matches
|
192
|
+
if(cou > 0)
|
193
|
+
best_SV_SV_match[i][j] = sum.to_f/cou.to_f
|
194
|
+
else
|
195
|
+
best_SV_SV_match[i][j] = 0.0
|
196
|
+
end
|
197
|
+
#--Vertex and SRL - Dividing it by the label's match value
|
198
|
+
best_SV_SV_match[i][j] = best_SV_SV_match[i][j]/ compare_labels(rev[i], subm[j])
|
199
|
+
flag = 1
|
200
|
+
if(best_SV_SV_match[i][j] > max)
|
201
|
+
max = best_SV_SV_match[i][j]
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end #end of for loop for the submission edges
|
208
|
+
|
209
|
+
#cumulating the review edges' matches in order to get its average value
|
210
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
211
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
|
212
|
+
cum_edge_match = cum_edge_match + max
|
213
|
+
count+=1
|
214
|
+
max = 0.0#re-initialize
|
215
|
+
flag = 0
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end #end of 'for' loop for the review's edges
|
219
|
+
|
220
|
+
#getting the average for all the review edges' matches with the submission's edges
|
221
|
+
avg_match = 0.0
|
222
|
+
if(count > 0)
|
223
|
+
avg_match = cum_edge_match/ count
|
224
|
+
end
|
225
|
+
return avg_match
|
226
|
+
end
|
227
|
+
#------------------------------------------#------------------------------------------
|
228
|
+
=begin
|
229
|
+
* SAME TYPE COMPARISON!!
|
230
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
231
|
+
* compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
|
232
|
+
* where SUBJECT-OBJECT and VERB_VERB comparisons are done - same type comparisons!!
|
233
|
+
=end
|
234
|
+
|
235
|
+
def compare_edges_syntax_diff(rev, subm, num_rev_edg, num_sub_edg)
|
236
|
+
# puts("*****Inside compareEdgesSyntaxDiff :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
237
|
+
best_SV_VS_match = Array.new(num_rev_edg){Array.new}
|
238
|
+
cum_edge_match = 0.0
|
239
|
+
count = 0
|
240
|
+
max = 0.0
|
241
|
+
flag = 0
|
242
|
+
wnet = WordnetBasedSimilarity.new
|
243
|
+
for i in (0..num_rev_edg - 1)
|
244
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
245
|
+
#skipping frequent word
|
246
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
247
|
+
next
|
248
|
+
end
|
249
|
+
for j in (0..num_sub_edg - 1)
|
250
|
+
if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
|
251
|
+
#checking if the subm token is a frequent word
|
252
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
253
|
+
next
|
254
|
+
end
|
255
|
+
if(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
|
256
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
257
|
+
sum = 0.0
|
258
|
+
cou = 0
|
259
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
260
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
|
261
|
+
cou +=1
|
262
|
+
end
|
263
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
264
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
|
265
|
+
cou +=1
|
266
|
+
end
|
267
|
+
|
268
|
+
if(cou > 0)
|
269
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
270
|
+
else
|
271
|
+
best_SV_VS_match[i][j] = 0.0
|
272
|
+
end
|
273
|
+
|
274
|
+
flag = 1
|
275
|
+
if(best_SV_VS_match[i][j] > max)
|
276
|
+
max = best_SV_VS_match[i][j]
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end #end of the if condition
|
280
|
+
end #end of the for loop for the submission edges
|
281
|
+
|
282
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
283
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name}-- #{max}")
|
284
|
+
cum_edge_match = cum_edge_match + max
|
285
|
+
count+=1
|
286
|
+
max = 0.0 #re-initialize
|
287
|
+
flag = 0
|
288
|
+
end
|
289
|
+
|
290
|
+
end #end of the if condition
|
291
|
+
end #end of the for loop for the review
|
292
|
+
|
293
|
+
avg_match = 0.0
|
294
|
+
if(count > 0)
|
295
|
+
avg_match = cum_edge_match.to_f/count.to_f
|
296
|
+
end
|
297
|
+
return avg_match
|
298
|
+
end #end of the method
|
299
|
+
#------------------------------------------#------------------------------------------
|
300
|
+
=begin
|
301
|
+
DIFFERENT TYPE COMPARISON!!
|
302
|
+
* Compares the edges from across the two graphs to identify matches and quantify various metrics
|
303
|
+
* compare SUBJECT-VERB edges with VERB-OBJECT matches and vice-versa
|
304
|
+
* SUBJECT-VERB, VERB-SUBJECT, OBJECT-VERB, VERB-OBJECT comparisons are done!
|
305
|
+
=end
|
306
|
+
def compare_edges_diff_types(rev, subm, num_rev_edg, num_sub_edg)
|
307
|
+
# puts("*****Inside compareEdgesDiffTypes :: numRevEdg :: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
308
|
+
best_SV_VS_match = Array.new(num_rev_edg){Array.new}
|
309
|
+
cum_edge_match = 0.0
|
310
|
+
count = 0
|
311
|
+
max = 0.0
|
312
|
+
flag = 0
|
313
|
+
wnet = WordnetBasedSimilarity.new
|
314
|
+
for i in (0..num_rev_edg - 1)
|
315
|
+
if(!rev[i].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1)
|
316
|
+
#skipping edges with frequent words for vertices
|
317
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name))
|
318
|
+
next
|
319
|
+
end
|
320
|
+
#identifying best match for edges
|
321
|
+
for j in (0..num_sub_edg - 1)
|
322
|
+
if(!subm[j].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1)
|
323
|
+
#checking if the subm token is a frequent word
|
324
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
325
|
+
next
|
326
|
+
end
|
327
|
+
#for S-V with S-V or V-O with V-O
|
328
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type)
|
329
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
330
|
+
sum = 0.0
|
331
|
+
cou = 0
|
332
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
333
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].out_vertex.node_id]
|
334
|
+
cou +=1
|
335
|
+
end
|
336
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
337
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].in_vertex.node_id]
|
338
|
+
cou +=1
|
339
|
+
end
|
340
|
+
if(cou > 0)
|
341
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
342
|
+
else
|
343
|
+
best_SV_VS_match[i][j] = 0.0
|
344
|
+
end
|
345
|
+
#-- Vertex and SRL
|
346
|
+
best_SV_VS_match[i][j] = best_SV_VS_match[i][j]/ compare_labels(rev[i], subm[j])
|
347
|
+
flag = 1
|
348
|
+
if(best_SV_VS_match[i][j] > max)
|
349
|
+
max = best_SV_VS_match[i][j]
|
350
|
+
end
|
351
|
+
#for S-V with V-O or V-O with S-V
|
352
|
+
elsif(rev[i].in_vertex.type == subm[j].out_vertex.type and rev[i].out_vertex.type == subm[j].in_vertex.type)
|
353
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
354
|
+
sum = 0.0
|
355
|
+
cou = 0
|
356
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
357
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
358
|
+
cou +=1
|
359
|
+
end
|
360
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
361
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
362
|
+
cou +=1
|
363
|
+
end
|
364
|
+
if(cou > 0)
|
365
|
+
best_SV_VS_match[i][j] = sum.to_f/cou.to_f
|
366
|
+
else
|
367
|
+
best_SV_VS_match[i][j] =0.0
|
368
|
+
end
|
369
|
+
flag = 1
|
370
|
+
if(best_SV_VS_match[i][j] > max)
|
371
|
+
max = best_SV_VS_match[i][j]
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end #end of the if condition
|
375
|
+
end #end of the for loop for submission edges
|
376
|
+
|
377
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
378
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} -- #{max}")
|
379
|
+
cum_edge_match = cum_edge_match + max
|
380
|
+
count+=1
|
381
|
+
max = 0.0 #re-initialize
|
382
|
+
flag = 0
|
383
|
+
end
|
384
|
+
end #end of if condition
|
385
|
+
end #end of for loop for review edges
|
386
|
+
|
387
|
+
avg_match = 0.0
|
388
|
+
if(count > 0)
|
389
|
+
avg_match = cum_edge_match.to_f/ count.to_f
|
390
|
+
end
|
391
|
+
return avg_match
|
392
|
+
end #end of the method
|
393
|
+
#------------------------------------------#------------------------------------------
|
394
|
+
|
395
|
+
def compare_SVO_edges(rev, subm, num_rev_edg, num_sub_edg)
|
396
|
+
# puts("***********Inside compare SVO edges numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
397
|
+
best_SVO_SVO_edges_match = Array.new(num_rev_edg){Array.new}
|
398
|
+
cum_double_edge_match = 0.0
|
399
|
+
count = 0
|
400
|
+
max = 0.0
|
401
|
+
flag = 0
|
402
|
+
wnet = WordnetBasedSimilarity.new
|
403
|
+
for i in (0..num_rev_edg - 1)
|
404
|
+
if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
|
405
|
+
rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
|
406
|
+
#skipping edges with frequent words for vertices
|
407
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
|
408
|
+
next
|
409
|
+
end
|
410
|
+
#best match
|
411
|
+
for j in (0..num_sub_edg - 1)
|
412
|
+
if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and
|
413
|
+
subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
|
414
|
+
#checking if the subm token is a frequent word
|
415
|
+
if(wnet.is_frequent_word(subm[j].in_vertex.name) and wnet.is_frequent_word(subm[j].out_vertex.name))
|
416
|
+
next
|
417
|
+
end
|
418
|
+
#making sure the types are the same during comparison
|
419
|
+
if(rev[i].in_vertex.type == subm[j].in_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
|
420
|
+
rev[i+1].out_vertex.type == subm[j+1].out_vertex.type)
|
421
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
422
|
+
sum = 0.0
|
423
|
+
cou = 0
|
424
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
425
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j].in_vertex.node_id]
|
426
|
+
cou +=1
|
427
|
+
end
|
428
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
429
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
430
|
+
cou +=1
|
431
|
+
end
|
432
|
+
if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
|
433
|
+
sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j+1].out_vertex.node_id]
|
434
|
+
cou +=1
|
435
|
+
end
|
436
|
+
#-- Only Vertex match
|
437
|
+
if(cou > 0)
|
438
|
+
best_SVO_SVO_edges_match[i][j] = sum.to_f/cou.to_f
|
439
|
+
else
|
440
|
+
best_SVO_SVO_edges_match[i][j] = 0.0
|
441
|
+
end
|
442
|
+
#-- Vertex and SRL
|
443
|
+
best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i], subm[j]).to_f
|
444
|
+
best_SVO_SVO_edges_match[i][j] = best_SVO_SVO_edges_match[i][j].to_f/ compare_labels(rev[i+1], subm[j+1]).to_f
|
445
|
+
#-- Only SRL
|
446
|
+
if(best_SVO_SVO_edges_match[i][j] > max)
|
447
|
+
max = best_SVO_SVO_edges_match[i][j]
|
448
|
+
end
|
449
|
+
flag = 1
|
450
|
+
end
|
451
|
+
end #end of 'if' condition
|
452
|
+
end #end of 'for' loop for 'j'
|
453
|
+
|
454
|
+
if(flag != 0) #if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
455
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name} -- #{max}")
|
456
|
+
cum_double_edge_match = cum_double_edge_match + max
|
457
|
+
count+=1
|
458
|
+
max = 0.0 #re-initialize
|
459
|
+
flag = 0
|
460
|
+
end
|
461
|
+
end #end of 'if' condition
|
462
|
+
end #end of 'for' loop for 'i'
|
463
|
+
|
464
|
+
avg_match = 0.0
|
465
|
+
if(count > 0)
|
466
|
+
avg_match = cum_double_edge_match.to_f/ count.to_f
|
467
|
+
end
|
468
|
+
return avg_match
|
469
|
+
end
|
470
|
+
#------------------------------------------#------------------------------------------
|
471
|
+
|
472
|
+
def compare_SVO_diff_syntax(rev, subm, num_rev_edg, num_sub_edg)
|
473
|
+
# puts("***********Inside compare SVO edges with syntax difference numRevEdg:: #{num_rev_edg} numSubEdg:: #{num_sub_edg}")
|
474
|
+
best_SVO_OVS_edges_match = Array.new(num_rev_edg){ Array.new}
|
475
|
+
cum_double_edge_match = 0.0
|
476
|
+
count = 0
|
477
|
+
max = 0.0
|
478
|
+
flag = 0
|
479
|
+
wnet = WordnetBasedSimilarity.new
|
480
|
+
for i in (0..num_rev_edg - 1)
|
481
|
+
if(!rev[i].nil? and !rev[i+1].nil? and rev[i].in_vertex.node_id != -1 and rev[i].out_vertex.node_id != -1 and
|
482
|
+
rev[i+1].out_vertex.node_id != -1 and rev[i].out_vertex == rev[i+1].in_vertex)
|
483
|
+
#skipping edges with frequent words for vertices
|
484
|
+
if(wnet.is_frequent_word(rev[i].in_vertex.name) and wnet.is_frequent_word(rev[i].out_vertex.name) and wnet.is_frequent_word(rev[i+1].out_vertex.name))
|
485
|
+
next
|
486
|
+
end
|
487
|
+
|
488
|
+
for j in (0..num_sub_edg - 1)
|
489
|
+
if(!subm[j].nil? and !subm[j+1].nil? and subm[j].in_vertex.node_id != -1 and subm[j].out_vertex.node_id != -1 and subm[j+1].out_vertex.node_id != -1 and subm[j].out_vertex == subm[j+1].in_vertex)
|
490
|
+
#making sure the types are the same during comparison
|
491
|
+
if(rev[i].in_vertex.type == subm[j+1].out_vertex.type and rev[i].out_vertex.type == subm[j].out_vertex.type and
|
492
|
+
rev[i+1].out_vertex.type == subm[j].in_vertex.type)
|
493
|
+
#taking each match separately because one or more of the terms may be a frequent word, for which no @vertex_match exists!
|
494
|
+
sum = 0.0
|
495
|
+
cou = 0
|
496
|
+
if(!@vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id].nil?)
|
497
|
+
sum = sum + @vertex_match[rev[i].in_vertex.node_id][subm[j+1].out_vertex.node_id]
|
498
|
+
cou +=1
|
499
|
+
end
|
500
|
+
if(!@vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id].nil?)
|
501
|
+
sum = sum + @vertex_match[rev[i].out_vertex.node_id][subm[j].out_vertex.node_id]
|
502
|
+
cou +=1
|
503
|
+
end
|
504
|
+
if(!@vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id].nil?)
|
505
|
+
sum = sum + @vertex_match[rev[i+1].out_vertex.node_id][subm[j].in_vertex.node_id]
|
506
|
+
cou +=1
|
507
|
+
end
|
508
|
+
#comparing s-v-o (from review) with o-v-s (from submission)
|
509
|
+
if(cou > 0)
|
510
|
+
best_SVO_OVS_edges_match[i][j] = sum.to_f/cou.to_f
|
511
|
+
else
|
512
|
+
best_SVO_OVS_edges_match[i][j] = 0.0
|
513
|
+
end
|
514
|
+
flag = 1
|
515
|
+
if(best_SVO_OVS_edges_match[i][j] > max)
|
516
|
+
max = best_SVO_OVS_edges_match[i][j]
|
517
|
+
end
|
518
|
+
end
|
519
|
+
end #end of 'if' condition
|
520
|
+
end #end of 'for' loop for 'j'
|
521
|
+
if(flag != 0)#if the review edge had any submission edges with which it was matched, since not all S-V edges might have corresponding V-O edges to match with
|
522
|
+
# puts("**** Best match for:: #{rev[i].in_vertex.name} - #{rev[i].out_vertex.name} - #{rev[i+1].out_vertex.name}-- #{max}")
|
523
|
+
cum_double_edge_match = cum_double_edge_match + max
|
524
|
+
count+=1
|
525
|
+
max = 0.0 #re-initialize
|
526
|
+
flag = 0
|
527
|
+
end
|
528
|
+
|
529
|
+
end #end of if condition
|
530
|
+
end #end of for loop for 'i'
|
531
|
+
|
532
|
+
avg_match = 0.0
|
533
|
+
if(count > 0)
|
534
|
+
avg_match = cum_double_edge_match.to_f / count.to_f
|
535
|
+
end
|
536
|
+
return avg_match
|
537
|
+
end #end of method
|
538
|
+
#------------------------------------------#------------------------------------------
|
539
|
+
=begin
|
540
|
+
SR Labels and vertex matches are given equal importance
|
541
|
+
* Problem is even if the vertices didn't match, the SRL labels would cause them to have a high similarity.
|
542
|
+
* Consider "boy - said" and "chocolate - melted" - these edges have NOMATCH for vertices, but both edges have the same label "SBJ" and would get an EXACT match,
|
543
|
+
* resulting in an avg of 3! This cant be right!
|
544
|
+
* We therefore use the labels to only decrease the match value found from vertices, i.e., if the labels were different.
|
545
|
+
* Match value will be left as is, if the labels were the same.
|
546
|
+
=end
|
547
|
+
def compare_labels(edge1, edge2)
|
548
|
+
result = EQUAL
|
549
|
+
if(!edge1.label.nil? and !edge2.label .nil?)
|
550
|
+
if(edge1.label.downcase == edge2.label.downcase)
|
551
|
+
result = EQUAL #divide by 1
|
552
|
+
else
|
553
|
+
result = DISTINCT #divide by 2
|
554
|
+
end
|
555
|
+
elsif((!edge1.label.nil? and !edge2.label.nil?) or (edge1.label.nil? and !edge2.label.nil? )) #if only one of the labels was null
|
556
|
+
result = DISTINCT
|
557
|
+
elsif(edge1.label.nil? and edge2.label.nil?) #if both labels were null!
|
558
|
+
result = EQUAL
|
559
|
+
end
|
560
|
+
return result
|
561
|
+
end # end of method
|
562
|
+
end
|