automated_metareview 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
class Edge
|
2
|
+
attr_accessor :edgeID, :type, :name, :index, :in_vertex, :out_vertex, :edge_match, :average_match, :frequency, :label
|
3
|
+
|
4
|
+
def initialize(edge_name, edge_type)
|
5
|
+
@name = edge_name
|
6
|
+
@type = edge_type #1 - verb, 2 - adjective, 3-adverb
|
7
|
+
@average_match = 0.0 #initializing match to 0
|
8
|
+
@frequency = 0
|
9
|
+
#initializing the number of matches for each metric value to 0
|
10
|
+
@edge_match = Array.new
|
11
|
+
@edge_match = [0, 0, 0, 0, 0]
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,695 @@
|
|
1
|
+
require 'automated_metareview/sentence_state'
|
2
|
+
require 'automated_metareview/edge'
|
3
|
+
require 'automated_metareview/vertex'
|
4
|
+
|
5
|
+
class GraphGenerator
|
6
|
+
#include SentenceState
|
7
|
+
#creating accessors for the instance variables
|
8
|
+
attr_accessor :vertices, :num_vertices, :edges, :num_edges, :pipeline, :pos_tagger
|
9
|
+
|
10
|
+
# #global variables
|
11
|
+
# $vertices = Array.new
|
12
|
+
# $edges = Array.new
|
13
|
+
|
14
|
+
=begin
|
15
|
+
* generates the graph for the given review text and
|
16
|
+
* INPUT: an array of sentences for a review or a submission. Every row in 'text' contains one sentence.
|
17
|
+
* type - tells you if it was a review or s submission
|
18
|
+
* type = 1 - submission/past review
|
19
|
+
* type = 2 - new review
|
20
|
+
=end
|
21
|
+
def generate_graph(text, pos_tagger, coreNLPTagger, forRelevance, forPatternIdentify)
|
22
|
+
#initializing common arrays
|
23
|
+
@vertices = Array.new
|
24
|
+
@num_vertices = 0
|
25
|
+
@edges = Array.new
|
26
|
+
@num_edges = 0
|
27
|
+
|
28
|
+
@pos_tagger = pos_tagger #part of speech tagger
|
29
|
+
@pipeline = coreNLPTagger #dependency parsing
|
30
|
+
#iterate through the sentences in the text
|
31
|
+
for i in (0..text.length-1)
|
32
|
+
if(text[i].empty? or text[i] == "" or text[i].split(" ").empty?)
|
33
|
+
next
|
34
|
+
end
|
35
|
+
unTaggedString = text[i].split(" ")
|
36
|
+
# puts "UnTagged String:: #{unTaggedString}"
|
37
|
+
taggedString = @pos_tagger.get_readable(text[i])
|
38
|
+
# puts "taggedString:: #{taggedString}"
|
39
|
+
|
40
|
+
#Initializing some arrays
|
41
|
+
nouns = Array.new
|
42
|
+
nCount = 0
|
43
|
+
verbs = Array.new
|
44
|
+
vCount = 0
|
45
|
+
adjectives = Array.new
|
46
|
+
adjCount = 0
|
47
|
+
adverbs = Array.new
|
48
|
+
advCount = 0
|
49
|
+
|
50
|
+
parents = Array.new
|
51
|
+
labels = Array.new
|
52
|
+
|
53
|
+
#------------------------------------------#------------------------------------------
|
54
|
+
#finding parents
|
55
|
+
parents = find_parents(text[i])
|
56
|
+
parentCounter = 0
|
57
|
+
#------------------------------------------#------------------------------------------
|
58
|
+
#finding parents
|
59
|
+
labels = find_labels(text[i])
|
60
|
+
labelCounter = 0
|
61
|
+
#------------------------------------------#------------------------------------------
|
62
|
+
#find state
|
63
|
+
sstate = SentenceState.new
|
64
|
+
states_array = sstate.identify_sentence_state(taggedString)
|
65
|
+
states_counter = 0
|
66
|
+
state = states_array[states_counter]
|
67
|
+
states_counter += 1
|
68
|
+
#------------------------------------------#------------------------------------------
|
69
|
+
|
70
|
+
taggedString = taggedString.split(" ")
|
71
|
+
prevType = nil #initlializing the prevyp
|
72
|
+
|
73
|
+
#iterate through the tokens
|
74
|
+
for j in (0..taggedString.length-1)
|
75
|
+
taggedToken = taggedString[j]
|
76
|
+
plainToken = taggedToken[0...taggedToken.index("/")].to_s
|
77
|
+
posTag = taggedToken[taggedToken.index("/")+1..taggedToken.length].to_s
|
78
|
+
#ignore periods
|
79
|
+
if(plainToken == "." or taggedToken.include?("/POS") or (taggedToken.index("/") == taggedToken.length()-1) or (taggedToken.index("/") == taggedToken.length()-2))#this is for strings containinig "'s" or without POS
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
#SETTING STATE
|
84
|
+
#since the CC or IN are part of the following sentence segment, we set the STATE for that segment when we see a CC or IN
|
85
|
+
if(taggedToken.include?("/CC"))#{//|| ps.contains("/IN")
|
86
|
+
state = states_array[states_counter]
|
87
|
+
states_counter+=1
|
88
|
+
end
|
89
|
+
# puts("**Value:: #{plainToken} LabelCounter:: #{labelCounter} ParentCounter:: #{parentCounter} POStag:: #{posTag} .. state = #{state}")
|
90
|
+
|
91
|
+
#------------------------------------------
|
92
|
+
#if the token is a noun
|
93
|
+
if(taggedToken.include?("NN") or taggedToken.include?("PRP") or taggedToken.include?("IN") or taggedToken.include?("/EX") or taggedToken.include?("WP"))
|
94
|
+
#either add on to a previous vertex or create a brand new noun vertex
|
95
|
+
if(prevType == NOUN) #adding to a previous noun vertex
|
96
|
+
nCount -= 1 #decrement, since we are accessing a previous noun vertex
|
97
|
+
prevVertex = search_vertices(@vertices, nouns[nCount], i) #fetching the previous vertex
|
98
|
+
nouns[nCount] = nouns[nCount].to_s + " " + plainToken #concatenating with contents of the previous noun vertex
|
99
|
+
#checking if the previous noun concatenated with "s" already exists among the vertices
|
100
|
+
if((nounVertex = search_vertices(@vertices, nouns[nCount], i)) == nil)
|
101
|
+
prevVertex.name = prevVertex.name.to_s + " " + plainToken #concatenating the nouns
|
102
|
+
nounVertex = prevVertex #the current concatenated vertex will be considered
|
103
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD")#resetting labels for the concatenated vertex
|
104
|
+
nounVertex.label = labels[labelCounter]
|
105
|
+
end
|
106
|
+
#fAppendedVertex = 1
|
107
|
+
end#if the vertex already exists, just use nounVertex - the returned vertex for ops.
|
108
|
+
else #if the previous token is not a noun, create a brand new vertex
|
109
|
+
nouns[nCount] = plainToken #this is checked for later on
|
110
|
+
nounVertex = search_vertices(@vertices, plainToken, i)
|
111
|
+
if(nounVertex == nil) #the string doesn't already exist
|
112
|
+
@vertices[@num_vertices] = Vertex.new(nouns[nCount], NOUN, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
113
|
+
nounVertex = @vertices[@num_vertices] #the newly formed vertex will be considered
|
114
|
+
@num_vertices+=1
|
115
|
+
end
|
116
|
+
end #end of if prevType was noun
|
117
|
+
remove_redundant_vertices(nouns[nCount], i)
|
118
|
+
nCount+=1 #increment nCount for a new noun vertex just created (or existing previous vertex appended with new text)
|
119
|
+
|
120
|
+
#checking if a noun existed before this one and if the adjective was attached to that noun.
|
121
|
+
#if an adjective was found earlier, we add a new edge
|
122
|
+
if(prevType == ADJ)
|
123
|
+
#set previous noun's property to null, if it was set, if there is a noun before the adjective
|
124
|
+
if(nCount > 1)
|
125
|
+
v1 = search_vertices(@vertices, nouns[nCount-2], i) #fetching the previous noun, the one before the current noun (therefore -2)
|
126
|
+
v2 = search_vertices(@vertices, adjectives[adjCount-1], i) #fetching the previous adjective
|
127
|
+
#if such an edge exists - DELETE IT - search_edges_to_set_null() returns the position in the array at which such an edge exists
|
128
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges_to_set_null(@edges, v1, v2, i)) != -1) #-1 is when no such edge exists
|
129
|
+
@edges[e] = nil #setting the edge to null
|
130
|
+
#if @num_edges had been previously incremented, decrement it
|
131
|
+
if(@num_edges > 0)
|
132
|
+
@num_edges-=1 #deducting an edge count
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
#if this noun vertex was encountered for the first time, nCount < 1,
|
137
|
+
#so do adding of edge outside the if condition
|
138
|
+
#add a new edge with v1 as the adjective and v2 as the new noun
|
139
|
+
v1 = search_vertices(@vertices, adjectives[adjCount-1], i)
|
140
|
+
v2 = nounVertex #the noun vertex that was just created
|
141
|
+
#if such an edge did not already exist
|
142
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
143
|
+
@edges[@num_edges] = Edge.new("noun-property",VERB)
|
144
|
+
@edges[@num_edges].in_vertex = v1
|
145
|
+
@edges[@num_edges].out_vertex = v2
|
146
|
+
@edges[@num_edges].index = i
|
147
|
+
@num_edges+=1
|
148
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
149
|
+
remove_redundant_edges(v1, v2, i)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
#a noun has been found and has established a verb as an in_vertex and such an edge doesnt already previously exist
|
153
|
+
if(vCount > 0) #and fAppendedVertex == 0
|
154
|
+
#add edge only when a fresh vertex is created not when existing vertex is appended to
|
155
|
+
v1 = search_vertices(@vertices, verbs[vCount-1], i)
|
156
|
+
v2 = nounVertex
|
157
|
+
#if such an edge does not already exist add it
|
158
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges,v1, v2, i)) == -1)
|
159
|
+
@edges[@num_edges] = Edge.new("verb", VERB)
|
160
|
+
@edges[@num_edges].in_vertex = v1 #for vCount = 0
|
161
|
+
@edges[@num_edges].out_vertex = v2
|
162
|
+
@edges[@num_edges].index = i
|
163
|
+
@num_edges+=1
|
164
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
165
|
+
remove_redundant_edges(v1, v2, i)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
prevType = NOUN
|
169
|
+
#------------------------------------------
|
170
|
+
|
171
|
+
#if the string is an adjective
|
172
|
+
#adjectives are vertices but they are not connected by an edge to the nouns, instead they are the noun's properties
|
173
|
+
elsif(taggedToken.include?("/JJ"))
|
174
|
+
adjective = nil
|
175
|
+
if(prevType == ADJ) #combine the adjectives
|
176
|
+
# puts("PREV ADJ here:: #{plainToken}")
|
177
|
+
if(adjCount >= 1)
|
178
|
+
adjCount = adjCount - 1
|
179
|
+
prevVertex = search_vertices(@vertices, adjectives[adjCount], i) #fetching the previous vertex
|
180
|
+
adjectives[adjCount] = adjectives[adjCount] + " " + plainToken
|
181
|
+
#if the concatenated vertex didn't already exist
|
182
|
+
if((adjective = search_vertices(@vertices, adjectives[adjCount], i)).nil?)
|
183
|
+
prevVertex.name = prevVertex.name+" "+plainToken
|
184
|
+
adjective = prevVertex #set it as "adjective" for further execution
|
185
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD") #resetting labels for the concatenated vertex
|
186
|
+
adjective.label = labels[labelCounter]
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
else #new adjective vertex
|
191
|
+
adjectives[adjCount] = plainToken
|
192
|
+
if((adjective = search_vertices(@vertices, plainToken, i)).nil?) #the string doesn't already exist
|
193
|
+
@vertices[@num_vertices] = Vertex.new(adjectives[adjCount], ADJ, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
194
|
+
adjective = @vertices[@num_vertices]
|
195
|
+
@num_vertices+=1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
remove_redundant_vertices(adjectives[adjCount], i)
|
199
|
+
adjCount+=1 #incrementing, since a new adjective was created or an existing one updated.
|
200
|
+
|
201
|
+
#by default associate the adjective with the previous/latest noun and if there is a noun following it immediately, then remove the property from the older noun (done under noun condition)
|
202
|
+
if(nCount > 0) #gets the previous noun to form the edge
|
203
|
+
v1 = search_vertices(@vertices, nouns[nCount-1], i)
|
204
|
+
v2 = adjective #the current adjective vertex
|
205
|
+
#if such an edge does not already exist add it
|
206
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
207
|
+
# puts "** Adding noun-adj edge .. #{v1.name} - #{v2.name}"
|
208
|
+
@edges[@num_edges] = Edge.new("noun-property",VERB)
|
209
|
+
@edges[@num_edges].in_vertex = v1
|
210
|
+
@edges[@num_edges].out_vertex = v2
|
211
|
+
@edges[@num_edges].index = i
|
212
|
+
@num_edges+=1
|
213
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
214
|
+
remove_redundant_edges(v1, v2, i)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
prevType = ADJ
|
218
|
+
#end of if condition for adjective
|
219
|
+
#------------------------------------------
|
220
|
+
|
221
|
+
#if the string is a verb or a modal//length condition for verbs is, be, are...
|
222
|
+
elsif(taggedToken.include?("/VB") or taggedToken.include?("MD"))
|
223
|
+
verbVertex = nil
|
224
|
+
if(prevType == VERB) #combine the verbs
|
225
|
+
vCount = vCount - 1
|
226
|
+
prevVertex = search_vertices(@vertices, verbs[vCount], i) #fetching the previous vertex
|
227
|
+
verbs[vCount] = verbs[vCount] + " " + plainToken
|
228
|
+
#if the concatenated vertex didn't already exist
|
229
|
+
if((verbVertex = search_vertices(@vertices, verbs[vCount], i)) == nil)
|
230
|
+
prevVertex.name = prevVertex.name + " " + plainToken
|
231
|
+
verbVertex = prevVertex #concatenated vertex becomes the new verb vertex
|
232
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD")#resetting labels for the concatenated vertex
|
233
|
+
verbVertex.label = labels[labelCounter]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
else
|
237
|
+
verbs[vCount] = plainToken
|
238
|
+
if((verbVertex = search_vertices(@vertices, plainToken, i)) == nil)
|
239
|
+
@vertices[@num_vertices] = Vertex.new(plainToken, VERB, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
240
|
+
verbVertex = @vertices[@num_vertices] #newly created verb vertex will be considered in the future
|
241
|
+
@num_vertices+=1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
remove_redundant_vertices(verbs[vCount], i)
|
245
|
+
vCount+=1
|
246
|
+
|
247
|
+
#if an adverb was found earlier, we set that as the verb's property
|
248
|
+
if(prevType == ADV)
|
249
|
+
#set previous verb's property to null, if it was set, if there is a verb following the adverb
|
250
|
+
if(vCount > 1)
|
251
|
+
v1 = search_vertices(@vertices, verbs[vCount-2], i) #fetching the previous verb, the one before the current one (hence -2)
|
252
|
+
v2 = search_vertices(@vertices, adverbs[advCount-1], i) #fetching the previous adverb
|
253
|
+
#if such an edge exists - DELETE IT
|
254
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges_to_set_null(@edges, v1, v2, i)) != -1)
|
255
|
+
@edges[e] = nil #setting the edge to null
|
256
|
+
if(@num_edges > 0)
|
257
|
+
@num_edges-=1 #deducting an edge count
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
#if this verb vertex was encountered for the first time, vCount < 1,
|
262
|
+
#so do adding of edge outside the if condition
|
263
|
+
#add a new edge with v1 as the adverb and v2 as the new verb
|
264
|
+
v1 = search_vertices(@vertices, adverbs[advCount-1], i)
|
265
|
+
v2 = verbVertex
|
266
|
+
#if such an edge did not already exist
|
267
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
268
|
+
@edges[@num_edges] = Edge.new("verb-property",VERB)
|
269
|
+
@edges[@num_edges].in_vertex = v1
|
270
|
+
@edges[@num_edges].out_vertex = v2
|
271
|
+
@edges[@num_edges].index = i
|
272
|
+
@num_edges+=1
|
273
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
274
|
+
remove_redundant_edges(v1, v2, i)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
#making the previous noun, one of the vertices of the verb edge
|
279
|
+
if(nCount > 0) #and fAppendedVertex == 0
|
280
|
+
#gets the previous noun to form the edge
|
281
|
+
v1 = search_vertices(@vertices, nouns[nCount-1], i)
|
282
|
+
v2 = verbVertex
|
283
|
+
#if such an edge does not already exist add it
|
284
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
285
|
+
@edges[@num_edges] = Edge.new("verb",VERB)
|
286
|
+
@edges[@num_edges].in_vertex = v1 #for nCount = 0;
|
287
|
+
@edges[@num_edges].out_vertex = v2 #the verb
|
288
|
+
@edges[@num_edges].index = i
|
289
|
+
@num_edges+=1
|
290
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
291
|
+
remove_redundant_edges(v1, v2, i)
|
292
|
+
end
|
293
|
+
end
|
294
|
+
prevType = VERB
|
295
|
+
#------------------------------------------
|
296
|
+
#if the string is an adverb
|
297
|
+
elsif(taggedToken.include?("RB"))
|
298
|
+
adverb = nil
|
299
|
+
if(prevType == ADV) #appending to existing adverb
|
300
|
+
if(advCount >= 1)
|
301
|
+
advCount = advCount - 1
|
302
|
+
end
|
303
|
+
prevVertex = search_vertices(@vertices, adverbs[advCount], i) #fetching the previous vertex
|
304
|
+
adverbs[advCount] = adverbs[advCount] + " " + plainToken
|
305
|
+
#if the concatenated vertex didn't already exist
|
306
|
+
if((adverb = search_vertices(@vertices, adverbs[advCount], i)) == nil)
|
307
|
+
prevVertex.name = prevVertex.name + " " + plainToken
|
308
|
+
adverb = prevVertex #setting it as "adverb" for further computation
|
309
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD") #resetting labels for the concatenated vertex
|
310
|
+
adverb.label = labels[labelCounter]
|
311
|
+
end
|
312
|
+
end
|
313
|
+
else #else creating a new vertex
|
314
|
+
adverbs[advCount] = plainToken
|
315
|
+
if((adverb = search_vertices(@vertices, plainToken, i)) == nil)
|
316
|
+
@vertices[@num_vertices] = Vertex.new(adverbs[advCount], ADV, i, state, labels[labelCounter], parents[parentCounter], posTag);
|
317
|
+
adverb = @vertices[@num_vertices]
|
318
|
+
@num_vertices+=1
|
319
|
+
end
|
320
|
+
end
|
321
|
+
remove_redundant_vertices(adverbs[advCount], i)
|
322
|
+
advCount+=1
|
323
|
+
|
324
|
+
#by default associate it with the previous/latest verb and if there is a verb following it immediately, then remove the property from the verb
|
325
|
+
if(vCount > 0) #gets the previous verb to form a verb-adverb edge
|
326
|
+
v1 = search_vertices(@vertices, verbs[vCount-1], i)
|
327
|
+
v2 = adverb
|
328
|
+
#if such an edge does not already exist add it
|
329
|
+
if(!v1.nil? and !v2.nil? && (e = search_edges(@edges, v1, v2, i)) == -1)
|
330
|
+
@edges[@num_edges] = Edge.new("verb-property",VERB)
|
331
|
+
@edges[@num_edges].in_vertex = v1 #for nCount = 0;
|
332
|
+
@edges[@num_edges].out_vertex = v2 #the verb
|
333
|
+
@edges[@num_edges].index = i
|
334
|
+
@num_edges+=1
|
335
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
336
|
+
remove_redundant_edges(v1, v2, i)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
prevType = ADV
|
340
|
+
#end of if condition for adverb
|
341
|
+
end #end of if condition
|
342
|
+
#------------------------------------------
|
343
|
+
#incrementing counters for labels and parents
|
344
|
+
labelCounter+=1
|
345
|
+
parentCounter+=1
|
346
|
+
end #end of the for loop for the tokens
|
347
|
+
#puts "here outside the for loop for tokens"
|
348
|
+
nouns = nil
|
349
|
+
verbs = nil
|
350
|
+
adjectives = nil
|
351
|
+
adverbs = nil
|
352
|
+
end #end of number of sentences in the text
|
353
|
+
|
354
|
+
@num_vertices = @num_vertices - 1 #since as a counter it was 1 ahead of the array's contents
|
355
|
+
@num_edges = @num_edges - 1 #same reason as for num_vertices
|
356
|
+
set_semantic_labels_for_edges
|
357
|
+
#print_graph(@edges, @vertices)
|
358
|
+
# puts("Number of edges:: #{@num_edges}")
|
359
|
+
# puts("Number of vertices:: #{@num_vertices}")
|
360
|
+
return @num_edges
|
361
|
+
end #end of the graphGenerate method
|
362
|
+
|
363
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
364
|
+
|
365
|
+
def search_vertices(list, s, index)
|
366
|
+
for i in (0..list.length-1)
|
367
|
+
if(!list[i].nil? and !s.nil?)
|
368
|
+
#if the vertex exists and in the same sentence (index)
|
369
|
+
if(list[i].name.casecmp(s) == 0 and list[i].index == index)
|
370
|
+
# puts("***** search_vertices:: Returning:: #{s}")
|
371
|
+
return list[i]
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
# puts("***** search_vertices:: Returning nil")
|
376
|
+
return nil
|
377
|
+
end #end of the search_vertices method
|
378
|
+
|
379
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
380
|
+
|
381
|
+
=begin
|
382
|
+
NULLIFY ALL VERTICES CONTAINING "ONLY SUBSTRINGS" (and not exact matches) OF THIS VERTEX IN THE SAME SENTENCE (verts[j].index == index)
|
383
|
+
And reset the @vertices array with non-null elements.
|
384
|
+
=end
|
385
|
+
def remove_redundant_vertices(s, index)
|
386
|
+
# puts "**** remove_redundant_vertices:: string #{s}"
|
387
|
+
j = @num_vertices - 1
|
388
|
+
verts = @vertices
|
389
|
+
while j >= 0
|
390
|
+
if(!verts[j].nil? and verts[j].index == index and s.casecmp(verts[j].name) != 0 and
|
391
|
+
(s.downcase.include?(verts[j].name.downcase) and verts[j].name.length > 1))
|
392
|
+
#the last 'length' condition is added so as to prevent "I" (an indiv. vertex) from being replaced by nil
|
393
|
+
# puts "*** string index = #{index}... verts[j].index = #{verts[j].index}"
|
394
|
+
# puts "**** remove_redundant_vertices setting #{verts[j].name} to nil!"
|
395
|
+
#search through all the edges and set those with this vertex as in-out- vertex to null
|
396
|
+
if(!@edges.nil?)
|
397
|
+
for i in 0..@edges.length - 1
|
398
|
+
edge = @edges[i]
|
399
|
+
if(!edge.nil? and (edge.in_vertex == verts[j] or edge.out_vertex == verts[j]))
|
400
|
+
# puts "edge #{edge.in_vertex.name} - #{edge.out_vertex.name}"
|
401
|
+
@edges[i] = nil #setting that edge to nil
|
402
|
+
end
|
403
|
+
end
|
404
|
+
end
|
405
|
+
#finally setting the vertex to null
|
406
|
+
verts[j] = nil
|
407
|
+
end
|
408
|
+
j-=1
|
409
|
+
end #end of while loop
|
410
|
+
|
411
|
+
# puts "**** remove_redundant_vertices Old @num_vertices:: #{@num_vertices}"
|
412
|
+
#recreating the vertices array without the nil values
|
413
|
+
counter = 0
|
414
|
+
vertices_array = Array.new
|
415
|
+
for i in (0..verts.length-1)
|
416
|
+
vertex = verts[i]
|
417
|
+
if(!vertex.nil?)
|
418
|
+
vertices_array << vertex
|
419
|
+
counter+=1
|
420
|
+
end
|
421
|
+
end
|
422
|
+
@vertices = vertices_array
|
423
|
+
@num_vertices = counter+1 #since @num_vertices is always one advanced of the last vertex
|
424
|
+
end
|
425
|
+
|
426
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
427
|
+
|
428
|
+
=begin
|
429
|
+
Checks to see if an edge between vertices "in" and "out" exists.
|
430
|
+
true - if an edge exists and false - if an edge doesn't exist
|
431
|
+
edge[] list, vertex in, vertex out, int index
|
432
|
+
=end
|
433
|
+
def search_edges(list, in_vertex, out, index)
|
434
|
+
edgePos = -1
|
435
|
+
if(list.nil?)#if the list is null
|
436
|
+
return edgePos
|
437
|
+
end
|
438
|
+
|
439
|
+
for i in (0..list.length-1)
|
440
|
+
if(!list[i].nil? and !list[i].in_vertex.nil? and !list[i].out_vertex.nil?)
|
441
|
+
#checking for exact match with an edge
|
442
|
+
if(((list[i].in_vertex.name.casecmp(in_vertex.name)==0 or list[i].in_vertex.name.include?(in_vertex.name)) and
|
443
|
+
(list[i].out_vertex.name.casecmp(out.name)==0 or list[i].out_vertex.name.include?(out.name))) or
|
444
|
+
((list[i].in_vertex.name.casecmp(out.name)==0 or list[i].in_vertex.name.include?(out.name)) and
|
445
|
+
(list[i].out_vertex.name.casecmp(in_vertex.name)==0 or list[i].out_vertex.name.include?(in_vertex.name))))
|
446
|
+
# puts("***** Found edge! : index:: #{index} list[i].index:: #{list[i].index}")
|
447
|
+
#if an edge was found
|
448
|
+
edgePos = i #returning its position in the array
|
449
|
+
#INCREMENT FREQUENCY IF THE EDGE WAS FOUND IN A DIFFERENT SENT. (CHECK BY MAINTAINING A TEXT NUMBER AND CHECKING IF THE NEW # IS DIFF FROM PREV #)
|
450
|
+
if(index != list[i].index)
|
451
|
+
list[i].frequency+=1
|
452
|
+
end
|
453
|
+
end
|
454
|
+
end
|
455
|
+
end #end of the for loop
|
456
|
+
return edgePos
|
457
|
+
end # end of searchdges
|
458
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
459
|
+
|
460
|
+
def search_edges_to_set_null(list, in_vertex, out, index)
|
461
|
+
edgePos = -1
|
462
|
+
# puts("***** Searching edge to set to null:: #{in_vertex.name} - #{out.name} ... num_edges #{@num_edges}")
|
463
|
+
for i in 0..@num_edges - 1
|
464
|
+
if(!list[i].nil? and !list[i].in_vertex.nil? and !list[i].out_vertex.nil?)
|
465
|
+
# puts "comparing with #{list[i].in_vertex.name} - #{list[i].out_vertex.name}"
|
466
|
+
#puts "#{list[i].in_vertex.name.downcase == in_vertex.name.downcase} - #{list[i].out_vertex.name.downcase == out.name.downcase}"
|
467
|
+
#checking for exact match with an edge
|
468
|
+
if((list[i].in_vertex.name.downcase == in_vertex.name.downcase and list[i].out_vertex.name.downcase == out.name.downcase) or
|
469
|
+
(list[i].in_vertex.name.downcase == out.name.downcase and list[i].out_vertex.name.downcase == in_vertex.name.downcase))
|
470
|
+
#if an edge was found
|
471
|
+
edgePos = i #returning its position in the array
|
472
|
+
#INCREMENT FREQUENCY IF THE EDGE WAS FOUND IN A DIFFERENT SENT. (CHECK BY MAINTAINING A TEXT NUMBER AND CHECKING IF THE NEW # IS DIFF FROM PREV #)
|
473
|
+
if(index != list[i].index)
|
474
|
+
list[i].frequency+=1
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end #end of the for loop
|
479
|
+
# puts("***** search_edges_to_set_null #{in_vertex.name} - #{out.name} returning:: #{edgePos}")
|
480
|
+
return edgePos
|
481
|
+
end # end of the method search_edges_to_set_null
|
482
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
483
|
+
=begin
|
484
|
+
NULLIFY ALL EDGES CONTAINING "ONLY SUBSTRINGS" (and not exact matches) OF EITHER IN/OUT VERTICES IN THE SAME SENTENCE (verts[j].index == index)
|
485
|
+
And reset the @edges array with non-null elements.
|
486
|
+
=end
|
487
|
+
|
488
|
+
def remove_redundant_edges(in_vertex, out, index)
|
489
|
+
list = @edges
|
490
|
+
j = @num_edges - 1
|
491
|
+
while j >= 0 do
|
492
|
+
if(!list[j].nil? and list[j].index == index)
|
493
|
+
#when invertices are eq and out-verts are substrings or vice versa
|
494
|
+
if(in_vertex.name.casecmp(list[j].in_vertex.name) == 0 and out.name.casecmp(list[j].out_vertex.name) != 0 and out.name.downcase.include?(list[j].out_vertex.name.downcase))
|
495
|
+
# puts("FOUND out_vertex match for edge:: #{list[j].in_vertex.name} - #{list[j].out_vertex.name}")
|
496
|
+
list[j] = nil
|
497
|
+
#@num_edges-=1
|
498
|
+
#when in-vertices are only substrings and out-verts are equal
|
499
|
+
elsif(in_vertex.name.casecmp(list[j].in_vertex.name)!=0 and in_vertex.name.downcase.include?(list[j].in_vertex.name.downcase) and out.name.casecmp(list[j].out_vertex.name)==0)
|
500
|
+
# puts("FOUND in_vertex match for edge: #{list[j].in_vertex.name} - #{list[j].out_vertex.name}")
|
501
|
+
list[j] = nil
|
502
|
+
#@num_edges-=1
|
503
|
+
end
|
504
|
+
end
|
505
|
+
j-=1
|
506
|
+
end #end of the while loop
|
507
|
+
# puts "**** search_edges:: Old number #{@num_edges}"
|
508
|
+
#recreating the edges array without the nil values
|
509
|
+
counter = 0
|
510
|
+
edges_array = Array.new
|
511
|
+
list.each{
|
512
|
+
|edge|
|
513
|
+
if(!edge.nil?)
|
514
|
+
# puts "edge:: #{edge.in_vertex.name} - #{edge.out_vertex.name}"
|
515
|
+
edges_array << edge
|
516
|
+
counter+=1
|
517
|
+
end
|
518
|
+
}
|
519
|
+
@edges = edges_array
|
520
|
+
@num_edges = counter+1
|
521
|
+
# puts "**** search_edges:: New number of edges #{@num_edges}"
|
522
|
+
end
|
523
|
+
|
524
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
525
|
+
def print_graph(edges, vertices)
|
526
|
+
puts("*** List of vertices::")
|
527
|
+
for j in (0..vertices.length-1)
|
528
|
+
if(!vertices[j].nil?)
|
529
|
+
puts("@@@ Vertex:: #{vertices[j].name}")
|
530
|
+
puts("*** Frequency:: #{vertices[j].frequency} State:: #{vertices[j].state}")
|
531
|
+
puts("*** Label:: #{vertices[j].label} Parent:: #{vertices[j].parent}")
|
532
|
+
end
|
533
|
+
end
|
534
|
+
puts("*******")
|
535
|
+
puts("*** List of edges::")
|
536
|
+
for j in (0..edges.length-1)
|
537
|
+
if(!edges[j].nil? and !edges[j].in_vertex.nil? and !edges[j].out_vertex.nil?)
|
538
|
+
puts("@@@ Edge:: #{edges[j].in_vertex.name} & #{edges[j].out_vertex.name}")
|
539
|
+
puts("*** Frequency:: #{edges[j].frequency} State:: #{edges[j].in_vertex.state} & #{edges[j].out_vertex.state}")
|
540
|
+
puts("*** Label:: #{edges[j].label}")
|
541
|
+
end
|
542
|
+
end
|
543
|
+
puts("--------------")
|
544
|
+
end #end of print_graph method
|
545
|
+
|
546
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
547
|
+
#Identifying parents and labels for the vertices
|
548
|
+
def find_parents(t)
|
549
|
+
# puts "Inside find_parents.. text #{t}"
|
550
|
+
tp = TextPreprocessing.new
|
551
|
+
unTaggedString = t.split(" ")
|
552
|
+
parents = Array.new
|
553
|
+
# t = text[i]
|
554
|
+
t = StanfordCoreNLP::Text.new(t) #the same variable has to be passed into the Textx.new method
|
555
|
+
@pipeline.annotate(t)
|
556
|
+
#for each sentence identify theparsed form of the sentence
|
557
|
+
sentence = t.get(:sentences).toArray
|
558
|
+
parsed_sentence = sentence[0].get(:collapsed_c_c_processed_dependencies)
|
559
|
+
#puts "parsed sentence #{parsed_sentence}"
|
560
|
+
#iterating through the set of tokens and identifying each token's parent
|
561
|
+
#puts "unTaggedString.length #{unTaggedString.length}"
|
562
|
+
for j in (0..unTaggedString.length - 1)
|
563
|
+
#puts "unTaggedString[#{j}] #{unTaggedString[j]}"
|
564
|
+
if(tp.is_punct(unTaggedString[j]))
|
565
|
+
next
|
566
|
+
end
|
567
|
+
if(tp.contains_punct(unTaggedString[j]))
|
568
|
+
unTaggedString[j] = tp.contains_punct(unTaggedString[j])
|
569
|
+
# puts "unTaggedString #{unTaggedString[j]} and #{tp.contains_punct_bool(unTaggedString[j])}"
|
570
|
+
end
|
571
|
+
if(!unTaggedString[j].nil? and !tp.contains_punct_bool(unTaggedString[j]))
|
572
|
+
pat = parsed_sentence.getAllNodesByWordPattern(unTaggedString[j])
|
573
|
+
pat = pat.toArray
|
574
|
+
parent = parsed_sentence.getParents(pat[0]).toArray
|
575
|
+
end
|
576
|
+
#puts "parent of #{unTaggedString[j]} is #{parent[0]}"
|
577
|
+
if(!parent.nil? and !parent[0].nil?)
|
578
|
+
parents[j] = (parent[0].to_s)[0..(parent[0].to_s).index("-")-1]#extracting the name of the parent (since it is in the foramt-> "name-POS")
|
579
|
+
#puts "parents[#{j}] = #{parents[j]}"
|
580
|
+
else
|
581
|
+
parents[j] = nil
|
582
|
+
end
|
583
|
+
end
|
584
|
+
return parents
|
585
|
+
end #end of find_parents method
|
586
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
587
|
+
#Identifying parents and labels for the vertices
|
588
|
+
def find_labels(t)
|
589
|
+
# puts "Inside find_labels"
|
590
|
+
unTaggedString = t.split(" ")
|
591
|
+
t = StanfordCoreNLP::Text.new(t)
|
592
|
+
@pipeline.annotate(t)
|
593
|
+
#for each sentence identify theparsed form of the sentence
|
594
|
+
sentence = t.get(:sentences).toArray
|
595
|
+
parsed_sentence = sentence[0].get(:collapsed_c_c_processed_dependencies)
|
596
|
+
labels = Array.new
|
597
|
+
labelCounter = 0
|
598
|
+
govDep = parsed_sentence.typedDependencies.toArray
|
599
|
+
#for each untagged token
|
600
|
+
for j in (0..unTaggedString.length - 1)
|
601
|
+
unTaggedString[j].gsub!(".", "")
|
602
|
+
unTaggedString[j].gsub!(",", "")
|
603
|
+
#puts "Label for #{unTaggedString[j]}"
|
604
|
+
#identify its corresponding position in govDep and fetch its label
|
605
|
+
for k in (0..govDep.length - 1)
|
606
|
+
#puts "Comparing with #{govDep[k].dep.value()}"
|
607
|
+
if(govDep[k].dep.value() == unTaggedString[j])
|
608
|
+
labels[j] = govDep[k].reln.getShortName()
|
609
|
+
#puts labels[j]
|
610
|
+
labelCounter+=1
|
611
|
+
break
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
return labels
|
616
|
+
end # end of find_labels method
|
617
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
618
|
+
=begin
|
619
|
+
* Setting semantic labels for edges based on the labels vertices have with their parents
|
620
|
+
=end
|
621
|
+
def set_semantic_labels_for_edges
|
622
|
+
# puts "*** inside set_semantic_labels_for_edges"
|
623
|
+
for i in (0.. @vertices.length - 1)
|
624
|
+
if(!@vertices[i].nil? and !@vertices[i].parent.nil?) #parent = null for ROOT
|
625
|
+
#search for the parent vertex
|
626
|
+
for j in (0..@vertices.length - 1)
|
627
|
+
if(!@vertices[j].nil? and (@vertices[j].name.casecmp(@vertices[i].parent) == 0 or
|
628
|
+
@vertices[j].name.downcase.include?(@vertices[i].parent.downcase)))
|
629
|
+
# puts("**Parent:: #{@vertices[j].name}")
|
630
|
+
parent = @vertices[j]
|
631
|
+
break #break out of search for the parent
|
632
|
+
end
|
633
|
+
end
|
634
|
+
if(!parent.nil?)#{
|
635
|
+
#check if an edge exists between vertices[i] and the parent
|
636
|
+
for k in (0..@edges.length - 1)
|
637
|
+
if(!@edges[k].nil? and !@edges[k].in_vertex.nil? and !@edges[k].out_vertex.nil?)
|
638
|
+
if((@edges[k].in_vertex.name.equal?(@vertices[i].name) and @edges[k].out_vertex.name.equal?(parent.name)) or (@edges[k].in_vertex.name.equal?(parent.name) and @edges[k].out_vertex.name.equal?(@vertices[i].name)))
|
639
|
+
#set the role label
|
640
|
+
if(@edges[k].label.nil?)
|
641
|
+
@edges[k].label = @vertices[i].label
|
642
|
+
elsif(!@edges[k].label.nil? and (@edges[k].label == "NMOD" or @edges[k].label == "PMOD") and (@vertices[i].label != "NMOD" or @vertices[i].label != "PMOD"))
|
643
|
+
@edges[k].label = @vertices[i].label
|
644
|
+
end
|
645
|
+
end
|
646
|
+
end
|
647
|
+
end
|
648
|
+
end#end of if paren.nil? condition
|
649
|
+
end
|
650
|
+
end #end of for loop
|
651
|
+
end #end of set_semantic_labels_for_edges method
|
652
|
+
|
653
|
+
end # end of the class GraphGenerator
|
654
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
655
|
+
=begin
|
656
|
+
Identifying frequency of edges and pruning out edges that do no meet the threshold conditions
|
657
|
+
=end
|
658
|
+
def identify_frequency_and_prune_edges(edges, num)
|
659
|
+
# puts "inside frequency threshold! :: num #{num}"
|
660
|
+
#freqEdges maintains the top frequency edges from ALPHA_FREQ to BETA_FREQ
|
661
|
+
freqEdges = Array.new #from alpha = 3 to beta = 10
|
662
|
+
#iterating through all the edges
|
663
|
+
for j in (0..num-1)
|
664
|
+
if(!edges[j].nil?)
|
665
|
+
if(edges[j].frequency <= BETA_FREQ and edges[j].frequency >= ALPHA_FREQ and !freqEdges[edges[j].frequency-1].nil?)#{
|
666
|
+
for i in (0..freqEdges[edges[j].frequency-1].length - 1)#iterating to find i for which freqEdges is null
|
667
|
+
if(!freqEdges[edges[j].frequency-1][i].nil?)
|
668
|
+
break
|
669
|
+
end
|
670
|
+
end
|
671
|
+
freqEdges[edges[j].frequency-1][i] = edges[j]
|
672
|
+
end
|
673
|
+
end
|
674
|
+
end
|
675
|
+
selectedEdges = Array.new
|
676
|
+
#Selecting only those edges that satisfy the frequency condition [between ALPHA and BETA]
|
677
|
+
j = BETA_FREQ-1
|
678
|
+
while j >= ALPHA_FREQ-1 do
|
679
|
+
if(!freqEdges[j].nil?)
|
680
|
+
for i in (0..num-1)
|
681
|
+
if(!freqEdges[j][i].nil?)
|
682
|
+
selectedEdges[maxSelected] = freqEdges[j][i]
|
683
|
+
maxSelected+=1
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|
687
|
+
j-=1
|
688
|
+
end
|
689
|
+
|
690
|
+
if(maxSelected != 0)
|
691
|
+
@num_edges = maxSelected #replacing numEdges with the number of selected edges
|
692
|
+
end
|
693
|
+
return selectedEdges
|
694
|
+
end
|
695
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|