automated_metareview 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.idea/automated_metareview.iml +91 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +8 -0
- data/Gemfile +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/automated_metareview.gemspec +27 -0
- data/lib/automated_metareview.rb +18 -0
- data/lib/automated_metareview/constants.rb +208 -0
- data/lib/automated_metareview/degree_of_relevance.rb +562 -0
- data/lib/automated_metareview/edge.rb +13 -0
- data/lib/automated_metareview/graph_generator.rb +695 -0
- data/lib/automated_metareview/negations.rb +51 -0
- data/lib/automated_metareview/negative-words.csv +4783 -0
- data/lib/automated_metareview/patterns-assess.csv +17 -0
- data/lib/automated_metareview/patterns-prob-detect.csv +22 -0
- data/lib/automated_metareview/patterns-suggest.csv +20 -0
- data/lib/automated_metareview/plagiarism_check.rb +155 -0
- data/lib/automated_metareview/positive-words.csv +2006 -0
- data/lib/automated_metareview/predict_class.rb +121 -0
- data/lib/automated_metareview/sentence_state.rb +293 -0
- data/lib/automated_metareview/text_preprocessing.rb +342 -0
- data/lib/automated_metareview/text_quantity.rb +26 -0
- data/lib/automated_metareview/tone.rb +212 -0
- data/lib/automated_metareview/version.rb +3 -0
- data/lib/automated_metareview/vertex.rb +18 -0
- data/lib/automated_metareview/wordnet_based_similarity.rb +480 -0
- metadata +158 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
class Edge
|
2
|
+
attr_accessor :edgeID, :type, :name, :index, :in_vertex, :out_vertex, :edge_match, :average_match, :frequency, :label
|
3
|
+
|
4
|
+
def initialize(edge_name, edge_type)
|
5
|
+
@name = edge_name
|
6
|
+
@type = edge_type #1 - verb, 2 - adjective, 3-adverb
|
7
|
+
@average_match = 0.0 #initializing match to 0
|
8
|
+
@frequency = 0
|
9
|
+
#initializing the number of matches for each metric value to 0
|
10
|
+
@edge_match = Array.new
|
11
|
+
@edge_match = [0, 0, 0, 0, 0]
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,695 @@
|
|
1
|
+
require 'automated_metareview/sentence_state'
|
2
|
+
require 'automated_metareview/edge'
|
3
|
+
require 'automated_metareview/vertex'
|
4
|
+
|
5
|
+
class GraphGenerator
|
6
|
+
#include SentenceState
|
7
|
+
#creating accessors for the instance variables
|
8
|
+
attr_accessor :vertices, :num_vertices, :edges, :num_edges, :pipeline, :pos_tagger
|
9
|
+
|
10
|
+
# #global variables
|
11
|
+
# $vertices = Array.new
|
12
|
+
# $edges = Array.new
|
13
|
+
|
14
|
+
=begin
|
15
|
+
* generates the graph for the given review text and
|
16
|
+
* INPUT: an array of sentences for a review or a submission. Every row in 'text' contains one sentence.
|
17
|
+
* type - tells you if it was a review or s submission
|
18
|
+
* type = 1 - submission/past review
|
19
|
+
* type = 2 - new review
|
20
|
+
=end
|
21
|
+
def generate_graph(text, pos_tagger, coreNLPTagger, forRelevance, forPatternIdentify)
|
22
|
+
#initializing common arrays
|
23
|
+
@vertices = Array.new
|
24
|
+
@num_vertices = 0
|
25
|
+
@edges = Array.new
|
26
|
+
@num_edges = 0
|
27
|
+
|
28
|
+
@pos_tagger = pos_tagger #part of speech tagger
|
29
|
+
@pipeline = coreNLPTagger #dependency parsing
|
30
|
+
#iterate through the sentences in the text
|
31
|
+
for i in (0..text.length-1)
|
32
|
+
if(text[i].empty? or text[i] == "" or text[i].split(" ").empty?)
|
33
|
+
next
|
34
|
+
end
|
35
|
+
unTaggedString = text[i].split(" ")
|
36
|
+
# puts "UnTagged String:: #{unTaggedString}"
|
37
|
+
taggedString = @pos_tagger.get_readable(text[i])
|
38
|
+
# puts "taggedString:: #{taggedString}"
|
39
|
+
|
40
|
+
#Initializing some arrays
|
41
|
+
nouns = Array.new
|
42
|
+
nCount = 0
|
43
|
+
verbs = Array.new
|
44
|
+
vCount = 0
|
45
|
+
adjectives = Array.new
|
46
|
+
adjCount = 0
|
47
|
+
adverbs = Array.new
|
48
|
+
advCount = 0
|
49
|
+
|
50
|
+
parents = Array.new
|
51
|
+
labels = Array.new
|
52
|
+
|
53
|
+
#------------------------------------------#------------------------------------------
|
54
|
+
#finding parents
|
55
|
+
parents = find_parents(text[i])
|
56
|
+
parentCounter = 0
|
57
|
+
#------------------------------------------#------------------------------------------
|
58
|
+
#finding parents
|
59
|
+
labels = find_labels(text[i])
|
60
|
+
labelCounter = 0
|
61
|
+
#------------------------------------------#------------------------------------------
|
62
|
+
#find state
|
63
|
+
sstate = SentenceState.new
|
64
|
+
states_array = sstate.identify_sentence_state(taggedString)
|
65
|
+
states_counter = 0
|
66
|
+
state = states_array[states_counter]
|
67
|
+
states_counter += 1
|
68
|
+
#------------------------------------------#------------------------------------------
|
69
|
+
|
70
|
+
taggedString = taggedString.split(" ")
|
71
|
+
prevType = nil #initlializing the prevyp
|
72
|
+
|
73
|
+
#iterate through the tokens
|
74
|
+
for j in (0..taggedString.length-1)
|
75
|
+
taggedToken = taggedString[j]
|
76
|
+
plainToken = taggedToken[0...taggedToken.index("/")].to_s
|
77
|
+
posTag = taggedToken[taggedToken.index("/")+1..taggedToken.length].to_s
|
78
|
+
#ignore periods
|
79
|
+
if(plainToken == "." or taggedToken.include?("/POS") or (taggedToken.index("/") == taggedToken.length()-1) or (taggedToken.index("/") == taggedToken.length()-2))#this is for strings containinig "'s" or without POS
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
#SETTING STATE
|
84
|
+
#since the CC or IN are part of the following sentence segment, we set the STATE for that segment when we see a CC or IN
|
85
|
+
if(taggedToken.include?("/CC"))#{//|| ps.contains("/IN")
|
86
|
+
state = states_array[states_counter]
|
87
|
+
states_counter+=1
|
88
|
+
end
|
89
|
+
# puts("**Value:: #{plainToken} LabelCounter:: #{labelCounter} ParentCounter:: #{parentCounter} POStag:: #{posTag} .. state = #{state}")
|
90
|
+
|
91
|
+
#------------------------------------------
|
92
|
+
#if the token is a noun
|
93
|
+
if(taggedToken.include?("NN") or taggedToken.include?("PRP") or taggedToken.include?("IN") or taggedToken.include?("/EX") or taggedToken.include?("WP"))
|
94
|
+
#either add on to a previous vertex or create a brand new noun vertex
|
95
|
+
if(prevType == NOUN) #adding to a previous noun vertex
|
96
|
+
nCount -= 1 #decrement, since we are accessing a previous noun vertex
|
97
|
+
prevVertex = search_vertices(@vertices, nouns[nCount], i) #fetching the previous vertex
|
98
|
+
nouns[nCount] = nouns[nCount].to_s + " " + plainToken #concatenating with contents of the previous noun vertex
|
99
|
+
#checking if the previous noun concatenated with "s" already exists among the vertices
|
100
|
+
if((nounVertex = search_vertices(@vertices, nouns[nCount], i)) == nil)
|
101
|
+
prevVertex.name = prevVertex.name.to_s + " " + plainToken #concatenating the nouns
|
102
|
+
nounVertex = prevVertex #the current concatenated vertex will be considered
|
103
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD")#resetting labels for the concatenated vertex
|
104
|
+
nounVertex.label = labels[labelCounter]
|
105
|
+
end
|
106
|
+
#fAppendedVertex = 1
|
107
|
+
end#if the vertex already exists, just use nounVertex - the returned vertex for ops.
|
108
|
+
else #if the previous token is not a noun, create a brand new vertex
|
109
|
+
nouns[nCount] = plainToken #this is checked for later on
|
110
|
+
nounVertex = search_vertices(@vertices, plainToken, i)
|
111
|
+
if(nounVertex == nil) #the string doesn't already exist
|
112
|
+
@vertices[@num_vertices] = Vertex.new(nouns[nCount], NOUN, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
113
|
+
nounVertex = @vertices[@num_vertices] #the newly formed vertex will be considered
|
114
|
+
@num_vertices+=1
|
115
|
+
end
|
116
|
+
end #end of if prevType was noun
|
117
|
+
remove_redundant_vertices(nouns[nCount], i)
|
118
|
+
nCount+=1 #increment nCount for a new noun vertex just created (or existing previous vertex appended with new text)
|
119
|
+
|
120
|
+
#checking if a noun existed before this one and if the adjective was attached to that noun.
|
121
|
+
#if an adjective was found earlier, we add a new edge
|
122
|
+
if(prevType == ADJ)
|
123
|
+
#set previous noun's property to null, if it was set, if there is a noun before the adjective
|
124
|
+
if(nCount > 1)
|
125
|
+
v1 = search_vertices(@vertices, nouns[nCount-2], i) #fetching the previous noun, the one before the current noun (therefore -2)
|
126
|
+
v2 = search_vertices(@vertices, adjectives[adjCount-1], i) #fetching the previous adjective
|
127
|
+
#if such an edge exists - DELETE IT - search_edges_to_set_null() returns the position in the array at which such an edge exists
|
128
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges_to_set_null(@edges, v1, v2, i)) != -1) #-1 is when no such edge exists
|
129
|
+
@edges[e] = nil #setting the edge to null
|
130
|
+
#if @num_edges had been previously incremented, decrement it
|
131
|
+
if(@num_edges > 0)
|
132
|
+
@num_edges-=1 #deducting an edge count
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
#if this noun vertex was encountered for the first time, nCount < 1,
|
137
|
+
#so do adding of edge outside the if condition
|
138
|
+
#add a new edge with v1 as the adjective and v2 as the new noun
|
139
|
+
v1 = search_vertices(@vertices, adjectives[adjCount-1], i)
|
140
|
+
v2 = nounVertex #the noun vertex that was just created
|
141
|
+
#if such an edge did not already exist
|
142
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
143
|
+
@edges[@num_edges] = Edge.new("noun-property",VERB)
|
144
|
+
@edges[@num_edges].in_vertex = v1
|
145
|
+
@edges[@num_edges].out_vertex = v2
|
146
|
+
@edges[@num_edges].index = i
|
147
|
+
@num_edges+=1
|
148
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
149
|
+
remove_redundant_edges(v1, v2, i)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
#a noun has been found and has established a verb as an in_vertex and such an edge doesnt already previously exist
|
153
|
+
if(vCount > 0) #and fAppendedVertex == 0
|
154
|
+
#add edge only when a fresh vertex is created not when existing vertex is appended to
|
155
|
+
v1 = search_vertices(@vertices, verbs[vCount-1], i)
|
156
|
+
v2 = nounVertex
|
157
|
+
#if such an edge does not already exist add it
|
158
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges,v1, v2, i)) == -1)
|
159
|
+
@edges[@num_edges] = Edge.new("verb", VERB)
|
160
|
+
@edges[@num_edges].in_vertex = v1 #for vCount = 0
|
161
|
+
@edges[@num_edges].out_vertex = v2
|
162
|
+
@edges[@num_edges].index = i
|
163
|
+
@num_edges+=1
|
164
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
165
|
+
remove_redundant_edges(v1, v2, i)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
prevType = NOUN
|
169
|
+
#------------------------------------------
|
170
|
+
|
171
|
+
#if the string is an adjective
|
172
|
+
#adjectives are vertices but they are not connected by an edge to the nouns, instead they are the noun's properties
|
173
|
+
elsif(taggedToken.include?("/JJ"))
|
174
|
+
adjective = nil
|
175
|
+
if(prevType == ADJ) #combine the adjectives
|
176
|
+
# puts("PREV ADJ here:: #{plainToken}")
|
177
|
+
if(adjCount >= 1)
|
178
|
+
adjCount = adjCount - 1
|
179
|
+
prevVertex = search_vertices(@vertices, adjectives[adjCount], i) #fetching the previous vertex
|
180
|
+
adjectives[adjCount] = adjectives[adjCount] + " " + plainToken
|
181
|
+
#if the concatenated vertex didn't already exist
|
182
|
+
if((adjective = search_vertices(@vertices, adjectives[adjCount], i)).nil?)
|
183
|
+
prevVertex.name = prevVertex.name+" "+plainToken
|
184
|
+
adjective = prevVertex #set it as "adjective" for further execution
|
185
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD") #resetting labels for the concatenated vertex
|
186
|
+
adjective.label = labels[labelCounter]
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
else #new adjective vertex
|
191
|
+
adjectives[adjCount] = plainToken
|
192
|
+
if((adjective = search_vertices(@vertices, plainToken, i)).nil?) #the string doesn't already exist
|
193
|
+
@vertices[@num_vertices] = Vertex.new(adjectives[adjCount], ADJ, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
194
|
+
adjective = @vertices[@num_vertices]
|
195
|
+
@num_vertices+=1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
remove_redundant_vertices(adjectives[adjCount], i)
|
199
|
+
adjCount+=1 #incrementing, since a new adjective was created or an existing one updated.
|
200
|
+
|
201
|
+
#by default associate the adjective with the previous/latest noun and if there is a noun following it immediately, then remove the property from the older noun (done under noun condition)
|
202
|
+
if(nCount > 0) #gets the previous noun to form the edge
|
203
|
+
v1 = search_vertices(@vertices, nouns[nCount-1], i)
|
204
|
+
v2 = adjective #the current adjective vertex
|
205
|
+
#if such an edge does not already exist add it
|
206
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
207
|
+
# puts "** Adding noun-adj edge .. #{v1.name} - #{v2.name}"
|
208
|
+
@edges[@num_edges] = Edge.new("noun-property",VERB)
|
209
|
+
@edges[@num_edges].in_vertex = v1
|
210
|
+
@edges[@num_edges].out_vertex = v2
|
211
|
+
@edges[@num_edges].index = i
|
212
|
+
@num_edges+=1
|
213
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
214
|
+
remove_redundant_edges(v1, v2, i)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
prevType = ADJ
|
218
|
+
#end of if condition for adjective
|
219
|
+
#------------------------------------------
|
220
|
+
|
221
|
+
#if the string is a verb or a modal//length condition for verbs is, be, are...
|
222
|
+
elsif(taggedToken.include?("/VB") or taggedToken.include?("MD"))
|
223
|
+
verbVertex = nil
|
224
|
+
if(prevType == VERB) #combine the verbs
|
225
|
+
vCount = vCount - 1
|
226
|
+
prevVertex = search_vertices(@vertices, verbs[vCount], i) #fetching the previous vertex
|
227
|
+
verbs[vCount] = verbs[vCount] + " " + plainToken
|
228
|
+
#if the concatenated vertex didn't already exist
|
229
|
+
if((verbVertex = search_vertices(@vertices, verbs[vCount], i)) == nil)
|
230
|
+
prevVertex.name = prevVertex.name + " " + plainToken
|
231
|
+
verbVertex = prevVertex #concatenated vertex becomes the new verb vertex
|
232
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD")#resetting labels for the concatenated vertex
|
233
|
+
verbVertex.label = labels[labelCounter]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
else
|
237
|
+
verbs[vCount] = plainToken
|
238
|
+
if((verbVertex = search_vertices(@vertices, plainToken, i)) == nil)
|
239
|
+
@vertices[@num_vertices] = Vertex.new(plainToken, VERB, i, state, labels[labelCounter], parents[parentCounter], posTag)
|
240
|
+
verbVertex = @vertices[@num_vertices] #newly created verb vertex will be considered in the future
|
241
|
+
@num_vertices+=1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
remove_redundant_vertices(verbs[vCount], i)
|
245
|
+
vCount+=1
|
246
|
+
|
247
|
+
#if an adverb was found earlier, we set that as the verb's property
|
248
|
+
if(prevType == ADV)
|
249
|
+
#set previous verb's property to null, if it was set, if there is a verb following the adverb
|
250
|
+
if(vCount > 1)
|
251
|
+
v1 = search_vertices(@vertices, verbs[vCount-2], i) #fetching the previous verb, the one before the current one (hence -2)
|
252
|
+
v2 = search_vertices(@vertices, adverbs[advCount-1], i) #fetching the previous adverb
|
253
|
+
#if such an edge exists - DELETE IT
|
254
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges_to_set_null(@edges, v1, v2, i)) != -1)
|
255
|
+
@edges[e] = nil #setting the edge to null
|
256
|
+
if(@num_edges > 0)
|
257
|
+
@num_edges-=1 #deducting an edge count
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
#if this verb vertex was encountered for the first time, vCount < 1,
|
262
|
+
#so do adding of edge outside the if condition
|
263
|
+
#add a new edge with v1 as the adverb and v2 as the new verb
|
264
|
+
v1 = search_vertices(@vertices, adverbs[advCount-1], i)
|
265
|
+
v2 = verbVertex
|
266
|
+
#if such an edge did not already exist
|
267
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
268
|
+
@edges[@num_edges] = Edge.new("verb-property",VERB)
|
269
|
+
@edges[@num_edges].in_vertex = v1
|
270
|
+
@edges[@num_edges].out_vertex = v2
|
271
|
+
@edges[@num_edges].index = i
|
272
|
+
@num_edges+=1
|
273
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
274
|
+
remove_redundant_edges(v1, v2, i)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
#making the previous noun, one of the vertices of the verb edge
|
279
|
+
if(nCount > 0) #and fAppendedVertex == 0
|
280
|
+
#gets the previous noun to form the edge
|
281
|
+
v1 = search_vertices(@vertices, nouns[nCount-1], i)
|
282
|
+
v2 = verbVertex
|
283
|
+
#if such an edge does not already exist add it
|
284
|
+
if(!v1.nil? and !v2.nil? and (e = search_edges(@edges, v1, v2, i)) == -1)
|
285
|
+
@edges[@num_edges] = Edge.new("verb",VERB)
|
286
|
+
@edges[@num_edges].in_vertex = v1 #for nCount = 0;
|
287
|
+
@edges[@num_edges].out_vertex = v2 #the verb
|
288
|
+
@edges[@num_edges].index = i
|
289
|
+
@num_edges+=1
|
290
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
291
|
+
remove_redundant_edges(v1, v2, i)
|
292
|
+
end
|
293
|
+
end
|
294
|
+
prevType = VERB
|
295
|
+
#------------------------------------------
|
296
|
+
#if the string is an adverb
|
297
|
+
elsif(taggedToken.include?("RB"))
|
298
|
+
adverb = nil
|
299
|
+
if(prevType == ADV) #appending to existing adverb
|
300
|
+
if(advCount >= 1)
|
301
|
+
advCount = advCount - 1
|
302
|
+
end
|
303
|
+
prevVertex = search_vertices(@vertices, adverbs[advCount], i) #fetching the previous vertex
|
304
|
+
adverbs[advCount] = adverbs[advCount] + " " + plainToken
|
305
|
+
#if the concatenated vertex didn't already exist
|
306
|
+
if((adverb = search_vertices(@vertices, adverbs[advCount], i)) == nil)
|
307
|
+
prevVertex.name = prevVertex.name + " " + plainToken
|
308
|
+
adverb = prevVertex #setting it as "adverb" for further computation
|
309
|
+
if(labels[labelCounter] != "NMOD" or labels[labelCounter] != "PMOD") #resetting labels for the concatenated vertex
|
310
|
+
adverb.label = labels[labelCounter]
|
311
|
+
end
|
312
|
+
end
|
313
|
+
else #else creating a new vertex
|
314
|
+
adverbs[advCount] = plainToken
|
315
|
+
if((adverb = search_vertices(@vertices, plainToken, i)) == nil)
|
316
|
+
@vertices[@num_vertices] = Vertex.new(adverbs[advCount], ADV, i, state, labels[labelCounter], parents[parentCounter], posTag);
|
317
|
+
adverb = @vertices[@num_vertices]
|
318
|
+
@num_vertices+=1
|
319
|
+
end
|
320
|
+
end
|
321
|
+
remove_redundant_vertices(adverbs[advCount], i)
|
322
|
+
advCount+=1
|
323
|
+
|
324
|
+
#by default associate it with the previous/latest verb and if there is a verb following it immediately, then remove the property from the verb
|
325
|
+
if(vCount > 0) #gets the previous verb to form a verb-adverb edge
|
326
|
+
v1 = search_vertices(@vertices, verbs[vCount-1], i)
|
327
|
+
v2 = adverb
|
328
|
+
#if such an edge does not already exist add it
|
329
|
+
if(!v1.nil? and !v2.nil? && (e = search_edges(@edges, v1, v2, i)) == -1)
|
330
|
+
@edges[@num_edges] = Edge.new("verb-property",VERB)
|
331
|
+
@edges[@num_edges].in_vertex = v1 #for nCount = 0;
|
332
|
+
@edges[@num_edges].out_vertex = v2 #the verb
|
333
|
+
@edges[@num_edges].index = i
|
334
|
+
@num_edges+=1
|
335
|
+
#since an edge was just added we try to check if there exist any redundant edges that can be removed
|
336
|
+
remove_redundant_edges(v1, v2, i)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
prevType = ADV
|
340
|
+
#end of if condition for adverb
|
341
|
+
end #end of if condition
|
342
|
+
#------------------------------------------
|
343
|
+
#incrementing counters for labels and parents
|
344
|
+
labelCounter+=1
|
345
|
+
parentCounter+=1
|
346
|
+
end #end of the for loop for the tokens
|
347
|
+
#puts "here outside the for loop for tokens"
|
348
|
+
nouns = nil
|
349
|
+
verbs = nil
|
350
|
+
adjectives = nil
|
351
|
+
adverbs = nil
|
352
|
+
end #end of number of sentences in the text
|
353
|
+
|
354
|
+
@num_vertices = @num_vertices - 1 #since as a counter it was 1 ahead of the array's contents
|
355
|
+
@num_edges = @num_edges - 1 #same reason as for num_vertices
|
356
|
+
set_semantic_labels_for_edges
|
357
|
+
#print_graph(@edges, @vertices)
|
358
|
+
# puts("Number of edges:: #{@num_edges}")
|
359
|
+
# puts("Number of vertices:: #{@num_vertices}")
|
360
|
+
return @num_edges
|
361
|
+
end #end of the graphGenerate method
|
362
|
+
|
363
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
364
|
+
|
365
|
+
def search_vertices(list, s, index)
|
366
|
+
for i in (0..list.length-1)
|
367
|
+
if(!list[i].nil? and !s.nil?)
|
368
|
+
#if the vertex exists and in the same sentence (index)
|
369
|
+
if(list[i].name.casecmp(s) == 0 and list[i].index == index)
|
370
|
+
# puts("***** search_vertices:: Returning:: #{s}")
|
371
|
+
return list[i]
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
# puts("***** search_vertices:: Returning nil")
|
376
|
+
return nil
|
377
|
+
end #end of the search_vertices method
|
378
|
+
|
379
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
380
|
+
|
381
|
+
=begin
|
382
|
+
NULLIFY ALL VERTICES CONTAINING "ONLY SUBSTRINGS" (and not exact matches) OF THIS VERTEX IN THE SAME SENTENCE (verts[j].index == index)
|
383
|
+
And reset the @vertices array with non-null elements.
|
384
|
+
=end
|
385
|
+
def remove_redundant_vertices(s, index)
|
386
|
+
# puts "**** remove_redundant_vertices:: string #{s}"
|
387
|
+
j = @num_vertices - 1
|
388
|
+
verts = @vertices
|
389
|
+
while j >= 0
|
390
|
+
if(!verts[j].nil? and verts[j].index == index and s.casecmp(verts[j].name) != 0 and
|
391
|
+
(s.downcase.include?(verts[j].name.downcase) and verts[j].name.length > 1))
|
392
|
+
#the last 'length' condition is added so as to prevent "I" (an indiv. vertex) from being replaced by nil
|
393
|
+
# puts "*** string index = #{index}... verts[j].index = #{verts[j].index}"
|
394
|
+
# puts "**** remove_redundant_vertices setting #{verts[j].name} to nil!"
|
395
|
+
#search through all the edges and set those with this vertex as in-out- vertex to null
|
396
|
+
if(!@edges.nil?)
|
397
|
+
for i in 0..@edges.length - 1
|
398
|
+
edge = @edges[i]
|
399
|
+
if(!edge.nil? and (edge.in_vertex == verts[j] or edge.out_vertex == verts[j]))
|
400
|
+
# puts "edge #{edge.in_vertex.name} - #{edge.out_vertex.name}"
|
401
|
+
@edges[i] = nil #setting that edge to nil
|
402
|
+
end
|
403
|
+
end
|
404
|
+
end
|
405
|
+
#finally setting the vertex to null
|
406
|
+
verts[j] = nil
|
407
|
+
end
|
408
|
+
j-=1
|
409
|
+
end #end of while loop
|
410
|
+
|
411
|
+
# puts "**** remove_redundant_vertices Old @num_vertices:: #{@num_vertices}"
|
412
|
+
#recreating the vertices array without the nil values
|
413
|
+
counter = 0
|
414
|
+
vertices_array = Array.new
|
415
|
+
for i in (0..verts.length-1)
|
416
|
+
vertex = verts[i]
|
417
|
+
if(!vertex.nil?)
|
418
|
+
vertices_array << vertex
|
419
|
+
counter+=1
|
420
|
+
end
|
421
|
+
end
|
422
|
+
@vertices = vertices_array
|
423
|
+
@num_vertices = counter+1 #since @num_vertices is always one advanced of the last vertex
|
424
|
+
end
|
425
|
+
|
426
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
427
|
+
|
428
|
+
=begin
|
429
|
+
Checks to see if an edge between vertices "in" and "out" exists.
|
430
|
+
true - if an edge exists and false - if an edge doesn't exist
|
431
|
+
edge[] list, vertex in, vertex out, int index
|
432
|
+
=end
|
433
|
+
def search_edges(list, in_vertex, out, index)
|
434
|
+
edgePos = -1
|
435
|
+
if(list.nil?)#if the list is null
|
436
|
+
return edgePos
|
437
|
+
end
|
438
|
+
|
439
|
+
for i in (0..list.length-1)
|
440
|
+
if(!list[i].nil? and !list[i].in_vertex.nil? and !list[i].out_vertex.nil?)
|
441
|
+
#checking for exact match with an edge
|
442
|
+
if(((list[i].in_vertex.name.casecmp(in_vertex.name)==0 or list[i].in_vertex.name.include?(in_vertex.name)) and
|
443
|
+
(list[i].out_vertex.name.casecmp(out.name)==0 or list[i].out_vertex.name.include?(out.name))) or
|
444
|
+
((list[i].in_vertex.name.casecmp(out.name)==0 or list[i].in_vertex.name.include?(out.name)) and
|
445
|
+
(list[i].out_vertex.name.casecmp(in_vertex.name)==0 or list[i].out_vertex.name.include?(in_vertex.name))))
|
446
|
+
# puts("***** Found edge! : index:: #{index} list[i].index:: #{list[i].index}")
|
447
|
+
#if an edge was found
|
448
|
+
edgePos = i #returning its position in the array
|
449
|
+
#INCREMENT FREQUENCY IF THE EDGE WAS FOUND IN A DIFFERENT SENT. (CHECK BY MAINTAINING A TEXT NUMBER AND CHECKING IF THE NEW # IS DIFF FROM PREV #)
|
450
|
+
if(index != list[i].index)
|
451
|
+
list[i].frequency+=1
|
452
|
+
end
|
453
|
+
end
|
454
|
+
end
|
455
|
+
end #end of the for loop
|
456
|
+
return edgePos
|
457
|
+
end # end of searchdges
|
458
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
459
|
+
|
460
|
+
def search_edges_to_set_null(list, in_vertex, out, index)
|
461
|
+
edgePos = -1
|
462
|
+
# puts("***** Searching edge to set to null:: #{in_vertex.name} - #{out.name} ... num_edges #{@num_edges}")
|
463
|
+
for i in 0..@num_edges - 1
|
464
|
+
if(!list[i].nil? and !list[i].in_vertex.nil? and !list[i].out_vertex.nil?)
|
465
|
+
# puts "comparing with #{list[i].in_vertex.name} - #{list[i].out_vertex.name}"
|
466
|
+
#puts "#{list[i].in_vertex.name.downcase == in_vertex.name.downcase} - #{list[i].out_vertex.name.downcase == out.name.downcase}"
|
467
|
+
#checking for exact match with an edge
|
468
|
+
if((list[i].in_vertex.name.downcase == in_vertex.name.downcase and list[i].out_vertex.name.downcase == out.name.downcase) or
|
469
|
+
(list[i].in_vertex.name.downcase == out.name.downcase and list[i].out_vertex.name.downcase == in_vertex.name.downcase))
|
470
|
+
#if an edge was found
|
471
|
+
edgePos = i #returning its position in the array
|
472
|
+
#INCREMENT FREQUENCY IF THE EDGE WAS FOUND IN A DIFFERENT SENT. (CHECK BY MAINTAINING A TEXT NUMBER AND CHECKING IF THE NEW # IS DIFF FROM PREV #)
|
473
|
+
if(index != list[i].index)
|
474
|
+
list[i].frequency+=1
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end #end of the for loop
|
479
|
+
# puts("***** search_edges_to_set_null #{in_vertex.name} - #{out.name} returning:: #{edgePos}")
|
480
|
+
return edgePos
|
481
|
+
end # end of the method search_edges_to_set_null
|
482
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
483
|
+
=begin
|
484
|
+
NULLIFY ALL EDGES CONTAINING "ONLY SUBSTRINGS" (and not exact matches) OF EITHER IN/OUT VERTICES IN THE SAME SENTENCE (verts[j].index == index)
|
485
|
+
And reset the @edges array with non-null elements.
|
486
|
+
=end
|
487
|
+
|
488
|
+
def remove_redundant_edges(in_vertex, out, index)
|
489
|
+
list = @edges
|
490
|
+
j = @num_edges - 1
|
491
|
+
while j >= 0 do
|
492
|
+
if(!list[j].nil? and list[j].index == index)
|
493
|
+
#when invertices are eq and out-verts are substrings or vice versa
|
494
|
+
if(in_vertex.name.casecmp(list[j].in_vertex.name) == 0 and out.name.casecmp(list[j].out_vertex.name) != 0 and out.name.downcase.include?(list[j].out_vertex.name.downcase))
|
495
|
+
# puts("FOUND out_vertex match for edge:: #{list[j].in_vertex.name} - #{list[j].out_vertex.name}")
|
496
|
+
list[j] = nil
|
497
|
+
#@num_edges-=1
|
498
|
+
#when in-vertices are only substrings and out-verts are equal
|
499
|
+
elsif(in_vertex.name.casecmp(list[j].in_vertex.name)!=0 and in_vertex.name.downcase.include?(list[j].in_vertex.name.downcase) and out.name.casecmp(list[j].out_vertex.name)==0)
|
500
|
+
# puts("FOUND in_vertex match for edge: #{list[j].in_vertex.name} - #{list[j].out_vertex.name}")
|
501
|
+
list[j] = nil
|
502
|
+
#@num_edges-=1
|
503
|
+
end
|
504
|
+
end
|
505
|
+
j-=1
|
506
|
+
end #end of the while loop
|
507
|
+
# puts "**** search_edges:: Old number #{@num_edges}"
|
508
|
+
#recreating the edges array without the nil values
|
509
|
+
counter = 0
|
510
|
+
edges_array = Array.new
|
511
|
+
list.each{
|
512
|
+
|edge|
|
513
|
+
if(!edge.nil?)
|
514
|
+
# puts "edge:: #{edge.in_vertex.name} - #{edge.out_vertex.name}"
|
515
|
+
edges_array << edge
|
516
|
+
counter+=1
|
517
|
+
end
|
518
|
+
}
|
519
|
+
@edges = edges_array
|
520
|
+
@num_edges = counter+1
|
521
|
+
# puts "**** search_edges:: New number of edges #{@num_edges}"
|
522
|
+
end
|
523
|
+
|
524
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
525
|
+
def print_graph(edges, vertices)
|
526
|
+
puts("*** List of vertices::")
|
527
|
+
for j in (0..vertices.length-1)
|
528
|
+
if(!vertices[j].nil?)
|
529
|
+
puts("@@@ Vertex:: #{vertices[j].name}")
|
530
|
+
puts("*** Frequency:: #{vertices[j].frequency} State:: #{vertices[j].state}")
|
531
|
+
puts("*** Label:: #{vertices[j].label} Parent:: #{vertices[j].parent}")
|
532
|
+
end
|
533
|
+
end
|
534
|
+
puts("*******")
|
535
|
+
puts("*** List of edges::")
|
536
|
+
for j in (0..edges.length-1)
|
537
|
+
if(!edges[j].nil? and !edges[j].in_vertex.nil? and !edges[j].out_vertex.nil?)
|
538
|
+
puts("@@@ Edge:: #{edges[j].in_vertex.name} & #{edges[j].out_vertex.name}")
|
539
|
+
puts("*** Frequency:: #{edges[j].frequency} State:: #{edges[j].in_vertex.state} & #{edges[j].out_vertex.state}")
|
540
|
+
puts("*** Label:: #{edges[j].label}")
|
541
|
+
end
|
542
|
+
end
|
543
|
+
puts("--------------")
|
544
|
+
end #end of print_graph method
|
545
|
+
|
546
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
547
|
+
#Identifying parents and labels for the vertices
|
548
|
+
def find_parents(t)
|
549
|
+
# puts "Inside find_parents.. text #{t}"
|
550
|
+
tp = TextPreprocessing.new
|
551
|
+
unTaggedString = t.split(" ")
|
552
|
+
parents = Array.new
|
553
|
+
# t = text[i]
|
554
|
+
t = StanfordCoreNLP::Text.new(t) #the same variable has to be passed into the Textx.new method
|
555
|
+
@pipeline.annotate(t)
|
556
|
+
#for each sentence identify theparsed form of the sentence
|
557
|
+
sentence = t.get(:sentences).toArray
|
558
|
+
parsed_sentence = sentence[0].get(:collapsed_c_c_processed_dependencies)
|
559
|
+
#puts "parsed sentence #{parsed_sentence}"
|
560
|
+
#iterating through the set of tokens and identifying each token's parent
|
561
|
+
#puts "unTaggedString.length #{unTaggedString.length}"
|
562
|
+
for j in (0..unTaggedString.length - 1)
|
563
|
+
#puts "unTaggedString[#{j}] #{unTaggedString[j]}"
|
564
|
+
if(tp.is_punct(unTaggedString[j]))
|
565
|
+
next
|
566
|
+
end
|
567
|
+
if(tp.contains_punct(unTaggedString[j]))
|
568
|
+
unTaggedString[j] = tp.contains_punct(unTaggedString[j])
|
569
|
+
# puts "unTaggedString #{unTaggedString[j]} and #{tp.contains_punct_bool(unTaggedString[j])}"
|
570
|
+
end
|
571
|
+
if(!unTaggedString[j].nil? and !tp.contains_punct_bool(unTaggedString[j]))
|
572
|
+
pat = parsed_sentence.getAllNodesByWordPattern(unTaggedString[j])
|
573
|
+
pat = pat.toArray
|
574
|
+
parent = parsed_sentence.getParents(pat[0]).toArray
|
575
|
+
end
|
576
|
+
#puts "parent of #{unTaggedString[j]} is #{parent[0]}"
|
577
|
+
if(!parent.nil? and !parent[0].nil?)
|
578
|
+
parents[j] = (parent[0].to_s)[0..(parent[0].to_s).index("-")-1]#extracting the name of the parent (since it is in the foramt-> "name-POS")
|
579
|
+
#puts "parents[#{j}] = #{parents[j]}"
|
580
|
+
else
|
581
|
+
parents[j] = nil
|
582
|
+
end
|
583
|
+
end
|
584
|
+
return parents
|
585
|
+
end #end of find_parents method
|
586
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
587
|
+
#Identifying parents and labels for the vertices
|
588
|
+
def find_labels(t)
|
589
|
+
# puts "Inside find_labels"
|
590
|
+
unTaggedString = t.split(" ")
|
591
|
+
t = StanfordCoreNLP::Text.new(t)
|
592
|
+
@pipeline.annotate(t)
|
593
|
+
#for each sentence identify theparsed form of the sentence
|
594
|
+
sentence = t.get(:sentences).toArray
|
595
|
+
parsed_sentence = sentence[0].get(:collapsed_c_c_processed_dependencies)
|
596
|
+
labels = Array.new
|
597
|
+
labelCounter = 0
|
598
|
+
govDep = parsed_sentence.typedDependencies.toArray
|
599
|
+
#for each untagged token
|
600
|
+
for j in (0..unTaggedString.length - 1)
|
601
|
+
unTaggedString[j].gsub!(".", "")
|
602
|
+
unTaggedString[j].gsub!(",", "")
|
603
|
+
#puts "Label for #{unTaggedString[j]}"
|
604
|
+
#identify its corresponding position in govDep and fetch its label
|
605
|
+
for k in (0..govDep.length - 1)
|
606
|
+
#puts "Comparing with #{govDep[k].dep.value()}"
|
607
|
+
if(govDep[k].dep.value() == unTaggedString[j])
|
608
|
+
labels[j] = govDep[k].reln.getShortName()
|
609
|
+
#puts labels[j]
|
610
|
+
labelCounter+=1
|
611
|
+
break
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
return labels
|
616
|
+
end # end of find_labels method
|
617
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
618
|
+
=begin
|
619
|
+
* Setting semantic labels for edges based on the labels vertices have with their parents
|
620
|
+
=end
|
621
|
+
def set_semantic_labels_for_edges
|
622
|
+
# puts "*** inside set_semantic_labels_for_edges"
|
623
|
+
for i in (0.. @vertices.length - 1)
|
624
|
+
if(!@vertices[i].nil? and !@vertices[i].parent.nil?) #parent = null for ROOT
|
625
|
+
#search for the parent vertex
|
626
|
+
for j in (0..@vertices.length - 1)
|
627
|
+
if(!@vertices[j].nil? and (@vertices[j].name.casecmp(@vertices[i].parent) == 0 or
|
628
|
+
@vertices[j].name.downcase.include?(@vertices[i].parent.downcase)))
|
629
|
+
# puts("**Parent:: #{@vertices[j].name}")
|
630
|
+
parent = @vertices[j]
|
631
|
+
break #break out of search for the parent
|
632
|
+
end
|
633
|
+
end
|
634
|
+
if(!parent.nil?)#{
|
635
|
+
#check if an edge exists between vertices[i] and the parent
|
636
|
+
for k in (0..@edges.length - 1)
|
637
|
+
if(!@edges[k].nil? and !@edges[k].in_vertex.nil? and !@edges[k].out_vertex.nil?)
|
638
|
+
if((@edges[k].in_vertex.name.equal?(@vertices[i].name) and @edges[k].out_vertex.name.equal?(parent.name)) or (@edges[k].in_vertex.name.equal?(parent.name) and @edges[k].out_vertex.name.equal?(@vertices[i].name)))
|
639
|
+
#set the role label
|
640
|
+
if(@edges[k].label.nil?)
|
641
|
+
@edges[k].label = @vertices[i].label
|
642
|
+
elsif(!@edges[k].label.nil? and (@edges[k].label == "NMOD" or @edges[k].label == "PMOD") and (@vertices[i].label != "NMOD" or @vertices[i].label != "PMOD"))
|
643
|
+
@edges[k].label = @vertices[i].label
|
644
|
+
end
|
645
|
+
end
|
646
|
+
end
|
647
|
+
end
|
648
|
+
end#end of if paren.nil? condition
|
649
|
+
end
|
650
|
+
end #end of for loop
|
651
|
+
end #end of set_semantic_labels_for_edges method
|
652
|
+
|
653
|
+
end # end of the class GraphGenerator
|
654
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|
655
|
+
=begin
|
656
|
+
Identifying frequency of edges and pruning out edges that do no meet the threshold conditions
|
657
|
+
=end
|
658
|
+
def identify_frequency_and_prune_edges(edges, num)
|
659
|
+
# puts "inside frequency threshold! :: num #{num}"
|
660
|
+
#freqEdges maintains the top frequency edges from ALPHA_FREQ to BETA_FREQ
|
661
|
+
freqEdges = Array.new #from alpha = 3 to beta = 10
|
662
|
+
#iterating through all the edges
|
663
|
+
for j in (0..num-1)
|
664
|
+
if(!edges[j].nil?)
|
665
|
+
if(edges[j].frequency <= BETA_FREQ and edges[j].frequency >= ALPHA_FREQ and !freqEdges[edges[j].frequency-1].nil?)#{
|
666
|
+
for i in (0..freqEdges[edges[j].frequency-1].length - 1)#iterating to find i for which freqEdges is null
|
667
|
+
if(!freqEdges[edges[j].frequency-1][i].nil?)
|
668
|
+
break
|
669
|
+
end
|
670
|
+
end
|
671
|
+
freqEdges[edges[j].frequency-1][i] = edges[j]
|
672
|
+
end
|
673
|
+
end
|
674
|
+
end
|
675
|
+
selectedEdges = Array.new
|
676
|
+
#Selecting only those edges that satisfy the frequency condition [between ALPHA and BETA]
|
677
|
+
j = BETA_FREQ-1
|
678
|
+
while j >= ALPHA_FREQ-1 do
|
679
|
+
if(!freqEdges[j].nil?)
|
680
|
+
for i in (0..num-1)
|
681
|
+
if(!freqEdges[j][i].nil?)
|
682
|
+
selectedEdges[maxSelected] = freqEdges[j][i]
|
683
|
+
maxSelected+=1
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|
687
|
+
j-=1
|
688
|
+
end
|
689
|
+
|
690
|
+
if(maxSelected != 0)
|
691
|
+
@num_edges = maxSelected #replacing numEdges with the number of selected edges
|
692
|
+
end
|
693
|
+
return selectedEdges
|
694
|
+
end
|
695
|
+
#------------------------------------------#------------------------------------------#------------------------------------------
|