nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,451 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
A convolutional neural network for NLP tagging tasks such as dependency
|
5
|
+
parsing, where each token has another (or root) as a head.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
cimport numpy as np
|
10
|
+
|
11
|
+
cdef class ConvolutionalDependencyNetwork(ConvolutionalNetwork):
|
12
|
+
|
13
|
+
# the weights of all possible dependency among tokens
|
14
|
+
cdef readonly np.ndarray dependency_weights
|
15
|
+
|
16
|
+
# validation data
|
17
|
+
cdef validation_heads
|
18
|
+
|
19
|
+
def save(self):
|
20
|
+
"""
|
21
|
+
Saves the neural network to a file.
|
22
|
+
It will save the weights, biases, sizes, padding and
|
23
|
+
distance tables, and other feature tables.
|
24
|
+
"""
|
25
|
+
data = self._generate_save_dict()
|
26
|
+
|
27
|
+
np.savez(self.network_filename, **data)
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def load_from_file(cls, filename):
|
31
|
+
"""
|
32
|
+
Loads the neural network from a file.
|
33
|
+
It will load weights, biases, sizes, padding and
|
34
|
+
distance tables, and other feature tables.
|
35
|
+
"""
|
36
|
+
data = np.load(filename, allow_pickle=True)
|
37
|
+
nn = cls._load_from_file(data, filename)
|
38
|
+
|
39
|
+
return nn
|
40
|
+
|
41
|
+
def train(self, list sentences, list heads, int epochs,
|
42
|
+
int epochs_between_reports=0, float desired_accuracy=0,
|
43
|
+
list labels=None):
|
44
|
+
"""
|
45
|
+
Trains the convolutional network. Refer to the basic Network
|
46
|
+
train method for detailed explanation.
|
47
|
+
"""
|
48
|
+
# the ConvolutionalNetwork class was written primarily for SRL
|
49
|
+
# every token acts as a predicate, and we don't need to tell it explicitely
|
50
|
+
predicates = [np.arange(len(sentence)) for sentence in sentences]
|
51
|
+
|
52
|
+
# the last argument in ConvolutionalNetwork.train is actually the argument
|
53
|
+
# groups list. We use "labels" here just to signal that there is non-None
|
54
|
+
# argument, which is correctly handled by the DependencyNetwork._tag(...) method.
|
55
|
+
|
56
|
+
if self.validation_sentences is None:
|
57
|
+
self.set_validation_data(sentences, heads, labels)
|
58
|
+
|
59
|
+
super(ConvolutionalDependencyNetwork, self).train(sentences, predicates,
|
60
|
+
heads, epochs,
|
61
|
+
epochs_between_reports,
|
62
|
+
desired_accuracy,
|
63
|
+
labels)
|
64
|
+
|
65
|
+
def set_validation_data(self, list sentences, list heads, list labels=None):
|
66
|
+
"""
|
67
|
+
Sets the data to be used in validation during training. If this function
|
68
|
+
is not called before training, the training data itself is used to
|
69
|
+
measure the model's performance.
|
70
|
+
|
71
|
+
:param labels: only used when learning labels
|
72
|
+
"""
|
73
|
+
self.validation_sentences = sentences
|
74
|
+
self.validation_tags = labels
|
75
|
+
self.validation_heads = heads
|
76
|
+
|
77
|
+
def _tag_sentence(self, sentence, predicates=None, heads=None, labels=None):
|
78
|
+
"""
|
79
|
+
This function is just an interface to the _tag_sentence signature
|
80
|
+
defined in ConvolutionalNetwork.
|
81
|
+
"""
|
82
|
+
if labels is None:
|
83
|
+
self._tag_sentence_unlabeled_dependency(sentence, heads)
|
84
|
+
else:
|
85
|
+
self._tag_sentence_labeled_dependency(sentence, heads, labels)
|
86
|
+
|
87
|
+
def _tag_sentence_unlabeled_dependency(self, np.ndarray sentence, np.ndarray heads=None):
|
88
|
+
"""
|
89
|
+
Run the network for the unlabeled dependency task.
|
90
|
+
A graph with all weights for possible dependencies is built
|
91
|
+
and the final answer is obtained applying the Chu-Liu-Edmond's
|
92
|
+
algorithm.
|
93
|
+
"""
|
94
|
+
training = heads is not None
|
95
|
+
self._pre_tagging_setup(sentence, training)
|
96
|
+
|
97
|
+
num_tokens = len(sentence)
|
98
|
+
# dependency_weights [i, j] has the score for token i having j as a head.
|
99
|
+
# the main diagonal has the values for dependencies from the root and is
|
100
|
+
# later copied to the last column for easier processing
|
101
|
+
self.dependency_weights = np.empty((num_tokens, num_tokens + 1))
|
102
|
+
|
103
|
+
cdef np.ndarray[FLOAT_t, ndim=1] token_scores
|
104
|
+
|
105
|
+
# in the SRL parlance, each token is treated as a predicate, because all
|
106
|
+
# sentence tokens are scored with respect to it (in order to determine the
|
107
|
+
# dependency weights)
|
108
|
+
for token in range(num_tokens):
|
109
|
+
|
110
|
+
# _sentence_convolution returns a 2-dim array. in dep parsing,
|
111
|
+
# we only have one dimension, so reshape it
|
112
|
+
token_scores = self._sentence_convolution(sentence, token,
|
113
|
+
training=training).reshape(num_tokens)
|
114
|
+
self.dependency_weights[token, :-1] = token_scores
|
115
|
+
|
116
|
+
if training:
|
117
|
+
head = heads[token]
|
118
|
+
|
119
|
+
if self._calculate_gradients(head, token_scores):
|
120
|
+
self._backpropagate()
|
121
|
+
self._calculate_input_deltas(sentence, token)
|
122
|
+
self._adjust_weights(token)
|
123
|
+
self._adjust_features(sentence, token)
|
124
|
+
|
125
|
+
# copy dependency weights from the root to each token to the last column and
|
126
|
+
# effectively ignore the main diagonal (dependency to the token itself)
|
127
|
+
self.dependency_weights[np.arange(num_tokens),
|
128
|
+
-1] = self.dependency_weights.diagonal()
|
129
|
+
np.fill_diagonal(self.dependency_weights, -np.Infinity)
|
130
|
+
answer = self._find_maximum_spanning_tree()
|
131
|
+
|
132
|
+
return answer
|
133
|
+
|
134
|
+
def _tag_sentence_labeled_dependency(self, np.ndarray sentence, np.ndarray heads,
|
135
|
+
np.ndarray labels=None):
|
136
|
+
"""
|
137
|
+
Run the network for labeling pre determined dependency edges between tokens.
|
138
|
+
This is similar to the classification step in SRL.
|
139
|
+
"""
|
140
|
+
cdef np.ndarray[FLOAT_t, ndim=1] answer
|
141
|
+
cdef np.ndarray[FLOAT_t, ndim=2] scores
|
142
|
+
training = labels is not None
|
143
|
+
self._pre_tagging_setup(sentence, training)
|
144
|
+
|
145
|
+
answer = np.zeros(len(sentence))
|
146
|
+
|
147
|
+
# as in unlabeled dependency, each token is treated as a predicate from the
|
148
|
+
# SRL point of view. The only target is its head
|
149
|
+
for token in range(len(sentence)):
|
150
|
+
head = heads[token]
|
151
|
+
|
152
|
+
# weird format just to take advantage of the SRL classification code
|
153
|
+
# it means that the target starts at position *head* and ends at *head*
|
154
|
+
head = [[head, head]]
|
155
|
+
|
156
|
+
# it will return a 2-dim array, but we only have one target
|
157
|
+
# argmax() works as expected
|
158
|
+
scores = self._sentence_convolution(sentence, token, head, training)
|
159
|
+
answer[token] = scores.argmax()
|
160
|
+
|
161
|
+
if training:
|
162
|
+
label = labels[token]
|
163
|
+
if self._calculate_gradients_classify([label], scores):
|
164
|
+
self._backpropagate()
|
165
|
+
self._calculate_input_deltas(sentence, token, head)
|
166
|
+
self._adjust_weights(token, head)
|
167
|
+
self._adjust_features(sentence, token)
|
168
|
+
|
169
|
+
|
170
|
+
return answer
|
171
|
+
|
172
|
+
def _calculate_gradients(self, gold_head, scores):
|
173
|
+
"""
|
174
|
+
Calculate the gradients to be applied in the backpropagation. Gradients
|
175
|
+
are calculated after the network has output the scores for assigning
|
176
|
+
each token as head of a given token.
|
177
|
+
|
178
|
+
We aim at maximizing the log probability of the right head:
|
179
|
+
log(p(head)) = score(head) - logadd(scores for all heads)
|
180
|
+
|
181
|
+
:param gold_head: the index of the token that should have the highest
|
182
|
+
score
|
183
|
+
:param scores: the scores output by the network
|
184
|
+
:returns: if True, normal gradient calculation was performed.
|
185
|
+
If False, the error was too low and weight correction should be
|
186
|
+
skipped.
|
187
|
+
"""
|
188
|
+
# first, set the gradient at each token to
|
189
|
+
# -exp(score(token)) / sum_j exp(score(token_j))
|
190
|
+
# i.e., the negative of its probability
|
191
|
+
cdef np.ndarray[FLOAT_t, ndim=1] exp_scores = np.exp(scores)
|
192
|
+
exp_sum = np.sum(exp_scores)
|
193
|
+
self.net_gradients = -exp_scores / exp_sum
|
194
|
+
error = 1 + self.net_gradients[gold_head]
|
195
|
+
self.error += error
|
196
|
+
|
197
|
+
# check if the error is too small - if so, not worth to continue
|
198
|
+
if error <= 0.01:
|
199
|
+
self.skips += 1
|
200
|
+
return False
|
201
|
+
|
202
|
+
# and add 1 to the right head
|
203
|
+
self.net_gradients[gold_head] += 1
|
204
|
+
|
205
|
+
# the ConvolutionalNetwork class deals with multi dimensional gradients
|
206
|
+
# (because of more than one output neuron), so let's reshape
|
207
|
+
new_shape = (self.net_gradients.shape[0], 1)
|
208
|
+
self.net_gradients = self.net_gradients.reshape(new_shape)
|
209
|
+
|
210
|
+
return True
|
211
|
+
|
212
|
+
def _validate(self):
|
213
|
+
"""
|
214
|
+
Evaluate the network performance by token hit and whole sentence hit.
|
215
|
+
"""
|
216
|
+
hits = 0
|
217
|
+
num_tokens = 0
|
218
|
+
sentence_hits = 0
|
219
|
+
|
220
|
+
for i in range(len(self.validation_sentences)):
|
221
|
+
sent = self.validation_sentences[i]
|
222
|
+
heads = self.validation_heads[i]
|
223
|
+
sentence_hit = True
|
224
|
+
|
225
|
+
if self.validation_tags is None:
|
226
|
+
# unlabeled dependency
|
227
|
+
answer = self._tag_sentence_unlabeled_dependency(sent)
|
228
|
+
gold_tags = heads
|
229
|
+
else:
|
230
|
+
# labeled dependency
|
231
|
+
gold_tags = self.validation_tags[i]
|
232
|
+
answer = self._tag_sentence_labeled_dependency(sent, heads)
|
233
|
+
|
234
|
+
for j in range(len(gold_tags)):
|
235
|
+
net_tag = answer[j]
|
236
|
+
gold_tag = gold_tags[j]
|
237
|
+
|
238
|
+
if net_tag == gold_tag or (gold_tag == j and net_tag == len(sent)):
|
239
|
+
hits += 1
|
240
|
+
else:
|
241
|
+
sentence_hit = False
|
242
|
+
|
243
|
+
if sentence_hit:
|
244
|
+
sentence_hits += 1
|
245
|
+
num_tokens += len(sent)
|
246
|
+
|
247
|
+
self.accuracy = float(hits) / num_tokens
|
248
|
+
self.sentence_accuracy = float(sentence_hits) / len(self.validation_sentences)
|
249
|
+
|
250
|
+
|
251
|
+
def _average_error(self):
|
252
|
+
"""
|
253
|
+
Average the network error over tokens.
|
254
|
+
"""
|
255
|
+
self.error = self.error / self.num_tokens
|
256
|
+
|
257
|
+
def _print_epoch_report(self, int num):
|
258
|
+
"""
|
259
|
+
Reports the status of the network in the given training
|
260
|
+
epoch, including error, token and sentence accuracy.
|
261
|
+
"""
|
262
|
+
logger = logging.getLogger("Logger")
|
263
|
+
logger.info("%d epochs Error: %f Token accuracy: %f " \
|
264
|
+
"Sentence accuracy: %f " \
|
265
|
+
"%d corrections skipped " \
|
266
|
+
"Learning rate: %f" % (num,
|
267
|
+
self.error,
|
268
|
+
self.accuracy,
|
269
|
+
self.sentence_accuracy,
|
270
|
+
self.skips,
|
271
|
+
self.learning_rate))
|
272
|
+
|
273
|
+
def tag_sentence(self, np.ndarray sentence, np.ndarray heads=None):
|
274
|
+
"""
|
275
|
+
If heads is not given, compute the dependency edges in the sentence.
|
276
|
+
If it is given, compute the label of each dependency.
|
277
|
+
|
278
|
+
:returns: a numpy 1-dim array with the head for each token or label
|
279
|
+
of each edge. In the first case, a dependency from the root is
|
280
|
+
represented as a value equal to the sentence length.
|
281
|
+
"""
|
282
|
+
if heads is None:
|
283
|
+
return self._tag_sentence_unlabeled_dependency(sentence)
|
284
|
+
else:
|
285
|
+
return self._tag_sentence_labeled_dependency(sentence, heads)
|
286
|
+
|
287
|
+
def _find_cycles(self, np.ndarray graph):
|
288
|
+
"""
|
289
|
+
Check if the given graph has cycles and returns the first one
|
290
|
+
to be found.
|
291
|
+
|
292
|
+
:param graph: an array where graph[i] has the number of a
|
293
|
+
vertex with an incoming connection to i
|
294
|
+
"""
|
295
|
+
# this set stores all vertices with a valid path to the root
|
296
|
+
reachable_vertices = set()
|
297
|
+
|
298
|
+
# vertices known to be unreachable from the root, i.e., in a cycle
|
299
|
+
vertices_in_cycles = set()
|
300
|
+
|
301
|
+
# vertices currently being evaluated, not known if they're reachable
|
302
|
+
visited = set()
|
303
|
+
|
304
|
+
cycles = []
|
305
|
+
|
306
|
+
# the directions of the edges don't matter if we only want to find cycles
|
307
|
+
for vertex in range(len(graph)):
|
308
|
+
if vertex in reachable_vertices or vertex in vertices_in_cycles:
|
309
|
+
continue
|
310
|
+
|
311
|
+
cycle = self._find_cycle_recursive(graph, vertex, visited,
|
312
|
+
reachable_vertices, vertices_in_cycles)
|
313
|
+
if cycle is not None:
|
314
|
+
cycles.append(cycle)
|
315
|
+
|
316
|
+
return cycles
|
317
|
+
|
318
|
+
def _find_cycle_recursive(self, np.ndarray graph, int vertex, set visited,
|
319
|
+
set reachable, set unreachable):
|
320
|
+
"""
|
321
|
+
Auxiliary recursive function for searching the graph for cycles.
|
322
|
+
It returns the first cycle it can find starting from the given
|
323
|
+
vertex, or None.
|
324
|
+
"""
|
325
|
+
next_vertex = graph[vertex]
|
326
|
+
root = len(graph)
|
327
|
+
visited.add(vertex)
|
328
|
+
|
329
|
+
if next_vertex == root or next_vertex in reachable:
|
330
|
+
# vertex linked to root
|
331
|
+
reachable.update(visited)
|
332
|
+
visited.clear()
|
333
|
+
cycle = None
|
334
|
+
|
335
|
+
elif next_vertex in visited:
|
336
|
+
# cycle detected! return all vertices in it
|
337
|
+
visited.clear()
|
338
|
+
cycle = set([vertex])
|
339
|
+
while next_vertex != vertex:
|
340
|
+
cycle.add(next_vertex)
|
341
|
+
next_vertex = graph[next_vertex]
|
342
|
+
|
343
|
+
unreachable.update(cycle)
|
344
|
+
|
345
|
+
elif next_vertex in unreachable:
|
346
|
+
# vertex linked to an existing cycle, but not part of it
|
347
|
+
# (if it were, it should have been filtered out in _find_cycles)
|
348
|
+
visited.clear()
|
349
|
+
cycle = None
|
350
|
+
|
351
|
+
else:
|
352
|
+
# continue checking
|
353
|
+
cycle = self._find_cycle_recursive(graph, next_vertex, visited,
|
354
|
+
reachable, unreachable)
|
355
|
+
|
356
|
+
return cycle
|
357
|
+
|
358
|
+
def _contract_cycle(self, heads, cycle):
|
359
|
+
"""
|
360
|
+
Contract the given cycle in the dependency graph.
|
361
|
+
|
362
|
+
:param heads: list of the heads of each token, such that
|
363
|
+
heads[i] contains the head for the i-th token
|
364
|
+
:param cycle: a set containing the numbers of the vertices
|
365
|
+
in the cycle
|
366
|
+
"""
|
367
|
+
# each cell i, j has the score for token i having j as its head.
|
368
|
+
|
369
|
+
num_vertices = self.dependency_weights.shape[1]
|
370
|
+
outside = np.array([x for x in range(num_vertices) if x not in cycle])
|
371
|
+
cycle = np.array(list(cycle))
|
372
|
+
|
373
|
+
cdef np.ndarray[FLOAT_t, ndim=2] outgoing_weights
|
374
|
+
|
375
|
+
# adjustments will be made on incoming and outgoing edges
|
376
|
+
# pick the part of the weight matrix that contain them
|
377
|
+
|
378
|
+
# if len(outside) == 1, it means all vertices except for the root are in a cycle
|
379
|
+
if len(outside) > 1:
|
380
|
+
# weird index array we need in order to properly use fancy indexing
|
381
|
+
# -1 because we can't take the root now
|
382
|
+
outside_inds = np.array([[i] for i in outside[:-1]])
|
383
|
+
outgoing_weights = self.dependency_weights[outside_inds, cycle]
|
384
|
+
|
385
|
+
# the cycle should have only one outgoing edge for each vertex outside it
|
386
|
+
# so, search the maximum outgoing edge to each outside vertex
|
387
|
+
max_outgoing_inds = outgoing_weights.argmax(1)
|
388
|
+
max_outgoing_weights = outgoing_weights.max(1)
|
389
|
+
|
390
|
+
# set every outgoing weight to -inf and then restore the highest ones
|
391
|
+
outgoing_weights[:] = -np.Infinity
|
392
|
+
outgoing_weights[np.arange(len(outside_inds)),
|
393
|
+
max_outgoing_inds] = max_outgoing_weights
|
394
|
+
self.dependency_weights[outside_inds, cycle] = outgoing_weights
|
395
|
+
|
396
|
+
# now, adjust incoming edges. the incoming edge from each vertex v
|
397
|
+
# (outside the cycle) to v' (inside) is reweighted as:
|
398
|
+
# s(v, v') = s(v, v') - s(head(v'), v') + s(c)
|
399
|
+
# and then we pick the highest edge for each outside vertex
|
400
|
+
# s(c) = sum_v s(head(v), v)
|
401
|
+
cycle_inds = np.array([[i] for i in cycle])
|
402
|
+
cdef np.ndarray[FLOAT_t, ndim=2] incoming_weights
|
403
|
+
incoming_weights = self.dependency_weights[cycle_inds, outside]
|
404
|
+
|
405
|
+
cycle_score = 0
|
406
|
+
for i, vertex in enumerate(cycle):
|
407
|
+
head_to_v = self.dependency_weights[vertex, heads[vertex]]
|
408
|
+
cycle_score += head_to_v
|
409
|
+
incoming_weights[i] -= head_to_v
|
410
|
+
|
411
|
+
max_incoming_inds = incoming_weights.argmax(0)
|
412
|
+
max_incoming_weights = incoming_weights.max(0)
|
413
|
+
# we leave the + s(c) to the end
|
414
|
+
max_incoming_weights += cycle_score
|
415
|
+
|
416
|
+
# the vertex with the maximum weighted incoming edge now changes
|
417
|
+
# its head, thus breaking the cycle
|
418
|
+
new_head_ind = max_incoming_weights.argmax()
|
419
|
+
vertex_leaving_cycle_ind = max_incoming_inds[new_head_ind]
|
420
|
+
|
421
|
+
new_head = outside[new_head_ind]
|
422
|
+
vertex_leaving_cycle = cycle[vertex_leaving_cycle_ind]
|
423
|
+
old_head = heads[vertex_leaving_cycle]
|
424
|
+
heads[vertex_leaving_cycle] = new_head
|
425
|
+
self.dependency_weights[vertex_leaving_cycle, old_head] = -np.Infinity
|
426
|
+
|
427
|
+
# analagous to the outgoing weights
|
428
|
+
incoming_weights[:] = -np.Infinity
|
429
|
+
incoming_weights[max_incoming_inds,
|
430
|
+
np.arange(len(outside))] = max_incoming_weights
|
431
|
+
self.dependency_weights[cycle_inds, outside] = incoming_weights
|
432
|
+
|
433
|
+
def _find_maximum_spanning_tree(self):
|
434
|
+
"""
|
435
|
+
Run the Chu-Liu / Edmond's algorithm in order to find the highest
|
436
|
+
scoring dependency tree from the dependency graph weights.
|
437
|
+
|
438
|
+
:returns: a 1-dim array with the head of each token in the sentence
|
439
|
+
"""
|
440
|
+
# pick the highest scoring dependency for each word
|
441
|
+
heads = self.dependency_weights.argmax(1)
|
442
|
+
|
443
|
+
# check if there are cycles. if there isn't any, we're done
|
444
|
+
cycles = self._find_cycles(heads)
|
445
|
+
|
446
|
+
for cycle in cycles:
|
447
|
+
# resolve each cycle c
|
448
|
+
self._contract_cycle(heads, cycle)
|
449
|
+
|
450
|
+
return heads
|
451
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
from .parse_reader import DependencyReader
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|