nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,451 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ A convolutional neural network for NLP tagging tasks such as dependency
5
+ parsing, where each token has another (or root) as a head.
6
+ """
7
+
8
+ import numpy as np
9
+ cimport numpy as np
10
+
11
+ cdef class ConvolutionalDependencyNetwork(ConvolutionalNetwork):
12
+
13
+ # the weights of all possible dependency among tokens
14
+ cdef readonly np.ndarray dependency_weights
15
+
16
+ # validation data
17
+ cdef validation_heads
18
+
19
+ def save(self):
20
+ """
21
+ Saves the neural network to a file.
22
+ It will save the weights, biases, sizes, padding and
23
+ distance tables, and other feature tables.
24
+ """
25
+ data = self._generate_save_dict()
26
+
27
+ np.savez(self.network_filename, **data)
28
+
29
+ @classmethod
30
+ def load_from_file(cls, filename):
31
+ """
32
+ Loads the neural network from a file.
33
+ It will load weights, biases, sizes, padding and
34
+ distance tables, and other feature tables.
35
+ """
36
+ data = np.load(filename, allow_pickle=True)
37
+ nn = cls._load_from_file(data, filename)
38
+
39
+ return nn
40
+
41
+ def train(self, list sentences, list heads, int epochs,
42
+ int epochs_between_reports=0, float desired_accuracy=0,
43
+ list labels=None):
44
+ """
45
+ Trains the convolutional network. Refer to the basic Network
46
+ train method for detailed explanation.
47
+ """
48
+ # the ConvolutionalNetwork class was written primarily for SRL
49
+ # every token acts as a predicate, and we don't need to tell it explicitely
50
+ predicates = [np.arange(len(sentence)) for sentence in sentences]
51
+
52
+ # the last argument in ConvolutionalNetwork.train is actually the argument
53
+ # groups list. We use "labels" here just to signal that there is non-None
54
+ # argument, which is correctly handled by the DependencyNetwork._tag(...) method.
55
+
56
+ if self.validation_sentences is None:
57
+ self.set_validation_data(sentences, heads, labels)
58
+
59
+ super(ConvolutionalDependencyNetwork, self).train(sentences, predicates,
60
+ heads, epochs,
61
+ epochs_between_reports,
62
+ desired_accuracy,
63
+ labels)
64
+
65
+ def set_validation_data(self, list sentences, list heads, list labels=None):
66
+ """
67
+ Sets the data to be used in validation during training. If this function
68
+ is not called before training, the training data itself is used to
69
+ measure the model's performance.
70
+
71
+ :param labels: only used when learning labels
72
+ """
73
+ self.validation_sentences = sentences
74
+ self.validation_tags = labels
75
+ self.validation_heads = heads
76
+
77
+ def _tag_sentence(self, sentence, predicates=None, heads=None, labels=None):
78
+ """
79
+ This function is just an interface to the _tag_sentence signature
80
+ defined in ConvolutionalNetwork.
81
+ """
82
+ if labels is None:
83
+ self._tag_sentence_unlabeled_dependency(sentence, heads)
84
+ else:
85
+ self._tag_sentence_labeled_dependency(sentence, heads, labels)
86
+
87
+ def _tag_sentence_unlabeled_dependency(self, np.ndarray sentence, np.ndarray heads=None):
88
+ """
89
+ Run the network for the unlabeled dependency task.
90
+ A graph with all weights for possible dependencies is built
91
+ and the final answer is obtained applying the Chu-Liu-Edmond's
92
+ algorithm.
93
+ """
94
+ training = heads is not None
95
+ self._pre_tagging_setup(sentence, training)
96
+
97
+ num_tokens = len(sentence)
98
+ # dependency_weights [i, j] has the score for token i having j as a head.
99
+ # the main diagonal has the values for dependencies from the root and is
100
+ # later copied to the last column for easier processing
101
+ self.dependency_weights = np.empty((num_tokens, num_tokens + 1))
102
+
103
+ cdef np.ndarray[FLOAT_t, ndim=1] token_scores
104
+
105
+ # in the SRL parlance, each token is treated as a predicate, because all
106
+ # sentence tokens are scored with respect to it (in order to determine the
107
+ # dependency weights)
108
+ for token in range(num_tokens):
109
+
110
+ # _sentence_convolution returns a 2-dim array. in dep parsing,
111
+ # we only have one dimension, so reshape it
112
+ token_scores = self._sentence_convolution(sentence, token,
113
+ training=training).reshape(num_tokens)
114
+ self.dependency_weights[token, :-1] = token_scores
115
+
116
+ if training:
117
+ head = heads[token]
118
+
119
+ if self._calculate_gradients(head, token_scores):
120
+ self._backpropagate()
121
+ self._calculate_input_deltas(sentence, token)
122
+ self._adjust_weights(token)
123
+ self._adjust_features(sentence, token)
124
+
125
+ # copy dependency weights from the root to each token to the last column and
126
+ # effectively ignore the main diagonal (dependency to the token itself)
127
+ self.dependency_weights[np.arange(num_tokens),
128
+ -1] = self.dependency_weights.diagonal()
129
+ np.fill_diagonal(self.dependency_weights, -np.Infinity)
130
+ answer = self._find_maximum_spanning_tree()
131
+
132
+ return answer
133
+
134
+ def _tag_sentence_labeled_dependency(self, np.ndarray sentence, np.ndarray heads,
135
+ np.ndarray labels=None):
136
+ """
137
+ Run the network for labeling pre determined dependency edges between tokens.
138
+ This is similar to the classification step in SRL.
139
+ """
140
+ cdef np.ndarray[FLOAT_t, ndim=1] answer
141
+ cdef np.ndarray[FLOAT_t, ndim=2] scores
142
+ training = labels is not None
143
+ self._pre_tagging_setup(sentence, training)
144
+
145
+ answer = np.zeros(len(sentence))
146
+
147
+ # as in unlabeled dependency, each token is treated as a predicate from the
148
+ # SRL point of view. The only target is its head
149
+ for token in range(len(sentence)):
150
+ head = heads[token]
151
+
152
+ # weird format just to take advantage of the SRL classification code
153
+ # it means that the target starts at position *head* and ends at *head*
154
+ head = [[head, head]]
155
+
156
+ # it will return a 2-dim array, but we only have one target
157
+ # argmax() works as expected
158
+ scores = self._sentence_convolution(sentence, token, head, training)
159
+ answer[token] = scores.argmax()
160
+
161
+ if training:
162
+ label = labels[token]
163
+ if self._calculate_gradients_classify([label], scores):
164
+ self._backpropagate()
165
+ self._calculate_input_deltas(sentence, token, head)
166
+ self._adjust_weights(token, head)
167
+ self._adjust_features(sentence, token)
168
+
169
+
170
+ return answer
171
+
172
+ def _calculate_gradients(self, gold_head, scores):
173
+ """
174
+ Calculate the gradients to be applied in the backpropagation. Gradients
175
+ are calculated after the network has output the scores for assigning
176
+ each token as head of a given token.
177
+
178
+ We aim at maximizing the log probability of the right head:
179
+ log(p(head)) = score(head) - logadd(scores for all heads)
180
+
181
+ :param gold_head: the index of the token that should have the highest
182
+ score
183
+ :param scores: the scores output by the network
184
+ :returns: if True, normal gradient calculation was performed.
185
+ If False, the error was too low and weight correction should be
186
+ skipped.
187
+ """
188
+ # first, set the gradient at each token to
189
+ # -exp(score(token)) / sum_j exp(score(token_j))
190
+ # i.e., the negative of its probability
191
+ cdef np.ndarray[FLOAT_t, ndim=1] exp_scores = np.exp(scores)
192
+ exp_sum = np.sum(exp_scores)
193
+ self.net_gradients = -exp_scores / exp_sum
194
+ error = 1 + self.net_gradients[gold_head]
195
+ self.error += error
196
+
197
+ # check if the error is too small - if so, not worth to continue
198
+ if error <= 0.01:
199
+ self.skips += 1
200
+ return False
201
+
202
+ # and add 1 to the right head
203
+ self.net_gradients[gold_head] += 1
204
+
205
+ # the ConvolutionalNetwork class deals with multi dimensional gradients
206
+ # (because of more than one output neuron), so let's reshape
207
+ new_shape = (self.net_gradients.shape[0], 1)
208
+ self.net_gradients = self.net_gradients.reshape(new_shape)
209
+
210
+ return True
211
+
212
+ def _validate(self):
213
+ """
214
+ Evaluate the network performance by token hit and whole sentence hit.
215
+ """
216
+ hits = 0
217
+ num_tokens = 0
218
+ sentence_hits = 0
219
+
220
+ for i in range(len(self.validation_sentences)):
221
+ sent = self.validation_sentences[i]
222
+ heads = self.validation_heads[i]
223
+ sentence_hit = True
224
+
225
+ if self.validation_tags is None:
226
+ # unlabeled dependency
227
+ answer = self._tag_sentence_unlabeled_dependency(sent)
228
+ gold_tags = heads
229
+ else:
230
+ # labeled dependency
231
+ gold_tags = self.validation_tags[i]
232
+ answer = self._tag_sentence_labeled_dependency(sent, heads)
233
+
234
+ for j in range(len(gold_tags)):
235
+ net_tag = answer[j]
236
+ gold_tag = gold_tags[j]
237
+
238
+ if net_tag == gold_tag or (gold_tag == j and net_tag == len(sent)):
239
+ hits += 1
240
+ else:
241
+ sentence_hit = False
242
+
243
+ if sentence_hit:
244
+ sentence_hits += 1
245
+ num_tokens += len(sent)
246
+
247
+ self.accuracy = float(hits) / num_tokens
248
+ self.sentence_accuracy = float(sentence_hits) / len(self.validation_sentences)
249
+
250
+
251
+ def _average_error(self):
252
+ """
253
+ Average the network error over tokens.
254
+ """
255
+ self.error = self.error / self.num_tokens
256
+
257
+ def _print_epoch_report(self, int num):
258
+ """
259
+ Reports the status of the network in the given training
260
+ epoch, including error, token and sentence accuracy.
261
+ """
262
+ logger = logging.getLogger("Logger")
263
+ logger.info("%d epochs Error: %f Token accuracy: %f " \
264
+ "Sentence accuracy: %f " \
265
+ "%d corrections skipped " \
266
+ "Learning rate: %f" % (num,
267
+ self.error,
268
+ self.accuracy,
269
+ self.sentence_accuracy,
270
+ self.skips,
271
+ self.learning_rate))
272
+
273
+ def tag_sentence(self, np.ndarray sentence, np.ndarray heads=None):
274
+ """
275
+ If heads is not given, compute the dependency edges in the sentence.
276
+ If it is given, compute the label of each dependency.
277
+
278
+ :returns: a numpy 1-dim array with the head for each token or label
279
+ of each edge. In the first case, a dependency from the root is
280
+ represented as a value equal to the sentence length.
281
+ """
282
+ if heads is None:
283
+ return self._tag_sentence_unlabeled_dependency(sentence)
284
+ else:
285
+ return self._tag_sentence_labeled_dependency(sentence, heads)
286
+
287
+ def _find_cycles(self, np.ndarray graph):
288
+ """
289
+ Check if the given graph has cycles and returns the first one
290
+ to be found.
291
+
292
+ :param graph: an array where graph[i] has the number of a
293
+ vertex with an incoming connection to i
294
+ """
295
+ # this set stores all vertices with a valid path to the root
296
+ reachable_vertices = set()
297
+
298
+ # vertices known to be unreachable from the root, i.e., in a cycle
299
+ vertices_in_cycles = set()
300
+
301
+ # vertices currently being evaluated, not known if they're reachable
302
+ visited = set()
303
+
304
+ cycles = []
305
+
306
+ # the directions of the edges don't matter if we only want to find cycles
307
+ for vertex in range(len(graph)):
308
+ if vertex in reachable_vertices or vertex in vertices_in_cycles:
309
+ continue
310
+
311
+ cycle = self._find_cycle_recursive(graph, vertex, visited,
312
+ reachable_vertices, vertices_in_cycles)
313
+ if cycle is not None:
314
+ cycles.append(cycle)
315
+
316
+ return cycles
317
+
318
+ def _find_cycle_recursive(self, np.ndarray graph, int vertex, set visited,
319
+ set reachable, set unreachable):
320
+ """
321
+ Auxiliary recursive function for searching the graph for cycles.
322
+ It returns the first cycle it can find starting from the given
323
+ vertex, or None.
324
+ """
325
+ next_vertex = graph[vertex]
326
+ root = len(graph)
327
+ visited.add(vertex)
328
+
329
+ if next_vertex == root or next_vertex in reachable:
330
+ # vertex linked to root
331
+ reachable.update(visited)
332
+ visited.clear()
333
+ cycle = None
334
+
335
+ elif next_vertex in visited:
336
+ # cycle detected! return all vertices in it
337
+ visited.clear()
338
+ cycle = set([vertex])
339
+ while next_vertex != vertex:
340
+ cycle.add(next_vertex)
341
+ next_vertex = graph[next_vertex]
342
+
343
+ unreachable.update(cycle)
344
+
345
+ elif next_vertex in unreachable:
346
+ # vertex linked to an existing cycle, but not part of it
347
+ # (if it were, it should have been filtered out in _find_cycles)
348
+ visited.clear()
349
+ cycle = None
350
+
351
+ else:
352
+ # continue checking
353
+ cycle = self._find_cycle_recursive(graph, next_vertex, visited,
354
+ reachable, unreachable)
355
+
356
+ return cycle
357
+
358
+ def _contract_cycle(self, heads, cycle):
359
+ """
360
+ Contract the given cycle in the dependency graph.
361
+
362
+ :param heads: list of the heads of each token, such that
363
+ heads[i] contains the head for the i-th token
364
+ :param cycle: a set containing the numbers of the vertices
365
+ in the cycle
366
+ """
367
+ # each cell i, j has the score for token i having j as its head.
368
+
369
+ num_vertices = self.dependency_weights.shape[1]
370
+ outside = np.array([x for x in range(num_vertices) if x not in cycle])
371
+ cycle = np.array(list(cycle))
372
+
373
+ cdef np.ndarray[FLOAT_t, ndim=2] outgoing_weights
374
+
375
+ # adjustments will be made on incoming and outgoing edges
376
+ # pick the part of the weight matrix that contain them
377
+
378
+ # if len(outside) == 1, it means all vertices except for the root are in a cycle
379
+ if len(outside) > 1:
380
+ # weird index array we need in order to properly use fancy indexing
381
+ # -1 because we can't take the root now
382
+ outside_inds = np.array([[i] for i in outside[:-1]])
383
+ outgoing_weights = self.dependency_weights[outside_inds, cycle]
384
+
385
+ # the cycle should have only one outgoing edge for each vertex outside it
386
+ # so, search the maximum outgoing edge to each outside vertex
387
+ max_outgoing_inds = outgoing_weights.argmax(1)
388
+ max_outgoing_weights = outgoing_weights.max(1)
389
+
390
+ # set every outgoing weight to -inf and then restore the highest ones
391
+ outgoing_weights[:] = -np.Infinity
392
+ outgoing_weights[np.arange(len(outside_inds)),
393
+ max_outgoing_inds] = max_outgoing_weights
394
+ self.dependency_weights[outside_inds, cycle] = outgoing_weights
395
+
396
+ # now, adjust incoming edges. the incoming edge from each vertex v
397
+ # (outside the cycle) to v' (inside) is reweighted as:
398
+ # s(v, v') = s(v, v') - s(head(v'), v') + s(c)
399
+ # and then we pick the highest edge for each outside vertex
400
+ # s(c) = sum_v s(head(v), v)
401
+ cycle_inds = np.array([[i] for i in cycle])
402
+ cdef np.ndarray[FLOAT_t, ndim=2] incoming_weights
403
+ incoming_weights = self.dependency_weights[cycle_inds, outside]
404
+
405
+ cycle_score = 0
406
+ for i, vertex in enumerate(cycle):
407
+ head_to_v = self.dependency_weights[vertex, heads[vertex]]
408
+ cycle_score += head_to_v
409
+ incoming_weights[i] -= head_to_v
410
+
411
+ max_incoming_inds = incoming_weights.argmax(0)
412
+ max_incoming_weights = incoming_weights.max(0)
413
+ # we leave the + s(c) to the end
414
+ max_incoming_weights += cycle_score
415
+
416
+ # the vertex with the maximum weighted incoming edge now changes
417
+ # its head, thus breaking the cycle
418
+ new_head_ind = max_incoming_weights.argmax()
419
+ vertex_leaving_cycle_ind = max_incoming_inds[new_head_ind]
420
+
421
+ new_head = outside[new_head_ind]
422
+ vertex_leaving_cycle = cycle[vertex_leaving_cycle_ind]
423
+ old_head = heads[vertex_leaving_cycle]
424
+ heads[vertex_leaving_cycle] = new_head
425
+ self.dependency_weights[vertex_leaving_cycle, old_head] = -np.Infinity
426
+
427
+ # analagous to the outgoing weights
428
+ incoming_weights[:] = -np.Infinity
429
+ incoming_weights[max_incoming_inds,
430
+ np.arange(len(outside))] = max_incoming_weights
431
+ self.dependency_weights[cycle_inds, outside] = incoming_weights
432
+
433
+ def _find_maximum_spanning_tree(self):
434
+ """
435
+ Run the Chu-Liu / Edmond's algorithm in order to find the highest
436
+ scoring dependency tree from the dependency graph weights.
437
+
438
+ :returns: a 1-dim array with the head of each token in the sentence
439
+ """
440
+ # pick the highest scoring dependency for each word
441
+ heads = self.dependency_weights.argmax(1)
442
+
443
+ # check if there are cycles. if there isn't any, we're done
444
+ cycles = self._find_cycles(heads)
445
+
446
+ for cycle in cycles:
447
+ # resolve each cycle c
448
+ self._contract_cycle(heads, cycle)
449
+
450
+ return heads
451
+
@@ -0,0 +1 @@
1
+ from .parse_reader import DependencyReader