nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,878 @@
1
+ # -*- coding: utf-8 -*-
2
+ #cython: embedsignature=True
3
+ #cython: profile=True
4
+ #cython: language_level=3
5
+
6
+ """
7
+ A neural network for NLP tagging tasks.
8
+ It employs feature tables to store feature vectors for each token.
9
+ """
10
+
11
+ import numpy as np
12
+ cimport numpy as np
13
+ cimport cython
14
+ from cpython cimport bool
15
+
16
+ #from itertools import izip
17
+ import logging
18
+
19
+ #ctypedef float FLOAT_t
20
+ ctypedef np.float_t FLOAT_t
21
+ #ctypedef int INT_t
22
+ ctypedef np.int_t INT_t
23
+ ctypedef np.double_t DOUBLE_t
24
+
25
+ # ----------------------------------------------------------------------
26
+ # Math functions
27
+
28
+ cdef logsumexp(np.ndarray a, axis=None):
29
+ """Compute the log of the sum of exponentials of input elements.
30
+ like: scipy.misc.logsumexp
31
+
32
+ Parameters
33
+ ----------
34
+ a : array_like
35
+ Input array.
36
+ axis : int, optional
37
+ Axis over which the sum is taken. By default `axis` is None,
38
+ and all elements are summed.
39
+
40
+ Returns
41
+ -------
42
+ res : ndarray
43
+ The result, ``np.log(np.sum(np.exp(a)))`` calculated in a numerically
44
+ more stable way.
45
+ """
46
+ if axis is None:
47
+ a = a.ravel()
48
+ else:
49
+ a = np.rollaxis(a, axis)
50
+ a_max = a.max(axis=0)
51
+ return np.log(np.sum(np.exp(a - a_max), axis=0)) + a_max
52
+
53
+ cdef hardtanh(np.ndarray weights, inplace=False):
54
+ """
55
+ Hard hyperbolic tangent.
56
+ If inplace is True, modifies the input weights, which will be faster.
57
+ """
58
+ if inplace:
59
+ out = weights
60
+ else:
61
+ out = np.copy(weights)
62
+ inds_greater = weights > 1
63
+ inds_lesser = weights < -1
64
+ out[inds_greater] = 1
65
+ out[inds_lesser] = -1
66
+
67
+ return out
68
+
69
+ cdef hardtanhd(np.ndarray[FLOAT_t, ndim=2] weights):
70
+ """derivative of hardtanh"""
71
+ cdef np.ndarray out = np.zeros_like(weights)
72
+ inds = np.logical_and(-1.0 <= weights, weights <= 1.0)
73
+ out[inds] = 1.0
74
+
75
+ return out
76
+
77
+ # ----------------------------------------------------------------------
78
+
79
+ cdef class Network:
80
+
81
+ # sizes and learning rates
82
+ cdef readonly int word_window_size, input_size, hidden_size, output_size
83
+ cdef public float learning_rate, learning_rate_features
84
+ cdef public float decay_factor
85
+ cdef public bool use_learning_rate_decay
86
+ cdef readonly int features_per_token
87
+
88
+ # lookup for fast access to all the token embeddings in a sentence
89
+ cdef np.ndarray sentence_lookup
90
+
91
+ # padding stuff
92
+ cdef np.ndarray padding_left, padding_right
93
+ cdef public np.ndarray pre_padding, pos_padding
94
+
95
+ # weights, biases, calculated values
96
+ cdef readonly np.ndarray hidden_weights, output_weights
97
+ cdef readonly np.ndarray hidden_bias, output_bias
98
+ cdef readonly np.ndarray input_values, hidden_values, layer2_values
99
+
100
+ # feature tables
101
+ cdef public list feature_tables
102
+
103
+ # transitions
104
+ cdef public float learning_rate_trans
105
+ cdef public np.ndarray transitions
106
+
107
+ # the score for a given path
108
+ cdef readonly float answer_score
109
+
110
+ # gradients
111
+ cdef readonly np.ndarray net_gradients, trans_gradients
112
+ cdef readonly np.ndarray input_sent_values, hidden_sent_values, layer2_sent_values
113
+
114
+ # data for statistics during training.
115
+ cdef float error, accuracy, float_errors, sentence_accuracy
116
+ cdef int num_tokens, skips
117
+
118
+ # file where the network is saved
119
+ cdef public str network_filename
120
+ cdef public str network_text_filename
121
+
122
+ # validation
123
+ cdef list validation_sentences
124
+ cdef list validation_tags
125
+
126
+ @classmethod
127
+ def create_new(cls, feature_tables, int word_window, int hidden_size,
128
+ int output_size):
129
+ """
130
+ Creates a new neural network.
131
+ """
132
+ # sum the number of features in all tables
133
+ cdef int input_size = sum(table.shape[1] for table in feature_tables)
134
+ input_size *= word_window
135
+
136
+ # creates the weight matrices
137
+
138
+ # set the seed for replicability
139
+ #np.random.seed(42)
140
+
141
+ # SENNA: centered uniform distribution with variance = 1/sqrt(fanin)
142
+ # variance = 1/12 interval ^ 2
143
+ # interval = 3.46 / fanin ^ 1/4
144
+ #high = 1.732 / np.power(input_size, 0.25) # SENNA: 0.416
145
+ high = 2.38 / np.sqrt(input_size) # [Bottou-88]
146
+ #high = 0.1 # Fonseca
147
+ hidden_weights = np.random.uniform(-high, high, (hidden_size, input_size))
148
+ hidden_bias = np.random.uniform(-high, high, (hidden_size))
149
+ #high = 1.732 / np.power(hidden_size, 0.25) # SENNA
150
+ high = 2.38 / np.sqrt(hidden_size) # [Bottou-88]
151
+ #high = 0.1 # Fonseca
152
+ output_weights = np.random.uniform(-high, high, (output_size, hidden_size))
153
+ output_bias = np.random.uniform(-high, high, (output_size))
154
+
155
+ high = 1.0
156
+ # +1 is due for the initial transition
157
+ transitions = np.random.uniform(-high, high, (output_size + 1, output_size))
158
+
159
+ net = Network(word_window, input_size, hidden_size, output_size,
160
+ hidden_weights, hidden_bias, output_weights, output_bias,
161
+ transitions)
162
+ net.feature_tables = feature_tables
163
+
164
+ return net
165
+
166
+ def __init__(self, word_window, input_size, hidden_size, output_size,
167
+ hidden_weights, hidden_bias, output_weights, output_bias,
168
+ transitions=None):
169
+ """
170
+ This function isn't expected to be directly called.
171
+ Instead, use the classmethods load_from_file or
172
+ create_new.
173
+
174
+ :param transitions: transition weights. If None uses
175
+ Window Level Likelihood instead of Sentence Level Likelihood.
176
+ """
177
+ self.learning_rate = 0
178
+ self.learning_rate_features = 0
179
+
180
+ self.word_window_size = word_window
181
+ self.input_size = input_size
182
+ self.hidden_size = hidden_size
183
+ self.output_size = output_size
184
+ self.features_per_token = input_size / word_window
185
+
186
+ # A_i_j score for jumping from tag i to j
187
+ # A_0_i = transitions[-1]
188
+ self.transitions = transitions
189
+
190
+ self.hidden_weights = hidden_weights
191
+ self.hidden_bias = hidden_bias
192
+ self.output_weights = output_weights
193
+ self.output_bias = output_bias
194
+
195
+ self.validation_sentences = None
196
+ self.validation_tags = None
197
+
198
+ self.use_learning_rate_decay = False
199
+
200
+ def description(self):
201
+ """
202
+ Returns a textual description of the network.
203
+ """
204
+ table_dims = [str(t.shape[1]) for t in self.feature_tables]
205
+ table_dims = ', '.join(table_dims)
206
+
207
+ desc = """
208
+ Word window size: %d
209
+ Feature table sizes: %s
210
+ Input layer size: %d
211
+ Hidden layer size: %d
212
+ Output size: %d
213
+ """ % (self.word_window_size, table_dims, self.input_size, self.hidden_size, self.output_size)
214
+
215
+ return desc
216
+
217
+ def _create_sentence_lookup(self, np.ndarray sentence):
218
+ """
219
+ Create a lookup matrix with the embeddings values for all tokens in a sentence.
220
+ """
221
+ #print (sentence)
222
+ cdef np.ndarray padded_sentence = np.concatenate((self.pre_padding,
223
+ sentence,
224
+ self.pos_padding))
225
+
226
+ # make sure it works on 32 bit python installations
227
+ padded_sentence = padded_sentence.astype(int)
228
+ #padded_sentence = padded_sentence.astype(np.int64)
229
+
230
+ self.sentence_lookup = np.empty((len(padded_sentence), self.features_per_token))
231
+ ind_from = 0
232
+
233
+ for i, table in enumerate(self.feature_tables):
234
+ num_dims = table.shape[1]
235
+ ind_to = ind_from + num_dims
236
+
237
+ token_indices = padded_sentence[:, i]
238
+ embeddings = table.take(token_indices, axis=0)
239
+ self.sentence_lookup[:, ind_from:ind_to] = embeddings
240
+
241
+ ind_from = ind_to
242
+
243
+ property padding_left:
244
+ """
245
+ The padding element filling the "void" before the beginning
246
+ of the sentence.
247
+ """
248
+ def __get__(self):
249
+ return self.padding_left
250
+
251
+ def __set__(self, np.ndarray padding_left):
252
+ self.padding_left = padding_left
253
+ self.pre_padding = np.array((self.word_window_size // 2) * [padding_left])
254
+
255
+ property padding_right:
256
+ """
257
+ The padding element filling the "void" after the end
258
+ of the sentence.
259
+ """
260
+ def __get__(self):
261
+ return self.padding_right
262
+
263
+ def __set__(self, np.ndarray padding_right):
264
+ self.padding_right = padding_right
265
+ self.pos_padding = np.array((self.word_window_size // 2) * [padding_right])
266
+
267
+ def tag_sentence(self, np.ndarray sentence):
268
+ """
269
+ Runs the network for each element in the sentence and returns
270
+ the sequence of tags.
271
+
272
+ :param sentence: a 2-dim numpy array, where each item encodes a token.
273
+ """
274
+ scores = self._tag_sentence(sentence)
275
+ # computes full score, combining ftheta and A (if SLL)
276
+ return self._viterbi(scores)
277
+
278
+ def _tag_sentence(self, np.ndarray sentence, tags=None):
279
+ """
280
+ Runs the network for each element in the sentence and returns
281
+ the sequence of tags.
282
+
283
+ :param sentence: a 2-dim numpy array, where each item encodes a token.
284
+ :param tags: the correct tags (needed when training)
285
+ :return: a (len(sentence), output_size) array with the scores for all tokens
286
+ """
287
+ cdef np.ndarray answer
288
+ cdef np.ndarray input_data
289
+ # scores[t, i] = ftheta_i,t = score for i-th tag, t-th word
290
+ cdef np.ndarray scores = np.empty((len(sentence), self.output_size))
291
+
292
+ training = tags is not None
293
+ if training:
294
+ self.input_sent_values = np.empty((len(sentence), self.input_size))
295
+ # layer2_values at each token in the correct path
296
+ self.layer2_sent_values = np.empty((len(sentence), self.hidden_size))
297
+ # hidden_values at each token in the correct path
298
+ self.hidden_sent_values = np.empty((len(sentence), self.hidden_size))
299
+
300
+ self._create_sentence_lookup(sentence)
301
+
302
+ # run through all windows in the sentence
303
+ for i in range(len(sentence)):
304
+ input_data = self.sentence_lookup[i:i + self.word_window_size].flatten()
305
+
306
+ # (hidden_size, input_size) . input_size = hidden_size
307
+ self.layer2_values = self.hidden_weights.dot(input_data) + self.hidden_bias
308
+ self.hidden_values = hardtanh(self.layer2_values, inplace=not training)
309
+ output = self.output_weights.dot(self.hidden_values) + self.output_bias
310
+ scores[i] = output
311
+
312
+ if training:
313
+ self.input_sent_values[i] = input_data
314
+ self.layer2_sent_values[i] = self.layer2_values
315
+ self.hidden_sent_values[i] = self.hidden_values
316
+
317
+ if training:
318
+ if self._calculate_gradients_sll(tags, scores):
319
+ self._backpropagate(sentence)
320
+
321
+ return scores
322
+
323
+ def _calculate_delta(self, scores):
324
+ """
325
+ Calculates a matrix with the scores for all possible paths at all given
326
+ points (tokens).
327
+ In the returned matrix, delta[i][j] means the sum of all scores
328
+ ending in token i with tag j (delta_i(j) in eq. 14 in the paper)
329
+ """
330
+ # logadd for first token. the transition score of the starting tag must be used.
331
+ # it turns out that logadd = log(exp(score)) = score
332
+ # (use long double because taking exp's leads to very very big numbers)
333
+ # scores[t][k] = ftheta_k,t
334
+ delta = scores
335
+
336
+ # transitions[-1] represents initial transition, A_0,i in paper (mispelled as A_i,0)
337
+ # delta_0(k) = ftheta_k,0 + A_0,i
338
+ delta[0] += self.transitions[-1]
339
+
340
+ # logadd for the remaining tokens
341
+ # delta_t(k) = ftheta_k,t + logadd_i(delta_t-1(i) + A_i,k)
342
+ # = ftheta_k,t + log(Sum_i(exp(delta_t-1(i) + A_i,k)))
343
+ transitions = self.transitions[:-1].T # A_k,i
344
+
345
+ for token in xrange(1, len(delta)):
346
+ # sum by rows
347
+ logadd = logsumexp(delta[token - 1] + transitions, 1)
348
+ delta[token] += logadd
349
+
350
+ return delta
351
+
352
+ @cython.boundscheck(False)
353
+ def _calculate_gradients_sll(self, tags, scores):
354
+ """
355
+ Calculates the output and transition deltas for each token, using Sentence Level Likelihood.
356
+ The aim is to minimize the cost:
357
+ C(theta,A) = logadd(scores for all possible paths) - score(correct path)
358
+
359
+ :returns: if True, normal gradient calculation was performed.
360
+ If False, the error was too low and weight correction should be
361
+ skipped.
362
+ """
363
+ cdef np.ndarray[DOUBLE_t, ndim=2] delta # (len(sentence), output_size)
364
+ cdef np.ndarray[DOUBLE_t, ndim=2] delta_softmax # (output_size, output_size)
365
+
366
+ # ftheta_i,t = network output for i-th tag, at t-th word
367
+ # s = Sum_i(A_tags[i-1],tags[i] + ftheta_i,i), i < len(sentence) (12)
368
+ correct_path_score = 0
369
+ last_tag = self.output_size
370
+ for tag, net_scores in zip(tags[:-2], scores[:-2]):
371
+ #print(tags, last_tag, tag, self.transitions)
372
+ trans = 0 if self.transitions is None else self.transitions[last_tag, tag]
373
+ #print ('trans:',trans)
374
+ #print ('net:',tag, net_scores[tag])
375
+ #print
376
+ correct_path_score += trans + net_scores[tag]
377
+ last_tag = tag
378
+
379
+ # delta[t] = delta_t in equation (14)
380
+ delta = self._calculate_delta(scores)
381
+ # logadd_i(delta_T(i)) = log(Sum_i(exp(delta_T(i))))
382
+ # Sentence-level Log-Likelihood (SLL)
383
+ # C(ftheta,A) = logadd_j(s(x, j, theta, A)) - score(correct path)
384
+ #print ': ', np.log(np.sum(np.exp(delta[-1])))
385
+ #print '> ', correct_path_score
386
+ error = np.log(np.sum(np.exp(delta[-1]))) - correct_path_score
387
+ #print error
388
+ #error = logsumexp(delta[-1]) - correct_path_score
389
+ self.error += error
390
+
391
+ # if the error is too low, don't bother training (saves time and avoids
392
+ # overfitting). An error of 0.01 means a log-prob of -0.01 for the right
393
+ # tag, i.e., more than 99% probability
394
+ # error 0.69 -> 50% probability for right tag (minimal threshold)
395
+ # error 0.22 -> 80%
396
+ # error 0.1 -> 90%
397
+ if error <= 0.01:
398
+ self.skips += 1
399
+ return False
400
+
401
+ # initialize gradients
402
+ # dC / dftheta
403
+ self.net_gradients = np.zeros((len(tags), self.output_size))
404
+ # dC / dA
405
+ #self.trans_gradients = np.zeros_like(self.transitions, float)
406
+ self.trans_gradients = np.zeros_like(self.transitions, np.float64)
407
+
408
+ # things get nasty from here
409
+ # refer to the papers to understand what exactly is going on
410
+
411
+ # compute the gradients for the last token
412
+ # dC_logadd / ddelta_T(i) = e(delta_T(i))/Sum_k(e(delta_T(k)))
413
+ # Compute it using the log:
414
+ # log(e(delta_T(i))/Sum_k(e(delta_T(k)))) =
415
+ # log(e(delta_T(i))) - log(Sum_k(e(delta_T(k)))) =
416
+ # delta_T(i) - logsumexp(delta_T(k))
417
+ # dC_logadd / ddelta_T(i) = e(delta_T(i) - logsumexp(delta_T(k)))
418
+ sumlogadd = logsumexp(delta[-1])
419
+ # negative gradients
420
+ self.net_gradients[-1] = -np.exp(delta[-1] - sumlogadd)
421
+
422
+ transitions_t = 0 if self.transitions is None else self.transitions[:-1].T
423
+
424
+ # delta[i][j]: sum of scores of all path that assign tag j to ith-token
425
+
426
+ # now compute the gradients for the other tokens, from last to first
427
+ for t in range(len(scores) - 2, -1, -1):
428
+
429
+ # sum the scores for all paths ending with each tag i at token t
430
+ # with the transitions from tag i to the next tag j
431
+ # Obtained by transposing twice
432
+ # [delta_t-1(i)+A_j,i]T
433
+ path_scores = (delta[t] + transitions_t).T
434
+
435
+ # normalize over all possible tag paths using a softmax,
436
+ # computed using log.
437
+ # the log of the sums of exps, summed by column
438
+ log_sum_scores = logsumexp(path_scores, 0)
439
+
440
+ # softmax is the division of an exponential by the sum of all exponentials
441
+ # (yields a probability)
442
+ # e(delta_t-1(i)+A_i,j) / Sum_k e(delta_t-1(k)+A_k,j)
443
+ delta_softmax = np.exp(path_scores - log_sum_scores)
444
+
445
+ # multiply each value in the softmax by the gradient at the next tag
446
+ # dC_logadd / ddelta_t(i) * delta_softmax
447
+ # Attardi: negative since net_gradients[t + 1] already negative
448
+ grad_times_softmax = self.net_gradients[t + 1] * delta_softmax
449
+ # dC / dA_i,j
450
+ self.trans_gradients[:-1, :] += grad_times_softmax
451
+
452
+ # sum all transition gradients by row to find the network gradients
453
+ # Sum_j(dC_logadd / ddelta_t(j) * delta_softmax)
454
+ # Attardi: negative since grad_times_softmax already negative
455
+ self.net_gradients[t] = np.sum(grad_times_softmax, 1)
456
+
457
+ # find the gradients for the starting transition
458
+ # there is only one possibility to come from, which is the sentence start
459
+ self.trans_gradients[-1] = self.net_gradients[0]
460
+
461
+ # now, add +1 to the correct path
462
+ last_tag = self.output_size
463
+ for token, tag in enumerate(tags):
464
+ self.net_gradients[token][tag] += 1 # negative gradient
465
+ if self.transitions is not None:
466
+ self.trans_gradients[last_tag][tag] += 1 # negative gradient
467
+ last_tag = tag
468
+
469
+ return True
470
+
471
+ @cython.boundscheck(False)
472
+ def _calculate_gradients_wll(self, tags, scores):
473
+ """
474
+ Calculates the output for each token, using Word Level Likelihood.
475
+ The aim is to minimize the word-level log-likelihood:
476
+ C(ftheta) = logadd_j(ftheta_j) - ftheta_y,
477
+ where y is the sequence of correct tags
478
+
479
+ :returns: if True, normal gradient calculation was performed.
480
+ If False, the error was too low and weight correction should be
481
+ skipped.
482
+ """
483
+ # compute the negative gradient with respect to ftheta
484
+ # dC / dftheta_i = e(ftheta_i)/Sum_k(e(ftheta_k))
485
+ exponentials = np.exp(scores)
486
+ # FIXME: use logsumexp
487
+ # ((len(sentence), self.output_size))
488
+ self.net_gradients = -(exponentials.T / exponentials.sum(1)).T
489
+
490
+ # correct path and its gradient
491
+ correct_path_score = 0
492
+ token = 0
493
+ for tag, net_scores in zip(tags, scores):
494
+ self.net_gradients[token][tag] += 1 # negative gradient
495
+ token += 1
496
+ correct_path_score += net_scores[tag]
497
+
498
+ # C(ftheta) = logadd_j(ftheta_j) - score(correct path)
499
+ #error = np.log(np.sum(np.exp(scores))) - correct_path_score
500
+ error = logsumexp(scores) - correct_path_score
501
+ # approximate
502
+ #error = np.max(scores) - correct_path_score
503
+ self.error += error
504
+
505
+ return True
506
+
507
+ @cython.boundscheck(False)
508
+ def _viterbi(self, np.ndarray[FLOAT_t, ndim=2] scores):
509
+ """
510
+ Performs a Viterbi search over the scores for each tag using
511
+ the transitions matrix. If a matrix wasn't supplied,
512
+ it will return the tags with the highest scores individually.
513
+ """
514
+ # pretty straightforward
515
+ if self.transitions is None or len(scores) == 1:
516
+ return scores.argmax(1)
517
+
518
+ path_scores = np.empty_like(scores)
519
+ #path_backtrack = np.empty_like(scores, np.int64)
520
+ path_backtrack = np.empty_like(scores, int)
521
+
522
+ # now the actual Viterbi algorithm
523
+ # first, get the scores for each tag at token 0
524
+ # the last row of the transitions table has the scores for the first tag
525
+ path_scores[0] = scores[0] + self.transitions[-1]
526
+
527
+ output_range = np.arange(self.output_size) # outside loop. Attardi
528
+ transitions = self.transitions[:-1] # idem
529
+
530
+ cdef int i
531
+ for i in xrange(1, len(scores)):
532
+
533
+ # each line contains the score until each tag t plus the transition to each other tag t'
534
+ prev_score_and_trans = (path_scores[i - 1] + transitions.T).T
535
+
536
+ # find the previous tag that yielded the max score
537
+ path_backtrack[i] = prev_score_and_trans.argmax(0)
538
+ path_scores[i] = prev_score_and_trans[path_backtrack[i],
539
+ output_range] + scores[i]
540
+
541
+ # now find the maximum score for the last token and follow the backtrack
542
+ #answer = np.empty(len(scores), dtype=np.int64)
543
+ answer = np.empty(len(scores), dtype=int)
544
+ answer[-1] = path_scores[-1].argmax()
545
+ self.answer_score = path_scores[-1][answer[-1]]
546
+ previous_tag = path_backtrack[-1][answer[-1]]
547
+
548
+ for i in range(len(scores) - 2, 0, -1):
549
+ answer[i] = previous_tag
550
+ previous_tag = path_backtrack[i][previous_tag]
551
+
552
+ answer[0] = previous_tag
553
+ return answer
554
+
555
+ def set_validation_data(self, list validation_sentences=None,
556
+ list validation_tags=None):
557
+ """
558
+ Sets the data to be used during validation. If this function is not
559
+ called before training, the training data is used to measure performance.
560
+
561
+ :param validation_sentences: sentences to be used in validation.
562
+ :param validation_tags: tags for the validation sentences.
563
+ """
564
+ self.validation_sentences = validation_sentences
565
+ self.validation_tags = validation_tags
566
+
567
+ def set_learning_rate_decay(self, float decay_factor=1.0):
568
+ """
569
+ Sets the network to use learning rate decay.
570
+
571
+ The learning rate at each iteration t is determined as:
572
+ initial_rate / (1 + t * decay_factor)
573
+
574
+ with t starting from 0
575
+ """
576
+ self.use_learning_rate_decay = True
577
+ self.decay_factor = decay_factor
578
+
579
+ def decrease_learning_rates(self, epoch):
580
+ """
581
+ Apply the learning rate decay, if the network was configured to use it.
582
+ """
583
+ if not self.use_learning_rate_decay or epoch == 0:
584
+ return
585
+
586
+ # multiplying the last rate by this adjustment is equivalent to
587
+ # initial_rate / (1 + t * decay_factor)
588
+ # and we don't need to store the initial rates
589
+ factor = (1.0 + (epoch - 1) * self.decay_factor) / (1 + epoch * self.decay_factor)
590
+ self.learning_rate *= factor
591
+ self.learning_rate_features *= factor
592
+ if self.transitions is not None:
593
+ self.learning_rate_trans *= factor
594
+
595
+ def train(self, list sentences, list tags,
596
+ int epochs, int epochs_between_reports=0,
597
+ float desired_accuracy=0):
598
+ """
599
+ Trains the network to tag sentences.
600
+
601
+ :param sentences: a list of 2-dim numpy arrays, where each item
602
+ encodes a sentence. Each item in a sentence has the
603
+ indices to its features.
604
+ :param tags: a list of 1-dim numpy arrays, where each item has
605
+ the tags of the sentences.
606
+ :param epochs: number of training epochs
607
+ :param epochs_between_reports: number of epochs to wait between
608
+ reports about the training performance. 0 means no reports.
609
+ :param desired_accuracy: training stops if the desired accuracy
610
+ is reached. Ignored if 0.
611
+ """
612
+ logger = logging.getLogger("Logger")
613
+ logger.info("Training for up to %d epochs" % epochs)
614
+ top_accuracy = 0
615
+ last_accuracy = 0
616
+ last_error = np.Infinity
617
+ self.num_tokens = sum(len(sent) for sent in sentences)
618
+
619
+ if self.validation_sentences is None:
620
+ self.validation_sentences = sentences
621
+ self.validation_tags = tags
622
+
623
+ for i in range(epochs):
624
+ self.decrease_learning_rates(i)
625
+ self._train_epoch(sentences, tags)
626
+ self._validate()
627
+
628
+ # normalize error
629
+ self.error = self.error / self.num_tokens if self.num_tokens else np.Infinity
630
+
631
+ # Attardi: save model
632
+ if self.accuracy > top_accuracy:
633
+ top_accuracy = self.accuracy
634
+ self.save()
635
+ logger.debug("Saved model")
636
+ elif self.use_learning_rate_decay:
637
+ # this iteration didn't bring improvements; load the last saved model
638
+ # before continuing training with a lower rate
639
+ self._load_parameters()
640
+
641
+ if (epochs_between_reports > 0 and i % epochs_between_reports == 0) \
642
+ or self.accuracy >= desired_accuracy > 0 \
643
+ or (self.accuracy < last_accuracy and self.error > last_error):
644
+
645
+ self._print_epoch_report(i + 1)
646
+
647
+ if self.accuracy >= desired_accuracy > 0\
648
+ or (self.error > last_error and self.accuracy < last_accuracy):
649
+ break
650
+
651
+ last_accuracy = self.accuracy
652
+ last_error = self.error
653
+
654
+ self.num_tokens = 0
655
+
656
+ def _print_epoch_report(self, int num):
657
+ """
658
+ Reports the status of the network in the given training
659
+ epoch, including error and accuracy.
660
+ """
661
+ logger = logging.getLogger("Logger")
662
+ logger.info("%d epochs Error: %f Accuracy: %f " \
663
+ "%d corrections skipped " \
664
+ "learning rate: %f" % (num,
665
+ self.error,
666
+ self.accuracy,
667
+ self.skips,
668
+ self.learning_rate))
669
+
670
+ def _train_epoch(self, list sentences, list tags):
671
+ """
672
+ Trains for one epoch with all examples.
673
+ """
674
+ self.error = 0
675
+ self.skips = 0
676
+ self.float_errors = 0
677
+
678
+ # shuffle data
679
+ # get the random number generator state in order to shuffle
680
+ # sentences and their tags in the same order
681
+ random_state = np.random.get_state()
682
+ np.random.shuffle(sentences)
683
+ np.random.set_state(random_state)
684
+ np.random.shuffle(tags)
685
+
686
+ for sent, sent_tags in zip(sentences, tags):
687
+ try:
688
+ self._tag_sentence(sent, sent_tags)
689
+ except FloatingPointError:
690
+ # just ignore the sentence in case of an overflow
691
+ self.float_errors += 1
692
+
693
+ def _validate(self):
694
+ """Perform validation on validation data and estimate accuracy"""
695
+ hits = 0
696
+
697
+ for sent, gold_tags in zip(self.validation_sentences, self.validation_tags):
698
+ answer = self.tag_sentence(sent)
699
+ hits += np.count_nonzero(answer == gold_tags)
700
+
701
+ # self.num_tokens stores number of tokens in training sentences
702
+ num_tokens = sum(len(sent) for sent in self.validation_sentences)
703
+ self.accuracy = float(hits) / num_tokens
704
+
705
+ def _backpropagate(self, sentence):
706
+ """
707
+ Backpropagate the gradients of the cost.
708
+ """
709
+ # f_1 = input_sent_values
710
+ # f_2 = M_1 f_1 + b_2 = layer2_values
711
+ # f_3 = hardTanh(f_2) = hidden_values
712
+ # f_4 = M_2 f_3 + b_4
713
+
714
+ # For l = 4..1 do:
715
+ # dC / dtheta_l = df_l / dtheta_l dC / df_l (19)
716
+ # dC / df_l-1 = df_l / df_l-1 dC / df_l (20)
717
+
718
+ """
719
+ Compute the gradients of the cost for each layer
720
+ """
721
+ # layer 4: output layer
722
+ # dC / dW_4 = dC / df_4 f_3.T (22)
723
+ # (len, output_size).T (len, hidden_size) = (output_size, hidden_size)
724
+ cdef np.ndarray[FLOAT_t, ndim=2] output_deltas
725
+ output_deltas = self.net_gradients.T.dot(self.hidden_sent_values)
726
+
727
+ # dC / db_4 = dC / df_4 (22)
728
+ # (output_size) += ((len(sentence), output_size))
729
+ # sum by column, i.e. all changes through the sentence
730
+ output_bias_deltas = self.net_gradients.sum(0)
731
+
732
+ # dC / df_3 = M_2.T dC / df_4 (23)
733
+ # (len, output_size) (output_size, hidden_size) = (len, hidden_size)
734
+ dCdf_3 = self.net_gradients.dot(self.output_weights)
735
+
736
+ # layer 3: HardTanh layer
737
+ # no weights to adjust
738
+
739
+ # dC / df_2 = hardtanhd(f_2) * dC / df_3
740
+ # (len, hidden_size) (len, hidden_size)
741
+ # FIXME: this goes quickly to 0.
742
+ dCdf_2 = hardtanhd(self.layer2_sent_values) * dCdf_3
743
+
744
+ # df_2 / df_1 = M_1
745
+
746
+ # layer 2: linear layer
747
+ # dC / dW_2 = dC / df_2 f_1.T (22)
748
+ cdef np.ndarray[FLOAT_t, ndim=2] hidden_deltas
749
+ # (len, hidden_size).T (len, input_size) = (hidden_size, input_size)
750
+ hidden_deltas = dCdf_2.T.dot(self.input_sent_values)
751
+
752
+ # dC / db_2 = dC / df_2 (22)
753
+ # sum by column contribution by each token
754
+ hidden_bias_deltas = dCdf_2.sum(0)
755
+
756
+ # dC / df_1 = M_1.T dC / df_2
757
+ cdef np.ndarray[FLOAT_t, ndim=2] input_gradients
758
+ # (len, hidden_size) (hidden_size, input_size) = (len, input_size)
759
+ input_gradients = dCdf_2.dot(self.hidden_weights)
760
+
761
+ """
762
+ Adjust the weights.
763
+ """
764
+ self.output_weights += output_deltas * self.learning_rate
765
+ self.output_bias += output_bias_deltas * self.learning_rate
766
+ self.hidden_weights += hidden_deltas * self.learning_rate
767
+ self.hidden_bias += hidden_bias_deltas * self.learning_rate
768
+
769
+ """
770
+ Adjust the features indexed by the input window.
771
+ """
772
+ # the deltas that will be applied to the feature tables
773
+ # they are in the same sequence as the network receives them, i.e.
774
+ # [token1-table1][token1-table2][token2-table1][token2-table2] (...)
775
+ # input_size = num features * window (e.g. 60 * 5). Attardi
776
+ cdef np.ndarray[FLOAT_t, ndim=2] input_deltas
777
+ # (len, input_size)
778
+ input_deltas = input_gradients * self.learning_rate_features
779
+
780
+ padded_sentence = np.concatenate((self.pre_padding,
781
+ sentence,
782
+ self.pos_padding))
783
+
784
+ cdef np.ndarray[INT_t, ndim=1] features
785
+ cdef np.ndarray[FLOAT_t, ndim=2] table
786
+ cdef int start, end, t
787
+ cdef int i, j
788
+
789
+ for i, w_deltas in enumerate(input_deltas):
790
+ # for each window (w_deltas: 300, features: 5)
791
+ # this tracks where the deltas for the next table begins
792
+ start = 0
793
+ for features in padded_sentence[i:i+self.word_window_size]:
794
+ # select the columns for each feature_tables (t: 3)
795
+ for t, table in enumerate(self.feature_tables):
796
+ end = start + table.shape[1]
797
+ table[features[t]] += w_deltas[start:end]
798
+ start = end
799
+
800
+ # Adjusts the transition scores table with the calculated gradients.
801
+ if self.transitions is not None:
802
+ self.transitions += self.trans_gradients * self.learning_rate_trans
803
+
804
+ def _load_parameters(self):
805
+ """
806
+ Loads weights, feature tables and transition tables previously saved.
807
+ """
808
+ data = np.load(self.network_filename)
809
+ self.hidden_weights = data['hidden_weights']
810
+ self.hidden_bias = data['hidden_bias']
811
+ self.output_weights = data['output_weights']
812
+ self.output_bias = data['output_bias']
813
+ self.feature_tables = list(data['feature_tables'])
814
+
815
+ # check if transitions isn't None (numpy saves everything as an array)
816
+ if data['transitions'].shape != ():
817
+ self.transitions = data['transitions']
818
+ else:
819
+ self.transitions = None
820
+
821
+ def save(self):
822
+ """
823
+ Saves the neural network to a file.
824
+ It will save the weights, biases, sizes, padding,
825
+ and feature tables.
826
+ """
827
+ np.savez(self.network_filename, hidden_weights=self.hidden_weights,
828
+ output_weights=self.output_weights,
829
+ hidden_bias=self.hidden_bias, output_bias=self.output_bias,
830
+ word_window_size=self.word_window_size,
831
+ input_size=self.input_size, hidden_size=self.hidden_size,
832
+ output_size=self.output_size, padding_left=self.padding_left,
833
+ padding_right=self.padding_right, transitions=self.transitions,
834
+ feature_tables=self.feature_tables)
835
+
836
+
837
+ @classmethod
838
+ def load_from_file(cls, filename):
839
+ """
840
+ Loads the neural network from a file.
841
+ It will load weights, biases, sizes, padding,
842
+ and feature tables.
843
+ """
844
+ data = np.load(filename)
845
+ #print filename
846
+ # cython classes don't have the __dict__ attribute
847
+ # so we can't do an elegant self.__dict__.update(data)
848
+ hidden_weights = data['hidden_weights']
849
+ hidden_bias = data['hidden_bias']
850
+ output_weights = data['output_weights']
851
+ output_bias = data['output_bias']
852
+
853
+ word_window_size = data['word_window_size']
854
+ input_size = data['input_size']
855
+ hidden_size = data['hidden_size']
856
+ output_size = data['output_size']
857
+ if 'transitions' in data:
858
+ transitions = data['transitions']
859
+ else:
860
+ transitions = None
861
+
862
+ nn = Network(word_window_size, input_size, hidden_size, output_size,
863
+ hidden_weights, hidden_bias, output_weights, output_bias,
864
+ transitions)
865
+
866
+ nn.padding_left = data['padding_left']
867
+ nn.padding_right = data['padding_right']
868
+ nn.pre_padding = np.array((nn.word_window_size // 2) * [nn.padding_left])
869
+ nn.pos_padding = np.array((nn.word_window_size // 2) * [nn.padding_right])
870
+ nn.feature_tables = list(data['feature_tables'])
871
+ nn.network_filename = filename
872
+
873
+ return nn
874
+
875
+ # include the files for other networks
876
+ # this comes here after the Network class has already been defined
877
+ include "networkconv.pyx"
878
+ include "networkdependencyconv.pyx"