nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,436 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Class for dealing with SRL data.
5
+ """
6
+
7
+ from collections import defaultdict
8
+ import _pickle as cPickle
9
+ import _pickle
10
+ import logging
11
+ import re
12
+ import os
13
+ import numpy as np
14
+ #from itertools import izip
15
+
16
+ from .. import attributes
17
+ from .. import utils
18
+ from ..word_dictionary import WordDictionary
19
+ from .. import reader
20
+
21
+ class ConllPos(object):
22
+ """
23
+ Dummy class for storing the position of each field in a
24
+ CoNLL data file.
25
+ """
26
+ id = 0
27
+ word = 1
28
+ lemma = 2
29
+ hmorph = 3 # head morph
30
+ hpos = 4 # head pos
31
+ tmorph = 5 # tail morph
32
+ tpos = 6 # tail pos
33
+ parse = 7
34
+ rel = 8
35
+ semantic_role = 9
36
+ SEP = '\t'
37
+
38
+ class SRLReader(reader.TaggerReader):
39
+
40
+ def __init__(self, md=None, filename=None):
41
+ """
42
+ The reader will read sentences from a given file. This file must
43
+ be in the correct format (one token per line, columns indicating
44
+ which tokens are predicates and their argument structure).
45
+
46
+ :param filename: a file with CoNLL-like format data. If it is None,
47
+ the reader will be created with no data.
48
+ :param only_boundaries: train to identify only argument boundaries
49
+ :param only_classify: train to classify pre-determined argument
50
+ :param only_predicates: train to identify only predicates
51
+ """
52
+
53
+ self.taskname = 'srl'
54
+ self.pos_dict = {}
55
+
56
+ if filename is not None:
57
+ self._read_conll(filename)
58
+ #self._clean_text()
59
+
60
+ super(SRLReader, self).__init__(md)
61
+
62
+
63
+ @property
64
+ def task(self):
65
+ """
66
+ Abstract Base Class (ABC) attribute.
67
+ """
68
+ return self.taskname
69
+
70
+
71
+ def _read_conll(self, filename):
72
+ '''
73
+ Read a file in CoNLL format and extracts semantic role tags
74
+ for each token.
75
+ '''
76
+ lines = []
77
+ with open(filename, 'rt') as f:
78
+ for line in f:
79
+ line = line.strip()
80
+ lines.append(line)
81
+
82
+ self.sentences = []
83
+ self.predicates = []
84
+ tokens = []
85
+ sent_predicates = []
86
+ sent_tags = []
87
+ token_number = 0
88
+
89
+ for line in lines:
90
+ line = line.strip()
91
+
92
+ if line == '':
93
+ # blank line between sentences
94
+ if len(tokens) > 0:
95
+ sentence = (tokens, sent_tags)
96
+ self.sentences.append(sentence)
97
+ self.predicates.append(np.array(sent_predicates))
98
+ tokens = []
99
+ sent_predicates = []
100
+ sent_tags = []
101
+ token_number = 0
102
+
103
+ continue
104
+
105
+ fields = line.split(ConllPos.SEP)
106
+ idx = fields[ConllPos.id]
107
+ word = fields[ConllPos.word]
108
+ lemma = fields[ConllPos.lemma]
109
+ hmorph = fields[ConllPos.hmorph]
110
+ hpos = fields[ConllPos.hpos].lower()
111
+ tmorph = fields[ConllPos.tmorph]
112
+ tpos = fields[ConllPos.tpos].lower()
113
+ parse = fields[ConllPos.parse]
114
+ rel = fields[ConllPos.rel]
115
+ is_predicate = (rel[:1] == 'V')
116
+ tag = fields[ConllPos.semantic_role]
117
+
118
+ tag = self._read_role(tag)
119
+ sent_tags.append((int(parse)-1, tag)) # note: codify_sentences
120
+
121
+ token = attributes.Token(word=word, morph_h=hmorph, morph_t=tmorph, pos_t=tpos, chunk=rel)
122
+ #token = attributes.Token(word, morph_h=hmorph, pos_h=hpos, morph_t=tmorph, pos_t=tpos, chunk=rel)
123
+ tokens.append(token)
124
+ if is_predicate:
125
+ sent_predicates.append(token_number)
126
+
127
+ token_number += 1
128
+
129
+ if len(tokens) > 0:
130
+ # last sentence
131
+ sentence = (tokens, sent_tags)
132
+ self.sentences.append(sentence)
133
+ self.predicates.append(np.array(sent_predicates))
134
+
135
+ @classmethod
136
+ def _read_role(cls, role):
137
+ '''
138
+ Reads the semantic role from a CoNLL-style file.
139
+
140
+ pram: role what is read from the conll file
141
+ '''
142
+ return role
143
+
144
+ def extend(self, data):
145
+ """
146
+ Adds more data to the reader.
147
+ :param data: a list of tuples in the format (tokens, tags, predicates),
148
+ one for each sentence.
149
+ """
150
+ self.sentences.extend([(sent, tags) for sent, tags, _ in data])
151
+ self.predicates.extend([np.array(preds) for _, _, preds in data])
152
+
153
+ def load_or_create_tag_dict(self):
154
+ """
155
+ In the case of SRL argument classification or one step SRL, try to
156
+ load the tag dictionary. If the file with the tags is not present,
157
+ a new one is created from the available sentences.
158
+
159
+ In the case of argument detection or predicate detection,
160
+ this function does nothing.
161
+ """
162
+ if os.path.isfile(self.md.paths['srl_tags']):
163
+ self.load_tag_dict()
164
+ return
165
+
166
+ self._create_tag_dict()
167
+ logger = logging.getLogger('Logger')
168
+ logger.info('Created SRL tag dictionary')
169
+
170
+ def _create_tag_dict(self):
171
+ """
172
+ Examine the available sentences and create a tag dictionary.
173
+
174
+ :param iob: If True, this function will generate an entry for B-[tag]
175
+ and one for I-[tag], except for the tag 'O'.
176
+ """
177
+ logger = logging.getLogger("Logger")
178
+ tags = {tag
179
+ for _, tags in self.sentences
180
+ for rel, tag in tags}
181
+
182
+ # create a dictionary now even if uses IOB, in order to save it in
183
+ # a deterministic order
184
+ self.tag_dict = {tag: code for code, tag in enumerate(tags)}
185
+ reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
186
+ logger.debug("Saved SRL tag dictionary.")
187
+
188
+
189
+ def load_tag_dict(self, filename=None, iob=False):
190
+ """
191
+ Loads the tag dictionary from the default file. The dictionary file should
192
+ have one tag per line.
193
+
194
+ :param iob: If True, this function will generate an entry for B-[tag]
195
+ and one for I-[tag], except for the tag 'O'.
196
+ """
197
+ if filename is None:
198
+ filename = self.md.paths['srl_tags']
199
+
200
+ self.tag_dict = {}
201
+ code = 0
202
+ with open(filename, 'rt') as f:
203
+ for tag in f:
204
+ tag = tag.strip()
205
+ if tag == '':
206
+ continue
207
+
208
+ self.tag_dict[tag] = code
209
+
210
+ code += 1
211
+
212
+
213
+ def _generate_iobes_dictionary(self):
214
+ """
215
+ Generate the reader's tag dictionary mapping the IOBES tags to numeric codes.
216
+ """
217
+ self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}
218
+
219
+ def _generate_predicate_id_dictionary(self):
220
+ """
221
+ Generate a tag dictionary for identifying predicates.
222
+ It has two tags: V for predicates and O for others.
223
+ """
224
+ self.tag_dict = {'-': 0, 'V': 1}
225
+ #self.tag_dict = {'O': 0, 'V': 1}
226
+
227
+ def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
228
+ """
229
+ Generates a token dictionary based on the given sentences.
230
+
231
+ :param dict_size: Max number of tokens to be included in the dictionary.
232
+ :param minimum_occurrences: Minimum number of times that a token must
233
+ appear in the text in order to be included in the dictionary.
234
+ """
235
+ logger = logging.getLogger("Logger")
236
+ all_tokens = [token.word
237
+ for tokens, _ in self.sentences
238
+ for token in tokens]
239
+ self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
240
+ logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
241
+
242
+ def _clean_text(self):
243
+ """
244
+ Cleans the sentences text, replacing numbers for a keyword, different
245
+ kinds of quotation marks for a single one, etc.
246
+ """
247
+ for sent, _ in self.sentences:
248
+ for i, token in enumerate(sent):
249
+ new_word = utils.clean_text(token.word, correct=False)
250
+ new_lemma = utils.clean_text(token.lemma, correct=False)
251
+ token.word = new_word
252
+ token.lemma = new_lemma
253
+ sent[i] = token
254
+
255
+ def create_converter(self):
256
+ """
257
+ This function overrides the TextReader's one in order to deal with Token
258
+ objects instead of raw strings.
259
+ """
260
+ self.converter = attributes.TokenConverter()
261
+
262
+ if self.md.use_lemma:
263
+ # look up word lemmas
264
+ word_lookup = lambda t: self.word_dict.get(t.lemma)
265
+ else:
266
+ # look up the word itself
267
+ word_lookup = lambda t: self.word_dict.get(t.word)
268
+
269
+ self.converter.add_extractor(word_lookup)
270
+
271
+ #if self.md.use_caps:
272
+ # caps_lookup = lambda t: attributes.get_capitalization(t.word)
273
+ # self.converter.add_extractor(caps_lookup)
274
+
275
+ if self.md.use_pos:
276
+ with open(self.md.paths['pos_tag_dict']) as f:
277
+ #pos_dict = cPickle.load(f)
278
+ buf = f.readlines()
279
+ for i, line in enumerate(buf):
280
+ line = line.strip()
281
+ self.pos_dict[line] = i
282
+
283
+ pos_def_dict = defaultdict(lambda: self.pos_dict['NN'])
284
+ pos_def_dict.update(self.pos_dict)
285
+ pos_lookup = lambda t: pos_def_dict[t.pos_t]
286
+ self.converter.add_extractor(pos_lookup)
287
+
288
+ #if self.md.use_chunk:
289
+ # with open(self.md.paths['chunk_tag_dict']) as f:
290
+ # chunk_dict = cPickle.load(f)
291
+
292
+ # chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
293
+ # chunk_def_dict.update(chunk_dict)
294
+ # chunk_lookup = lambda t: chunk_def_dict[t.chunk]
295
+ # self.converter.add_extractor(chunk_lookup)
296
+
297
+ def get_num_pos_tags(self):
298
+ return len(self.pos_dict)
299
+
300
+ def generate_tag_dict(self):
301
+ """
302
+ Generates a tag dictionary that converts the tag itself
303
+ to an index to be used in the neural network.
304
+ """
305
+ self.tagset = set(tag
306
+ for _, props in self.sentences
307
+ for prop in props
308
+ for _, tag in prop)
309
+
310
+ self.tag_dict = dict( zip( self.tagset,
311
+ xrange(len(self.tagset))
312
+ )
313
+ )
314
+
315
+ def _remove_tag_names(self):
316
+ """Removes the actual tag names, leaving only IOB or IOBES block delimiters."""
317
+ for _, propositions in self.sentences:
318
+ for tags in propositions:
319
+ for i, (_, tag) in enumerate(tags):
320
+ tags[i] = tag[0]
321
+
322
+ def _codify_sentences(self):
323
+ """Internal helper function."""
324
+ new_sentences = []
325
+ self.tags = []
326
+
327
+ for (sent, props) in self.sentences:
328
+ new_sent = []
329
+ sentence_tags = []
330
+
331
+ for token in sent:
332
+ new_token = self.converter.convert(token)
333
+ new_sent.append(new_token)
334
+
335
+ for prop in props:
336
+ sentence_tags.append(prop)
337
+
338
+ new_sentences.append(np.array(new_sent))
339
+ self.tags.append(sentence_tags)
340
+ #print(new_sentences, flush=True)
341
+ #print(self.tags, flush=True)
342
+
343
+ self.sentences = new_sentences
344
+ self.codified = True
345
+
346
+ def codify_sentences(self):
347
+ """
348
+ Converts each token in each sequence into indices to their feature vectors
349
+ in feature matrices. The previous sentences as text are not accessible anymore.
350
+ Tags are also encoded. This function takes care of the case of classifying
351
+ pre-delimited arguments.
352
+ """
353
+ if self.converter is None:
354
+ self.create_converter()
355
+
356
+ self._codify_sentences()
357
+ self.arg_limits = []
358
+
359
+ for i, propositions in enumerate(self.tags):
360
+ new_sent_tags = []
361
+ sent_args = []
362
+
363
+ for j, (rel, prop_tags) in enumerate(propositions):
364
+
365
+ new_prop_tags = []
366
+ prop_args = []
367
+
368
+ #if prop_tags != '-' and j==rel:
369
+ if prop_tags != '-' :
370
+ prop_args.append(np.array([j, j+1]))
371
+ new_prop_tags.append(self.tag_dict[prop_tags])
372
+
373
+ sent_args.append(np.array(prop_args))
374
+ new_sent_tags.append(np.array(new_prop_tags))
375
+
376
+ self.arg_limits.append(sent_args)
377
+ self.tags[i] = new_sent_tags
378
+
379
+
380
+ def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False):
381
+ """
382
+ Replaces each word label with an IOB or IOBES version, appending a prefix
383
+ to them.
384
+
385
+ :param scheme: IOB or IOBES (In, Other, Begin, End, Single).
386
+ :param update_dict: whether to update or not the tag dictionary after
387
+ converting the tags.
388
+ :param only_boundaries: if True, only leaves the IOBES tags and remove
389
+ the actual tags. Also, avoid updating the tag dict.
390
+ """
391
+ scheme = scheme.lower()
392
+ if scheme not in ('iob', 'iobes'):
393
+ raise ValueError("Unknown tagging scheme: %s" % scheme)
394
+
395
+ for _, props in self.sentences:
396
+ for prop in props:
397
+
398
+ last_tag = None
399
+ for i, tag in enumerate(prop):
400
+
401
+ if tag == 'O':
402
+ # O tag is independent from IBES
403
+ last_tag = tag
404
+ continue
405
+
406
+ try:
407
+ next_tag = prop[i + 1]
408
+ except IndexError:
409
+ # last word already
410
+ next_tag = None
411
+
412
+ if tag != last_tag:
413
+ # a new block starts here.
414
+ last_tag = tag
415
+ if scheme == 'iob' or next_tag == tag:
416
+ prop[i] = 'B-%s' % tag
417
+ else:
418
+ prop[i] = 'S-%s' % tag
419
+ else:
420
+ # the block continues.
421
+ if scheme == 'iob' or next_tag == tag:
422
+ prop[i] = 'I-%s' % tag
423
+ else:
424
+ prop[i] = 'E-%s' % tag
425
+
426
+ if only_boundaries:
427
+ self._remove_tag_names()
428
+ elif update_tag_dict:
429
+ self.generate_tag_dict()
430
+ else:
431
+ # treat any tag not appearing in the tag dictionary as O
432
+ actual_tagset = {tag for _, props in self.sentences for prop in props for tag in prop}
433
+ for tag in actual_tagset:
434
+ if tag not in self.tag_dict:
435
+ self.tag_dict[tag] = self.tag_dict[self.rare_tag]
436
+
@@ -0,0 +1,87 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Auxiliary functions for SRL training.
5
+ """
6
+
7
+ import re
8
+ import numpy as np
9
+
10
+
11
+ def init_transitions_simplified(tag_dict):
12
+ """
13
+ This function initializes a tag transition table containing only
14
+ the boundaries IOBES.
15
+ """
16
+ tags = sorted(tag_dict, key=tag_dict.get)
17
+ transitions = []
18
+
19
+ for tag in tags:
20
+ if tag in 'OES':
21
+ trans = lambda x: 0 if x in 'BOS' else -1000
22
+ elif tag in 'IB':
23
+ trans = lambda x: 0 if x in 'IE' else -1000
24
+ else:
25
+ raise ValueError('Unexpected tag: %s' % tag)
26
+
27
+ transitions.append([trans(next_tag) for next_tag in tags])
28
+
29
+ # initial transition
30
+ trans = lambda x: 0 if x in 'BOS' else -1000
31
+ transitions.append([trans(next_tag) for next_tag in tags])
32
+
33
+ return np.array(transitions, np.float)
34
+
35
+
36
+ def init_transitions(tag_dict, scheme):
37
+ """
38
+ This function initializes the tag transition table setting
39
+ very low values for impossible transitions.
40
+
41
+ :param tag_dict: The tag dictionary mapping tag names to the
42
+ network output number.
43
+ :param scheme: either iob or iobes.
44
+ """
45
+ scheme = scheme.lower()
46
+ assert scheme in ('iob', 'iobes'), 'Unknown tagging scheme: %s' % scheme
47
+ transitions = []
48
+
49
+ # since dict's are unordered, let's take the tags in the correct order
50
+ tags = sorted(tag_dict, key=tag_dict.get)
51
+
52
+ # transitions between tags
53
+ for tag in tags:
54
+
55
+ if tag == 'O':
56
+ # next tag can be O, V or any B
57
+ trans = lambda x: 0 if re.match('B|S|V', x) \
58
+ else -1 if x == 'O' else -1000
59
+
60
+ elif tag[0] in 'IB':
61
+ block = tag[2:]
62
+ if scheme == 'iobes':
63
+ # next tag can be I or E (same block)
64
+ trans = lambda x: 0 if re.match('(I|E)-%s' % block, x) else -1000
65
+ else:
66
+ # next tag can be O, I (same block) or B (new block)
67
+ trans = lambda x: 0 if re.match('I-%s' % block, x) or re.match('B-(?!%s)' % block, x) \
68
+ else -1 if x == 'O' else -1000
69
+
70
+ elif tag[0] in 'ES':
71
+ # next tag can be O, S (new block) or B (new block)
72
+ block = tag[2:]
73
+ trans = lambda x: 0 if re.match('(S|B)-(?!%s)' % block, x) \
74
+ else -1 if x == 'O' else -1000
75
+
76
+ else:
77
+ raise ValueError('Unknown tag: %s' % tag)
78
+
79
+ transitions.append([trans(next_tag) for next_tag in tags])
80
+
81
+ # starting tag
82
+ # it can be O or any B/S
83
+ trans = lambda x: 0 if x[0] in 'OBS' else -1000
84
+ transitions.append([trans(next_tag) for next_tag in tags])
85
+
86
+ return np.array(transitions, np.float)
87
+