nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,535 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Class for dealing with SRL data.
5
+ """
6
+
7
+ from collections import defaultdict
8
+ #import cPickle
9
+ import _pickle
10
+ import logging
11
+ import re
12
+ import os
13
+ import numpy as np
14
+ #from itertools import izip
15
+
16
+ from .. import attributes
17
+ from .. import utils
18
+ from ..word_dictionary import WordDictionary
19
+ from .. import reader
20
+
21
+ class ConllPos(object):
22
+ """
23
+ Dummy class for storing the position of each field in a
24
+ CoNLL data file.
25
+ """
26
+ id = 0
27
+ word = 1
28
+ lemma = 2
29
+ pos = 3
30
+ morph = 4
31
+ parse = 7
32
+ pred = 8
33
+ semantic_role = 9
34
+
35
+ class SRLReader(reader.TaggerReader):
36
+
37
+ def __init__(self, md=None, filename=None, only_boundaries=False,
38
+ only_classify=False, only_predicates=False):
39
+ """
40
+ The reader will read sentences from a given file. This file must
41
+ be in the correct format (one token per line, columns indicating
42
+ which tokens are predicates and their argument structure).
43
+
44
+ :param filename: a file with CoNLL-like format data. If it is None,
45
+ the reader will be created with no data.
46
+ :param only_boundaries: train to identify only argument boundaries
47
+ :param only_classify: train to classify pre-determined argument
48
+ :param only_predicates: train to identify only predicates
49
+ """
50
+
51
+ if only_boundaries:
52
+ self.taskname = 'srl_boundary'
53
+ self._generate_iobes_dictionary()
54
+ elif only_classify:
55
+ self.taskname = 'srl_classify'
56
+ elif only_predicates:
57
+ self.taskname = 'srl_predicates'
58
+ self._generate_predicate_id_dictionary()
59
+ else:
60
+ self.taskname = 'srl'
61
+
62
+ self.rare_tag = 'O'
63
+ if filename is not None:
64
+ self._read_conll(filename)
65
+ self._clean_text()
66
+
67
+ super(SRLReader, self).__init__(md)
68
+
69
+
70
+ @property
71
+ def task(self):
72
+ """
73
+ Abstract Base Class (ABC) attribute.
74
+ """
75
+ return self.taskname
76
+
77
+
78
+ def _read_conll(self, filename):
79
+ '''
80
+ Read a file in CoNLL format and extracts semantic role tags
81
+ for each token.
82
+ '''
83
+ lines = []
84
+ with open(filename, 'rb') as f:
85
+ for line in f:
86
+ line = unicode(line, 'utf-8').strip()
87
+ lines.append(line)
88
+
89
+ self.sentences = []
90
+ self.predicates = []
91
+ tokens = []
92
+ sent_predicates = []
93
+ sent_tags = []
94
+ token_number = 0
95
+
96
+ for line in lines:
97
+ line = line.strip()
98
+
99
+ if line == '':
100
+ # blank line between sentences
101
+ if len(tokens) > 0:
102
+ sentence = (tokens, sent_tags)
103
+ self.sentences.append(sentence)
104
+ self.predicates.append(np.array(sent_predicates))
105
+ tokens = []
106
+ sent_predicates = []
107
+ sent_tags = []
108
+ token_number = 0
109
+
110
+ continue
111
+
112
+ fields = line.split()
113
+ word = fields[ConllPos.word]
114
+ lemma = fields[ConllPos.lemma]
115
+ pos = fields[ConllPos.pos].lower()
116
+ is_predicate = fields[ConllPos.pred] != '-'
117
+ tags = fields[ConllPos.semantic_role:]
118
+
119
+ # if this is the first token in the sentence, find out how many predicates
120
+ # are there. initialize a list for each of them.
121
+ if sent_tags == []:
122
+ expected_roles = []
123
+ for tag in tags:
124
+ tag, expected_role = self._read_role(tag, 'O', True)
125
+ sent_tags.append([tag])
126
+ expected_roles.append(expected_role)
127
+ else:
128
+ for i, tag in enumerate(tags):
129
+ expected_role = expected_roles[i]
130
+ tag, expected_role = self._read_role(tag, expected_role, True)
131
+ sent_tags[i].append(tag)
132
+ expected_roles[i] = expected_role
133
+
134
+ token = attributes.Token(word, lemma, pos)
135
+ tokens.append(token)
136
+ if is_predicate:
137
+ sent_predicates.append(token_number)
138
+
139
+ token_number += 1
140
+
141
+ if len(tokens) > 0:
142
+ # last sentence
143
+ sentence = (tokens, sent_tags)
144
+ self.sentences.append(sentence)
145
+ self.predicates.append(np.array(sent_predicates))
146
+
147
+ @classmethod
148
+ def _read_role(cls, role, expected_role, remove_continuation):
149
+ """
150
+ Reads the next semantic role from a CoNLL-style file.
151
+
152
+ :param role: what is read from the conll file (something like
153
+ *, (A0* or *)
154
+ :param role: the expected role if a * is found
155
+ :param remove_countinuation: removes the C- from non-continuous
156
+ arguments. C-A0 becomes A0.
157
+ :return a tuple (role, expected next role)
158
+ """
159
+ if role == '*':
160
+ # signals continuation of the last block
161
+ role = expected_role
162
+ elif role == '*)':
163
+ # finishes block
164
+ role = expected_role
165
+ expected_role = 'O'
166
+ else:
167
+ # verifies if it is a single argument
168
+ match = re.search('\(([-\w]+)\*\)', role)
169
+ if match:
170
+ role = match.group(1)
171
+ expected_role = 'O'
172
+ else:
173
+ # verifies if it opens an argument
174
+ match = re.search('\(([-\w]+)\*', role)
175
+ if match:
176
+ role = match.group(1)
177
+ expected_role = role
178
+ else:
179
+ raise ValueError('Unexpected role data: %s' % role)
180
+
181
+ if role.startswith('C-') and remove_continuation:
182
+ # removes C-
183
+ role = role[2:]
184
+
185
+ return (role, expected_role)
186
+
187
+ def extend(self, data):
188
+ """
189
+ Adds more data to the reader.
190
+ :param data: a list of tuples in the format (tokens, tags, predicates),
191
+ one for each sentence.
192
+ """
193
+ self.sentences.extend([(sent, tags) for sent, tags, _ in data])
194
+ self.predicates.extend([np.array(preds) for _, _, preds in data])
195
+
196
+ def load_or_create_tag_dict(self):
197
+ """
198
+ In the case of SRL argument classification or one step SRL, try to
199
+ load the tag dictionary. If the file with the tags is not present,
200
+ a new one is created from the available sentences.
201
+
202
+ In the case of argument detection or predicate detection,
203
+ this function does nothing.
204
+ """
205
+ if self.task == 'srl_predicates' or self.task == 'srl_boundary':
206
+ return
207
+
208
+ # only SRL as one step uses IOB tags
209
+ iob = self.task == 'srl'
210
+ if os.path.isfile(self.md.paths['srl_tags']):
211
+ self.load_tag_dict(iob=iob)
212
+ return
213
+
214
+ self._create_tag_dict(iob)
215
+ logger = logging.getLogger('Logger')
216
+ logger.info('Created SRL tag dictionary')
217
+
218
+ def _create_tag_dict(self, iob=False):
219
+ """
220
+ Examine the available sentences and create a tag dictionary.
221
+
222
+ :param iob: If True, this function will generate an entry for B-[tag]
223
+ and one for I-[tag], except for the tag 'O'.
224
+ """
225
+ logger = logging.getLogger("Logger")
226
+ tags = {tag
227
+ for _, tag_groups in self.sentences
228
+ for tags in tag_groups
229
+ for tag in tags}
230
+
231
+ # create a dictionary now even if uses IOB, in order to save it in
232
+ # a deterministic order
233
+ self.tag_dict = {tag: code for code, tag in enumerate(tags)}
234
+ reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
235
+ logger.debug("Saved SRL tag dictionary.")
236
+ if not iob:
237
+ return
238
+
239
+ # insert I- and B- preserving the ordering
240
+ new_dict = {}
241
+ code = 0
242
+ for tag in sorted(self.tag_dict, key=self.tag_dict.get):
243
+ if tag == 'O':
244
+ new_dict[tag] = code
245
+ else:
246
+ new_dict['B-%s' % tag] = code
247
+ code += 1
248
+ new_dict['I-%s' % tag] = code
249
+
250
+ code += 1
251
+
252
+ self.tag_dict = new_dict
253
+
254
+ def load_tag_dict(self, filename=None, iob=False):
255
+ """
256
+ Loads the tag dictionary from the default file. The dictionary file should
257
+ have one tag per line.
258
+
259
+ :param iob: If True, this function will generate an entry for B-[tag]
260
+ and one for I-[tag], except for the tag 'O'.
261
+ """
262
+ if self.task == 'srl_predicates' or self.task == 'srl_boundary':
263
+ return
264
+
265
+ if filename is None:
266
+ filename = self.md.paths['srl_tags']
267
+
268
+ if not iob:
269
+ super(SRLReader, self).load_tag_dict(filename)
270
+ return
271
+
272
+ self.tag_dict = {}
273
+ code = 0
274
+ with open(filename, 'rb') as f:
275
+ for tag in f:
276
+ tag = unicode(tag, 'utf-8').strip()
277
+ if tag == '':
278
+ continue
279
+
280
+ if tag == 'O':
281
+ self.tag_dict[tag] = code
282
+ else:
283
+ self.tag_dict['B-%s' % tag] = code
284
+ code += 1
285
+ self.tag_dict['I-%s' % tag] = code
286
+
287
+ code += 1
288
+
289
+ if 'O' not in self.tag_dict:
290
+ self.tag_dict['O'] = code
291
+
292
+ def _generate_iobes_dictionary(self):
293
+ """
294
+ Generate the reader's tag dictionary mapping the IOBES tags to numeric codes.
295
+ """
296
+ self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}
297
+
298
+ def _generate_predicate_id_dictionary(self):
299
+ """
300
+ Generate a tag dictionary for identifying predicates.
301
+ It has two tags: V for predicates and O for others.
302
+ """
303
+ self.tag_dict = {'O': 0, 'V': 1}
304
+
305
+ def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
306
+ """
307
+ Generates a token dictionary based on the given sentences.
308
+
309
+ :param dict_size: Max number of tokens to be included in the dictionary.
310
+ :param minimum_occurrences: Minimum number of times that a token must
311
+ appear in the text in order to be included in the dictionary.
312
+ """
313
+ logger = logging.getLogger("Logger")
314
+ all_tokens = [token.word
315
+ for tokens, _ in self.sentences
316
+ for token in tokens]
317
+ self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
318
+ logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
319
+
320
+ def _clean_text(self):
321
+ """
322
+ Cleans the sentences text, replacing numbers for a keyword, different
323
+ kinds of quotation marks for a single one, etc.
324
+ """
325
+ for sent, _ in self.sentences:
326
+ for i, token in enumerate(sent):
327
+ new_word = utils.clean_text(token.word, correct=False)
328
+ new_lemma = utils.clean_text(token.lemma, correct=False)
329
+ token.word = new_word
330
+ token.lemma = new_lemma
331
+ sent[i] = token
332
+
333
+ def create_converter(self):
334
+ """
335
+ This function overrides the TextReader's one in order to deal with Token
336
+ objects instead of raw strings.
337
+ """
338
+ self.converter = attributes.TokenConverter()
339
+
340
+ if self.md.use_lemma:
341
+ # look up word lemmas
342
+ word_lookup = lambda t: self.word_dict.get(t.lemma)
343
+ else:
344
+ # look up the word itself
345
+ word_lookup = lambda t: self.word_dict.get(t.word)
346
+
347
+ self.converter.add_extractor(word_lookup)
348
+
349
+ if self.md.use_caps:
350
+ caps_lookup = lambda t: attributes.get_capitalization(t.word)
351
+ self.converter.add_extractor(caps_lookup)
352
+
353
+ if self.md.use_pos:
354
+ with open(self.md.paths['pos_tag_dict']) as f:
355
+ pos_dict = cPickle.load(f)
356
+
357
+ pos_def_dict = defaultdict(lambda: pos_dict['other'])
358
+ pos_def_dict.update(pos_dict)
359
+ pos_lookup = lambda t: pos_def_dict[t.pos]
360
+ self.converter.add_extractor(pos_lookup)
361
+
362
+ if self.md.use_chunk:
363
+ with open(self.md.paths['chunk_tag_dict']) as f:
364
+ chunk_dict = cPickle.load(f)
365
+
366
+ chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
367
+ chunk_def_dict.update(chunk_dict)
368
+ chunk_lookup = lambda t: chunk_def_dict[t.chunk]
369
+ self.converter.add_extractor(chunk_lookup)
370
+
371
+ def generate_tag_dict(self):
372
+ """
373
+ Generates a tag dictionary that converts the tag itself
374
+ to an index to be used in the neural network.
375
+ """
376
+ self.tagset = set(tag
377
+ for _, props in self.sentences
378
+ for prop in props
379
+ for tag in prop)
380
+
381
+ self.tag_dict = dict( zip( self.tagset,
382
+ xrange(len(self.tagset))
383
+ )
384
+ )
385
+
386
+ def _remove_tag_names(self):
387
+ """Removes the actual tag names, leaving only IOB or IOBES block delimiters."""
388
+ for _, propositions in self.sentences:
389
+ for tags in propositions:
390
+ for i, tag in enumerate(tags):
391
+ tags[i] = tag[0]
392
+
393
+ def _codify_sentences(self):
394
+ """Internal helper function."""
395
+ new_sentences = []
396
+ self.tags = []
397
+
398
+ for (sent, props), preds in izip(self.sentences, self.predicates):
399
+ new_sent = []
400
+ sentence_tags = []
401
+
402
+ for token in sent:
403
+ new_token = self.converter.convert(token)
404
+ new_sent.append(new_token)
405
+
406
+ if self.task == 'srl_predicates':
407
+ sentence_tags = np.zeros(len(sent), np.int)
408
+ if len(preds) > 0:
409
+ sentence_tags[preds] = 1
410
+ else:
411
+ for prop in props:
412
+ # for classifying arguments, leave the names. they will be changed later
413
+ if self.task == 'srl_classify':
414
+ prop_tags = prop
415
+ else:
416
+ prop_tags = np.array([self.tag_dict[tag] for tag in prop])
417
+ sentence_tags.append(prop_tags)
418
+
419
+ new_sentences.append(np.array(new_sent))
420
+ self.tags.append(sentence_tags)
421
+
422
+ self.sentences = new_sentences
423
+ self.codified = True
424
+
425
+ def codify_sentences(self):
426
+ """
427
+ Converts each token in each sequence into indices to their feature vectors
428
+ in feature matrices. The previous sentences as text are not accessible anymore.
429
+ Tags are also encoded. This function takes care of the case of classifying
430
+ pre-delimited arguments.
431
+ """
432
+ if self.converter is None:
433
+ self.create_converter()
434
+
435
+ self._codify_sentences()
436
+ self.arg_limits = []
437
+
438
+ if self.task == 'srl_classify':
439
+ # generate the tags for each argument
440
+ start = 0
441
+ end = 0
442
+
443
+ for i, propositions in enumerate(self.tags):
444
+ new_sent_tags = []
445
+ sent_args = []
446
+
447
+ for prop_tags in propositions:
448
+
449
+ new_prop_tags = []
450
+ prop_args = []
451
+ last_tag = 'O'
452
+
453
+ for j, tag in enumerate(prop_tags):
454
+ if tag != last_tag:
455
+ # if we were inside an argument, it ended
456
+ # we may have started a new
457
+ if last_tag != 'O':
458
+ end = j - 1
459
+ prop_args.append(np.array([start, end]))
460
+
461
+ if tag != 'O':
462
+ start = j
463
+ new_prop_tags.append(self.tag_dict[tag])
464
+
465
+ last_tag = tag
466
+ else:
467
+ # after last iteration, check the last tag
468
+ if last_tag != 'O':
469
+ end = j
470
+ prop_args.append(np.array([start, end]))
471
+
472
+ sent_args.append(np.array(prop_args))
473
+ new_sent_tags.append(np.array(new_prop_tags))
474
+
475
+ self.arg_limits.append(sent_args)
476
+ self.tags[i] = new_sent_tags
477
+
478
+
479
+ def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False):
480
+ """
481
+ Replaces each word label with an IOB or IOBES version, appending a prefix
482
+ to them.
483
+
484
+ :param scheme: IOB or IOBES (In, Other, Begin, End, Single).
485
+ :param update_dict: whether to update or not the tag dictionary after
486
+ converting the tags.
487
+ :param only_boundaries: if True, only leaves the IOBES tags and remove
488
+ the actual tags. Also, avoid updating the tag dict.
489
+ """
490
+ scheme = scheme.lower()
491
+ if scheme not in ('iob', 'iobes'):
492
+ raise ValueError("Unknown tagging scheme: %s" % scheme)
493
+
494
+ for _, props in self.sentences:
495
+ for prop in props:
496
+
497
+ last_tag = None
498
+ for i, tag in enumerate(prop):
499
+
500
+ if tag == 'O':
501
+ # O tag is independent from IBES
502
+ last_tag = tag
503
+ continue
504
+
505
+ try:
506
+ next_tag = prop[i + 1]
507
+ except IndexError:
508
+ # last word already
509
+ next_tag = None
510
+
511
+ if tag != last_tag:
512
+ # a new block starts here.
513
+ last_tag = tag
514
+ if scheme == 'iob' or next_tag == tag:
515
+ prop[i] = 'B-%s' % tag
516
+ else:
517
+ prop[i] = 'S-%s' % tag
518
+ else:
519
+ # the block continues.
520
+ if scheme == 'iob' or next_tag == tag:
521
+ prop[i] = 'I-%s' % tag
522
+ else:
523
+ prop[i] = 'E-%s' % tag
524
+
525
+ if only_boundaries:
526
+ self._remove_tag_names()
527
+ elif update_tag_dict:
528
+ self.generate_tag_dict()
529
+ else:
530
+ # treat any tag not appearing in the tag dictionary as O
531
+ actual_tagset = {tag for _, props in self.sentences for prop in props for tag in prop}
532
+ for tag in actual_tagset:
533
+ if tag not in self.tag_dict:
534
+ self.tag_dict[tag] = self.tag_dict[self.rare_tag]
535
+