nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,535 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Class for dealing with SRL data.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from collections import defaultdict
|
8
|
+
#import cPickle
|
9
|
+
import _pickle
|
10
|
+
import logging
|
11
|
+
import re
|
12
|
+
import os
|
13
|
+
import numpy as np
|
14
|
+
#from itertools import izip
|
15
|
+
|
16
|
+
from .. import attributes
|
17
|
+
from .. import utils
|
18
|
+
from ..word_dictionary import WordDictionary
|
19
|
+
from .. import reader
|
20
|
+
|
21
|
+
class ConllPos(object):
|
22
|
+
"""
|
23
|
+
Dummy class for storing the position of each field in a
|
24
|
+
CoNLL data file.
|
25
|
+
"""
|
26
|
+
id = 0
|
27
|
+
word = 1
|
28
|
+
lemma = 2
|
29
|
+
pos = 3
|
30
|
+
morph = 4
|
31
|
+
parse = 7
|
32
|
+
pred = 8
|
33
|
+
semantic_role = 9
|
34
|
+
|
35
|
+
class SRLReader(reader.TaggerReader):
|
36
|
+
|
37
|
+
def __init__(self, md=None, filename=None, only_boundaries=False,
|
38
|
+
only_classify=False, only_predicates=False):
|
39
|
+
"""
|
40
|
+
The reader will read sentences from a given file. This file must
|
41
|
+
be in the correct format (one token per line, columns indicating
|
42
|
+
which tokens are predicates and their argument structure).
|
43
|
+
|
44
|
+
:param filename: a file with CoNLL-like format data. If it is None,
|
45
|
+
the reader will be created with no data.
|
46
|
+
:param only_boundaries: train to identify only argument boundaries
|
47
|
+
:param only_classify: train to classify pre-determined argument
|
48
|
+
:param only_predicates: train to identify only predicates
|
49
|
+
"""
|
50
|
+
|
51
|
+
if only_boundaries:
|
52
|
+
self.taskname = 'srl_boundary'
|
53
|
+
self._generate_iobes_dictionary()
|
54
|
+
elif only_classify:
|
55
|
+
self.taskname = 'srl_classify'
|
56
|
+
elif only_predicates:
|
57
|
+
self.taskname = 'srl_predicates'
|
58
|
+
self._generate_predicate_id_dictionary()
|
59
|
+
else:
|
60
|
+
self.taskname = 'srl'
|
61
|
+
|
62
|
+
self.rare_tag = 'O'
|
63
|
+
if filename is not None:
|
64
|
+
self._read_conll(filename)
|
65
|
+
self._clean_text()
|
66
|
+
|
67
|
+
super(SRLReader, self).__init__(md)
|
68
|
+
|
69
|
+
|
70
|
+
@property
|
71
|
+
def task(self):
|
72
|
+
"""
|
73
|
+
Abstract Base Class (ABC) attribute.
|
74
|
+
"""
|
75
|
+
return self.taskname
|
76
|
+
|
77
|
+
|
78
|
+
def _read_conll(self, filename):
|
79
|
+
'''
|
80
|
+
Read a file in CoNLL format and extracts semantic role tags
|
81
|
+
for each token.
|
82
|
+
'''
|
83
|
+
lines = []
|
84
|
+
with open(filename, 'rb') as f:
|
85
|
+
for line in f:
|
86
|
+
line = unicode(line, 'utf-8').strip()
|
87
|
+
lines.append(line)
|
88
|
+
|
89
|
+
self.sentences = []
|
90
|
+
self.predicates = []
|
91
|
+
tokens = []
|
92
|
+
sent_predicates = []
|
93
|
+
sent_tags = []
|
94
|
+
token_number = 0
|
95
|
+
|
96
|
+
for line in lines:
|
97
|
+
line = line.strip()
|
98
|
+
|
99
|
+
if line == '':
|
100
|
+
# blank line between sentences
|
101
|
+
if len(tokens) > 0:
|
102
|
+
sentence = (tokens, sent_tags)
|
103
|
+
self.sentences.append(sentence)
|
104
|
+
self.predicates.append(np.array(sent_predicates))
|
105
|
+
tokens = []
|
106
|
+
sent_predicates = []
|
107
|
+
sent_tags = []
|
108
|
+
token_number = 0
|
109
|
+
|
110
|
+
continue
|
111
|
+
|
112
|
+
fields = line.split()
|
113
|
+
word = fields[ConllPos.word]
|
114
|
+
lemma = fields[ConllPos.lemma]
|
115
|
+
pos = fields[ConllPos.pos].lower()
|
116
|
+
is_predicate = fields[ConllPos.pred] != '-'
|
117
|
+
tags = fields[ConllPos.semantic_role:]
|
118
|
+
|
119
|
+
# if this is the first token in the sentence, find out how many predicates
|
120
|
+
# are there. initialize a list for each of them.
|
121
|
+
if sent_tags == []:
|
122
|
+
expected_roles = []
|
123
|
+
for tag in tags:
|
124
|
+
tag, expected_role = self._read_role(tag, 'O', True)
|
125
|
+
sent_tags.append([tag])
|
126
|
+
expected_roles.append(expected_role)
|
127
|
+
else:
|
128
|
+
for i, tag in enumerate(tags):
|
129
|
+
expected_role = expected_roles[i]
|
130
|
+
tag, expected_role = self._read_role(tag, expected_role, True)
|
131
|
+
sent_tags[i].append(tag)
|
132
|
+
expected_roles[i] = expected_role
|
133
|
+
|
134
|
+
token = attributes.Token(word, lemma, pos)
|
135
|
+
tokens.append(token)
|
136
|
+
if is_predicate:
|
137
|
+
sent_predicates.append(token_number)
|
138
|
+
|
139
|
+
token_number += 1
|
140
|
+
|
141
|
+
if len(tokens) > 0:
|
142
|
+
# last sentence
|
143
|
+
sentence = (tokens, sent_tags)
|
144
|
+
self.sentences.append(sentence)
|
145
|
+
self.predicates.append(np.array(sent_predicates))
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def _read_role(cls, role, expected_role, remove_continuation):
|
149
|
+
"""
|
150
|
+
Reads the next semantic role from a CoNLL-style file.
|
151
|
+
|
152
|
+
:param role: what is read from the conll file (something like
|
153
|
+
*, (A0* or *)
|
154
|
+
:param role: the expected role if a * is found
|
155
|
+
:param remove_countinuation: removes the C- from non-continuous
|
156
|
+
arguments. C-A0 becomes A0.
|
157
|
+
:return a tuple (role, expected next role)
|
158
|
+
"""
|
159
|
+
if role == '*':
|
160
|
+
# signals continuation of the last block
|
161
|
+
role = expected_role
|
162
|
+
elif role == '*)':
|
163
|
+
# finishes block
|
164
|
+
role = expected_role
|
165
|
+
expected_role = 'O'
|
166
|
+
else:
|
167
|
+
# verifies if it is a single argument
|
168
|
+
match = re.search('\(([-\w]+)\*\)', role)
|
169
|
+
if match:
|
170
|
+
role = match.group(1)
|
171
|
+
expected_role = 'O'
|
172
|
+
else:
|
173
|
+
# verifies if it opens an argument
|
174
|
+
match = re.search('\(([-\w]+)\*', role)
|
175
|
+
if match:
|
176
|
+
role = match.group(1)
|
177
|
+
expected_role = role
|
178
|
+
else:
|
179
|
+
raise ValueError('Unexpected role data: %s' % role)
|
180
|
+
|
181
|
+
if role.startswith('C-') and remove_continuation:
|
182
|
+
# removes C-
|
183
|
+
role = role[2:]
|
184
|
+
|
185
|
+
return (role, expected_role)
|
186
|
+
|
187
|
+
def extend(self, data):
|
188
|
+
"""
|
189
|
+
Adds more data to the reader.
|
190
|
+
:param data: a list of tuples in the format (tokens, tags, predicates),
|
191
|
+
one for each sentence.
|
192
|
+
"""
|
193
|
+
self.sentences.extend([(sent, tags) for sent, tags, _ in data])
|
194
|
+
self.predicates.extend([np.array(preds) for _, _, preds in data])
|
195
|
+
|
196
|
+
def load_or_create_tag_dict(self):
|
197
|
+
"""
|
198
|
+
In the case of SRL argument classification or one step SRL, try to
|
199
|
+
load the tag dictionary. If the file with the tags is not present,
|
200
|
+
a new one is created from the available sentences.
|
201
|
+
|
202
|
+
In the case of argument detection or predicate detection,
|
203
|
+
this function does nothing.
|
204
|
+
"""
|
205
|
+
if self.task == 'srl_predicates' or self.task == 'srl_boundary':
|
206
|
+
return
|
207
|
+
|
208
|
+
# only SRL as one step uses IOB tags
|
209
|
+
iob = self.task == 'srl'
|
210
|
+
if os.path.isfile(self.md.paths['srl_tags']):
|
211
|
+
self.load_tag_dict(iob=iob)
|
212
|
+
return
|
213
|
+
|
214
|
+
self._create_tag_dict(iob)
|
215
|
+
logger = logging.getLogger('Logger')
|
216
|
+
logger.info('Created SRL tag dictionary')
|
217
|
+
|
218
|
+
def _create_tag_dict(self, iob=False):
|
219
|
+
"""
|
220
|
+
Examine the available sentences and create a tag dictionary.
|
221
|
+
|
222
|
+
:param iob: If True, this function will generate an entry for B-[tag]
|
223
|
+
and one for I-[tag], except for the tag 'O'.
|
224
|
+
"""
|
225
|
+
logger = logging.getLogger("Logger")
|
226
|
+
tags = {tag
|
227
|
+
for _, tag_groups in self.sentences
|
228
|
+
for tags in tag_groups
|
229
|
+
for tag in tags}
|
230
|
+
|
231
|
+
# create a dictionary now even if uses IOB, in order to save it in
|
232
|
+
# a deterministic order
|
233
|
+
self.tag_dict = {tag: code for code, tag in enumerate(tags)}
|
234
|
+
reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
|
235
|
+
logger.debug("Saved SRL tag dictionary.")
|
236
|
+
if not iob:
|
237
|
+
return
|
238
|
+
|
239
|
+
# insert I- and B- preserving the ordering
|
240
|
+
new_dict = {}
|
241
|
+
code = 0
|
242
|
+
for tag in sorted(self.tag_dict, key=self.tag_dict.get):
|
243
|
+
if tag == 'O':
|
244
|
+
new_dict[tag] = code
|
245
|
+
else:
|
246
|
+
new_dict['B-%s' % tag] = code
|
247
|
+
code += 1
|
248
|
+
new_dict['I-%s' % tag] = code
|
249
|
+
|
250
|
+
code += 1
|
251
|
+
|
252
|
+
self.tag_dict = new_dict
|
253
|
+
|
254
|
+
def load_tag_dict(self, filename=None, iob=False):
|
255
|
+
"""
|
256
|
+
Loads the tag dictionary from the default file. The dictionary file should
|
257
|
+
have one tag per line.
|
258
|
+
|
259
|
+
:param iob: If True, this function will generate an entry for B-[tag]
|
260
|
+
and one for I-[tag], except for the tag 'O'.
|
261
|
+
"""
|
262
|
+
if self.task == 'srl_predicates' or self.task == 'srl_boundary':
|
263
|
+
return
|
264
|
+
|
265
|
+
if filename is None:
|
266
|
+
filename = self.md.paths['srl_tags']
|
267
|
+
|
268
|
+
if not iob:
|
269
|
+
super(SRLReader, self).load_tag_dict(filename)
|
270
|
+
return
|
271
|
+
|
272
|
+
self.tag_dict = {}
|
273
|
+
code = 0
|
274
|
+
with open(filename, 'rb') as f:
|
275
|
+
for tag in f:
|
276
|
+
tag = unicode(tag, 'utf-8').strip()
|
277
|
+
if tag == '':
|
278
|
+
continue
|
279
|
+
|
280
|
+
if tag == 'O':
|
281
|
+
self.tag_dict[tag] = code
|
282
|
+
else:
|
283
|
+
self.tag_dict['B-%s' % tag] = code
|
284
|
+
code += 1
|
285
|
+
self.tag_dict['I-%s' % tag] = code
|
286
|
+
|
287
|
+
code += 1
|
288
|
+
|
289
|
+
if 'O' not in self.tag_dict:
|
290
|
+
self.tag_dict['O'] = code
|
291
|
+
|
292
|
+
def _generate_iobes_dictionary(self):
|
293
|
+
"""
|
294
|
+
Generate the reader's tag dictionary mapping the IOBES tags to numeric codes.
|
295
|
+
"""
|
296
|
+
self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}
|
297
|
+
|
298
|
+
def _generate_predicate_id_dictionary(self):
|
299
|
+
"""
|
300
|
+
Generate a tag dictionary for identifying predicates.
|
301
|
+
It has two tags: V for predicates and O for others.
|
302
|
+
"""
|
303
|
+
self.tag_dict = {'O': 0, 'V': 1}
|
304
|
+
|
305
|
+
def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
|
306
|
+
"""
|
307
|
+
Generates a token dictionary based on the given sentences.
|
308
|
+
|
309
|
+
:param dict_size: Max number of tokens to be included in the dictionary.
|
310
|
+
:param minimum_occurrences: Minimum number of times that a token must
|
311
|
+
appear in the text in order to be included in the dictionary.
|
312
|
+
"""
|
313
|
+
logger = logging.getLogger("Logger")
|
314
|
+
all_tokens = [token.word
|
315
|
+
for tokens, _ in self.sentences
|
316
|
+
for token in tokens]
|
317
|
+
self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
|
318
|
+
logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
|
319
|
+
|
320
|
+
def _clean_text(self):
|
321
|
+
"""
|
322
|
+
Cleans the sentences text, replacing numbers for a keyword, different
|
323
|
+
kinds of quotation marks for a single one, etc.
|
324
|
+
"""
|
325
|
+
for sent, _ in self.sentences:
|
326
|
+
for i, token in enumerate(sent):
|
327
|
+
new_word = utils.clean_text(token.word, correct=False)
|
328
|
+
new_lemma = utils.clean_text(token.lemma, correct=False)
|
329
|
+
token.word = new_word
|
330
|
+
token.lemma = new_lemma
|
331
|
+
sent[i] = token
|
332
|
+
|
333
|
+
def create_converter(self):
|
334
|
+
"""
|
335
|
+
This function overrides the TextReader's one in order to deal with Token
|
336
|
+
objects instead of raw strings.
|
337
|
+
"""
|
338
|
+
self.converter = attributes.TokenConverter()
|
339
|
+
|
340
|
+
if self.md.use_lemma:
|
341
|
+
# look up word lemmas
|
342
|
+
word_lookup = lambda t: self.word_dict.get(t.lemma)
|
343
|
+
else:
|
344
|
+
# look up the word itself
|
345
|
+
word_lookup = lambda t: self.word_dict.get(t.word)
|
346
|
+
|
347
|
+
self.converter.add_extractor(word_lookup)
|
348
|
+
|
349
|
+
if self.md.use_caps:
|
350
|
+
caps_lookup = lambda t: attributes.get_capitalization(t.word)
|
351
|
+
self.converter.add_extractor(caps_lookup)
|
352
|
+
|
353
|
+
if self.md.use_pos:
|
354
|
+
with open(self.md.paths['pos_tag_dict']) as f:
|
355
|
+
pos_dict = cPickle.load(f)
|
356
|
+
|
357
|
+
pos_def_dict = defaultdict(lambda: pos_dict['other'])
|
358
|
+
pos_def_dict.update(pos_dict)
|
359
|
+
pos_lookup = lambda t: pos_def_dict[t.pos]
|
360
|
+
self.converter.add_extractor(pos_lookup)
|
361
|
+
|
362
|
+
if self.md.use_chunk:
|
363
|
+
with open(self.md.paths['chunk_tag_dict']) as f:
|
364
|
+
chunk_dict = cPickle.load(f)
|
365
|
+
|
366
|
+
chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
|
367
|
+
chunk_def_dict.update(chunk_dict)
|
368
|
+
chunk_lookup = lambda t: chunk_def_dict[t.chunk]
|
369
|
+
self.converter.add_extractor(chunk_lookup)
|
370
|
+
|
371
|
+
def generate_tag_dict(self):
|
372
|
+
"""
|
373
|
+
Generates a tag dictionary that converts the tag itself
|
374
|
+
to an index to be used in the neural network.
|
375
|
+
"""
|
376
|
+
self.tagset = set(tag
|
377
|
+
for _, props in self.sentences
|
378
|
+
for prop in props
|
379
|
+
for tag in prop)
|
380
|
+
|
381
|
+
self.tag_dict = dict( zip( self.tagset,
|
382
|
+
xrange(len(self.tagset))
|
383
|
+
)
|
384
|
+
)
|
385
|
+
|
386
|
+
def _remove_tag_names(self):
|
387
|
+
"""Removes the actual tag names, leaving only IOB or IOBES block delimiters."""
|
388
|
+
for _, propositions in self.sentences:
|
389
|
+
for tags in propositions:
|
390
|
+
for i, tag in enumerate(tags):
|
391
|
+
tags[i] = tag[0]
|
392
|
+
|
393
|
+
def _codify_sentences(self):
|
394
|
+
"""Internal helper function."""
|
395
|
+
new_sentences = []
|
396
|
+
self.tags = []
|
397
|
+
|
398
|
+
for (sent, props), preds in izip(self.sentences, self.predicates):
|
399
|
+
new_sent = []
|
400
|
+
sentence_tags = []
|
401
|
+
|
402
|
+
for token in sent:
|
403
|
+
new_token = self.converter.convert(token)
|
404
|
+
new_sent.append(new_token)
|
405
|
+
|
406
|
+
if self.task == 'srl_predicates':
|
407
|
+
sentence_tags = np.zeros(len(sent), np.int)
|
408
|
+
if len(preds) > 0:
|
409
|
+
sentence_tags[preds] = 1
|
410
|
+
else:
|
411
|
+
for prop in props:
|
412
|
+
# for classifying arguments, leave the names. they will be changed later
|
413
|
+
if self.task == 'srl_classify':
|
414
|
+
prop_tags = prop
|
415
|
+
else:
|
416
|
+
prop_tags = np.array([self.tag_dict[tag] for tag in prop])
|
417
|
+
sentence_tags.append(prop_tags)
|
418
|
+
|
419
|
+
new_sentences.append(np.array(new_sent))
|
420
|
+
self.tags.append(sentence_tags)
|
421
|
+
|
422
|
+
self.sentences = new_sentences
|
423
|
+
self.codified = True
|
424
|
+
|
425
|
+
def codify_sentences(self):
|
426
|
+
"""
|
427
|
+
Converts each token in each sequence into indices to their feature vectors
|
428
|
+
in feature matrices. The previous sentences as text are not accessible anymore.
|
429
|
+
Tags are also encoded. This function takes care of the case of classifying
|
430
|
+
pre-delimited arguments.
|
431
|
+
"""
|
432
|
+
if self.converter is None:
|
433
|
+
self.create_converter()
|
434
|
+
|
435
|
+
self._codify_sentences()
|
436
|
+
self.arg_limits = []
|
437
|
+
|
438
|
+
if self.task == 'srl_classify':
|
439
|
+
# generate the tags for each argument
|
440
|
+
start = 0
|
441
|
+
end = 0
|
442
|
+
|
443
|
+
for i, propositions in enumerate(self.tags):
|
444
|
+
new_sent_tags = []
|
445
|
+
sent_args = []
|
446
|
+
|
447
|
+
for prop_tags in propositions:
|
448
|
+
|
449
|
+
new_prop_tags = []
|
450
|
+
prop_args = []
|
451
|
+
last_tag = 'O'
|
452
|
+
|
453
|
+
for j, tag in enumerate(prop_tags):
|
454
|
+
if tag != last_tag:
|
455
|
+
# if we were inside an argument, it ended
|
456
|
+
# we may have started a new
|
457
|
+
if last_tag != 'O':
|
458
|
+
end = j - 1
|
459
|
+
prop_args.append(np.array([start, end]))
|
460
|
+
|
461
|
+
if tag != 'O':
|
462
|
+
start = j
|
463
|
+
new_prop_tags.append(self.tag_dict[tag])
|
464
|
+
|
465
|
+
last_tag = tag
|
466
|
+
else:
|
467
|
+
# after last iteration, check the last tag
|
468
|
+
if last_tag != 'O':
|
469
|
+
end = j
|
470
|
+
prop_args.append(np.array([start, end]))
|
471
|
+
|
472
|
+
sent_args.append(np.array(prop_args))
|
473
|
+
new_sent_tags.append(np.array(new_prop_tags))
|
474
|
+
|
475
|
+
self.arg_limits.append(sent_args)
|
476
|
+
self.tags[i] = new_sent_tags
|
477
|
+
|
478
|
+
|
479
|
+
def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False):
|
480
|
+
"""
|
481
|
+
Replaces each word label with an IOB or IOBES version, appending a prefix
|
482
|
+
to them.
|
483
|
+
|
484
|
+
:param scheme: IOB or IOBES (In, Other, Begin, End, Single).
|
485
|
+
:param update_dict: whether to update or not the tag dictionary after
|
486
|
+
converting the tags.
|
487
|
+
:param only_boundaries: if True, only leaves the IOBES tags and remove
|
488
|
+
the actual tags. Also, avoid updating the tag dict.
|
489
|
+
"""
|
490
|
+
scheme = scheme.lower()
|
491
|
+
if scheme not in ('iob', 'iobes'):
|
492
|
+
raise ValueError("Unknown tagging scheme: %s" % scheme)
|
493
|
+
|
494
|
+
for _, props in self.sentences:
|
495
|
+
for prop in props:
|
496
|
+
|
497
|
+
last_tag = None
|
498
|
+
for i, tag in enumerate(prop):
|
499
|
+
|
500
|
+
if tag == 'O':
|
501
|
+
# O tag is independent from IBES
|
502
|
+
last_tag = tag
|
503
|
+
continue
|
504
|
+
|
505
|
+
try:
|
506
|
+
next_tag = prop[i + 1]
|
507
|
+
except IndexError:
|
508
|
+
# last word already
|
509
|
+
next_tag = None
|
510
|
+
|
511
|
+
if tag != last_tag:
|
512
|
+
# a new block starts here.
|
513
|
+
last_tag = tag
|
514
|
+
if scheme == 'iob' or next_tag == tag:
|
515
|
+
prop[i] = 'B-%s' % tag
|
516
|
+
else:
|
517
|
+
prop[i] = 'S-%s' % tag
|
518
|
+
else:
|
519
|
+
# the block continues.
|
520
|
+
if scheme == 'iob' or next_tag == tag:
|
521
|
+
prop[i] = 'I-%s' % tag
|
522
|
+
else:
|
523
|
+
prop[i] = 'E-%s' % tag
|
524
|
+
|
525
|
+
if only_boundaries:
|
526
|
+
self._remove_tag_names()
|
527
|
+
elif update_tag_dict:
|
528
|
+
self.generate_tag_dict()
|
529
|
+
else:
|
530
|
+
# treat any tag not appearing in the tag dictionary as O
|
531
|
+
actual_tagset = {tag for _, props in self.sentences for prop in props for tag in prop}
|
532
|
+
for tag in actual_tagset:
|
533
|
+
if tag not in self.tag_dict:
|
534
|
+
self.tag_dict[tag] = self.tag_dict[self.rare_tag]
|
535
|
+
|