nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,436 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Class for dealing with SRL data.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from collections import defaultdict
|
8
|
+
import _pickle as cPickle
|
9
|
+
import _pickle
|
10
|
+
import logging
|
11
|
+
import re
|
12
|
+
import os
|
13
|
+
import numpy as np
|
14
|
+
#from itertools import izip
|
15
|
+
|
16
|
+
from .. import attributes
|
17
|
+
from .. import utils
|
18
|
+
from ..word_dictionary import WordDictionary
|
19
|
+
from .. import reader
|
20
|
+
|
21
|
+
class ConllPos(object):
|
22
|
+
"""
|
23
|
+
Dummy class for storing the position of each field in a
|
24
|
+
CoNLL data file.
|
25
|
+
"""
|
26
|
+
id = 0
|
27
|
+
word = 1
|
28
|
+
lemma = 2
|
29
|
+
hmorph = 3 # head morph
|
30
|
+
hpos = 4 # head pos
|
31
|
+
tmorph = 5 # tail morph
|
32
|
+
tpos = 6 # tail pos
|
33
|
+
parse = 7
|
34
|
+
rel = 8
|
35
|
+
semantic_role = 9
|
36
|
+
SEP = '\t'
|
37
|
+
|
38
|
+
class SRLReader(reader.TaggerReader):
|
39
|
+
|
40
|
+
def __init__(self, md=None, filename=None):
|
41
|
+
"""
|
42
|
+
The reader will read sentences from a given file. This file must
|
43
|
+
be in the correct format (one token per line, columns indicating
|
44
|
+
which tokens are predicates and their argument structure).
|
45
|
+
|
46
|
+
:param filename: a file with CoNLL-like format data. If it is None,
|
47
|
+
the reader will be created with no data.
|
48
|
+
:param only_boundaries: train to identify only argument boundaries
|
49
|
+
:param only_classify: train to classify pre-determined argument
|
50
|
+
:param only_predicates: train to identify only predicates
|
51
|
+
"""
|
52
|
+
|
53
|
+
self.taskname = 'srl'
|
54
|
+
self.pos_dict = {}
|
55
|
+
|
56
|
+
if filename is not None:
|
57
|
+
self._read_conll(filename)
|
58
|
+
#self._clean_text()
|
59
|
+
|
60
|
+
super(SRLReader, self).__init__(md)
|
61
|
+
|
62
|
+
|
63
|
+
@property
|
64
|
+
def task(self):
|
65
|
+
"""
|
66
|
+
Abstract Base Class (ABC) attribute.
|
67
|
+
"""
|
68
|
+
return self.taskname
|
69
|
+
|
70
|
+
|
71
|
+
def _read_conll(self, filename):
|
72
|
+
'''
|
73
|
+
Read a file in CoNLL format and extracts semantic role tags
|
74
|
+
for each token.
|
75
|
+
'''
|
76
|
+
lines = []
|
77
|
+
with open(filename, 'rt') as f:
|
78
|
+
for line in f:
|
79
|
+
line = line.strip()
|
80
|
+
lines.append(line)
|
81
|
+
|
82
|
+
self.sentences = []
|
83
|
+
self.predicates = []
|
84
|
+
tokens = []
|
85
|
+
sent_predicates = []
|
86
|
+
sent_tags = []
|
87
|
+
token_number = 0
|
88
|
+
|
89
|
+
for line in lines:
|
90
|
+
line = line.strip()
|
91
|
+
|
92
|
+
if line == '':
|
93
|
+
# blank line between sentences
|
94
|
+
if len(tokens) > 0:
|
95
|
+
sentence = (tokens, sent_tags)
|
96
|
+
self.sentences.append(sentence)
|
97
|
+
self.predicates.append(np.array(sent_predicates))
|
98
|
+
tokens = []
|
99
|
+
sent_predicates = []
|
100
|
+
sent_tags = []
|
101
|
+
token_number = 0
|
102
|
+
|
103
|
+
continue
|
104
|
+
|
105
|
+
fields = line.split(ConllPos.SEP)
|
106
|
+
idx = fields[ConllPos.id]
|
107
|
+
word = fields[ConllPos.word]
|
108
|
+
lemma = fields[ConllPos.lemma]
|
109
|
+
hmorph = fields[ConllPos.hmorph]
|
110
|
+
hpos = fields[ConllPos.hpos].lower()
|
111
|
+
tmorph = fields[ConllPos.tmorph]
|
112
|
+
tpos = fields[ConllPos.tpos].lower()
|
113
|
+
parse = fields[ConllPos.parse]
|
114
|
+
rel = fields[ConllPos.rel]
|
115
|
+
is_predicate = (rel[:1] == 'V')
|
116
|
+
tag = fields[ConllPos.semantic_role]
|
117
|
+
|
118
|
+
tag = self._read_role(tag)
|
119
|
+
sent_tags.append((int(parse)-1, tag)) # note: codify_sentences
|
120
|
+
|
121
|
+
token = attributes.Token(word=word, morph_h=hmorph, morph_t=tmorph, pos_t=tpos, chunk=rel)
|
122
|
+
#token = attributes.Token(word, morph_h=hmorph, pos_h=hpos, morph_t=tmorph, pos_t=tpos, chunk=rel)
|
123
|
+
tokens.append(token)
|
124
|
+
if is_predicate:
|
125
|
+
sent_predicates.append(token_number)
|
126
|
+
|
127
|
+
token_number += 1
|
128
|
+
|
129
|
+
if len(tokens) > 0:
|
130
|
+
# last sentence
|
131
|
+
sentence = (tokens, sent_tags)
|
132
|
+
self.sentences.append(sentence)
|
133
|
+
self.predicates.append(np.array(sent_predicates))
|
134
|
+
|
135
|
+
@classmethod
|
136
|
+
def _read_role(cls, role):
|
137
|
+
'''
|
138
|
+
Reads the semantic role from a CoNLL-style file.
|
139
|
+
|
140
|
+
pram: role what is read from the conll file
|
141
|
+
'''
|
142
|
+
return role
|
143
|
+
|
144
|
+
def extend(self, data):
|
145
|
+
"""
|
146
|
+
Adds more data to the reader.
|
147
|
+
:param data: a list of tuples in the format (tokens, tags, predicates),
|
148
|
+
one for each sentence.
|
149
|
+
"""
|
150
|
+
self.sentences.extend([(sent, tags) for sent, tags, _ in data])
|
151
|
+
self.predicates.extend([np.array(preds) for _, _, preds in data])
|
152
|
+
|
153
|
+
def load_or_create_tag_dict(self):
|
154
|
+
"""
|
155
|
+
In the case of SRL argument classification or one step SRL, try to
|
156
|
+
load the tag dictionary. If the file with the tags is not present,
|
157
|
+
a new one is created from the available sentences.
|
158
|
+
|
159
|
+
In the case of argument detection or predicate detection,
|
160
|
+
this function does nothing.
|
161
|
+
"""
|
162
|
+
if os.path.isfile(self.md.paths['srl_tags']):
|
163
|
+
self.load_tag_dict()
|
164
|
+
return
|
165
|
+
|
166
|
+
self._create_tag_dict()
|
167
|
+
logger = logging.getLogger('Logger')
|
168
|
+
logger.info('Created SRL tag dictionary')
|
169
|
+
|
170
|
+
def _create_tag_dict(self):
|
171
|
+
"""
|
172
|
+
Examine the available sentences and create a tag dictionary.
|
173
|
+
|
174
|
+
:param iob: If True, this function will generate an entry for B-[tag]
|
175
|
+
and one for I-[tag], except for the tag 'O'.
|
176
|
+
"""
|
177
|
+
logger = logging.getLogger("Logger")
|
178
|
+
tags = {tag
|
179
|
+
for _, tags in self.sentences
|
180
|
+
for rel, tag in tags}
|
181
|
+
|
182
|
+
# create a dictionary now even if uses IOB, in order to save it in
|
183
|
+
# a deterministic order
|
184
|
+
self.tag_dict = {tag: code for code, tag in enumerate(tags)}
|
185
|
+
reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
|
186
|
+
logger.debug("Saved SRL tag dictionary.")
|
187
|
+
|
188
|
+
|
189
|
+
def load_tag_dict(self, filename=None, iob=False):
|
190
|
+
"""
|
191
|
+
Loads the tag dictionary from the default file. The dictionary file should
|
192
|
+
have one tag per line.
|
193
|
+
|
194
|
+
:param iob: If True, this function will generate an entry for B-[tag]
|
195
|
+
and one for I-[tag], except for the tag 'O'.
|
196
|
+
"""
|
197
|
+
if filename is None:
|
198
|
+
filename = self.md.paths['srl_tags']
|
199
|
+
|
200
|
+
self.tag_dict = {}
|
201
|
+
code = 0
|
202
|
+
with open(filename, 'rt') as f:
|
203
|
+
for tag in f:
|
204
|
+
tag = tag.strip()
|
205
|
+
if tag == '':
|
206
|
+
continue
|
207
|
+
|
208
|
+
self.tag_dict[tag] = code
|
209
|
+
|
210
|
+
code += 1
|
211
|
+
|
212
|
+
|
213
|
+
def _generate_iobes_dictionary(self):
|
214
|
+
"""
|
215
|
+
Generate the reader's tag dictionary mapping the IOBES tags to numeric codes.
|
216
|
+
"""
|
217
|
+
self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}
|
218
|
+
|
219
|
+
def _generate_predicate_id_dictionary(self):
|
220
|
+
"""
|
221
|
+
Generate a tag dictionary for identifying predicates.
|
222
|
+
It has two tags: V for predicates and O for others.
|
223
|
+
"""
|
224
|
+
self.tag_dict = {'-': 0, 'V': 1}
|
225
|
+
#self.tag_dict = {'O': 0, 'V': 1}
|
226
|
+
|
227
|
+
def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
|
228
|
+
"""
|
229
|
+
Generates a token dictionary based on the given sentences.
|
230
|
+
|
231
|
+
:param dict_size: Max number of tokens to be included in the dictionary.
|
232
|
+
:param minimum_occurrences: Minimum number of times that a token must
|
233
|
+
appear in the text in order to be included in the dictionary.
|
234
|
+
"""
|
235
|
+
logger = logging.getLogger("Logger")
|
236
|
+
all_tokens = [token.word
|
237
|
+
for tokens, _ in self.sentences
|
238
|
+
for token in tokens]
|
239
|
+
self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences)
|
240
|
+
logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens)
|
241
|
+
|
242
|
+
def _clean_text(self):
|
243
|
+
"""
|
244
|
+
Cleans the sentences text, replacing numbers for a keyword, different
|
245
|
+
kinds of quotation marks for a single one, etc.
|
246
|
+
"""
|
247
|
+
for sent, _ in self.sentences:
|
248
|
+
for i, token in enumerate(sent):
|
249
|
+
new_word = utils.clean_text(token.word, correct=False)
|
250
|
+
new_lemma = utils.clean_text(token.lemma, correct=False)
|
251
|
+
token.word = new_word
|
252
|
+
token.lemma = new_lemma
|
253
|
+
sent[i] = token
|
254
|
+
|
255
|
+
def create_converter(self):
|
256
|
+
"""
|
257
|
+
This function overrides the TextReader's one in order to deal with Token
|
258
|
+
objects instead of raw strings.
|
259
|
+
"""
|
260
|
+
self.converter = attributes.TokenConverter()
|
261
|
+
|
262
|
+
if self.md.use_lemma:
|
263
|
+
# look up word lemmas
|
264
|
+
word_lookup = lambda t: self.word_dict.get(t.lemma)
|
265
|
+
else:
|
266
|
+
# look up the word itself
|
267
|
+
word_lookup = lambda t: self.word_dict.get(t.word)
|
268
|
+
|
269
|
+
self.converter.add_extractor(word_lookup)
|
270
|
+
|
271
|
+
#if self.md.use_caps:
|
272
|
+
# caps_lookup = lambda t: attributes.get_capitalization(t.word)
|
273
|
+
# self.converter.add_extractor(caps_lookup)
|
274
|
+
|
275
|
+
if self.md.use_pos:
|
276
|
+
with open(self.md.paths['pos_tag_dict']) as f:
|
277
|
+
#pos_dict = cPickle.load(f)
|
278
|
+
buf = f.readlines()
|
279
|
+
for i, line in enumerate(buf):
|
280
|
+
line = line.strip()
|
281
|
+
self.pos_dict[line] = i
|
282
|
+
|
283
|
+
pos_def_dict = defaultdict(lambda: self.pos_dict['NN'])
|
284
|
+
pos_def_dict.update(self.pos_dict)
|
285
|
+
pos_lookup = lambda t: pos_def_dict[t.pos_t]
|
286
|
+
self.converter.add_extractor(pos_lookup)
|
287
|
+
|
288
|
+
#if self.md.use_chunk:
|
289
|
+
# with open(self.md.paths['chunk_tag_dict']) as f:
|
290
|
+
# chunk_dict = cPickle.load(f)
|
291
|
+
|
292
|
+
# chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
|
293
|
+
# chunk_def_dict.update(chunk_dict)
|
294
|
+
# chunk_lookup = lambda t: chunk_def_dict[t.chunk]
|
295
|
+
# self.converter.add_extractor(chunk_lookup)
|
296
|
+
|
297
|
+
def get_num_pos_tags(self):
|
298
|
+
return len(self.pos_dict)
|
299
|
+
|
300
|
+
def generate_tag_dict(self):
|
301
|
+
"""
|
302
|
+
Generates a tag dictionary that converts the tag itself
|
303
|
+
to an index to be used in the neural network.
|
304
|
+
"""
|
305
|
+
self.tagset = set(tag
|
306
|
+
for _, props in self.sentences
|
307
|
+
for prop in props
|
308
|
+
for _, tag in prop)
|
309
|
+
|
310
|
+
self.tag_dict = dict( zip( self.tagset,
|
311
|
+
xrange(len(self.tagset))
|
312
|
+
)
|
313
|
+
)
|
314
|
+
|
315
|
+
def _remove_tag_names(self):
|
316
|
+
"""Removes the actual tag names, leaving only IOB or IOBES block delimiters."""
|
317
|
+
for _, propositions in self.sentences:
|
318
|
+
for tags in propositions:
|
319
|
+
for i, (_, tag) in enumerate(tags):
|
320
|
+
tags[i] = tag[0]
|
321
|
+
|
322
|
+
def _codify_sentences(self):
|
323
|
+
"""Internal helper function."""
|
324
|
+
new_sentences = []
|
325
|
+
self.tags = []
|
326
|
+
|
327
|
+
for (sent, props) in self.sentences:
|
328
|
+
new_sent = []
|
329
|
+
sentence_tags = []
|
330
|
+
|
331
|
+
for token in sent:
|
332
|
+
new_token = self.converter.convert(token)
|
333
|
+
new_sent.append(new_token)
|
334
|
+
|
335
|
+
for prop in props:
|
336
|
+
sentence_tags.append(prop)
|
337
|
+
|
338
|
+
new_sentences.append(np.array(new_sent))
|
339
|
+
self.tags.append(sentence_tags)
|
340
|
+
#print(new_sentences, flush=True)
|
341
|
+
#print(self.tags, flush=True)
|
342
|
+
|
343
|
+
self.sentences = new_sentences
|
344
|
+
self.codified = True
|
345
|
+
|
346
|
+
def codify_sentences(self):
|
347
|
+
"""
|
348
|
+
Converts each token in each sequence into indices to their feature vectors
|
349
|
+
in feature matrices. The previous sentences as text are not accessible anymore.
|
350
|
+
Tags are also encoded. This function takes care of the case of classifying
|
351
|
+
pre-delimited arguments.
|
352
|
+
"""
|
353
|
+
if self.converter is None:
|
354
|
+
self.create_converter()
|
355
|
+
|
356
|
+
self._codify_sentences()
|
357
|
+
self.arg_limits = []
|
358
|
+
|
359
|
+
for i, propositions in enumerate(self.tags):
|
360
|
+
new_sent_tags = []
|
361
|
+
sent_args = []
|
362
|
+
|
363
|
+
for j, (rel, prop_tags) in enumerate(propositions):
|
364
|
+
|
365
|
+
new_prop_tags = []
|
366
|
+
prop_args = []
|
367
|
+
|
368
|
+
#if prop_tags != '-' and j==rel:
|
369
|
+
if prop_tags != '-' :
|
370
|
+
prop_args.append(np.array([j, j+1]))
|
371
|
+
new_prop_tags.append(self.tag_dict[prop_tags])
|
372
|
+
|
373
|
+
sent_args.append(np.array(prop_args))
|
374
|
+
new_sent_tags.append(np.array(new_prop_tags))
|
375
|
+
|
376
|
+
self.arg_limits.append(sent_args)
|
377
|
+
self.tags[i] = new_sent_tags
|
378
|
+
|
379
|
+
|
380
|
+
def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False):
|
381
|
+
"""
|
382
|
+
Replaces each word label with an IOB or IOBES version, appending a prefix
|
383
|
+
to them.
|
384
|
+
|
385
|
+
:param scheme: IOB or IOBES (In, Other, Begin, End, Single).
|
386
|
+
:param update_dict: whether to update or not the tag dictionary after
|
387
|
+
converting the tags.
|
388
|
+
:param only_boundaries: if True, only leaves the IOBES tags and remove
|
389
|
+
the actual tags. Also, avoid updating the tag dict.
|
390
|
+
"""
|
391
|
+
scheme = scheme.lower()
|
392
|
+
if scheme not in ('iob', 'iobes'):
|
393
|
+
raise ValueError("Unknown tagging scheme: %s" % scheme)
|
394
|
+
|
395
|
+
for _, props in self.sentences:
|
396
|
+
for prop in props:
|
397
|
+
|
398
|
+
last_tag = None
|
399
|
+
for i, tag in enumerate(prop):
|
400
|
+
|
401
|
+
if tag == 'O':
|
402
|
+
# O tag is independent from IBES
|
403
|
+
last_tag = tag
|
404
|
+
continue
|
405
|
+
|
406
|
+
try:
|
407
|
+
next_tag = prop[i + 1]
|
408
|
+
except IndexError:
|
409
|
+
# last word already
|
410
|
+
next_tag = None
|
411
|
+
|
412
|
+
if tag != last_tag:
|
413
|
+
# a new block starts here.
|
414
|
+
last_tag = tag
|
415
|
+
if scheme == 'iob' or next_tag == tag:
|
416
|
+
prop[i] = 'B-%s' % tag
|
417
|
+
else:
|
418
|
+
prop[i] = 'S-%s' % tag
|
419
|
+
else:
|
420
|
+
# the block continues.
|
421
|
+
if scheme == 'iob' or next_tag == tag:
|
422
|
+
prop[i] = 'I-%s' % tag
|
423
|
+
else:
|
424
|
+
prop[i] = 'E-%s' % tag
|
425
|
+
|
426
|
+
if only_boundaries:
|
427
|
+
self._remove_tag_names()
|
428
|
+
elif update_tag_dict:
|
429
|
+
self.generate_tag_dict()
|
430
|
+
else:
|
431
|
+
# treat any tag not appearing in the tag dictionary as O
|
432
|
+
actual_tagset = {tag for _, props in self.sentences for prop in props for tag in prop}
|
433
|
+
for tag in actual_tagset:
|
434
|
+
if tag not in self.tag_dict:
|
435
|
+
self.tag_dict[tag] = self.tag_dict[self.rare_tag]
|
436
|
+
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Auxiliary functions for SRL training.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import re
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
|
11
|
+
def init_transitions_simplified(tag_dict):
|
12
|
+
"""
|
13
|
+
This function initializes a tag transition table containing only
|
14
|
+
the boundaries IOBES.
|
15
|
+
"""
|
16
|
+
tags = sorted(tag_dict, key=tag_dict.get)
|
17
|
+
transitions = []
|
18
|
+
|
19
|
+
for tag in tags:
|
20
|
+
if tag in 'OES':
|
21
|
+
trans = lambda x: 0 if x in 'BOS' else -1000
|
22
|
+
elif tag in 'IB':
|
23
|
+
trans = lambda x: 0 if x in 'IE' else -1000
|
24
|
+
else:
|
25
|
+
raise ValueError('Unexpected tag: %s' % tag)
|
26
|
+
|
27
|
+
transitions.append([trans(next_tag) for next_tag in tags])
|
28
|
+
|
29
|
+
# initial transition
|
30
|
+
trans = lambda x: 0 if x in 'BOS' else -1000
|
31
|
+
transitions.append([trans(next_tag) for next_tag in tags])
|
32
|
+
|
33
|
+
return np.array(transitions, np.float)
|
34
|
+
|
35
|
+
|
36
|
+
def init_transitions(tag_dict, scheme):
|
37
|
+
"""
|
38
|
+
This function initializes the tag transition table setting
|
39
|
+
very low values for impossible transitions.
|
40
|
+
|
41
|
+
:param tag_dict: The tag dictionary mapping tag names to the
|
42
|
+
network output number.
|
43
|
+
:param scheme: either iob or iobes.
|
44
|
+
"""
|
45
|
+
scheme = scheme.lower()
|
46
|
+
assert scheme in ('iob', 'iobes'), 'Unknown tagging scheme: %s' % scheme
|
47
|
+
transitions = []
|
48
|
+
|
49
|
+
# since dict's are unordered, let's take the tags in the correct order
|
50
|
+
tags = sorted(tag_dict, key=tag_dict.get)
|
51
|
+
|
52
|
+
# transitions between tags
|
53
|
+
for tag in tags:
|
54
|
+
|
55
|
+
if tag == 'O':
|
56
|
+
# next tag can be O, V or any B
|
57
|
+
trans = lambda x: 0 if re.match('B|S|V', x) \
|
58
|
+
else -1 if x == 'O' else -1000
|
59
|
+
|
60
|
+
elif tag[0] in 'IB':
|
61
|
+
block = tag[2:]
|
62
|
+
if scheme == 'iobes':
|
63
|
+
# next tag can be I or E (same block)
|
64
|
+
trans = lambda x: 0 if re.match('(I|E)-%s' % block, x) else -1000
|
65
|
+
else:
|
66
|
+
# next tag can be O, I (same block) or B (new block)
|
67
|
+
trans = lambda x: 0 if re.match('I-%s' % block, x) or re.match('B-(?!%s)' % block, x) \
|
68
|
+
else -1 if x == 'O' else -1000
|
69
|
+
|
70
|
+
elif tag[0] in 'ES':
|
71
|
+
# next tag can be O, S (new block) or B (new block)
|
72
|
+
block = tag[2:]
|
73
|
+
trans = lambda x: 0 if re.match('(S|B)-(?!%s)' % block, x) \
|
74
|
+
else -1 if x == 'O' else -1000
|
75
|
+
|
76
|
+
else:
|
77
|
+
raise ValueError('Unknown tag: %s' % tag)
|
78
|
+
|
79
|
+
transitions.append([trans(next_tag) for next_tag in tags])
|
80
|
+
|
81
|
+
# starting tag
|
82
|
+
# it can be O or any B/S
|
83
|
+
trans = lambda x: 0 if x[0] in 'OBS' else -1000
|
84
|
+
transitions.append([trans(next_tag) for next_tag in tags])
|
85
|
+
|
86
|
+
return np.array(transitions, np.float)
|
87
|
+
|