opener-coreference-base 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/core/vendor/dictconfig/__init__.py +1 -0
- data/core/vendor/dictconfig/dictconfig.py +549 -0
- data/core/vendor/pykaf/LICENSE.txt +13 -0
- data/core/vendor/pykaf/__init__.py +427 -0
- data/lib/opener/coreferences/base/version.rb +1 -1
- data/opener-coreference-base.gemspec +1 -1
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c35ea079c7c50da42f7bd17cddabf9cb79d1b93b
|
4
|
+
data.tar.gz: f836ce7b54e055bae722fda4890cd9a9a7cf8f39
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e835457582de3c321d41649f19c9c7682351931d03cd8e526ae5aa3b217833b626a386727702b371743ee267d93dbbf1fbea97167d45223e312067412c200169
|
7
|
+
data.tar.gz: 67d0658e73efc718196354b34d6ecee09cb52ef2d32f2bea821b008fae8f066a43a3ce5ea967b56421de8e543b2a4867f82a55b69fe0995d5bcc8bf794f53ff4
|
@@ -0,0 +1 @@
|
|
1
|
+
from dictconfig import *
|
@@ -0,0 +1,549 @@
|
|
1
|
+
# Copyright 2009-2010 by Vinay Sajip. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission to use, copy, modify, and distribute this software and its
|
4
|
+
# documentation for any purpose and without fee is hereby granted,
|
5
|
+
# provided that the above copyright notice appear in all copies and that
|
6
|
+
# both that copyright notice and this permission notice appear in
|
7
|
+
# supporting documentation, and that the name of Vinay Sajip
|
8
|
+
# not be used in advertising or publicity pertaining to distribution
|
9
|
+
# of the software without specific, written prior permission.
|
10
|
+
# VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
|
11
|
+
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
|
12
|
+
# VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
|
13
|
+
# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
14
|
+
# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
15
|
+
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
16
|
+
|
17
|
+
import logging.handlers
|
18
|
+
import re
|
19
|
+
import sys
|
20
|
+
import types
|
21
|
+
|
22
|
+
IDENTIFIER = re.compile('^[a-z_][a-z0-9_]*$', re.I)
|
23
|
+
|
24
|
+
def valid_ident(s):
|
25
|
+
m = IDENTIFIER.match(s)
|
26
|
+
if not m:
|
27
|
+
raise ValueError('Not a valid Python identifier: %r' % s)
|
28
|
+
return True
|
29
|
+
|
30
|
+
#
|
31
|
+
# This function is defined in logging only in recent versions of Python
|
32
|
+
#
|
33
|
+
try:
|
34
|
+
from logging import _checkLevel
|
35
|
+
except ImportError:
|
36
|
+
def _checkLevel(level):
|
37
|
+
if isinstance(level, int):
|
38
|
+
rv = level
|
39
|
+
elif str(level) == level:
|
40
|
+
if level not in logging._levelNames:
|
41
|
+
raise ValueError('Unknown level: %r' % level)
|
42
|
+
rv = logging._levelNames[level]
|
43
|
+
else:
|
44
|
+
raise TypeError('Level not an integer or a '
|
45
|
+
'valid string: %r' % level)
|
46
|
+
return rv
|
47
|
+
|
48
|
+
# The ConvertingXXX classes are wrappers around standard Python containers,
|
49
|
+
# and they serve to convert any suitable values in the container. The
|
50
|
+
# conversion converts base dicts, lists and tuples to their wrapped
|
51
|
+
# equivalents, whereas strings which match a conversion format are converted
|
52
|
+
# appropriately.
|
53
|
+
#
|
54
|
+
# Each wrapper should have a configurator attribute holding the actual
|
55
|
+
# configurator to use for conversion.
|
56
|
+
|
57
|
+
class ConvertingDict(dict):
|
58
|
+
"""A converting dictionary wrapper."""
|
59
|
+
|
60
|
+
def __getitem__(self, key):
|
61
|
+
value = dict.__getitem__(self, key)
|
62
|
+
result = self.configurator.convert(value)
|
63
|
+
#If the converted value is different, save for next time
|
64
|
+
if value is not result:
|
65
|
+
self[key] = result
|
66
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
67
|
+
ConvertingTuple):
|
68
|
+
result.parent = self
|
69
|
+
result.key = key
|
70
|
+
return result
|
71
|
+
|
72
|
+
def get(self, key, default=None):
|
73
|
+
value = dict.get(self, key, default)
|
74
|
+
result = self.configurator.convert(value)
|
75
|
+
#If the converted value is different, save for next time
|
76
|
+
if value is not result:
|
77
|
+
self[key] = result
|
78
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
79
|
+
ConvertingTuple):
|
80
|
+
result.parent = self
|
81
|
+
result.key = key
|
82
|
+
return result
|
83
|
+
|
84
|
+
def pop(self, key, default=None):
|
85
|
+
value = dict.pop(self, key, default)
|
86
|
+
result = self.configurator.convert(value)
|
87
|
+
if value is not result:
|
88
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
89
|
+
ConvertingTuple):
|
90
|
+
result.parent = self
|
91
|
+
result.key = key
|
92
|
+
return result
|
93
|
+
|
94
|
+
class ConvertingList(list):
|
95
|
+
"""A converting list wrapper."""
|
96
|
+
def __getitem__(self, key):
|
97
|
+
value = list.__getitem__(self, key)
|
98
|
+
result = self.configurator.convert(value)
|
99
|
+
#If the converted value is different, save for next time
|
100
|
+
if value is not result:
|
101
|
+
self[key] = result
|
102
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
103
|
+
ConvertingTuple):
|
104
|
+
result.parent = self
|
105
|
+
result.key = key
|
106
|
+
return result
|
107
|
+
|
108
|
+
def pop(self, idx=-1):
|
109
|
+
value = list.pop(self, idx)
|
110
|
+
result = self.configurator.convert(value)
|
111
|
+
if value is not result:
|
112
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
113
|
+
ConvertingTuple):
|
114
|
+
result.parent = self
|
115
|
+
return result
|
116
|
+
|
117
|
+
class ConvertingTuple(tuple):
|
118
|
+
"""A converting tuple wrapper."""
|
119
|
+
def __getitem__(self, key):
|
120
|
+
value = tuple.__getitem__(self, key)
|
121
|
+
result = self.configurator.convert(value)
|
122
|
+
if value is not result:
|
123
|
+
if type(result) in (ConvertingDict, ConvertingList,
|
124
|
+
ConvertingTuple):
|
125
|
+
result.parent = self
|
126
|
+
result.key = key
|
127
|
+
return result
|
128
|
+
|
129
|
+
class BaseConfigurator(object):
|
130
|
+
"""
|
131
|
+
The configurator base class which defines some useful defaults.
|
132
|
+
"""
|
133
|
+
|
134
|
+
CONVERT_PATTERN = re.compile(r'^(?P<prefix>[a-z]+)://(?P<suffix>.*)$')
|
135
|
+
|
136
|
+
WORD_PATTERN = re.compile(r'^\s*(\w+)\s*')
|
137
|
+
DOT_PATTERN = re.compile(r'^\.\s*(\w+)\s*')
|
138
|
+
INDEX_PATTERN = re.compile(r'^\[\s*(\w+)\s*\]\s*')
|
139
|
+
DIGIT_PATTERN = re.compile(r'^\d+$')
|
140
|
+
|
141
|
+
value_converters = {
|
142
|
+
'ext' : 'ext_convert',
|
143
|
+
'cfg' : 'cfg_convert',
|
144
|
+
}
|
145
|
+
|
146
|
+
# We might want to use a different one, e.g. importlib
|
147
|
+
importer = __import__
|
148
|
+
|
149
|
+
def __init__(self, config):
|
150
|
+
self.config = ConvertingDict(config)
|
151
|
+
self.config.configurator = self
|
152
|
+
|
153
|
+
def resolve(self, s):
|
154
|
+
"""
|
155
|
+
Resolve strings to objects using standard import and attribute
|
156
|
+
syntax.
|
157
|
+
"""
|
158
|
+
name = s.split('.')
|
159
|
+
used = name.pop(0)
|
160
|
+
try:
|
161
|
+
found = self.importer(used)
|
162
|
+
for frag in name:
|
163
|
+
used += '.' + frag
|
164
|
+
try:
|
165
|
+
found = getattr(found, frag)
|
166
|
+
except AttributeError:
|
167
|
+
self.importer(used)
|
168
|
+
found = getattr(found, frag)
|
169
|
+
return found
|
170
|
+
except ImportError:
|
171
|
+
e, tb = sys.exc_info()[1:]
|
172
|
+
v = ValueError('Cannot resolve %r: %s' % (s, e))
|
173
|
+
v.__cause__, v.__traceback__ = e, tb
|
174
|
+
raise v
|
175
|
+
|
176
|
+
def ext_convert(self, value):
|
177
|
+
"""Default converter for the ext:// protocol."""
|
178
|
+
return self.resolve(value)
|
179
|
+
|
180
|
+
def cfg_convert(self, value):
|
181
|
+
"""Default converter for the cfg:// protocol."""
|
182
|
+
rest = value
|
183
|
+
m = self.WORD_PATTERN.match(rest)
|
184
|
+
if m is None:
|
185
|
+
raise ValueError("Unable to convert %r" % value)
|
186
|
+
else:
|
187
|
+
rest = rest[m.end():]
|
188
|
+
d = self.config[m.groups()[0]]
|
189
|
+
#print d, rest
|
190
|
+
while rest:
|
191
|
+
m = self.DOT_PATTERN.match(rest)
|
192
|
+
if m:
|
193
|
+
d = d[m.groups()[0]]
|
194
|
+
else:
|
195
|
+
m = self.INDEX_PATTERN.match(rest)
|
196
|
+
if m:
|
197
|
+
idx = m.groups()[0]
|
198
|
+
if not self.DIGIT_PATTERN.match(idx):
|
199
|
+
d = d[idx]
|
200
|
+
else:
|
201
|
+
try:
|
202
|
+
n = int(idx) # try as number first (most likely)
|
203
|
+
d = d[n]
|
204
|
+
except TypeError:
|
205
|
+
d = d[idx]
|
206
|
+
if m:
|
207
|
+
rest = rest[m.end():]
|
208
|
+
else:
|
209
|
+
raise ValueError('Unable to convert '
|
210
|
+
'%r at %r' % (value, rest))
|
211
|
+
#rest should be empty
|
212
|
+
return d
|
213
|
+
|
214
|
+
def convert(self, value):
|
215
|
+
"""
|
216
|
+
Convert values to an appropriate type. dicts, lists and tuples are
|
217
|
+
replaced by their converting alternatives. Strings are checked to
|
218
|
+
see if they have a conversion format and are converted if they do.
|
219
|
+
"""
|
220
|
+
if not isinstance(value, ConvertingDict) and isinstance(value, dict):
|
221
|
+
value = ConvertingDict(value)
|
222
|
+
value.configurator = self
|
223
|
+
elif not isinstance(value, ConvertingList) and isinstance(value, list):
|
224
|
+
value = ConvertingList(value)
|
225
|
+
value.configurator = self
|
226
|
+
elif not isinstance(value, ConvertingTuple) and\
|
227
|
+
isinstance(value, tuple):
|
228
|
+
value = ConvertingTuple(value)
|
229
|
+
value.configurator = self
|
230
|
+
elif isinstance(value, basestring): # str for py3k
|
231
|
+
m = self.CONVERT_PATTERN.match(value)
|
232
|
+
if m:
|
233
|
+
d = m.groupdict()
|
234
|
+
prefix = d['prefix']
|
235
|
+
converter = self.value_converters.get(prefix, None)
|
236
|
+
if converter:
|
237
|
+
suffix = d['suffix']
|
238
|
+
converter = getattr(self, converter)
|
239
|
+
value = converter(suffix)
|
240
|
+
return value
|
241
|
+
|
242
|
+
def configure_custom(self, config):
|
243
|
+
"""Configure an object with a user-supplied factory."""
|
244
|
+
c = config.pop('()')
|
245
|
+
if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
|
246
|
+
c = self.resolve(c)
|
247
|
+
props = config.pop('.', None)
|
248
|
+
# Check for valid identifiers
|
249
|
+
kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
|
250
|
+
result = c(**kwargs)
|
251
|
+
if props:
|
252
|
+
for name, value in props.items():
|
253
|
+
setattr(result, name, value)
|
254
|
+
return result
|
255
|
+
|
256
|
+
def as_tuple(self, value):
|
257
|
+
"""Utility function which converts lists to tuples."""
|
258
|
+
if isinstance(value, list):
|
259
|
+
value = tuple(value)
|
260
|
+
return value
|
261
|
+
|
262
|
+
class DictConfigurator(BaseConfigurator):
|
263
|
+
"""
|
264
|
+
Configure logging using a dictionary-like object to describe the
|
265
|
+
configuration.
|
266
|
+
"""
|
267
|
+
|
268
|
+
def configure(self):
|
269
|
+
"""Do the configuration."""
|
270
|
+
|
271
|
+
config = self.config
|
272
|
+
if 'version' not in config:
|
273
|
+
raise ValueError("dictionary doesn't specify a version")
|
274
|
+
if config['version'] != 1:
|
275
|
+
raise ValueError("Unsupported version: %s" % config['version'])
|
276
|
+
incremental = config.pop('incremental', False)
|
277
|
+
EMPTY_DICT = {}
|
278
|
+
logging._acquireLock()
|
279
|
+
try:
|
280
|
+
if incremental:
|
281
|
+
handlers = config.get('handlers', EMPTY_DICT)
|
282
|
+
# incremental handler config only if handler name
|
283
|
+
# ties in to logging._handlers (Python 2.7)
|
284
|
+
if sys.version_info[:2] == (2, 7):
|
285
|
+
for name in handlers:
|
286
|
+
if name not in logging._handlers:
|
287
|
+
raise ValueError('No handler found with '
|
288
|
+
'name %r' % name)
|
289
|
+
else:
|
290
|
+
try:
|
291
|
+
handler = logging._handlers[name]
|
292
|
+
handler_config = handlers[name]
|
293
|
+
level = handler_config.get('level', None)
|
294
|
+
if level:
|
295
|
+
handler.setLevel(_checkLevel(level))
|
296
|
+
except StandardError, e:
|
297
|
+
raise ValueError('Unable to configure handler '
|
298
|
+
'%r: %s' % (name, e))
|
299
|
+
loggers = config.get('loggers', EMPTY_DICT)
|
300
|
+
for name in loggers:
|
301
|
+
try:
|
302
|
+
self.configure_logger(name, loggers[name], True)
|
303
|
+
except StandardError, e:
|
304
|
+
raise ValueError('Unable to configure logger '
|
305
|
+
'%r: %s' % (name, e))
|
306
|
+
root = config.get('root', None)
|
307
|
+
if root:
|
308
|
+
try:
|
309
|
+
self.configure_root(root, True)
|
310
|
+
except StandardError, e:
|
311
|
+
raise ValueError('Unable to configure root '
|
312
|
+
'logger: %s' % e)
|
313
|
+
else:
|
314
|
+
disable_existing = config.pop('disable_existing_loggers', True)
|
315
|
+
|
316
|
+
logging._handlers.clear()
|
317
|
+
del logging._handlerList[:]
|
318
|
+
|
319
|
+
# Do formatters first - they don't refer to anything else
|
320
|
+
formatters = config.get('formatters', EMPTY_DICT)
|
321
|
+
for name in formatters:
|
322
|
+
try:
|
323
|
+
formatters[name] = self.configure_formatter(
|
324
|
+
formatters[name])
|
325
|
+
except StandardError, e:
|
326
|
+
raise ValueError('Unable to configure '
|
327
|
+
'formatter %r: %s' % (name, e))
|
328
|
+
# Next, do filters - they don't refer to anything else, either
|
329
|
+
filters = config.get('filters', EMPTY_DICT)
|
330
|
+
for name in filters:
|
331
|
+
try:
|
332
|
+
filters[name] = self.configure_filter(filters[name])
|
333
|
+
except StandardError, e:
|
334
|
+
raise ValueError('Unable to configure '
|
335
|
+
'filter %r: %s' % (name, e))
|
336
|
+
|
337
|
+
# Next, do handlers - they refer to formatters and filters
|
338
|
+
# As handlers can refer to other handlers, sort the keys
|
339
|
+
# to allow a deterministic order of configuration
|
340
|
+
handlers = config.get('handlers', EMPTY_DICT)
|
341
|
+
for name in sorted(handlers):
|
342
|
+
try:
|
343
|
+
handler = self.configure_handler(handlers[name])
|
344
|
+
handler.name = name
|
345
|
+
handlers[name] = handler
|
346
|
+
except StandardError, e:
|
347
|
+
raise ValueError('Unable to configure handler '
|
348
|
+
'%r: %s' % (name, e))
|
349
|
+
# Next, do loggers - they refer to handlers and filters
|
350
|
+
|
351
|
+
#we don't want to lose the existing loggers,
|
352
|
+
#since other threads may have pointers to them.
|
353
|
+
#existing is set to contain all existing loggers,
|
354
|
+
#and as we go through the new configuration we
|
355
|
+
#remove any which are configured. At the end,
|
356
|
+
#what's left in existing is the set of loggers
|
357
|
+
#which were in the previous configuration but
|
358
|
+
#which are not in the new configuration.
|
359
|
+
root = logging.root
|
360
|
+
existing = root.manager.loggerDict.keys()
|
361
|
+
#The list needs to be sorted so that we can
|
362
|
+
#avoid disabling child loggers of explicitly
|
363
|
+
#named loggers. With a sorted list it is easier
|
364
|
+
#to find the child loggers.
|
365
|
+
existing.sort()
|
366
|
+
#We'll keep the list of existing loggers
|
367
|
+
#which are children of named loggers here...
|
368
|
+
child_loggers = []
|
369
|
+
#now set up the new ones...
|
370
|
+
loggers = config.get('loggers', EMPTY_DICT)
|
371
|
+
for name in loggers:
|
372
|
+
if name in existing:
|
373
|
+
i = existing.index(name)
|
374
|
+
prefixed = name + "."
|
375
|
+
pflen = len(prefixed)
|
376
|
+
num_existing = len(existing)
|
377
|
+
i = i + 1 # look at the entry after name
|
378
|
+
while (i < num_existing) and\
|
379
|
+
(existing[i][:pflen] == prefixed):
|
380
|
+
child_loggers.append(existing[i])
|
381
|
+
i = i + 1
|
382
|
+
existing.remove(name)
|
383
|
+
try:
|
384
|
+
self.configure_logger(name, loggers[name])
|
385
|
+
except StandardError, e:
|
386
|
+
raise ValueError('Unable to configure logger '
|
387
|
+
'%r: %s' % (name, e))
|
388
|
+
|
389
|
+
#Disable any old loggers. There's no point deleting
|
390
|
+
#them as other threads may continue to hold references
|
391
|
+
#and by disabling them, you stop them doing any logging.
|
392
|
+
#However, don't disable children of named loggers, as that's
|
393
|
+
#probably not what was intended by the user.
|
394
|
+
for log in existing:
|
395
|
+
logger = root.manager.loggerDict[log]
|
396
|
+
if log in child_loggers:
|
397
|
+
logger.level = logging.NOTSET
|
398
|
+
logger.handlers = []
|
399
|
+
logger.propagate = True
|
400
|
+
elif disable_existing:
|
401
|
+
logger.disabled = True
|
402
|
+
|
403
|
+
# And finally, do the root logger
|
404
|
+
root = config.get('root', None)
|
405
|
+
if root:
|
406
|
+
try:
|
407
|
+
self.configure_root(root)
|
408
|
+
except StandardError, e:
|
409
|
+
raise ValueError('Unable to configure root '
|
410
|
+
'logger: %s' % e)
|
411
|
+
finally:
|
412
|
+
logging._releaseLock()
|
413
|
+
|
414
|
+
def configure_formatter(self, config):
|
415
|
+
"""Configure a formatter from a dictionary."""
|
416
|
+
if '()' in config:
|
417
|
+
factory = config['()'] # for use in exception handler
|
418
|
+
try:
|
419
|
+
result = self.configure_custom(config)
|
420
|
+
except TypeError, te:
|
421
|
+
if "'format'" not in str(te):
|
422
|
+
raise
|
423
|
+
#Name of parameter changed from fmt to format.
|
424
|
+
#Retry with old name.
|
425
|
+
#This is so that code can be used with older Python versions
|
426
|
+
#(e.g. by Django)
|
427
|
+
config['fmt'] = config.pop('format')
|
428
|
+
config['()'] = factory
|
429
|
+
result = self.configure_custom(config)
|
430
|
+
else:
|
431
|
+
fmt = config.get('format', None)
|
432
|
+
dfmt = config.get('datefmt', None)
|
433
|
+
result = logging.Formatter(fmt, dfmt)
|
434
|
+
return result
|
435
|
+
|
436
|
+
def configure_filter(self, config):
|
437
|
+
"""Configure a filter from a dictionary."""
|
438
|
+
if '()' in config:
|
439
|
+
result = self.configure_custom(config)
|
440
|
+
else:
|
441
|
+
name = config.get('name', '')
|
442
|
+
result = logging.Filter(name)
|
443
|
+
return result
|
444
|
+
|
445
|
+
def add_filters(self, filterer, filters):
|
446
|
+
"""Add filters to a filterer from a list of names."""
|
447
|
+
for f in filters:
|
448
|
+
try:
|
449
|
+
filterer.addFilter(self.config['filters'][f])
|
450
|
+
except StandardError, e:
|
451
|
+
raise ValueError('Unable to add filter %r: %s' % (f, e))
|
452
|
+
|
453
|
+
def configure_handler(self, config):
|
454
|
+
"""Configure a handler from a dictionary."""
|
455
|
+
formatter = config.pop('formatter', None)
|
456
|
+
if formatter:
|
457
|
+
try:
|
458
|
+
formatter = self.config['formatters'][formatter]
|
459
|
+
except StandardError, e:
|
460
|
+
raise ValueError('Unable to set formatter '
|
461
|
+
'%r: %s' % (formatter, e))
|
462
|
+
level = config.pop('level', None)
|
463
|
+
filters = config.pop('filters', None)
|
464
|
+
if '()' in config:
|
465
|
+
c = config.pop('()')
|
466
|
+
if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
|
467
|
+
c = self.resolve(c)
|
468
|
+
factory = c
|
469
|
+
else:
|
470
|
+
klass = self.resolve(config.pop('class'))
|
471
|
+
#Special case for handler which refers to another handler
|
472
|
+
if issubclass(klass, logging.handlers.MemoryHandler) and\
|
473
|
+
'target' in config:
|
474
|
+
try:
|
475
|
+
config['target'] = self.config['handlers'][config['target']]
|
476
|
+
except StandardError, e:
|
477
|
+
raise ValueError('Unable to set target handler '
|
478
|
+
'%r: %s' % (config['target'], e))
|
479
|
+
elif issubclass(klass, logging.handlers.SMTPHandler) and\
|
480
|
+
'mailhost' in config:
|
481
|
+
config['mailhost'] = self.as_tuple(config['mailhost'])
|
482
|
+
elif issubclass(klass, logging.handlers.SysLogHandler) and\
|
483
|
+
'address' in config:
|
484
|
+
config['address'] = self.as_tuple(config['address'])
|
485
|
+
factory = klass
|
486
|
+
kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
|
487
|
+
try:
|
488
|
+
result = factory(**kwargs)
|
489
|
+
except TypeError, te:
|
490
|
+
if "'stream'" not in str(te):
|
491
|
+
raise
|
492
|
+
#The argument name changed from strm to stream
|
493
|
+
#Retry with old name.
|
494
|
+
#This is so that code can be used with older Python versions
|
495
|
+
#(e.g. by Django)
|
496
|
+
kwargs['strm'] = kwargs.pop('stream')
|
497
|
+
result = factory(**kwargs)
|
498
|
+
if formatter:
|
499
|
+
result.setFormatter(formatter)
|
500
|
+
if level is not None:
|
501
|
+
result.setLevel(_checkLevel(level))
|
502
|
+
if filters:
|
503
|
+
self.add_filters(result, filters)
|
504
|
+
return result
|
505
|
+
|
506
|
+
def add_handlers(self, logger, handlers):
|
507
|
+
"""Add handlers to a logger from a list of names."""
|
508
|
+
for h in handlers:
|
509
|
+
try:
|
510
|
+
logger.addHandler(self.config['handlers'][h])
|
511
|
+
except StandardError, e:
|
512
|
+
raise ValueError('Unable to add handler %r: %s' % (h, e))
|
513
|
+
|
514
|
+
def common_logger_config(self, logger, config, incremental=False):
|
515
|
+
"""
|
516
|
+
Perform configuration which is common to root and non-root loggers.
|
517
|
+
"""
|
518
|
+
level = config.get('level', None)
|
519
|
+
if level is not None:
|
520
|
+
logger.setLevel(_checkLevel(level))
|
521
|
+
if not incremental:
|
522
|
+
#Remove any existing handlers
|
523
|
+
for h in logger.handlers[:]:
|
524
|
+
logger.removeHandler(h)
|
525
|
+
handlers = config.get('handlers', None)
|
526
|
+
if handlers:
|
527
|
+
self.add_handlers(logger, handlers)
|
528
|
+
filters = config.get('filters', None)
|
529
|
+
if filters:
|
530
|
+
self.add_filters(logger, filters)
|
531
|
+
|
532
|
+
def configure_logger(self, name, config, incremental=False):
|
533
|
+
"""Configure a non-root logger from a dictionary."""
|
534
|
+
logger = logging.getLogger(name)
|
535
|
+
self.common_logger_config(logger, config, incremental)
|
536
|
+
propagate = config.get('propagate', None)
|
537
|
+
if propagate is not None:
|
538
|
+
logger.propagate = propagate
|
539
|
+
|
540
|
+
def configure_root(self, config, incremental=False):
|
541
|
+
"""Configure a root logger from a dictionary."""
|
542
|
+
root = logging.getLogger()
|
543
|
+
self.common_logger_config(root, config, incremental)
|
544
|
+
|
545
|
+
dictConfigClass = DictConfigurator
|
546
|
+
|
547
|
+
def dictConfig(config):
|
548
|
+
"""Configure logging using a dictionary."""
|
549
|
+
dictConfigClass(config).configure()
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright 2013 Josu Bermudez Galbarriatu <josu.bermudez@deusto.es>.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
@@ -0,0 +1,427 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
""" Module for manage KAF formatted files. """
|
3
|
+
|
4
|
+
from __future__ import unicode_literals
|
5
|
+
|
6
|
+
__author__ = 'Josu Bermudez <josu.bermudez@deusto.es>'
|
7
|
+
|
8
|
+
from lxml import etree
|
9
|
+
|
10
|
+
# CONSTANT TEXT VALUES USED TO CONSTRUCT KAF
|
11
|
+
KAF_TAG = "KAF"
|
12
|
+
LANGUAGE_ATTRIBUTE = "{http://www.w3.org/XML/1998/namespace}lang"
|
13
|
+
VERSION_ATTRIBUTE = "version"
|
14
|
+
NS = {}
|
15
|
+
|
16
|
+
KAF_HEADER_TAG = "kafHeader"
|
17
|
+
NAME_ATTRIBUTE = "name"
|
18
|
+
LINGUISTIC_PROCESSOR_HEAD = "linguisticProcessors"
|
19
|
+
LAYER_ATTRIBUTE = "layer"
|
20
|
+
LINGUISTIC_PROCESSOR_OCCURRENCE_TAG = "lp"
|
21
|
+
TIMESTAMP_ATTRIBUTE = "timestamp"
|
22
|
+
|
23
|
+
SPAN_TAG = "span"
|
24
|
+
TARGET_ID_ATTRIBUTE = "id"
|
25
|
+
TARGET_TAG = "target"
|
26
|
+
|
27
|
+
TEXT_LAYER_TAG = "text"
|
28
|
+
WORD_OCCURRENCE_TAG = "wf"
|
29
|
+
WORD_ID_ATTRIBUTE = "wid"
|
30
|
+
|
31
|
+
TERMS_LAYER_TAG = "terms"
|
32
|
+
TERM_OCCURRENCE_TAG = "term"
|
33
|
+
TERM_ID_ATTRIBUTE = "tid"
|
34
|
+
NER_ATTRIBUTE = "ner"
|
35
|
+
TYPE_ATTRIBUTE = "type"
|
36
|
+
LEMMA_ATTRIBUTE = "lemma"
|
37
|
+
POS_ATTRIBUTE = "pos"
|
38
|
+
MORPHOFEAT_ATTRIBUTE = "morphofeat"
|
39
|
+
|
40
|
+
NAMED_ENTITIES_LAYER_TAG = "entities"
|
41
|
+
NAMED_ENTITY_OCCURRENCE_TAG = "entity"
|
42
|
+
NAMED_ENTITY_ID_ATTRIBUTE = "eid"
|
43
|
+
NAMED_ENTITY_TYPE_ATTRIBUTE = "type"
|
44
|
+
NAMED_ENTITY_REFERENCES_GROUP_TAG = "references"
|
45
|
+
|
46
|
+
CONSTITUENCY_LAYER = "constituency"
|
47
|
+
CONSTITUENCY_TREE_TAG = "tree"
|
48
|
+
CONSTITUENCY_NON_TERMINALS = "nt"
|
49
|
+
CONSTITUENCY_TERMINALS = "t"
|
50
|
+
CONSTITUENCY_EDGES = "edge"
|
51
|
+
|
52
|
+
CHUNKS_LAYER_TAG = "chunks"
|
53
|
+
CHUNK_OCCURRENCE_TAG = "chunk"
|
54
|
+
CHUNK_CASE_ATTRIBUTE = "case"
|
55
|
+
CHUNK_PHRASE_ATTRIBUTE = "phrase"
|
56
|
+
CHUNK_HEAD_ATTRIBUTE = "head"
|
57
|
+
CHUNK_ID_ATTRIBUTE = "cid"
|
58
|
+
|
59
|
+
DEPENDENCY_LAYER_TAG = "deps"
|
60
|
+
DEPENDENCY_OCCURRENCE_TAG = "dep"
|
61
|
+
DEPENDENCY_FROM_ATTRIBUTE = "from"
|
62
|
+
DEPENDENCY_FUNCTION_ATTRIBUTE = "rfunc"
|
63
|
+
DEPENDENCY_TO_ATTRIBUTE = "to"
|
64
|
+
|
65
|
+
EXTERNAL_REFERENCE_OCCURRENCE_TAG = "externalRef"
|
66
|
+
EXTERNAL_REFERENCES_TAG = "externalReferences"
|
67
|
+
|
68
|
+
COREFERENCE_LAYER_TAG = "coreferences"
|
69
|
+
COREFERENCE_ID_ATTRIBUTE = "coid"
|
70
|
+
COREFERENCE_OCCURRENCE_TAG = "coref"
|
71
|
+
|
72
|
+
|
73
|
+
class KafDocument:
|
74
|
+
""" Manage a KAF document.
|
75
|
+
"""
|
76
|
+
valid_word_attributes = ("sent", "para", "offset", "length", "page")
|
77
|
+
valid_external_attributes = ("resource", "reference", "reftype", "status", "source", "confidence")
|
78
|
+
valid_externalRef_attributes = ("resource", "reference")
|
79
|
+
|
80
|
+
def __init__(self, file_name=None, input_stream=None, language=None, version="2.0", header=None):
|
81
|
+
""" Prepare the document basic structure.
|
82
|
+
"""
|
83
|
+
#parser = etree.XMLParser(remove_blank_text=True)
|
84
|
+
|
85
|
+
if file_name:
|
86
|
+
self.tree = etree.parse(file_name)#, parser=parser)
|
87
|
+
self.root = self.tree.getroot()
|
88
|
+
elif input_stream:
|
89
|
+
self.root = etree.fromstring(input_stream)#, parser=parser)
|
90
|
+
self.tree = etree.ElementTree(self.root)
|
91
|
+
else:
|
92
|
+
self.root = etree.Element(KAF_TAG, NS)
|
93
|
+
self.tree = etree.ElementTree(self.root)
|
94
|
+
if language:
|
95
|
+
self.root.attrib[LANGUAGE_ATTRIBUTE] = language
|
96
|
+
|
97
|
+
if version:
|
98
|
+
self.root.set(VERSION_ATTRIBUTE, version)
|
99
|
+
|
100
|
+
headers = self.tree.find(KAF_HEADER_TAG)
|
101
|
+
if headers is not None and len(headers):
|
102
|
+
self.kaf_header = headers
|
103
|
+
else:
|
104
|
+
self.kaf_header = None
|
105
|
+
|
106
|
+
if header:
|
107
|
+
self.set_header(header)
|
108
|
+
|
109
|
+
text_layer = self.tree.find(TEXT_LAYER_TAG)
|
110
|
+
if text_layer is not None and len(text_layer):
|
111
|
+
self.text = text_layer
|
112
|
+
else:
|
113
|
+
self.text = etree.SubElement(self.root, TEXT_LAYER_TAG)
|
114
|
+
|
115
|
+
terms_layer = self.tree.find(TERMS_LAYER_TAG)
|
116
|
+
if text_layer is not None and len(terms_layer):
|
117
|
+
self.terms = terms_layer
|
118
|
+
else:
|
119
|
+
self.terms = None
|
120
|
+
|
121
|
+
dependencies_layer = self.tree.find(DEPENDENCY_LAYER_TAG)
|
122
|
+
if dependencies_layer is not None and len(dependencies_layer):
|
123
|
+
self.dependencies = dependencies_layer
|
124
|
+
else:
|
125
|
+
self.dependencies = None
|
126
|
+
|
127
|
+
chunks_layer = self.tree.find(CHUNKS_LAYER_TAG)
|
128
|
+
if chunks_layer is not None and len(chunks_layer):
|
129
|
+
self.chunks = chunks_layer
|
130
|
+
else:
|
131
|
+
self.chunks = None
|
132
|
+
|
133
|
+
constituency_layer = self.tree.find(CONSTITUENCY_LAYER)
|
134
|
+
if constituency_layer is not None and len(constituency_layer):
|
135
|
+
self.constituency = constituency_layer
|
136
|
+
else:
|
137
|
+
self.constituency = None
|
138
|
+
|
139
|
+
named_entities_layer = self.tree.find(NAMED_ENTITIES_LAYER_TAG)
|
140
|
+
if named_entities_layer is not None and len(named_entities_layer):
|
141
|
+
self.entities = named_entities_layer
|
142
|
+
else:
|
143
|
+
self.entities = None
|
144
|
+
|
145
|
+
coreference_layer = self.tree.find(COREFERENCE_LAYER_TAG)
|
146
|
+
if coreference_layer is not None and len(coreference_layer):
|
147
|
+
self.coreferences = coreference_layer
|
148
|
+
else:
|
149
|
+
self.coreferences = None
|
150
|
+
|
151
|
+
def clear_header(self):
|
152
|
+
self.root.remove(self.kaf_header)
|
153
|
+
self.kaf_header = None
|
154
|
+
|
155
|
+
def set_header(self, kaf_header):
|
156
|
+
if self.kaf_header:
|
157
|
+
for element in kaf_header:
|
158
|
+
self.kaf_header.append(element)
|
159
|
+
self.kaf_header.attrib.update(kaf_header.attrib)
|
160
|
+
else:
|
161
|
+
self.kaf_header = kaf_header
|
162
|
+
self.root.append(self.kaf_header)
|
163
|
+
|
164
|
+
def add_linguistic_processors(self, layer, name, version, time_stamp):
|
165
|
+
if not self.kaf_header:
|
166
|
+
self.kaf_header = etree.SubElement(self.root, KAF_HEADER_TAG)
|
167
|
+
|
168
|
+
layer_find = self.kaf_header.find("./{0}..[@{1}='{2}']".format(LINGUISTIC_PROCESSOR_HEAD, LAYER_ATTRIBUTE, layer))
|
169
|
+
if layer_find:
|
170
|
+
layer = layer_find[0]
|
171
|
+
else:
|
172
|
+
layer = etree.SubElement(self.kaf_header, LINGUISTIC_PROCESSOR_HEAD, {LAYER_ATTRIBUTE: layer})
|
173
|
+
|
174
|
+
etree.SubElement(layer, LINGUISTIC_PROCESSOR_OCCURRENCE_TAG,
|
175
|
+
{NAME_ATTRIBUTE: name, VERSION_ATTRIBUTE: version, TIMESTAMP_ATTRIBUTE: time_stamp})
|
176
|
+
|
177
|
+
def add_word(self, word, wid, **kwargs):
|
178
|
+
"""Add a word to the KAF file.
|
179
|
+
A word have the next parameters/attributes;
|
180
|
+
+ wid: the unique id for the word form.
|
181
|
+
+ sent: sentence id of the token (optional)
|
182
|
+
+ para: paragraph id (optional)
|
183
|
+
+ offset: the offset of the word form (optional)
|
184
|
+
+ length: the length of the original word form (optional)
|
185
|
+
+ page: page id (optional)
|
186
|
+
"""
|
187
|
+
# Prepare the word attributes
|
188
|
+
word_attributes = dict((k, v) for (k, v) in kwargs.iteritems() if k in self.valid_word_attributes)
|
189
|
+
word_attributes[WORD_ID_ATTRIBUTE] = wid
|
190
|
+
# Create a text subnode for the word and set its attributes
|
191
|
+
element = etree.SubElement(self.text, WORD_OCCURRENCE_TAG, word_attributes)
|
192
|
+
try:
|
193
|
+
element.text = word
|
194
|
+
except:
|
195
|
+
element.text = "XXXXXX"
|
196
|
+
return element
|
197
|
+
|
198
|
+
def get_words(self):
|
199
|
+
""" Return all the words in the document"""
|
200
|
+
return self.text[:]
|
201
|
+
|
202
|
+
def get_words_by_id(self, wid):
|
203
|
+
""" Return all the words in the document"""
|
204
|
+
results = self.text.find("{0}[@{1}='{2}']".format(WORD_OCCURRENCE_TAG, WORD_ID_ATTRIBUTE, wid))
|
205
|
+
return results and results[0]
|
206
|
+
|
207
|
+
def add_term(self, tid, pos=None, lemma=None, morphofeat=None, term_type=None, words=(), ner=None,
|
208
|
+
external_refs=()):
|
209
|
+
"""Add a term to the kaf file.
|
210
|
+
A Term have the next parameters/attributes:
|
211
|
+
tid: unique identifier
|
212
|
+
type: type of the term. Currently, 3 values are possible:
|
213
|
+
+ open: open category term
|
214
|
+
+ close: close category term
|
215
|
+
lemma: lemma of the term
|
216
|
+
pos: part of speech
|
217
|
+
morphofeat: PennTreebank part of speech tag
|
218
|
+
word: a list of id of the bounded words.
|
219
|
+
external_ref: A list of dictionaries that contains the external references.
|
220
|
+
Each reference have:
|
221
|
+
+ resource
|
222
|
+
+ reference
|
223
|
+
+ INCOMPLETE
|
224
|
+
"""
|
225
|
+
if self.terms is None:
|
226
|
+
self.terms = etree.SubElement(self.root, TERMS_LAYER_TAG)
|
227
|
+
|
228
|
+
#TODO Complete external references
|
229
|
+
|
230
|
+
word_attributes = {TERM_ID_ATTRIBUTE: tid}
|
231
|
+
if pos:
|
232
|
+
word_attributes[POS_ATTRIBUTE] = pos
|
233
|
+
if lemma:
|
234
|
+
word_attributes[LEMMA_ATTRIBUTE] = lemma
|
235
|
+
if term_type:
|
236
|
+
word_attributes[TYPE_ATTRIBUTE] = term_type
|
237
|
+
if morphofeat:
|
238
|
+
word_attributes[MORPHOFEAT_ATTRIBUTE] = morphofeat
|
239
|
+
if ner:
|
240
|
+
word_attributes[NER_ATTRIBUTE] = ner
|
241
|
+
term = etree.SubElement(self.terms, TERM_OCCURRENCE_TAG, word_attributes)
|
242
|
+
if words:
|
243
|
+
span = etree.SubElement(term, SPAN_TAG)
|
244
|
+
for word in words:
|
245
|
+
etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: word})
|
246
|
+
if external_refs:
|
247
|
+
span = etree.SubElement(term, EXTERNAL_REFERENCES_TAG)
|
248
|
+
for external_ref in external_refs:
|
249
|
+
ref_attributes = dict((k, v) for (k, v) in external_ref.iteritems()
|
250
|
+
if k in self.valid_externalRef_attributes)
|
251
|
+
keys = ref_attributes.keys()
|
252
|
+
for attribute in self.valid_externalRef_attributes:
|
253
|
+
if not attribute in keys:
|
254
|
+
raise Exception("External resource not have {0}".format(attribute))
|
255
|
+
etree.SubElement(span, EXTERNAL_REFERENCE_OCCURRENCE_TAG, ref_attributes)
|
256
|
+
return term
|
257
|
+
|
258
|
+
def get_terms(self):
|
259
|
+
""" Return all the words in the document"""
|
260
|
+
return self.root.findall("{0}/{1}".format(TERMS_LAYER_TAG, TERM_OCCURRENCE_TAG))
|
261
|
+
|
262
|
+
def get_terms_words(self, term):
|
263
|
+
return term.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
|
264
|
+
|
265
|
+
def get_terms_references(self, term):
|
266
|
+
return term.findall("{0}/{1}".format(EXTERNAL_REFERENCES_TAG, EXTERNAL_REFERENCE_OCCURRENCE_TAG))
|
267
|
+
|
268
|
+
def add_dependency(self, origen, to, rfunc):
|
269
|
+
"""Add a new dependency relation in the text.
|
270
|
+
The dependency have the next parameters/attributes:
|
271
|
+
+ from: term id of the source element
|
272
|
+
+ to: term id of the target element
|
273
|
+
+ rfunc: relational function. One of:
|
274
|
+
- mod: indicates the word introducing the dependent in a head- modifier relation.
|
275
|
+
- subj: indicates the subject in the grammatical relation Subject-Predicate.
|
276
|
+
- csubj, xsubj, ncsubj: The Grammatical Relations (RL) csubj and xsubj may be used for clausal
|
277
|
+
subjects, controlled from within, or without, respectively. ncsubj is a non-clausal subject.
|
278
|
+
- dobj: Indicates the object in the grammatical relation between a predicate and its direct object.
|
279
|
+
- iobj: The relation between a predicate and a non-clausal complement introduced by a preposition;
|
280
|
+
type indicates the preposition introducing the dependent.
|
281
|
+
- obj2: The relation between a predicate and the second non-clausal complement in ditransitive
|
282
|
+
constructions.
|
283
|
+
"""
|
284
|
+
if not self.dependencies:
|
285
|
+
self.dependencies = etree.SubElement(self.root, DEPENDENCY_LAYER_TAG)
|
286
|
+
|
287
|
+
dependency_attributes = {DEPENDENCY_FROM_ATTRIBUTE: origen,
|
288
|
+
DEPENDENCY_TO_ATTRIBUTE: to,
|
289
|
+
DEPENDENCY_FUNCTION_ATTRIBUTE: rfunc}
|
290
|
+
return etree.SubElement(self.dependencies, DEPENDENCY_OCCURRENCE_TAG, dependency_attributes)
|
291
|
+
|
292
|
+
def get_dependencies(self):
|
293
|
+
"""Return all the words in the document"""
|
294
|
+
return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
|
295
|
+
|
296
|
+
def add_chunk(self, cid, head, phrase, case=None, terms=()):
|
297
|
+
""""Add a chunk to the kaf document.
|
298
|
+
Chunks are noun or prepositional phrases, spanning terms.A chunk have the following parameters/attributes:
|
299
|
+
+ cid: unique identifier
|
300
|
+
+ head: the chunk head's term id
|
301
|
+
+ phrase: typo of the phrase.Valid values for the phrase elements are one of the following:
|
302
|
+
- NP: noun phrase
|
303
|
+
- VP: verbal phrase
|
304
|
+
- PP: prepositional phrase
|
305
|
+
- S: sentence
|
306
|
+
- O: other
|
307
|
+
+ case (optional): declension case
|
308
|
+
"""
|
309
|
+
# Secure the root
|
310
|
+
if not self.chunks:
|
311
|
+
self.chunks = etree.SubElement(self.root, CHUNKS_LAYER_TAG)
|
312
|
+
# Prepare the attributes
|
313
|
+
chunk_attributes = {CHUNK_ID_ATTRIBUTE: cid, CHUNK_HEAD_ATTRIBUTE: head, CHUNK_PHRASE_ATTRIBUTE: phrase}
|
314
|
+
if case:
|
315
|
+
chunk_attributes[CHUNK_CASE_ATTRIBUTE] = case
|
316
|
+
# Create , and attach, the chunk
|
317
|
+
chunk = etree.SubElement(self.chunks, CHUNK_OCCURRENCE_TAG, chunk_attributes)
|
318
|
+
# Add the span terms
|
319
|
+
if terms:
|
320
|
+
spans = etree.SubElement(chunk, SPAN_TAG)
|
321
|
+
for term in terms:
|
322
|
+
etree.SubElement(spans, TARGET_TAG, {TARGET_ID_ATTRIBUTE: term})
|
323
|
+
return chunk
|
324
|
+
|
325
|
+
def get_chunks(self):
|
326
|
+
"""Return all the chunks of the text"""
|
327
|
+
return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
|
328
|
+
|
329
|
+
def get_chunk_terms(self, chunk):
|
330
|
+
"""Return all the terms of a chunk."""
|
331
|
+
return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
|
332
|
+
|
333
|
+
def add_entity(self, eid, entity_type, references=()):
|
334
|
+
""" Add a entity in the document.
|
335
|
+
:param eid: The identification code of the entity.
|
336
|
+
:param references: The references (ids of the terms) contained in the entity.
|
337
|
+
:param entity_type: The type of the entity.
|
338
|
+
"""
|
339
|
+
|
340
|
+
if self.entities is None:
|
341
|
+
self.entities = etree.SubElement(self.root, NAMED_ENTITIES_LAYER_TAG)
|
342
|
+
|
343
|
+
entity_attributes = {NAMED_ENTITY_ID_ATTRIBUTE: eid}
|
344
|
+
if entity_type:
|
345
|
+
entity_attributes[NAMED_ENTITY_TYPE_ATTRIBUTE] = entity_type
|
346
|
+
entity = etree.SubElement(self.entities, NAMED_ENTITY_OCCURRENCE_TAG, entity_attributes)
|
347
|
+
references_tag = etree.SubElement(entity, "references")
|
348
|
+
if references:
|
349
|
+
for reference in references:
|
350
|
+
span = etree.SubElement(references_tag, SPAN_TAG)
|
351
|
+
for token in reference:
|
352
|
+
etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
|
353
|
+
return entity
|
354
|
+
|
355
|
+
def get_constituency_trees(self):
|
356
|
+
"""Return all the constituency trees in the document"""
|
357
|
+
return self.root.findall("{0}/{1}".format(CONSTITUENCY_LAYER, CONSTITUENCY_TREE_TAG))
|
358
|
+
|
359
|
+
def get_contituent_tree_non_terminals(self, tree):
|
360
|
+
"""Get al the non terminal constituents of the tree."""
|
361
|
+
return tree.findall(CONSTITUENCY_NON_TERMINALS)
|
362
|
+
|
363
|
+
def get_contituent_tree_terminals(self, tree):
|
364
|
+
"""Get al the terminal constituents of the tree."""
|
365
|
+
return tree.findall(CONSTITUENCY_TERMINALS)
|
366
|
+
|
367
|
+
def get_contituent_tree_edges(self, tree):
|
368
|
+
"""Get al the edges of the tree."""
|
369
|
+
return tree.findall(CONSTITUENCY_EDGES)
|
370
|
+
|
371
|
+
def get_contituent_terminal_words(self, chunk):
|
372
|
+
"""Return all the terms of a terminal constituent."""
|
373
|
+
return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
|
374
|
+
|
375
|
+
def get_entities(self):
|
376
|
+
"""Return all the Named Entities in the document"""
|
377
|
+
return self.root.findall("{0}/{1}".format(NAMED_ENTITIES_LAYER_TAG, NAMED_ENTITY_OCCURRENCE_TAG))
|
378
|
+
|
379
|
+
def get_entity_references(self, named_entity):
|
380
|
+
"""Return all the terms of a Named Entities in the document"""
|
381
|
+
return named_entity.findall("{0}/{1}".format(NAMED_ENTITY_REFERENCES_GROUP_TAG, SPAN_TAG))
|
382
|
+
|
383
|
+
def get_entity_reference_span(self, reference):
|
384
|
+
"""Return all the terms of a Named Entities in the document"""
|
385
|
+
return reference.findall(TARGET_TAG)
|
386
|
+
|
387
|
+
def add_coreference(self, coid, references=()):
|
388
|
+
""" Add a coreference cluster to the document.
|
389
|
+
:param coid: The identification code of the cluster.
|
390
|
+
:param references: The references contained in the cluster
|
391
|
+
"""
|
392
|
+
if self.coreferences is None:
|
393
|
+
self.coreferences = etree.SubElement(self.root, COREFERENCE_LAYER_TAG)
|
394
|
+
|
395
|
+
coref_attrib = {COREFERENCE_ID_ATTRIBUTE: coid}
|
396
|
+
entity = etree.SubElement(self.coreferences, COREFERENCE_OCCURRENCE_TAG, coref_attrib)
|
397
|
+
|
398
|
+
if references:
|
399
|
+
for reference, form in references:
|
400
|
+
comment = etree.Comment(form.decode("utf-8").replace("-", " - "))
|
401
|
+
entity.append(comment)
|
402
|
+
span = etree.SubElement(entity, SPAN_TAG)
|
403
|
+
for token in reference:
|
404
|
+
etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
|
405
|
+
return entity
|
406
|
+
|
407
|
+
def indent(self, elem, level=0):
|
408
|
+
i = "\n" + level * " "
|
409
|
+
if len(elem):
|
410
|
+
if not elem.text or not elem.text.strip():
|
411
|
+
elem.text = i + " "
|
412
|
+
if not elem.tail or not elem.tail.strip():
|
413
|
+
elem.tail = i
|
414
|
+
for child in elem:
|
415
|
+
self.indent(child, level+1)
|
416
|
+
# This seeks for the las child processed in for, is not a code identation error
|
417
|
+
if not child.tail or not child.tail.strip():
|
418
|
+
child.tail = i
|
419
|
+
else:
|
420
|
+
if level and (not elem.tail or not elem.tail.strip()):
|
421
|
+
elem.tail = i
|
422
|
+
|
423
|
+
def write(self, output, encoding):
|
424
|
+
"""Write document into a file.
|
425
|
+
:param output: The output target for the document. May be a file type object or a file name."""
|
426
|
+
self.indent(self.root)
|
427
|
+
output.write(etree.tostring(self.root, encoding=encoding,))#, pretty_print=True, xml_declaration=True, with_comments=True))
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-coreference-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
@@ -276,6 +276,10 @@ files:
|
|
276
276
|
- core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py
|
277
277
|
- core/site-packages/pre_build/VUKafParserPy/KafParserMod.py
|
278
278
|
- core/site-packages/pre_build/VUKafParserPy/__init__.py
|
279
|
+
- core/vendor/dictconfig/__init__.py
|
280
|
+
- core/vendor/dictconfig/dictconfig.py
|
281
|
+
- core/vendor/pykaf/LICENSE.txt
|
282
|
+
- core/vendor/pykaf/__init__.py
|
279
283
|
- ext/hack/Rakefile
|
280
284
|
- ext/hack/support.rb
|
281
285
|
- lib/opener/coreferences/base.rb
|