opener-coreference-base 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 36e731f68489db9ea8890f201361498f0814e9de
4
- data.tar.gz: 65d7f509e9d0f85eb735b9f7925a41f17be78eb0
3
+ metadata.gz: c35ea079c7c50da42f7bd17cddabf9cb79d1b93b
4
+ data.tar.gz: f836ce7b54e055bae722fda4890cd9a9a7cf8f39
5
5
  SHA512:
6
- metadata.gz: e95ede174f6196081b58a779010e755ab84199636cdde86608218e972e6297be5e4de795b2fae43b8536cff70aa70783bf0c4e3f262428e28d164d43efeff9f7
7
- data.tar.gz: 2fa4a2e030929ff3810eb82a0fa8674bc1433cb026aac02650a2c895c1e22f61430894858ff426b4b71db48ce8a4146bf3c118f9197d49aefbd9ebb0a7111a64
6
+ metadata.gz: e835457582de3c321d41649f19c9c7682351931d03cd8e526ae5aa3b217833b626a386727702b371743ee267d93dbbf1fbea97167d45223e312067412c200169
7
+ data.tar.gz: 67d0658e73efc718196354b34d6ecee09cb52ef2d32f2bea821b008fae8f066a43a3ce5ea967b56421de8e543b2a4867f82a55b69fe0995d5bcc8bf794f53ff4
@@ -0,0 +1 @@
1
+ from dictconfig import *
@@ -0,0 +1,549 @@
1
+ # Copyright 2009-2010 by Vinay Sajip. All Rights Reserved.
2
+ #
3
+ # Permission to use, copy, modify, and distribute this software and its
4
+ # documentation for any purpose and without fee is hereby granted,
5
+ # provided that the above copyright notice appear in all copies and that
6
+ # both that copyright notice and this permission notice appear in
7
+ # supporting documentation, and that the name of Vinay Sajip
8
+ # not be used in advertising or publicity pertaining to distribution
9
+ # of the software without specific, written prior permission.
10
+ # VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
11
+ # ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
12
+ # VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
13
+ # ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
14
+ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15
+ # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+
17
+ import logging.handlers
18
+ import re
19
+ import sys
20
+ import types
21
+
22
+ IDENTIFIER = re.compile('^[a-z_][a-z0-9_]*$', re.I)
23
+
24
+ def valid_ident(s):
25
+ m = IDENTIFIER.match(s)
26
+ if not m:
27
+ raise ValueError('Not a valid Python identifier: %r' % s)
28
+ return True
29
+
30
+ #
31
+ # This function is defined in logging only in recent versions of Python
32
+ #
33
+ try:
34
+ from logging import _checkLevel
35
+ except ImportError:
36
+ def _checkLevel(level):
37
+ if isinstance(level, int):
38
+ rv = level
39
+ elif str(level) == level:
40
+ if level not in logging._levelNames:
41
+ raise ValueError('Unknown level: %r' % level)
42
+ rv = logging._levelNames[level]
43
+ else:
44
+ raise TypeError('Level not an integer or a '
45
+ 'valid string: %r' % level)
46
+ return rv
47
+
48
+ # The ConvertingXXX classes are wrappers around standard Python containers,
49
+ # and they serve to convert any suitable values in the container. The
50
+ # conversion converts base dicts, lists and tuples to their wrapped
51
+ # equivalents, whereas strings which match a conversion format are converted
52
+ # appropriately.
53
+ #
54
+ # Each wrapper should have a configurator attribute holding the actual
55
+ # configurator to use for conversion.
56
+
57
+ class ConvertingDict(dict):
58
+ """A converting dictionary wrapper."""
59
+
60
+ def __getitem__(self, key):
61
+ value = dict.__getitem__(self, key)
62
+ result = self.configurator.convert(value)
63
+ #If the converted value is different, save for next time
64
+ if value is not result:
65
+ self[key] = result
66
+ if type(result) in (ConvertingDict, ConvertingList,
67
+ ConvertingTuple):
68
+ result.parent = self
69
+ result.key = key
70
+ return result
71
+
72
+ def get(self, key, default=None):
73
+ value = dict.get(self, key, default)
74
+ result = self.configurator.convert(value)
75
+ #If the converted value is different, save for next time
76
+ if value is not result:
77
+ self[key] = result
78
+ if type(result) in (ConvertingDict, ConvertingList,
79
+ ConvertingTuple):
80
+ result.parent = self
81
+ result.key = key
82
+ return result
83
+
84
+ def pop(self, key, default=None):
85
+ value = dict.pop(self, key, default)
86
+ result = self.configurator.convert(value)
87
+ if value is not result:
88
+ if type(result) in (ConvertingDict, ConvertingList,
89
+ ConvertingTuple):
90
+ result.parent = self
91
+ result.key = key
92
+ return result
93
+
94
+ class ConvertingList(list):
95
+ """A converting list wrapper."""
96
+ def __getitem__(self, key):
97
+ value = list.__getitem__(self, key)
98
+ result = self.configurator.convert(value)
99
+ #If the converted value is different, save for next time
100
+ if value is not result:
101
+ self[key] = result
102
+ if type(result) in (ConvertingDict, ConvertingList,
103
+ ConvertingTuple):
104
+ result.parent = self
105
+ result.key = key
106
+ return result
107
+
108
+ def pop(self, idx=-1):
109
+ value = list.pop(self, idx)
110
+ result = self.configurator.convert(value)
111
+ if value is not result:
112
+ if type(result) in (ConvertingDict, ConvertingList,
113
+ ConvertingTuple):
114
+ result.parent = self
115
+ return result
116
+
117
+ class ConvertingTuple(tuple):
118
+ """A converting tuple wrapper."""
119
+ def __getitem__(self, key):
120
+ value = tuple.__getitem__(self, key)
121
+ result = self.configurator.convert(value)
122
+ if value is not result:
123
+ if type(result) in (ConvertingDict, ConvertingList,
124
+ ConvertingTuple):
125
+ result.parent = self
126
+ result.key = key
127
+ return result
128
+
129
+ class BaseConfigurator(object):
130
+ """
131
+ The configurator base class which defines some useful defaults.
132
+ """
133
+
134
+ CONVERT_PATTERN = re.compile(r'^(?P<prefix>[a-z]+)://(?P<suffix>.*)$')
135
+
136
+ WORD_PATTERN = re.compile(r'^\s*(\w+)\s*')
137
+ DOT_PATTERN = re.compile(r'^\.\s*(\w+)\s*')
138
+ INDEX_PATTERN = re.compile(r'^\[\s*(\w+)\s*\]\s*')
139
+ DIGIT_PATTERN = re.compile(r'^\d+$')
140
+
141
+ value_converters = {
142
+ 'ext' : 'ext_convert',
143
+ 'cfg' : 'cfg_convert',
144
+ }
145
+
146
+ # We might want to use a different one, e.g. importlib
147
+ importer = __import__
148
+
149
+ def __init__(self, config):
150
+ self.config = ConvertingDict(config)
151
+ self.config.configurator = self
152
+
153
+ def resolve(self, s):
154
+ """
155
+ Resolve strings to objects using standard import and attribute
156
+ syntax.
157
+ """
158
+ name = s.split('.')
159
+ used = name.pop(0)
160
+ try:
161
+ found = self.importer(used)
162
+ for frag in name:
163
+ used += '.' + frag
164
+ try:
165
+ found = getattr(found, frag)
166
+ except AttributeError:
167
+ self.importer(used)
168
+ found = getattr(found, frag)
169
+ return found
170
+ except ImportError:
171
+ e, tb = sys.exc_info()[1:]
172
+ v = ValueError('Cannot resolve %r: %s' % (s, e))
173
+ v.__cause__, v.__traceback__ = e, tb
174
+ raise v
175
+
176
+ def ext_convert(self, value):
177
+ """Default converter for the ext:// protocol."""
178
+ return self.resolve(value)
179
+
180
+ def cfg_convert(self, value):
181
+ """Default converter for the cfg:// protocol."""
182
+ rest = value
183
+ m = self.WORD_PATTERN.match(rest)
184
+ if m is None:
185
+ raise ValueError("Unable to convert %r" % value)
186
+ else:
187
+ rest = rest[m.end():]
188
+ d = self.config[m.groups()[0]]
189
+ #print d, rest
190
+ while rest:
191
+ m = self.DOT_PATTERN.match(rest)
192
+ if m:
193
+ d = d[m.groups()[0]]
194
+ else:
195
+ m = self.INDEX_PATTERN.match(rest)
196
+ if m:
197
+ idx = m.groups()[0]
198
+ if not self.DIGIT_PATTERN.match(idx):
199
+ d = d[idx]
200
+ else:
201
+ try:
202
+ n = int(idx) # try as number first (most likely)
203
+ d = d[n]
204
+ except TypeError:
205
+ d = d[idx]
206
+ if m:
207
+ rest = rest[m.end():]
208
+ else:
209
+ raise ValueError('Unable to convert '
210
+ '%r at %r' % (value, rest))
211
+ #rest should be empty
212
+ return d
213
+
214
+ def convert(self, value):
215
+ """
216
+ Convert values to an appropriate type. dicts, lists and tuples are
217
+ replaced by their converting alternatives. Strings are checked to
218
+ see if they have a conversion format and are converted if they do.
219
+ """
220
+ if not isinstance(value, ConvertingDict) and isinstance(value, dict):
221
+ value = ConvertingDict(value)
222
+ value.configurator = self
223
+ elif not isinstance(value, ConvertingList) and isinstance(value, list):
224
+ value = ConvertingList(value)
225
+ value.configurator = self
226
+ elif not isinstance(value, ConvertingTuple) and\
227
+ isinstance(value, tuple):
228
+ value = ConvertingTuple(value)
229
+ value.configurator = self
230
+ elif isinstance(value, basestring): # str for py3k
231
+ m = self.CONVERT_PATTERN.match(value)
232
+ if m:
233
+ d = m.groupdict()
234
+ prefix = d['prefix']
235
+ converter = self.value_converters.get(prefix, None)
236
+ if converter:
237
+ suffix = d['suffix']
238
+ converter = getattr(self, converter)
239
+ value = converter(suffix)
240
+ return value
241
+
242
+ def configure_custom(self, config):
243
+ """Configure an object with a user-supplied factory."""
244
+ c = config.pop('()')
245
+ if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
246
+ c = self.resolve(c)
247
+ props = config.pop('.', None)
248
+ # Check for valid identifiers
249
+ kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
250
+ result = c(**kwargs)
251
+ if props:
252
+ for name, value in props.items():
253
+ setattr(result, name, value)
254
+ return result
255
+
256
+ def as_tuple(self, value):
257
+ """Utility function which converts lists to tuples."""
258
+ if isinstance(value, list):
259
+ value = tuple(value)
260
+ return value
261
+
262
+ class DictConfigurator(BaseConfigurator):
263
+ """
264
+ Configure logging using a dictionary-like object to describe the
265
+ configuration.
266
+ """
267
+
268
+ def configure(self):
269
+ """Do the configuration."""
270
+
271
+ config = self.config
272
+ if 'version' not in config:
273
+ raise ValueError("dictionary doesn't specify a version")
274
+ if config['version'] != 1:
275
+ raise ValueError("Unsupported version: %s" % config['version'])
276
+ incremental = config.pop('incremental', False)
277
+ EMPTY_DICT = {}
278
+ logging._acquireLock()
279
+ try:
280
+ if incremental:
281
+ handlers = config.get('handlers', EMPTY_DICT)
282
+ # incremental handler config only if handler name
283
+ # ties in to logging._handlers (Python 2.7)
284
+ if sys.version_info[:2] == (2, 7):
285
+ for name in handlers:
286
+ if name not in logging._handlers:
287
+ raise ValueError('No handler found with '
288
+ 'name %r' % name)
289
+ else:
290
+ try:
291
+ handler = logging._handlers[name]
292
+ handler_config = handlers[name]
293
+ level = handler_config.get('level', None)
294
+ if level:
295
+ handler.setLevel(_checkLevel(level))
296
+ except StandardError, e:
297
+ raise ValueError('Unable to configure handler '
298
+ '%r: %s' % (name, e))
299
+ loggers = config.get('loggers', EMPTY_DICT)
300
+ for name in loggers:
301
+ try:
302
+ self.configure_logger(name, loggers[name], True)
303
+ except StandardError, e:
304
+ raise ValueError('Unable to configure logger '
305
+ '%r: %s' % (name, e))
306
+ root = config.get('root', None)
307
+ if root:
308
+ try:
309
+ self.configure_root(root, True)
310
+ except StandardError, e:
311
+ raise ValueError('Unable to configure root '
312
+ 'logger: %s' % e)
313
+ else:
314
+ disable_existing = config.pop('disable_existing_loggers', True)
315
+
316
+ logging._handlers.clear()
317
+ del logging._handlerList[:]
318
+
319
+ # Do formatters first - they don't refer to anything else
320
+ formatters = config.get('formatters', EMPTY_DICT)
321
+ for name in formatters:
322
+ try:
323
+ formatters[name] = self.configure_formatter(
324
+ formatters[name])
325
+ except StandardError, e:
326
+ raise ValueError('Unable to configure '
327
+ 'formatter %r: %s' % (name, e))
328
+ # Next, do filters - they don't refer to anything else, either
329
+ filters = config.get('filters', EMPTY_DICT)
330
+ for name in filters:
331
+ try:
332
+ filters[name] = self.configure_filter(filters[name])
333
+ except StandardError, e:
334
+ raise ValueError('Unable to configure '
335
+ 'filter %r: %s' % (name, e))
336
+
337
+ # Next, do handlers - they refer to formatters and filters
338
+ # As handlers can refer to other handlers, sort the keys
339
+ # to allow a deterministic order of configuration
340
+ handlers = config.get('handlers', EMPTY_DICT)
341
+ for name in sorted(handlers):
342
+ try:
343
+ handler = self.configure_handler(handlers[name])
344
+ handler.name = name
345
+ handlers[name] = handler
346
+ except StandardError, e:
347
+ raise ValueError('Unable to configure handler '
348
+ '%r: %s' % (name, e))
349
+ # Next, do loggers - they refer to handlers and filters
350
+
351
+ #we don't want to lose the existing loggers,
352
+ #since other threads may have pointers to them.
353
+ #existing is set to contain all existing loggers,
354
+ #and as we go through the new configuration we
355
+ #remove any which are configured. At the end,
356
+ #what's left in existing is the set of loggers
357
+ #which were in the previous configuration but
358
+ #which are not in the new configuration.
359
+ root = logging.root
360
+ existing = root.manager.loggerDict.keys()
361
+ #The list needs to be sorted so that we can
362
+ #avoid disabling child loggers of explicitly
363
+ #named loggers. With a sorted list it is easier
364
+ #to find the child loggers.
365
+ existing.sort()
366
+ #We'll keep the list of existing loggers
367
+ #which are children of named loggers here...
368
+ child_loggers = []
369
+ #now set up the new ones...
370
+ loggers = config.get('loggers', EMPTY_DICT)
371
+ for name in loggers:
372
+ if name in existing:
373
+ i = existing.index(name)
374
+ prefixed = name + "."
375
+ pflen = len(prefixed)
376
+ num_existing = len(existing)
377
+ i = i + 1 # look at the entry after name
378
+ while (i < num_existing) and\
379
+ (existing[i][:pflen] == prefixed):
380
+ child_loggers.append(existing[i])
381
+ i = i + 1
382
+ existing.remove(name)
383
+ try:
384
+ self.configure_logger(name, loggers[name])
385
+ except StandardError, e:
386
+ raise ValueError('Unable to configure logger '
387
+ '%r: %s' % (name, e))
388
+
389
+ #Disable any old loggers. There's no point deleting
390
+ #them as other threads may continue to hold references
391
+ #and by disabling them, you stop them doing any logging.
392
+ #However, don't disable children of named loggers, as that's
393
+ #probably not what was intended by the user.
394
+ for log in existing:
395
+ logger = root.manager.loggerDict[log]
396
+ if log in child_loggers:
397
+ logger.level = logging.NOTSET
398
+ logger.handlers = []
399
+ logger.propagate = True
400
+ elif disable_existing:
401
+ logger.disabled = True
402
+
403
+ # And finally, do the root logger
404
+ root = config.get('root', None)
405
+ if root:
406
+ try:
407
+ self.configure_root(root)
408
+ except StandardError, e:
409
+ raise ValueError('Unable to configure root '
410
+ 'logger: %s' % e)
411
+ finally:
412
+ logging._releaseLock()
413
+
414
+ def configure_formatter(self, config):
415
+ """Configure a formatter from a dictionary."""
416
+ if '()' in config:
417
+ factory = config['()'] # for use in exception handler
418
+ try:
419
+ result = self.configure_custom(config)
420
+ except TypeError, te:
421
+ if "'format'" not in str(te):
422
+ raise
423
+ #Name of parameter changed from fmt to format.
424
+ #Retry with old name.
425
+ #This is so that code can be used with older Python versions
426
+ #(e.g. by Django)
427
+ config['fmt'] = config.pop('format')
428
+ config['()'] = factory
429
+ result = self.configure_custom(config)
430
+ else:
431
+ fmt = config.get('format', None)
432
+ dfmt = config.get('datefmt', None)
433
+ result = logging.Formatter(fmt, dfmt)
434
+ return result
435
+
436
+ def configure_filter(self, config):
437
+ """Configure a filter from a dictionary."""
438
+ if '()' in config:
439
+ result = self.configure_custom(config)
440
+ else:
441
+ name = config.get('name', '')
442
+ result = logging.Filter(name)
443
+ return result
444
+
445
+ def add_filters(self, filterer, filters):
446
+ """Add filters to a filterer from a list of names."""
447
+ for f in filters:
448
+ try:
449
+ filterer.addFilter(self.config['filters'][f])
450
+ except StandardError, e:
451
+ raise ValueError('Unable to add filter %r: %s' % (f, e))
452
+
453
+ def configure_handler(self, config):
454
+ """Configure a handler from a dictionary."""
455
+ formatter = config.pop('formatter', None)
456
+ if formatter:
457
+ try:
458
+ formatter = self.config['formatters'][formatter]
459
+ except StandardError, e:
460
+ raise ValueError('Unable to set formatter '
461
+ '%r: %s' % (formatter, e))
462
+ level = config.pop('level', None)
463
+ filters = config.pop('filters', None)
464
+ if '()' in config:
465
+ c = config.pop('()')
466
+ if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
467
+ c = self.resolve(c)
468
+ factory = c
469
+ else:
470
+ klass = self.resolve(config.pop('class'))
471
+ #Special case for handler which refers to another handler
472
+ if issubclass(klass, logging.handlers.MemoryHandler) and\
473
+ 'target' in config:
474
+ try:
475
+ config['target'] = self.config['handlers'][config['target']]
476
+ except StandardError, e:
477
+ raise ValueError('Unable to set target handler '
478
+ '%r: %s' % (config['target'], e))
479
+ elif issubclass(klass, logging.handlers.SMTPHandler) and\
480
+ 'mailhost' in config:
481
+ config['mailhost'] = self.as_tuple(config['mailhost'])
482
+ elif issubclass(klass, logging.handlers.SysLogHandler) and\
483
+ 'address' in config:
484
+ config['address'] = self.as_tuple(config['address'])
485
+ factory = klass
486
+ kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
487
+ try:
488
+ result = factory(**kwargs)
489
+ except TypeError, te:
490
+ if "'stream'" not in str(te):
491
+ raise
492
+ #The argument name changed from strm to stream
493
+ #Retry with old name.
494
+ #This is so that code can be used with older Python versions
495
+ #(e.g. by Django)
496
+ kwargs['strm'] = kwargs.pop('stream')
497
+ result = factory(**kwargs)
498
+ if formatter:
499
+ result.setFormatter(formatter)
500
+ if level is not None:
501
+ result.setLevel(_checkLevel(level))
502
+ if filters:
503
+ self.add_filters(result, filters)
504
+ return result
505
+
506
+ def add_handlers(self, logger, handlers):
507
+ """Add handlers to a logger from a list of names."""
508
+ for h in handlers:
509
+ try:
510
+ logger.addHandler(self.config['handlers'][h])
511
+ except StandardError, e:
512
+ raise ValueError('Unable to add handler %r: %s' % (h, e))
513
+
514
+ def common_logger_config(self, logger, config, incremental=False):
515
+ """
516
+ Perform configuration which is common to root and non-root loggers.
517
+ """
518
+ level = config.get('level', None)
519
+ if level is not None:
520
+ logger.setLevel(_checkLevel(level))
521
+ if not incremental:
522
+ #Remove any existing handlers
523
+ for h in logger.handlers[:]:
524
+ logger.removeHandler(h)
525
+ handlers = config.get('handlers', None)
526
+ if handlers:
527
+ self.add_handlers(logger, handlers)
528
+ filters = config.get('filters', None)
529
+ if filters:
530
+ self.add_filters(logger, filters)
531
+
532
+ def configure_logger(self, name, config, incremental=False):
533
+ """Configure a non-root logger from a dictionary."""
534
+ logger = logging.getLogger(name)
535
+ self.common_logger_config(logger, config, incremental)
536
+ propagate = config.get('propagate', None)
537
+ if propagate is not None:
538
+ logger.propagate = propagate
539
+
540
+ def configure_root(self, config, incremental=False):
541
+ """Configure a root logger from a dictionary."""
542
+ root = logging.getLogger()
543
+ self.common_logger_config(root, config, incremental)
544
+
545
+ dictConfigClass = DictConfigurator
546
+
547
+ def dictConfig(config):
548
+ """Configure logging using a dictionary."""
549
+ dictConfigClass(config).configure()
@@ -0,0 +1,13 @@
1
+ Copyright 2013 Josu Bermudez Galbarriatu <josu.bermudez@deusto.es>.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
@@ -0,0 +1,427 @@
1
+ # coding=utf-8
2
+ """ Module for manage KAF formatted files. """
3
+
4
+ from __future__ import unicode_literals
5
+
6
+ __author__ = 'Josu Bermudez <josu.bermudez@deusto.es>'
7
+
8
+ from lxml import etree
9
+
10
+ # CONSTANT TEXT VALUES USED TO CONSTRUCT KAF
11
+ KAF_TAG = "KAF"
12
+ LANGUAGE_ATTRIBUTE = "{http://www.w3.org/XML/1998/namespace}lang"
13
+ VERSION_ATTRIBUTE = "version"
14
+ NS = {}
15
+
16
+ KAF_HEADER_TAG = "kafHeader"
17
+ NAME_ATTRIBUTE = "name"
18
+ LINGUISTIC_PROCESSOR_HEAD = "linguisticProcessors"
19
+ LAYER_ATTRIBUTE = "layer"
20
+ LINGUISTIC_PROCESSOR_OCCURRENCE_TAG = "lp"
21
+ TIMESTAMP_ATTRIBUTE = "timestamp"
22
+
23
+ SPAN_TAG = "span"
24
+ TARGET_ID_ATTRIBUTE = "id"
25
+ TARGET_TAG = "target"
26
+
27
+ TEXT_LAYER_TAG = "text"
28
+ WORD_OCCURRENCE_TAG = "wf"
29
+ WORD_ID_ATTRIBUTE = "wid"
30
+
31
+ TERMS_LAYER_TAG = "terms"
32
+ TERM_OCCURRENCE_TAG = "term"
33
+ TERM_ID_ATTRIBUTE = "tid"
34
+ NER_ATTRIBUTE = "ner"
35
+ TYPE_ATTRIBUTE = "type"
36
+ LEMMA_ATTRIBUTE = "lemma"
37
+ POS_ATTRIBUTE = "pos"
38
+ MORPHOFEAT_ATTRIBUTE = "morphofeat"
39
+
40
+ NAMED_ENTITIES_LAYER_TAG = "entities"
41
+ NAMED_ENTITY_OCCURRENCE_TAG = "entity"
42
+ NAMED_ENTITY_ID_ATTRIBUTE = "eid"
43
+ NAMED_ENTITY_TYPE_ATTRIBUTE = "type"
44
+ NAMED_ENTITY_REFERENCES_GROUP_TAG = "references"
45
+
46
+ CONSTITUENCY_LAYER = "constituency"
47
+ CONSTITUENCY_TREE_TAG = "tree"
48
+ CONSTITUENCY_NON_TERMINALS = "nt"
49
+ CONSTITUENCY_TERMINALS = "t"
50
+ CONSTITUENCY_EDGES = "edge"
51
+
52
+ CHUNKS_LAYER_TAG = "chunks"
53
+ CHUNK_OCCURRENCE_TAG = "chunk"
54
+ CHUNK_CASE_ATTRIBUTE = "case"
55
+ CHUNK_PHRASE_ATTRIBUTE = "phrase"
56
+ CHUNK_HEAD_ATTRIBUTE = "head"
57
+ CHUNK_ID_ATTRIBUTE = "cid"
58
+
59
+ DEPENDENCY_LAYER_TAG = "deps"
60
+ DEPENDENCY_OCCURRENCE_TAG = "dep"
61
+ DEPENDENCY_FROM_ATTRIBUTE = "from"
62
+ DEPENDENCY_FUNCTION_ATTRIBUTE = "rfunc"
63
+ DEPENDENCY_TO_ATTRIBUTE = "to"
64
+
65
+ EXTERNAL_REFERENCE_OCCURRENCE_TAG = "externalRef"
66
+ EXTERNAL_REFERENCES_TAG = "externalReferences"
67
+
68
+ COREFERENCE_LAYER_TAG = "coreferences"
69
+ COREFERENCE_ID_ATTRIBUTE = "coid"
70
+ COREFERENCE_OCCURRENCE_TAG = "coref"
71
+
72
+
73
+ class KafDocument:
74
+ """ Manage a KAF document.
75
+ """
76
+ valid_word_attributes = ("sent", "para", "offset", "length", "page")
77
+ valid_external_attributes = ("resource", "reference", "reftype", "status", "source", "confidence")
78
+ valid_externalRef_attributes = ("resource", "reference")
79
+
80
+ def __init__(self, file_name=None, input_stream=None, language=None, version="2.0", header=None):
81
+ """ Prepare the document basic structure.
82
+ """
83
+ #parser = etree.XMLParser(remove_blank_text=True)
84
+
85
+ if file_name:
86
+ self.tree = etree.parse(file_name)#, parser=parser)
87
+ self.root = self.tree.getroot()
88
+ elif input_stream:
89
+ self.root = etree.fromstring(input_stream)#, parser=parser)
90
+ self.tree = etree.ElementTree(self.root)
91
+ else:
92
+ self.root = etree.Element(KAF_TAG, NS)
93
+ self.tree = etree.ElementTree(self.root)
94
+ if language:
95
+ self.root.attrib[LANGUAGE_ATTRIBUTE] = language
96
+
97
+ if version:
98
+ self.root.set(VERSION_ATTRIBUTE, version)
99
+
100
+ headers = self.tree.find(KAF_HEADER_TAG)
101
+ if headers is not None and len(headers):
102
+ self.kaf_header = headers
103
+ else:
104
+ self.kaf_header = None
105
+
106
+ if header:
107
+ self.set_header(header)
108
+
109
+ text_layer = self.tree.find(TEXT_LAYER_TAG)
110
+ if text_layer is not None and len(text_layer):
111
+ self.text = text_layer
112
+ else:
113
+ self.text = etree.SubElement(self.root, TEXT_LAYER_TAG)
114
+
115
+ terms_layer = self.tree.find(TERMS_LAYER_TAG)
116
+ if text_layer is not None and len(terms_layer):
117
+ self.terms = terms_layer
118
+ else:
119
+ self.terms = None
120
+
121
+ dependencies_layer = self.tree.find(DEPENDENCY_LAYER_TAG)
122
+ if dependencies_layer is not None and len(dependencies_layer):
123
+ self.dependencies = dependencies_layer
124
+ else:
125
+ self.dependencies = None
126
+
127
+ chunks_layer = self.tree.find(CHUNKS_LAYER_TAG)
128
+ if chunks_layer is not None and len(chunks_layer):
129
+ self.chunks = chunks_layer
130
+ else:
131
+ self.chunks = None
132
+
133
+ constituency_layer = self.tree.find(CONSTITUENCY_LAYER)
134
+ if constituency_layer is not None and len(constituency_layer):
135
+ self.constituency = constituency_layer
136
+ else:
137
+ self.constituency = None
138
+
139
+ named_entities_layer = self.tree.find(NAMED_ENTITIES_LAYER_TAG)
140
+ if named_entities_layer is not None and len(named_entities_layer):
141
+ self.entities = named_entities_layer
142
+ else:
143
+ self.entities = None
144
+
145
+ coreference_layer = self.tree.find(COREFERENCE_LAYER_TAG)
146
+ if coreference_layer is not None and len(coreference_layer):
147
+ self.coreferences = coreference_layer
148
+ else:
149
+ self.coreferences = None
150
+
151
+ def clear_header(self):
152
+ self.root.remove(self.kaf_header)
153
+ self.kaf_header = None
154
+
155
+ def set_header(self, kaf_header):
156
+ if self.kaf_header:
157
+ for element in kaf_header:
158
+ self.kaf_header.append(element)
159
+ self.kaf_header.attrib.update(kaf_header.attrib)
160
+ else:
161
+ self.kaf_header = kaf_header
162
+ self.root.append(self.kaf_header)
163
+
164
+ def add_linguistic_processors(self, layer, name, version, time_stamp):
165
+ if not self.kaf_header:
166
+ self.kaf_header = etree.SubElement(self.root, KAF_HEADER_TAG)
167
+
168
+ layer_find = self.kaf_header.find("./{0}..[@{1}='{2}']".format(LINGUISTIC_PROCESSOR_HEAD, LAYER_ATTRIBUTE, layer))
169
+ if layer_find:
170
+ layer = layer_find[0]
171
+ else:
172
+ layer = etree.SubElement(self.kaf_header, LINGUISTIC_PROCESSOR_HEAD, {LAYER_ATTRIBUTE: layer})
173
+
174
+ etree.SubElement(layer, LINGUISTIC_PROCESSOR_OCCURRENCE_TAG,
175
+ {NAME_ATTRIBUTE: name, VERSION_ATTRIBUTE: version, TIMESTAMP_ATTRIBUTE: time_stamp})
176
+
177
+ def add_word(self, word, wid, **kwargs):
178
+ """Add a word to the KAF file.
179
+ A word have the next parameters/attributes;
180
+ + wid: the unique id for the word form.
181
+ + sent: sentence id of the token (optional)
182
+ + para: paragraph id (optional)
183
+ + offset: the offset of the word form (optional)
184
+ + length: the length of the original word form (optional)
185
+ + page: page id (optional)
186
+ """
187
+ # Prepare the word attributes
188
+ word_attributes = dict((k, v) for (k, v) in kwargs.iteritems() if k in self.valid_word_attributes)
189
+ word_attributes[WORD_ID_ATTRIBUTE] = wid
190
+ # Create a text subnode for the word and set its attributes
191
+ element = etree.SubElement(self.text, WORD_OCCURRENCE_TAG, word_attributes)
192
+ try:
193
+ element.text = word
194
+ except:
195
+ element.text = "XXXXXX"
196
+ return element
197
+
198
+ def get_words(self):
199
+ """ Return all the words in the document"""
200
+ return self.text[:]
201
+
202
+ def get_words_by_id(self, wid):
203
+ """ Return all the words in the document"""
204
+ results = self.text.find("{0}[@{1}='{2}']".format(WORD_OCCURRENCE_TAG, WORD_ID_ATTRIBUTE, wid))
205
+ return results and results[0]
206
+
207
+ def add_term(self, tid, pos=None, lemma=None, morphofeat=None, term_type=None, words=(), ner=None,
208
+ external_refs=()):
209
+ """Add a term to the kaf file.
210
+ A Term have the next parameters/attributes:
211
+ tid: unique identifier
212
+ type: type of the term. Currently, 3 values are possible:
213
+ + open: open category term
214
+ + close: close category term
215
+ lemma: lemma of the term
216
+ pos: part of speech
217
+ morphofeat: PennTreebank part of speech tag
218
+ word: a list of id of the bounded words.
219
+ external_ref: A list of dictionaries that contains the external references.
220
+ Each reference have:
221
+ + resource
222
+ + reference
223
+ + INCOMPLETE
224
+ """
225
+ if self.terms is None:
226
+ self.terms = etree.SubElement(self.root, TERMS_LAYER_TAG)
227
+
228
+ #TODO Complete external references
229
+
230
+ word_attributes = {TERM_ID_ATTRIBUTE: tid}
231
+ if pos:
232
+ word_attributes[POS_ATTRIBUTE] = pos
233
+ if lemma:
234
+ word_attributes[LEMMA_ATTRIBUTE] = lemma
235
+ if term_type:
236
+ word_attributes[TYPE_ATTRIBUTE] = term_type
237
+ if morphofeat:
238
+ word_attributes[MORPHOFEAT_ATTRIBUTE] = morphofeat
239
+ if ner:
240
+ word_attributes[NER_ATTRIBUTE] = ner
241
+ term = etree.SubElement(self.terms, TERM_OCCURRENCE_TAG, word_attributes)
242
+ if words:
243
+ span = etree.SubElement(term, SPAN_TAG)
244
+ for word in words:
245
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: word})
246
+ if external_refs:
247
+ span = etree.SubElement(term, EXTERNAL_REFERENCES_TAG)
248
+ for external_ref in external_refs:
249
+ ref_attributes = dict((k, v) for (k, v) in external_ref.iteritems()
250
+ if k in self.valid_externalRef_attributes)
251
+ keys = ref_attributes.keys()
252
+ for attribute in self.valid_externalRef_attributes:
253
+ if not attribute in keys:
254
+ raise Exception("External resource not have {0}".format(attribute))
255
+ etree.SubElement(span, EXTERNAL_REFERENCE_OCCURRENCE_TAG, ref_attributes)
256
+ return term
257
+
258
+ def get_terms(self):
259
+ """ Return all the words in the document"""
260
+ return self.root.findall("{0}/{1}".format(TERMS_LAYER_TAG, TERM_OCCURRENCE_TAG))
261
+
262
+ def get_terms_words(self, term):
263
+ return term.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
264
+
265
+ def get_terms_references(self, term):
266
+ return term.findall("{0}/{1}".format(EXTERNAL_REFERENCES_TAG, EXTERNAL_REFERENCE_OCCURRENCE_TAG))
267
+
268
+ def add_dependency(self, origen, to, rfunc):
269
+ """Add a new dependency relation in the text.
270
+ The dependency have the next parameters/attributes:
271
+ + from: term id of the source element
272
+ + to: term id of the target element
273
+ + rfunc: relational function. One of:
274
+ - mod: indicates the word introducing the dependent in a head- modifier relation.
275
+ - subj: indicates the subject in the grammatical relation Subject-Predicate.
276
+ - csubj, xsubj, ncsubj: The Grammatical Relations (RL) csubj and xsubj may be used for clausal
277
+ subjects, controlled from within, or without, respectively. ncsubj is a non-clausal subject.
278
+ - dobj: Indicates the object in the grammatical relation between a predicate and its direct object.
279
+ - iobj: The relation between a predicate and a non-clausal complement introduced by a preposition;
280
+ type indicates the preposition introducing the dependent.
281
+ - obj2: The relation between a predicate and the second non-clausal complement in ditransitive
282
+ constructions.
283
+ """
284
+ if not self.dependencies:
285
+ self.dependencies = etree.SubElement(self.root, DEPENDENCY_LAYER_TAG)
286
+
287
+ dependency_attributes = {DEPENDENCY_FROM_ATTRIBUTE: origen,
288
+ DEPENDENCY_TO_ATTRIBUTE: to,
289
+ DEPENDENCY_FUNCTION_ATTRIBUTE: rfunc}
290
+ return etree.SubElement(self.dependencies, DEPENDENCY_OCCURRENCE_TAG, dependency_attributes)
291
+
292
+ def get_dependencies(self):
293
+ """Return all the words in the document"""
294
+ return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
295
+
296
+ def add_chunk(self, cid, head, phrase, case=None, terms=()):
297
+ """"Add a chunk to the kaf document.
298
+ Chunks are noun or prepositional phrases, spanning terms.A chunk have the following parameters/attributes:
299
+ + cid: unique identifier
300
+ + head: the chunk head's term id
301
+ + phrase: typo of the phrase.Valid values for the phrase elements are one of the following:
302
+ - NP: noun phrase
303
+ - VP: verbal phrase
304
+ - PP: prepositional phrase
305
+ - S: sentence
306
+ - O: other
307
+ + case (optional): declension case
308
+ """
309
+ # Secure the root
310
+ if not self.chunks:
311
+ self.chunks = etree.SubElement(self.root, CHUNKS_LAYER_TAG)
312
+ # Prepare the attributes
313
+ chunk_attributes = {CHUNK_ID_ATTRIBUTE: cid, CHUNK_HEAD_ATTRIBUTE: head, CHUNK_PHRASE_ATTRIBUTE: phrase}
314
+ if case:
315
+ chunk_attributes[CHUNK_CASE_ATTRIBUTE] = case
316
+ # Create , and attach, the chunk
317
+ chunk = etree.SubElement(self.chunks, CHUNK_OCCURRENCE_TAG, chunk_attributes)
318
+ # Add the span terms
319
+ if terms:
320
+ spans = etree.SubElement(chunk, SPAN_TAG)
321
+ for term in terms:
322
+ etree.SubElement(spans, TARGET_TAG, {TARGET_ID_ATTRIBUTE: term})
323
+ return chunk
324
+
325
+ def get_chunks(self):
326
+ """Return all the chunks of the text"""
327
+ return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
328
+
329
+ def get_chunk_terms(self, chunk):
330
+ """Return all the terms of a chunk."""
331
+ return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
332
+
333
+ def add_entity(self, eid, entity_type, references=()):
334
+ """ Add a entity in the document.
335
+ :param eid: The identification code of the entity.
336
+ :param references: The references (ids of the terms) contained in the entity.
337
+ :param entity_type: The type of the entity.
338
+ """
339
+
340
+ if self.entities is None:
341
+ self.entities = etree.SubElement(self.root, NAMED_ENTITIES_LAYER_TAG)
342
+
343
+ entity_attributes = {NAMED_ENTITY_ID_ATTRIBUTE: eid}
344
+ if entity_type:
345
+ entity_attributes[NAMED_ENTITY_TYPE_ATTRIBUTE] = entity_type
346
+ entity = etree.SubElement(self.entities, NAMED_ENTITY_OCCURRENCE_TAG, entity_attributes)
347
+ references_tag = etree.SubElement(entity, "references")
348
+ if references:
349
+ for reference in references:
350
+ span = etree.SubElement(references_tag, SPAN_TAG)
351
+ for token in reference:
352
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
353
+ return entity
354
+
355
+ def get_constituency_trees(self):
356
+ """Return all the constituency trees in the document"""
357
+ return self.root.findall("{0}/{1}".format(CONSTITUENCY_LAYER, CONSTITUENCY_TREE_TAG))
358
+
359
+ def get_contituent_tree_non_terminals(self, tree):
360
+ """Get al the non terminal constituents of the tree."""
361
+ return tree.findall(CONSTITUENCY_NON_TERMINALS)
362
+
363
+ def get_contituent_tree_terminals(self, tree):
364
+ """Get al the terminal constituents of the tree."""
365
+ return tree.findall(CONSTITUENCY_TERMINALS)
366
+
367
+ def get_contituent_tree_edges(self, tree):
368
+ """Get al the edges of the tree."""
369
+ return tree.findall(CONSTITUENCY_EDGES)
370
+
371
+ def get_contituent_terminal_words(self, chunk):
372
+ """Return all the terms of a terminal constituent."""
373
+ return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
374
+
375
+ def get_entities(self):
376
+ """Return all the Named Entities in the document"""
377
+ return self.root.findall("{0}/{1}".format(NAMED_ENTITIES_LAYER_TAG, NAMED_ENTITY_OCCURRENCE_TAG))
378
+
379
+ def get_entity_references(self, named_entity):
380
+ """Return all the terms of a Named Entities in the document"""
381
+ return named_entity.findall("{0}/{1}".format(NAMED_ENTITY_REFERENCES_GROUP_TAG, SPAN_TAG))
382
+
383
+ def get_entity_reference_span(self, reference):
384
+ """Return all the terms of a Named Entities in the document"""
385
+ return reference.findall(TARGET_TAG)
386
+
387
+ def add_coreference(self, coid, references=()):
388
+ """ Add a coreference cluster to the document.
389
+ :param coid: The identification code of the cluster.
390
+ :param references: The references contained in the cluster
391
+ """
392
+ if self.coreferences is None:
393
+ self.coreferences = etree.SubElement(self.root, COREFERENCE_LAYER_TAG)
394
+
395
+ coref_attrib = {COREFERENCE_ID_ATTRIBUTE: coid}
396
+ entity = etree.SubElement(self.coreferences, COREFERENCE_OCCURRENCE_TAG, coref_attrib)
397
+
398
+ if references:
399
+ for reference, form in references:
400
+ comment = etree.Comment(form.decode("utf-8").replace("-", " - "))
401
+ entity.append(comment)
402
+ span = etree.SubElement(entity, SPAN_TAG)
403
+ for token in reference:
404
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
405
+ return entity
406
+
407
+ def indent(self, elem, level=0):
408
+ i = "\n" + level * " "
409
+ if len(elem):
410
+ if not elem.text or not elem.text.strip():
411
+ elem.text = i + " "
412
+ if not elem.tail or not elem.tail.strip():
413
+ elem.tail = i
414
+ for child in elem:
415
+ self.indent(child, level+1)
416
+ # This seeks for the las child processed in for, is not a code identation error
417
+ if not child.tail or not child.tail.strip():
418
+ child.tail = i
419
+ else:
420
+ if level and (not elem.tail or not elem.tail.strip()):
421
+ elem.tail = i
422
+
423
+ def write(self, output, encoding):
424
+ """Write document into a file.
425
+ :param output: The output target for the document. May be a file type object or a file name."""
426
+ self.indent(self.root)
427
+ output.write(etree.tostring(self.root, encoding=encoding,))#, pretty_print=True, xml_declaration=True, with_comments=True))
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module Coreferences
3
3
  class Base
4
- VERSION = '2.0.1'
4
+ VERSION = '2.0.2'
5
5
  end # Base
6
6
  end # Coreferences
7
7
  end # Opener
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
14
14
  gem.files = Dir.glob([
15
15
  'core/corefgraph/**/*',
16
16
  'core/site-packages/pre_build/**/*',
17
- 'vendor/**/*',
17
+ 'core/vendor/**/*',
18
18
  'ext/**/*',
19
19
  'lib/**/*',
20
20
  '*.gemspec',
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-coreference-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
@@ -276,6 +276,10 @@ files:
276
276
  - core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py
277
277
  - core/site-packages/pre_build/VUKafParserPy/KafParserMod.py
278
278
  - core/site-packages/pre_build/VUKafParserPy/__init__.py
279
+ - core/vendor/dictconfig/__init__.py
280
+ - core/vendor/dictconfig/dictconfig.py
281
+ - core/vendor/pykaf/LICENSE.txt
282
+ - core/vendor/pykaf/__init__.py
279
283
  - ext/hack/Rakefile
280
284
  - ext/hack/support.rb
281
285
  - lib/opener/coreferences/base.rb