opener-coreference-base 2.0.1 → 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 36e731f68489db9ea8890f201361498f0814e9de
4
- data.tar.gz: 65d7f509e9d0f85eb735b9f7925a41f17be78eb0
3
+ metadata.gz: c35ea079c7c50da42f7bd17cddabf9cb79d1b93b
4
+ data.tar.gz: f836ce7b54e055bae722fda4890cd9a9a7cf8f39
5
5
  SHA512:
6
- metadata.gz: e95ede174f6196081b58a779010e755ab84199636cdde86608218e972e6297be5e4de795b2fae43b8536cff70aa70783bf0c4e3f262428e28d164d43efeff9f7
7
- data.tar.gz: 2fa4a2e030929ff3810eb82a0fa8674bc1433cb026aac02650a2c895c1e22f61430894858ff426b4b71db48ce8a4146bf3c118f9197d49aefbd9ebb0a7111a64
6
+ metadata.gz: e835457582de3c321d41649f19c9c7682351931d03cd8e526ae5aa3b217833b626a386727702b371743ee267d93dbbf1fbea97167d45223e312067412c200169
7
+ data.tar.gz: 67d0658e73efc718196354b34d6ecee09cb52ef2d32f2bea821b008fae8f066a43a3ce5ea967b56421de8e543b2a4867f82a55b69fe0995d5bcc8bf794f53ff4
@@ -0,0 +1 @@
1
+ from dictconfig import *
@@ -0,0 +1,549 @@
1
+ # Copyright 2009-2010 by Vinay Sajip. All Rights Reserved.
2
+ #
3
+ # Permission to use, copy, modify, and distribute this software and its
4
+ # documentation for any purpose and without fee is hereby granted,
5
+ # provided that the above copyright notice appear in all copies and that
6
+ # both that copyright notice and this permission notice appear in
7
+ # supporting documentation, and that the name of Vinay Sajip
8
+ # not be used in advertising or publicity pertaining to distribution
9
+ # of the software without specific, written prior permission.
10
+ # VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
11
+ # ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
12
+ # VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
13
+ # ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
14
+ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15
+ # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+
17
+ import logging.handlers
18
+ import re
19
+ import sys
20
+ import types
21
+
22
+ IDENTIFIER = re.compile('^[a-z_][a-z0-9_]*$', re.I)
23
+
24
+ def valid_ident(s):
25
+ m = IDENTIFIER.match(s)
26
+ if not m:
27
+ raise ValueError('Not a valid Python identifier: %r' % s)
28
+ return True
29
+
30
+ #
31
+ # This function is defined in logging only in recent versions of Python
32
+ #
33
+ try:
34
+ from logging import _checkLevel
35
+ except ImportError:
36
+ def _checkLevel(level):
37
+ if isinstance(level, int):
38
+ rv = level
39
+ elif str(level) == level:
40
+ if level not in logging._levelNames:
41
+ raise ValueError('Unknown level: %r' % level)
42
+ rv = logging._levelNames[level]
43
+ else:
44
+ raise TypeError('Level not an integer or a '
45
+ 'valid string: %r' % level)
46
+ return rv
47
+
48
+ # The ConvertingXXX classes are wrappers around standard Python containers,
49
+ # and they serve to convert any suitable values in the container. The
50
+ # conversion converts base dicts, lists and tuples to their wrapped
51
+ # equivalents, whereas strings which match a conversion format are converted
52
+ # appropriately.
53
+ #
54
+ # Each wrapper should have a configurator attribute holding the actual
55
+ # configurator to use for conversion.
56
+
57
+ class ConvertingDict(dict):
58
+ """A converting dictionary wrapper."""
59
+
60
+ def __getitem__(self, key):
61
+ value = dict.__getitem__(self, key)
62
+ result = self.configurator.convert(value)
63
+ #If the converted value is different, save for next time
64
+ if value is not result:
65
+ self[key] = result
66
+ if type(result) in (ConvertingDict, ConvertingList,
67
+ ConvertingTuple):
68
+ result.parent = self
69
+ result.key = key
70
+ return result
71
+
72
+ def get(self, key, default=None):
73
+ value = dict.get(self, key, default)
74
+ result = self.configurator.convert(value)
75
+ #If the converted value is different, save for next time
76
+ if value is not result:
77
+ self[key] = result
78
+ if type(result) in (ConvertingDict, ConvertingList,
79
+ ConvertingTuple):
80
+ result.parent = self
81
+ result.key = key
82
+ return result
83
+
84
+ def pop(self, key, default=None):
85
+ value = dict.pop(self, key, default)
86
+ result = self.configurator.convert(value)
87
+ if value is not result:
88
+ if type(result) in (ConvertingDict, ConvertingList,
89
+ ConvertingTuple):
90
+ result.parent = self
91
+ result.key = key
92
+ return result
93
+
94
+ class ConvertingList(list):
95
+ """A converting list wrapper."""
96
+ def __getitem__(self, key):
97
+ value = list.__getitem__(self, key)
98
+ result = self.configurator.convert(value)
99
+ #If the converted value is different, save for next time
100
+ if value is not result:
101
+ self[key] = result
102
+ if type(result) in (ConvertingDict, ConvertingList,
103
+ ConvertingTuple):
104
+ result.parent = self
105
+ result.key = key
106
+ return result
107
+
108
+ def pop(self, idx=-1):
109
+ value = list.pop(self, idx)
110
+ result = self.configurator.convert(value)
111
+ if value is not result:
112
+ if type(result) in (ConvertingDict, ConvertingList,
113
+ ConvertingTuple):
114
+ result.parent = self
115
+ return result
116
+
117
+ class ConvertingTuple(tuple):
118
+ """A converting tuple wrapper."""
119
+ def __getitem__(self, key):
120
+ value = tuple.__getitem__(self, key)
121
+ result = self.configurator.convert(value)
122
+ if value is not result:
123
+ if type(result) in (ConvertingDict, ConvertingList,
124
+ ConvertingTuple):
125
+ result.parent = self
126
+ result.key = key
127
+ return result
128
+
129
+ class BaseConfigurator(object):
130
+ """
131
+ The configurator base class which defines some useful defaults.
132
+ """
133
+
134
+ CONVERT_PATTERN = re.compile(r'^(?P<prefix>[a-z]+)://(?P<suffix>.*)$')
135
+
136
+ WORD_PATTERN = re.compile(r'^\s*(\w+)\s*')
137
+ DOT_PATTERN = re.compile(r'^\.\s*(\w+)\s*')
138
+ INDEX_PATTERN = re.compile(r'^\[\s*(\w+)\s*\]\s*')
139
+ DIGIT_PATTERN = re.compile(r'^\d+$')
140
+
141
+ value_converters = {
142
+ 'ext' : 'ext_convert',
143
+ 'cfg' : 'cfg_convert',
144
+ }
145
+
146
+ # We might want to use a different one, e.g. importlib
147
+ importer = __import__
148
+
149
+ def __init__(self, config):
150
+ self.config = ConvertingDict(config)
151
+ self.config.configurator = self
152
+
153
+ def resolve(self, s):
154
+ """
155
+ Resolve strings to objects using standard import and attribute
156
+ syntax.
157
+ """
158
+ name = s.split('.')
159
+ used = name.pop(0)
160
+ try:
161
+ found = self.importer(used)
162
+ for frag in name:
163
+ used += '.' + frag
164
+ try:
165
+ found = getattr(found, frag)
166
+ except AttributeError:
167
+ self.importer(used)
168
+ found = getattr(found, frag)
169
+ return found
170
+ except ImportError:
171
+ e, tb = sys.exc_info()[1:]
172
+ v = ValueError('Cannot resolve %r: %s' % (s, e))
173
+ v.__cause__, v.__traceback__ = e, tb
174
+ raise v
175
+
176
+ def ext_convert(self, value):
177
+ """Default converter for the ext:// protocol."""
178
+ return self.resolve(value)
179
+
180
+ def cfg_convert(self, value):
181
+ """Default converter for the cfg:// protocol."""
182
+ rest = value
183
+ m = self.WORD_PATTERN.match(rest)
184
+ if m is None:
185
+ raise ValueError("Unable to convert %r" % value)
186
+ else:
187
+ rest = rest[m.end():]
188
+ d = self.config[m.groups()[0]]
189
+ #print d, rest
190
+ while rest:
191
+ m = self.DOT_PATTERN.match(rest)
192
+ if m:
193
+ d = d[m.groups()[0]]
194
+ else:
195
+ m = self.INDEX_PATTERN.match(rest)
196
+ if m:
197
+ idx = m.groups()[0]
198
+ if not self.DIGIT_PATTERN.match(idx):
199
+ d = d[idx]
200
+ else:
201
+ try:
202
+ n = int(idx) # try as number first (most likely)
203
+ d = d[n]
204
+ except TypeError:
205
+ d = d[idx]
206
+ if m:
207
+ rest = rest[m.end():]
208
+ else:
209
+ raise ValueError('Unable to convert '
210
+ '%r at %r' % (value, rest))
211
+ #rest should be empty
212
+ return d
213
+
214
+ def convert(self, value):
215
+ """
216
+ Convert values to an appropriate type. dicts, lists and tuples are
217
+ replaced by their converting alternatives. Strings are checked to
218
+ see if they have a conversion format and are converted if they do.
219
+ """
220
+ if not isinstance(value, ConvertingDict) and isinstance(value, dict):
221
+ value = ConvertingDict(value)
222
+ value.configurator = self
223
+ elif not isinstance(value, ConvertingList) and isinstance(value, list):
224
+ value = ConvertingList(value)
225
+ value.configurator = self
226
+ elif not isinstance(value, ConvertingTuple) and\
227
+ isinstance(value, tuple):
228
+ value = ConvertingTuple(value)
229
+ value.configurator = self
230
+ elif isinstance(value, basestring): # str for py3k
231
+ m = self.CONVERT_PATTERN.match(value)
232
+ if m:
233
+ d = m.groupdict()
234
+ prefix = d['prefix']
235
+ converter = self.value_converters.get(prefix, None)
236
+ if converter:
237
+ suffix = d['suffix']
238
+ converter = getattr(self, converter)
239
+ value = converter(suffix)
240
+ return value
241
+
242
+ def configure_custom(self, config):
243
+ """Configure an object with a user-supplied factory."""
244
+ c = config.pop('()')
245
+ if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
246
+ c = self.resolve(c)
247
+ props = config.pop('.', None)
248
+ # Check for valid identifiers
249
+ kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
250
+ result = c(**kwargs)
251
+ if props:
252
+ for name, value in props.items():
253
+ setattr(result, name, value)
254
+ return result
255
+
256
+ def as_tuple(self, value):
257
+ """Utility function which converts lists to tuples."""
258
+ if isinstance(value, list):
259
+ value = tuple(value)
260
+ return value
261
+
262
+ class DictConfigurator(BaseConfigurator):
263
+ """
264
+ Configure logging using a dictionary-like object to describe the
265
+ configuration.
266
+ """
267
+
268
+ def configure(self):
269
+ """Do the configuration."""
270
+
271
+ config = self.config
272
+ if 'version' not in config:
273
+ raise ValueError("dictionary doesn't specify a version")
274
+ if config['version'] != 1:
275
+ raise ValueError("Unsupported version: %s" % config['version'])
276
+ incremental = config.pop('incremental', False)
277
+ EMPTY_DICT = {}
278
+ logging._acquireLock()
279
+ try:
280
+ if incremental:
281
+ handlers = config.get('handlers', EMPTY_DICT)
282
+ # incremental handler config only if handler name
283
+ # ties in to logging._handlers (Python 2.7)
284
+ if sys.version_info[:2] == (2, 7):
285
+ for name in handlers:
286
+ if name not in logging._handlers:
287
+ raise ValueError('No handler found with '
288
+ 'name %r' % name)
289
+ else:
290
+ try:
291
+ handler = logging._handlers[name]
292
+ handler_config = handlers[name]
293
+ level = handler_config.get('level', None)
294
+ if level:
295
+ handler.setLevel(_checkLevel(level))
296
+ except StandardError, e:
297
+ raise ValueError('Unable to configure handler '
298
+ '%r: %s' % (name, e))
299
+ loggers = config.get('loggers', EMPTY_DICT)
300
+ for name in loggers:
301
+ try:
302
+ self.configure_logger(name, loggers[name], True)
303
+ except StandardError, e:
304
+ raise ValueError('Unable to configure logger '
305
+ '%r: %s' % (name, e))
306
+ root = config.get('root', None)
307
+ if root:
308
+ try:
309
+ self.configure_root(root, True)
310
+ except StandardError, e:
311
+ raise ValueError('Unable to configure root '
312
+ 'logger: %s' % e)
313
+ else:
314
+ disable_existing = config.pop('disable_existing_loggers', True)
315
+
316
+ logging._handlers.clear()
317
+ del logging._handlerList[:]
318
+
319
+ # Do formatters first - they don't refer to anything else
320
+ formatters = config.get('formatters', EMPTY_DICT)
321
+ for name in formatters:
322
+ try:
323
+ formatters[name] = self.configure_formatter(
324
+ formatters[name])
325
+ except StandardError, e:
326
+ raise ValueError('Unable to configure '
327
+ 'formatter %r: %s' % (name, e))
328
+ # Next, do filters - they don't refer to anything else, either
329
+ filters = config.get('filters', EMPTY_DICT)
330
+ for name in filters:
331
+ try:
332
+ filters[name] = self.configure_filter(filters[name])
333
+ except StandardError, e:
334
+ raise ValueError('Unable to configure '
335
+ 'filter %r: %s' % (name, e))
336
+
337
+ # Next, do handlers - they refer to formatters and filters
338
+ # As handlers can refer to other handlers, sort the keys
339
+ # to allow a deterministic order of configuration
340
+ handlers = config.get('handlers', EMPTY_DICT)
341
+ for name in sorted(handlers):
342
+ try:
343
+ handler = self.configure_handler(handlers[name])
344
+ handler.name = name
345
+ handlers[name] = handler
346
+ except StandardError, e:
347
+ raise ValueError('Unable to configure handler '
348
+ '%r: %s' % (name, e))
349
+ # Next, do loggers - they refer to handlers and filters
350
+
351
+ #we don't want to lose the existing loggers,
352
+ #since other threads may have pointers to them.
353
+ #existing is set to contain all existing loggers,
354
+ #and as we go through the new configuration we
355
+ #remove any which are configured. At the end,
356
+ #what's left in existing is the set of loggers
357
+ #which were in the previous configuration but
358
+ #which are not in the new configuration.
359
+ root = logging.root
360
+ existing = root.manager.loggerDict.keys()
361
+ #The list needs to be sorted so that we can
362
+ #avoid disabling child loggers of explicitly
363
+ #named loggers. With a sorted list it is easier
364
+ #to find the child loggers.
365
+ existing.sort()
366
+ #We'll keep the list of existing loggers
367
+ #which are children of named loggers here...
368
+ child_loggers = []
369
+ #now set up the new ones...
370
+ loggers = config.get('loggers', EMPTY_DICT)
371
+ for name in loggers:
372
+ if name in existing:
373
+ i = existing.index(name)
374
+ prefixed = name + "."
375
+ pflen = len(prefixed)
376
+ num_existing = len(existing)
377
+ i = i + 1 # look at the entry after name
378
+ while (i < num_existing) and\
379
+ (existing[i][:pflen] == prefixed):
380
+ child_loggers.append(existing[i])
381
+ i = i + 1
382
+ existing.remove(name)
383
+ try:
384
+ self.configure_logger(name, loggers[name])
385
+ except StandardError, e:
386
+ raise ValueError('Unable to configure logger '
387
+ '%r: %s' % (name, e))
388
+
389
+ #Disable any old loggers. There's no point deleting
390
+ #them as other threads may continue to hold references
391
+ #and by disabling them, you stop them doing any logging.
392
+ #However, don't disable children of named loggers, as that's
393
+ #probably not what was intended by the user.
394
+ for log in existing:
395
+ logger = root.manager.loggerDict[log]
396
+ if log in child_loggers:
397
+ logger.level = logging.NOTSET
398
+ logger.handlers = []
399
+ logger.propagate = True
400
+ elif disable_existing:
401
+ logger.disabled = True
402
+
403
+ # And finally, do the root logger
404
+ root = config.get('root', None)
405
+ if root:
406
+ try:
407
+ self.configure_root(root)
408
+ except StandardError, e:
409
+ raise ValueError('Unable to configure root '
410
+ 'logger: %s' % e)
411
+ finally:
412
+ logging._releaseLock()
413
+
414
+ def configure_formatter(self, config):
415
+ """Configure a formatter from a dictionary."""
416
+ if '()' in config:
417
+ factory = config['()'] # for use in exception handler
418
+ try:
419
+ result = self.configure_custom(config)
420
+ except TypeError, te:
421
+ if "'format'" not in str(te):
422
+ raise
423
+ #Name of parameter changed from fmt to format.
424
+ #Retry with old name.
425
+ #This is so that code can be used with older Python versions
426
+ #(e.g. by Django)
427
+ config['fmt'] = config.pop('format')
428
+ config['()'] = factory
429
+ result = self.configure_custom(config)
430
+ else:
431
+ fmt = config.get('format', None)
432
+ dfmt = config.get('datefmt', None)
433
+ result = logging.Formatter(fmt, dfmt)
434
+ return result
435
+
436
+ def configure_filter(self, config):
437
+ """Configure a filter from a dictionary."""
438
+ if '()' in config:
439
+ result = self.configure_custom(config)
440
+ else:
441
+ name = config.get('name', '')
442
+ result = logging.Filter(name)
443
+ return result
444
+
445
+ def add_filters(self, filterer, filters):
446
+ """Add filters to a filterer from a list of names."""
447
+ for f in filters:
448
+ try:
449
+ filterer.addFilter(self.config['filters'][f])
450
+ except StandardError, e:
451
+ raise ValueError('Unable to add filter %r: %s' % (f, e))
452
+
453
+ def configure_handler(self, config):
454
+ """Configure a handler from a dictionary."""
455
+ formatter = config.pop('formatter', None)
456
+ if formatter:
457
+ try:
458
+ formatter = self.config['formatters'][formatter]
459
+ except StandardError, e:
460
+ raise ValueError('Unable to set formatter '
461
+ '%r: %s' % (formatter, e))
462
+ level = config.pop('level', None)
463
+ filters = config.pop('filters', None)
464
+ if '()' in config:
465
+ c = config.pop('()')
466
+ if not hasattr(c, '__call__') and hasattr(types, 'ClassType') and type(c) != types.ClassType:
467
+ c = self.resolve(c)
468
+ factory = c
469
+ else:
470
+ klass = self.resolve(config.pop('class'))
471
+ #Special case for handler which refers to another handler
472
+ if issubclass(klass, logging.handlers.MemoryHandler) and\
473
+ 'target' in config:
474
+ try:
475
+ config['target'] = self.config['handlers'][config['target']]
476
+ except StandardError, e:
477
+ raise ValueError('Unable to set target handler '
478
+ '%r: %s' % (config['target'], e))
479
+ elif issubclass(klass, logging.handlers.SMTPHandler) and\
480
+ 'mailhost' in config:
481
+ config['mailhost'] = self.as_tuple(config['mailhost'])
482
+ elif issubclass(klass, logging.handlers.SysLogHandler) and\
483
+ 'address' in config:
484
+ config['address'] = self.as_tuple(config['address'])
485
+ factory = klass
486
+ kwargs = dict([(k, config[k]) for k in config if valid_ident(k)])
487
+ try:
488
+ result = factory(**kwargs)
489
+ except TypeError, te:
490
+ if "'stream'" not in str(te):
491
+ raise
492
+ #The argument name changed from strm to stream
493
+ #Retry with old name.
494
+ #This is so that code can be used with older Python versions
495
+ #(e.g. by Django)
496
+ kwargs['strm'] = kwargs.pop('stream')
497
+ result = factory(**kwargs)
498
+ if formatter:
499
+ result.setFormatter(formatter)
500
+ if level is not None:
501
+ result.setLevel(_checkLevel(level))
502
+ if filters:
503
+ self.add_filters(result, filters)
504
+ return result
505
+
506
+ def add_handlers(self, logger, handlers):
507
+ """Add handlers to a logger from a list of names."""
508
+ for h in handlers:
509
+ try:
510
+ logger.addHandler(self.config['handlers'][h])
511
+ except StandardError, e:
512
+ raise ValueError('Unable to add handler %r: %s' % (h, e))
513
+
514
+ def common_logger_config(self, logger, config, incremental=False):
515
+ """
516
+ Perform configuration which is common to root and non-root loggers.
517
+ """
518
+ level = config.get('level', None)
519
+ if level is not None:
520
+ logger.setLevel(_checkLevel(level))
521
+ if not incremental:
522
+ #Remove any existing handlers
523
+ for h in logger.handlers[:]:
524
+ logger.removeHandler(h)
525
+ handlers = config.get('handlers', None)
526
+ if handlers:
527
+ self.add_handlers(logger, handlers)
528
+ filters = config.get('filters', None)
529
+ if filters:
530
+ self.add_filters(logger, filters)
531
+
532
+ def configure_logger(self, name, config, incremental=False):
533
+ """Configure a non-root logger from a dictionary."""
534
+ logger = logging.getLogger(name)
535
+ self.common_logger_config(logger, config, incremental)
536
+ propagate = config.get('propagate', None)
537
+ if propagate is not None:
538
+ logger.propagate = propagate
539
+
540
+ def configure_root(self, config, incremental=False):
541
+ """Configure a root logger from a dictionary."""
542
+ root = logging.getLogger()
543
+ self.common_logger_config(root, config, incremental)
544
+
545
+ dictConfigClass = DictConfigurator
546
+
547
+ def dictConfig(config):
548
+ """Configure logging using a dictionary."""
549
+ dictConfigClass(config).configure()
@@ -0,0 +1,13 @@
1
+ Copyright 2013 Josu Bermudez Galbarriatu <josu.bermudez@deusto.es>.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
@@ -0,0 +1,427 @@
1
+ # coding=utf-8
2
+ """ Module for manage KAF formatted files. """
3
+
4
+ from __future__ import unicode_literals
5
+
6
+ __author__ = 'Josu Bermudez <josu.bermudez@deusto.es>'
7
+
8
+ from lxml import etree
9
+
10
+ # CONSTANT TEXT VALUES USED TO CONSTRUCT KAF
11
+ KAF_TAG = "KAF"
12
+ LANGUAGE_ATTRIBUTE = "{http://www.w3.org/XML/1998/namespace}lang"
13
+ VERSION_ATTRIBUTE = "version"
14
+ NS = {}
15
+
16
+ KAF_HEADER_TAG = "kafHeader"
17
+ NAME_ATTRIBUTE = "name"
18
+ LINGUISTIC_PROCESSOR_HEAD = "linguisticProcessors"
19
+ LAYER_ATTRIBUTE = "layer"
20
+ LINGUISTIC_PROCESSOR_OCCURRENCE_TAG = "lp"
21
+ TIMESTAMP_ATTRIBUTE = "timestamp"
22
+
23
+ SPAN_TAG = "span"
24
+ TARGET_ID_ATTRIBUTE = "id"
25
+ TARGET_TAG = "target"
26
+
27
+ TEXT_LAYER_TAG = "text"
28
+ WORD_OCCURRENCE_TAG = "wf"
29
+ WORD_ID_ATTRIBUTE = "wid"
30
+
31
+ TERMS_LAYER_TAG = "terms"
32
+ TERM_OCCURRENCE_TAG = "term"
33
+ TERM_ID_ATTRIBUTE = "tid"
34
+ NER_ATTRIBUTE = "ner"
35
+ TYPE_ATTRIBUTE = "type"
36
+ LEMMA_ATTRIBUTE = "lemma"
37
+ POS_ATTRIBUTE = "pos"
38
+ MORPHOFEAT_ATTRIBUTE = "morphofeat"
39
+
40
+ NAMED_ENTITIES_LAYER_TAG = "entities"
41
+ NAMED_ENTITY_OCCURRENCE_TAG = "entity"
42
+ NAMED_ENTITY_ID_ATTRIBUTE = "eid"
43
+ NAMED_ENTITY_TYPE_ATTRIBUTE = "type"
44
+ NAMED_ENTITY_REFERENCES_GROUP_TAG = "references"
45
+
46
+ CONSTITUENCY_LAYER = "constituency"
47
+ CONSTITUENCY_TREE_TAG = "tree"
48
+ CONSTITUENCY_NON_TERMINALS = "nt"
49
+ CONSTITUENCY_TERMINALS = "t"
50
+ CONSTITUENCY_EDGES = "edge"
51
+
52
+ CHUNKS_LAYER_TAG = "chunks"
53
+ CHUNK_OCCURRENCE_TAG = "chunk"
54
+ CHUNK_CASE_ATTRIBUTE = "case"
55
+ CHUNK_PHRASE_ATTRIBUTE = "phrase"
56
+ CHUNK_HEAD_ATTRIBUTE = "head"
57
+ CHUNK_ID_ATTRIBUTE = "cid"
58
+
59
+ DEPENDENCY_LAYER_TAG = "deps"
60
+ DEPENDENCY_OCCURRENCE_TAG = "dep"
61
+ DEPENDENCY_FROM_ATTRIBUTE = "from"
62
+ DEPENDENCY_FUNCTION_ATTRIBUTE = "rfunc"
63
+ DEPENDENCY_TO_ATTRIBUTE = "to"
64
+
65
+ EXTERNAL_REFERENCE_OCCURRENCE_TAG = "externalRef"
66
+ EXTERNAL_REFERENCES_TAG = "externalReferences"
67
+
68
+ COREFERENCE_LAYER_TAG = "coreferences"
69
+ COREFERENCE_ID_ATTRIBUTE = "coid"
70
+ COREFERENCE_OCCURRENCE_TAG = "coref"
71
+
72
+
73
+ class KafDocument:
74
+ """ Manage a KAF document.
75
+ """
76
+ valid_word_attributes = ("sent", "para", "offset", "length", "page")
77
+ valid_external_attributes = ("resource", "reference", "reftype", "status", "source", "confidence")
78
+ valid_externalRef_attributes = ("resource", "reference")
79
+
80
+ def __init__(self, file_name=None, input_stream=None, language=None, version="2.0", header=None):
81
+ """ Prepare the document basic structure.
82
+ """
83
+ #parser = etree.XMLParser(remove_blank_text=True)
84
+
85
+ if file_name:
86
+ self.tree = etree.parse(file_name)#, parser=parser)
87
+ self.root = self.tree.getroot()
88
+ elif input_stream:
89
+ self.root = etree.fromstring(input_stream)#, parser=parser)
90
+ self.tree = etree.ElementTree(self.root)
91
+ else:
92
+ self.root = etree.Element(KAF_TAG, NS)
93
+ self.tree = etree.ElementTree(self.root)
94
+ if language:
95
+ self.root.attrib[LANGUAGE_ATTRIBUTE] = language
96
+
97
+ if version:
98
+ self.root.set(VERSION_ATTRIBUTE, version)
99
+
100
+ headers = self.tree.find(KAF_HEADER_TAG)
101
+ if headers is not None and len(headers):
102
+ self.kaf_header = headers
103
+ else:
104
+ self.kaf_header = None
105
+
106
+ if header:
107
+ self.set_header(header)
108
+
109
+ text_layer = self.tree.find(TEXT_LAYER_TAG)
110
+ if text_layer is not None and len(text_layer):
111
+ self.text = text_layer
112
+ else:
113
+ self.text = etree.SubElement(self.root, TEXT_LAYER_TAG)
114
+
115
+ terms_layer = self.tree.find(TERMS_LAYER_TAG)
116
+ if text_layer is not None and len(terms_layer):
117
+ self.terms = terms_layer
118
+ else:
119
+ self.terms = None
120
+
121
+ dependencies_layer = self.tree.find(DEPENDENCY_LAYER_TAG)
122
+ if dependencies_layer is not None and len(dependencies_layer):
123
+ self.dependencies = dependencies_layer
124
+ else:
125
+ self.dependencies = None
126
+
127
+ chunks_layer = self.tree.find(CHUNKS_LAYER_TAG)
128
+ if chunks_layer is not None and len(chunks_layer):
129
+ self.chunks = chunks_layer
130
+ else:
131
+ self.chunks = None
132
+
133
+ constituency_layer = self.tree.find(CONSTITUENCY_LAYER)
134
+ if constituency_layer is not None and len(constituency_layer):
135
+ self.constituency = constituency_layer
136
+ else:
137
+ self.constituency = None
138
+
139
+ named_entities_layer = self.tree.find(NAMED_ENTITIES_LAYER_TAG)
140
+ if named_entities_layer is not None and len(named_entities_layer):
141
+ self.entities = named_entities_layer
142
+ else:
143
+ self.entities = None
144
+
145
+ coreference_layer = self.tree.find(COREFERENCE_LAYER_TAG)
146
+ if coreference_layer is not None and len(coreference_layer):
147
+ self.coreferences = coreference_layer
148
+ else:
149
+ self.coreferences = None
150
+
151
+ def clear_header(self):
152
+ self.root.remove(self.kaf_header)
153
+ self.kaf_header = None
154
+
155
+ def set_header(self, kaf_header):
156
+ if self.kaf_header:
157
+ for element in kaf_header:
158
+ self.kaf_header.append(element)
159
+ self.kaf_header.attrib.update(kaf_header.attrib)
160
+ else:
161
+ self.kaf_header = kaf_header
162
+ self.root.append(self.kaf_header)
163
+
164
+ def add_linguistic_processors(self, layer, name, version, time_stamp):
165
+ if not self.kaf_header:
166
+ self.kaf_header = etree.SubElement(self.root, KAF_HEADER_TAG)
167
+
168
+ layer_find = self.kaf_header.find("./{0}..[@{1}='{2}']".format(LINGUISTIC_PROCESSOR_HEAD, LAYER_ATTRIBUTE, layer))
169
+ if layer_find:
170
+ layer = layer_find[0]
171
+ else:
172
+ layer = etree.SubElement(self.kaf_header, LINGUISTIC_PROCESSOR_HEAD, {LAYER_ATTRIBUTE: layer})
173
+
174
+ etree.SubElement(layer, LINGUISTIC_PROCESSOR_OCCURRENCE_TAG,
175
+ {NAME_ATTRIBUTE: name, VERSION_ATTRIBUTE: version, TIMESTAMP_ATTRIBUTE: time_stamp})
176
+
177
+ def add_word(self, word, wid, **kwargs):
178
+ """Add a word to the KAF file.
179
+ A word have the next parameters/attributes;
180
+ + wid: the unique id for the word form.
181
+ + sent: sentence id of the token (optional)
182
+ + para: paragraph id (optional)
183
+ + offset: the offset of the word form (optional)
184
+ + length: the length of the original word form (optional)
185
+ + page: page id (optional)
186
+ """
187
+ # Prepare the word attributes
188
+ word_attributes = dict((k, v) for (k, v) in kwargs.iteritems() if k in self.valid_word_attributes)
189
+ word_attributes[WORD_ID_ATTRIBUTE] = wid
190
+ # Create a text subnode for the word and set its attributes
191
+ element = etree.SubElement(self.text, WORD_OCCURRENCE_TAG, word_attributes)
192
+ try:
193
+ element.text = word
194
+ except:
195
+ element.text = "XXXXXX"
196
+ return element
197
+
198
+ def get_words(self):
199
+ """ Return all the words in the document"""
200
+ return self.text[:]
201
+
202
+ def get_words_by_id(self, wid):
203
+ """ Return all the words in the document"""
204
+ results = self.text.find("{0}[@{1}='{2}']".format(WORD_OCCURRENCE_TAG, WORD_ID_ATTRIBUTE, wid))
205
+ return results and results[0]
206
+
207
+ def add_term(self, tid, pos=None, lemma=None, morphofeat=None, term_type=None, words=(), ner=None,
208
+ external_refs=()):
209
+ """Add a term to the kaf file.
210
+ A Term have the next parameters/attributes:
211
+ tid: unique identifier
212
+ type: type of the term. Currently, 3 values are possible:
213
+ + open: open category term
214
+ + close: close category term
215
+ lemma: lemma of the term
216
+ pos: part of speech
217
+ morphofeat: PennTreebank part of speech tag
218
+ word: a list of id of the bounded words.
219
+ external_ref: A list of dictionaries that contains the external references.
220
+ Each reference have:
221
+ + resource
222
+ + reference
223
+ + INCOMPLETE
224
+ """
225
+ if self.terms is None:
226
+ self.terms = etree.SubElement(self.root, TERMS_LAYER_TAG)
227
+
228
+ #TODO Complete external references
229
+
230
+ word_attributes = {TERM_ID_ATTRIBUTE: tid}
231
+ if pos:
232
+ word_attributes[POS_ATTRIBUTE] = pos
233
+ if lemma:
234
+ word_attributes[LEMMA_ATTRIBUTE] = lemma
235
+ if term_type:
236
+ word_attributes[TYPE_ATTRIBUTE] = term_type
237
+ if morphofeat:
238
+ word_attributes[MORPHOFEAT_ATTRIBUTE] = morphofeat
239
+ if ner:
240
+ word_attributes[NER_ATTRIBUTE] = ner
241
+ term = etree.SubElement(self.terms, TERM_OCCURRENCE_TAG, word_attributes)
242
+ if words:
243
+ span = etree.SubElement(term, SPAN_TAG)
244
+ for word in words:
245
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: word})
246
+ if external_refs:
247
+ span = etree.SubElement(term, EXTERNAL_REFERENCES_TAG)
248
+ for external_ref in external_refs:
249
+ ref_attributes = dict((k, v) for (k, v) in external_ref.iteritems()
250
+ if k in self.valid_externalRef_attributes)
251
+ keys = ref_attributes.keys()
252
+ for attribute in self.valid_externalRef_attributes:
253
+ if not attribute in keys:
254
+ raise Exception("External resource not have {0}".format(attribute))
255
+ etree.SubElement(span, EXTERNAL_REFERENCE_OCCURRENCE_TAG, ref_attributes)
256
+ return term
257
+
258
+ def get_terms(self):
259
+ """ Return all the words in the document"""
260
+ return self.root.findall("{0}/{1}".format(TERMS_LAYER_TAG, TERM_OCCURRENCE_TAG))
261
+
262
+ def get_terms_words(self, term):
263
+ return term.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
264
+
265
+ def get_terms_references(self, term):
266
+ return term.findall("{0}/{1}".format(EXTERNAL_REFERENCES_TAG, EXTERNAL_REFERENCE_OCCURRENCE_TAG))
267
+
268
+ def add_dependency(self, origen, to, rfunc):
269
+ """Add a new dependency relation in the text.
270
+ The dependency have the next parameters/attributes:
271
+ + from: term id of the source element
272
+ + to: term id of the target element
273
+ + rfunc: relational function. One of:
274
+ - mod: indicates the word introducing the dependent in a head- modifier relation.
275
+ - subj: indicates the subject in the grammatical relation Subject-Predicate.
276
+ - csubj, xsubj, ncsubj: The Grammatical Relations (RL) csubj and xsubj may be used for clausal
277
+ subjects, controlled from within, or without, respectively. ncsubj is a non-clausal subject.
278
+ - dobj: Indicates the object in the grammatical relation between a predicate and its direct object.
279
+ - iobj: The relation between a predicate and a non-clausal complement introduced by a preposition;
280
+ type indicates the preposition introducing the dependent.
281
+ - obj2: The relation between a predicate and the second non-clausal complement in ditransitive
282
+ constructions.
283
+ """
284
+ if not self.dependencies:
285
+ self.dependencies = etree.SubElement(self.root, DEPENDENCY_LAYER_TAG)
286
+
287
+ dependency_attributes = {DEPENDENCY_FROM_ATTRIBUTE: origen,
288
+ DEPENDENCY_TO_ATTRIBUTE: to,
289
+ DEPENDENCY_FUNCTION_ATTRIBUTE: rfunc}
290
+ return etree.SubElement(self.dependencies, DEPENDENCY_OCCURRENCE_TAG, dependency_attributes)
291
+
292
+ def get_dependencies(self):
293
+ """Return all the words in the document"""
294
+ return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
295
+
296
+ def add_chunk(self, cid, head, phrase, case=None, terms=()):
297
+ """"Add a chunk to the kaf document.
298
+ Chunks are noun or prepositional phrases, spanning terms.A chunk have the following parameters/attributes:
299
+ + cid: unique identifier
300
+ + head: the chunk head's term id
301
+ + phrase: typo of the phrase.Valid values for the phrase elements are one of the following:
302
+ - NP: noun phrase
303
+ - VP: verbal phrase
304
+ - PP: prepositional phrase
305
+ - S: sentence
306
+ - O: other
307
+ + case (optional): declension case
308
+ """
309
+ # Secure the root
310
+ if not self.chunks:
311
+ self.chunks = etree.SubElement(self.root, CHUNKS_LAYER_TAG)
312
+ # Prepare the attributes
313
+ chunk_attributes = {CHUNK_ID_ATTRIBUTE: cid, CHUNK_HEAD_ATTRIBUTE: head, CHUNK_PHRASE_ATTRIBUTE: phrase}
314
+ if case:
315
+ chunk_attributes[CHUNK_CASE_ATTRIBUTE] = case
316
+ # Create , and attach, the chunk
317
+ chunk = etree.SubElement(self.chunks, CHUNK_OCCURRENCE_TAG, chunk_attributes)
318
+ # Add the span terms
319
+ if terms:
320
+ spans = etree.SubElement(chunk, SPAN_TAG)
321
+ for term in terms:
322
+ etree.SubElement(spans, TARGET_TAG, {TARGET_ID_ATTRIBUTE: term})
323
+ return chunk
324
+
325
+ def get_chunks(self):
326
+ """Return all the chunks of the text"""
327
+ return self.root.findall("{0}/{1}".format(DEPENDENCY_LAYER_TAG, DEPENDENCY_OCCURRENCE_TAG))
328
+
329
+ def get_chunk_terms(self, chunk):
330
+ """Return all the terms of a chunk."""
331
+ return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
332
+
333
+ def add_entity(self, eid, entity_type, references=()):
334
+ """ Add a entity in the document.
335
+ :param eid: The identification code of the entity.
336
+ :param references: The references (ids of the terms) contained in the entity.
337
+ :param entity_type: The type of the entity.
338
+ """
339
+
340
+ if self.entities is None:
341
+ self.entities = etree.SubElement(self.root, NAMED_ENTITIES_LAYER_TAG)
342
+
343
+ entity_attributes = {NAMED_ENTITY_ID_ATTRIBUTE: eid}
344
+ if entity_type:
345
+ entity_attributes[NAMED_ENTITY_TYPE_ATTRIBUTE] = entity_type
346
+ entity = etree.SubElement(self.entities, NAMED_ENTITY_OCCURRENCE_TAG, entity_attributes)
347
+ references_tag = etree.SubElement(entity, "references")
348
+ if references:
349
+ for reference in references:
350
+ span = etree.SubElement(references_tag, SPAN_TAG)
351
+ for token in reference:
352
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
353
+ return entity
354
+
355
+ def get_constituency_trees(self):
356
+ """Return all the constituency trees in the document"""
357
+ return self.root.findall("{0}/{1}".format(CONSTITUENCY_LAYER, CONSTITUENCY_TREE_TAG))
358
+
359
+ def get_contituent_tree_non_terminals(self, tree):
360
+ """Get al the non terminal constituents of the tree."""
361
+ return tree.findall(CONSTITUENCY_NON_TERMINALS)
362
+
363
+ def get_contituent_tree_terminals(self, tree):
364
+ """Get al the terminal constituents of the tree."""
365
+ return tree.findall(CONSTITUENCY_TERMINALS)
366
+
367
+ def get_contituent_tree_edges(self, tree):
368
+ """Get al the edges of the tree."""
369
+ return tree.findall(CONSTITUENCY_EDGES)
370
+
371
+ def get_contituent_terminal_words(self, chunk):
372
+ """Return all the terms of a terminal constituent."""
373
+ return chunk.findall("{0}/{1}".format(SPAN_TAG, TARGET_TAG))
374
+
375
+ def get_entities(self):
376
+ """Return all the Named Entities in the document"""
377
+ return self.root.findall("{0}/{1}".format(NAMED_ENTITIES_LAYER_TAG, NAMED_ENTITY_OCCURRENCE_TAG))
378
+
379
+ def get_entity_references(self, named_entity):
380
+ """Return all the terms of a Named Entities in the document"""
381
+ return named_entity.findall("{0}/{1}".format(NAMED_ENTITY_REFERENCES_GROUP_TAG, SPAN_TAG))
382
+
383
+ def get_entity_reference_span(self, reference):
384
+ """Return all the terms of a Named Entities in the document"""
385
+ return reference.findall(TARGET_TAG)
386
+
387
+ def add_coreference(self, coid, references=()):
388
+ """ Add a coreference cluster to the document.
389
+ :param coid: The identification code of the cluster.
390
+ :param references: The references contained in the cluster
391
+ """
392
+ if self.coreferences is None:
393
+ self.coreferences = etree.SubElement(self.root, COREFERENCE_LAYER_TAG)
394
+
395
+ coref_attrib = {COREFERENCE_ID_ATTRIBUTE: coid}
396
+ entity = etree.SubElement(self.coreferences, COREFERENCE_OCCURRENCE_TAG, coref_attrib)
397
+
398
+ if references:
399
+ for reference, form in references:
400
+ comment = etree.Comment(form.decode("utf-8").replace("-", " - "))
401
+ entity.append(comment)
402
+ span = etree.SubElement(entity, SPAN_TAG)
403
+ for token in reference:
404
+ etree.SubElement(span, TARGET_TAG, {TARGET_ID_ATTRIBUTE: token})
405
+ return entity
406
+
407
+ def indent(self, elem, level=0):
408
+ i = "\n" + level * " "
409
+ if len(elem):
410
+ if not elem.text or not elem.text.strip():
411
+ elem.text = i + " "
412
+ if not elem.tail or not elem.tail.strip():
413
+ elem.tail = i
414
+ for child in elem:
415
+ self.indent(child, level+1)
416
+ # This seeks for the las child processed in for, is not a code identation error
417
+ if not child.tail or not child.tail.strip():
418
+ child.tail = i
419
+ else:
420
+ if level and (not elem.tail or not elem.tail.strip()):
421
+ elem.tail = i
422
+
423
+ def write(self, output, encoding):
424
+ """Write document into a file.
425
+ :param output: The output target for the document. May be a file type object or a file name."""
426
+ self.indent(self.root)
427
+ output.write(etree.tostring(self.root, encoding=encoding,))#, pretty_print=True, xml_declaration=True, with_comments=True))
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module Coreferences
3
3
  class Base
4
- VERSION = '2.0.1'
4
+ VERSION = '2.0.2'
5
5
  end # Base
6
6
  end # Coreferences
7
7
  end # Opener
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
14
14
  gem.files = Dir.glob([
15
15
  'core/corefgraph/**/*',
16
16
  'core/site-packages/pre_build/**/*',
17
- 'vendor/**/*',
17
+ 'core/vendor/**/*',
18
18
  'ext/**/*',
19
19
  'lib/**/*',
20
20
  '*.gemspec',
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-coreference-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
@@ -276,6 +276,10 @@ files:
276
276
  - core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py
277
277
  - core/site-packages/pre_build/VUKafParserPy/KafParserMod.py
278
278
  - core/site-packages/pre_build/VUKafParserPy/__init__.py
279
+ - core/vendor/dictconfig/__init__.py
280
+ - core/vendor/dictconfig/dictconfig.py
281
+ - core/vendor/pykaf/LICENSE.txt
282
+ - core/vendor/pykaf/__init__.py
279
283
  - ext/hack/Rakefile
280
284
  - ext/hack/support.rb
281
285
  - lib/opener/coreferences/base.rb