dataknobs-xization 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dataknobs-xization might be problematic. Click here for more details.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +16 -0
- dataknobs_xization/annotations.py +1308 -0
- dataknobs_xization/authorities.py +766 -0
- dataknobs_xization/lexicon.py +596 -0
- dataknobs_xization/masking_tokenizer.py +697 -0
- dataknobs_xization/normalize.py +448 -0
- dataknobs_xization-1.0.0.dist-info/METADATA +58 -0
- dataknobs_xization-1.0.0.dist-info/RECORD +10 -0
- dataknobs_xization-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,766 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import Any, Dict, List, Set, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
import dataknobs_xization.annotations as dk_annots
|
|
9
|
+
|
|
10
|
+
# Key annotation column name constants
|
|
11
|
+
KEY_AUTH_ID_COL = "auth_id"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DerivedFieldGroups(dk_annots.DerivedAnnotationColumns):
|
|
15
|
+
"""Defines derived column types:
|
|
16
|
+
* "field_type" -- The column holding they type of field of an annotation row
|
|
17
|
+
* "field_group" -- The column holding the group number(s) of the field
|
|
18
|
+
* "field_record" -- The column holding record number(s) of the field
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
field_type_suffix: str = "_field",
|
|
24
|
+
field_group_suffix: str = "_num",
|
|
25
|
+
field_record_suffix: str = "_recsnum",
|
|
26
|
+
):
|
|
27
|
+
"""Add derived column types/names: Given an annnotation row,
|
|
28
|
+
* field_type(row) == f'{row[ann_type_col]}_field'
|
|
29
|
+
* field_group(row) == f'{row[ann_type_col]}_num'
|
|
30
|
+
* field_record(row) == f'{row[ann_type_col])_recsnum'
|
|
31
|
+
|
|
32
|
+
Where:
|
|
33
|
+
* A field_type column holds annotation "sub"- type values, or fields
|
|
34
|
+
* A field_group column identifies groups of annotation fields
|
|
35
|
+
* A field_record column identifies groups of annotation field groups
|
|
36
|
+
|
|
37
|
+
:param field_type_suffix: The field_type col name suffix (if not _field)
|
|
38
|
+
:param field_group_suffix: The field_group col name suffix (if not _num)
|
|
39
|
+
:param field_record_suffix: field_record colname sfx (if not _recsnum)
|
|
40
|
+
"""
|
|
41
|
+
self.field_type_suffix = field_type_suffix
|
|
42
|
+
self.field_group_suffix = field_group_suffix
|
|
43
|
+
self.field_record_suffix = field_record_suffix
|
|
44
|
+
|
|
45
|
+
def get_col_value(
|
|
46
|
+
self,
|
|
47
|
+
metadata: dk_annots.AnnotationsMetaData,
|
|
48
|
+
col_type: str,
|
|
49
|
+
row: pd.Series,
|
|
50
|
+
missing: str = None,
|
|
51
|
+
) -> str:
|
|
52
|
+
"""Get the value of the column in the given row derived from col_type,
|
|
53
|
+
where col_type is one of:
|
|
54
|
+
* "field_type" == f"{field}_field"
|
|
55
|
+
* "field_group" == f"{field}_num"
|
|
56
|
+
* "field_record" == f"{field}_recsnum"
|
|
57
|
+
|
|
58
|
+
And "field" is the row_accessor's metadata's "ann_type" col's value.
|
|
59
|
+
|
|
60
|
+
:param metadata: The AnnotationsMetaData
|
|
61
|
+
:param col_type: The type of column value to derive
|
|
62
|
+
:param row: A row from which to get the value.
|
|
63
|
+
:param missing: The value to return for unknown or missing column
|
|
64
|
+
:return: The row value or the missing value
|
|
65
|
+
"""
|
|
66
|
+
value = missing
|
|
67
|
+
if metadata.ann_type_col in row.index:
|
|
68
|
+
field = row[metadata.ann_type_col]
|
|
69
|
+
if field is not None:
|
|
70
|
+
if col_type == "field_type":
|
|
71
|
+
col_name = self.get_field_type_col(field)
|
|
72
|
+
elif col_type == "field_group":
|
|
73
|
+
col_name = self.get_field_group_col(field)
|
|
74
|
+
elif col_type == "field_record":
|
|
75
|
+
col_name = self.get_field_record_col(field)
|
|
76
|
+
if col_name is not None and col_name in row.index:
|
|
77
|
+
value = row[col_name]
|
|
78
|
+
return value
|
|
79
|
+
|
|
80
|
+
def unpack_field(self, field_value: str) -> str:
|
|
81
|
+
"""Given a field in any of its derivatives (like field type, field group
|
|
82
|
+
or field record,) unpack and return the basic field value itself.
|
|
83
|
+
"""
|
|
84
|
+
field = field_value
|
|
85
|
+
if field.endswith(self.field_record_suffix):
|
|
86
|
+
field = field.replace(self.field_record_suffix, "")
|
|
87
|
+
elif field.endswith(self.field_group_suffix):
|
|
88
|
+
field = field.replace(self.field_group_suffix, "")
|
|
89
|
+
elif field.endswith(self.field_type_suffix):
|
|
90
|
+
field = field.replace(self.field_type_suffix, "")
|
|
91
|
+
return field
|
|
92
|
+
|
|
93
|
+
def get_field_name(self, field_value: str) -> str:
|
|
94
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
95
|
+
value (the field name); or a field type, group, or record column name,
|
|
96
|
+
get the field name.
|
|
97
|
+
"""
|
|
98
|
+
return self.unpack_field(field_value)
|
|
99
|
+
|
|
100
|
+
def get_field_type_col(self, field_value: str) -> str:
|
|
101
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
102
|
+
value; or a field type, group, or record column name, get the field
|
|
103
|
+
name.
|
|
104
|
+
"""
|
|
105
|
+
field = self.unpack_field(field_value)
|
|
106
|
+
return f"{field}{self.field_type_suffix}"
|
|
107
|
+
|
|
108
|
+
def get_field_group_col(self, field_value: str) -> str:
|
|
109
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
110
|
+
value; or a field type, group, or record, get the name of the derived
|
|
111
|
+
field group column.
|
|
112
|
+
"""
|
|
113
|
+
field = self.unpack_field(field_value)
|
|
114
|
+
return f"{field}{self.field_group_suffix}"
|
|
115
|
+
|
|
116
|
+
def get_field_record_col(self, field_value: str) -> str:
|
|
117
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
118
|
+
value; or a field type, group, or record, get the name of the derived
|
|
119
|
+
field record column.
|
|
120
|
+
"""
|
|
121
|
+
field = self.unpack_field(field_value)
|
|
122
|
+
return f"{field}{self.field_record_suffix}"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AuthorityAnnotationsMetaData(dk_annots.AnnotationsMetaData):
|
|
126
|
+
"""An extension of AnnotationsMetaData that adds an 'auth_id_col' to the
|
|
127
|
+
standard (key) annotation columns (attributes).
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
start_pos_col: str = dk_annots.KEY_START_POS_COL,
|
|
133
|
+
end_pos_col: str = dk_annots.KEY_END_POS_COL,
|
|
134
|
+
text_col: str = dk_annots.KEY_TEXT_COL,
|
|
135
|
+
ann_type_col: str = dk_annots.KEY_ANN_TYPE_COL,
|
|
136
|
+
auth_id_col: str = KEY_AUTH_ID_COL,
|
|
137
|
+
sort_fields: List[str] = (dk_annots.KEY_START_POS_COL, dk_annots.KEY_END_POS_COL),
|
|
138
|
+
sort_fields_ascending: List[bool] = (True, False),
|
|
139
|
+
**kwargs,
|
|
140
|
+
):
|
|
141
|
+
"""Initialize with key (and more) column names and info.
|
|
142
|
+
|
|
143
|
+
Key column types:
|
|
144
|
+
* start_pos
|
|
145
|
+
* end_pos
|
|
146
|
+
* text
|
|
147
|
+
* ann_type
|
|
148
|
+
* auth_id
|
|
149
|
+
|
|
150
|
+
Notes:
|
|
151
|
+
* Actual table columns can be named arbitrarily
|
|
152
|
+
* BUT: interactions through annotations classes and interfaces
|
|
153
|
+
relating to the "key" columns must use the key column constants
|
|
154
|
+
|
|
155
|
+
:param start_pos_col: Col name for the token starting position
|
|
156
|
+
:param end_pos_col: Col name for the token ending position
|
|
157
|
+
:param text_col: Col name for the token text
|
|
158
|
+
:param ann_type_col: Col name for the annotation types
|
|
159
|
+
:param auth_id_col: Col name for the authority value ID
|
|
160
|
+
:param sort_fields: The col types relevant for sorting annotation rows
|
|
161
|
+
:param sort_fields_ascending: To specify sort order of sort_fields
|
|
162
|
+
:param **kwargs: More column types mapped to column names
|
|
163
|
+
"""
|
|
164
|
+
super().__init__(
|
|
165
|
+
start_pos_col=start_pos_col,
|
|
166
|
+
end_pos_col=end_pos_col,
|
|
167
|
+
text_col=text_col,
|
|
168
|
+
ann_type_col=ann_type_col,
|
|
169
|
+
sort_fields=sort_fields,
|
|
170
|
+
sort_fields_ascending=sort_fields_ascending,
|
|
171
|
+
auth_id=auth_id_col,
|
|
172
|
+
**kwargs,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def auth_id_col(self) -> str:
|
|
177
|
+
"""Get the column name for the auth_id"""
|
|
178
|
+
return self.data[KEY_AUTH_ID_COL]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class AuthorityAnnotationsBuilder(dk_annots.AnnotationsBuilder):
|
|
182
|
+
"""An extension of an AnnotationsBuilder that adds the 'auth_id' column."""
|
|
183
|
+
|
|
184
|
+
def __init__(
|
|
185
|
+
self,
|
|
186
|
+
metadata: AuthorityAnnotationsMetaData = None,
|
|
187
|
+
data_defaults: Dict[str, Any] = None,
|
|
188
|
+
):
|
|
189
|
+
""":param metadata: The authority annotations metadata
|
|
190
|
+
:param data_defaults: Dict[ann_colname, default_value] with default
|
|
191
|
+
values for annotation columns
|
|
192
|
+
"""
|
|
193
|
+
super().__init__(
|
|
194
|
+
metadata if metadata is not None else AuthorityAnnotationsMetaData(), data_defaults
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def build_annotation_row(
|
|
198
|
+
self, start_pos: int, end_pos: int, text: str, ann_type: str, auth_id: str, **kwargs
|
|
199
|
+
) -> Dict[str, Any]:
|
|
200
|
+
"""Build an annotation row with the mandatory key values and those from
|
|
201
|
+
the remaining keyword arguments.
|
|
202
|
+
|
|
203
|
+
For those kwargs whose names match metadata column names, override the
|
|
204
|
+
data_defaults and add remaining data_default attributes.
|
|
205
|
+
|
|
206
|
+
:param result_row_dict: The result row dictionary being built
|
|
207
|
+
:param start_pos: The token start position
|
|
208
|
+
:param end_pos: The token end position
|
|
209
|
+
:param text: The token text
|
|
210
|
+
:param ann_type: The annotation type
|
|
211
|
+
:param auth_id: The authority ID for the row
|
|
212
|
+
:return: The result_row_dict
|
|
213
|
+
"""
|
|
214
|
+
return self.do_build_row(
|
|
215
|
+
{
|
|
216
|
+
self.metadata.start_pos_col: start_pos,
|
|
217
|
+
self.metadata.end_pos_col: end_pos,
|
|
218
|
+
self.metadata.text_col: text,
|
|
219
|
+
self.metadata.ann_type_col: ann_type,
|
|
220
|
+
self.metadata.auth_id_col: auth_id,
|
|
221
|
+
},
|
|
222
|
+
**kwargs,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class AuthorityData:
|
|
227
|
+
"""A wrapper for authority data."""
|
|
228
|
+
|
|
229
|
+
def __init__(self, df: pd.DataFrame, name: str):
|
|
230
|
+
self._df = df
|
|
231
|
+
self.name = name
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def df(self) -> pd.DataFrame:
|
|
235
|
+
"""Get the authority data in a dataframe"""
|
|
236
|
+
return self._df
|
|
237
|
+
|
|
238
|
+
def lookup_values(self, value: Any, is_id=False) -> pd.DataFrame:
|
|
239
|
+
"""Lookup authority value(s) for the given value or value id.
|
|
240
|
+
:param value: A value or value_id for this authority
|
|
241
|
+
:param is_id: True if value is an ID
|
|
242
|
+
:return: The applicable authority dataframe rows.
|
|
243
|
+
"""
|
|
244
|
+
col = self.df.index if is_id else self.df[self.name]
|
|
245
|
+
return self.df[col == value]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class Authority(dk_annots.Annotator):
|
|
249
|
+
"""A class for managing and defining tabular authoritative data for e.g.,
|
|
250
|
+
taxonomies, etc., and using them to annotate instances within text.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def __init__(
|
|
254
|
+
self,
|
|
255
|
+
name: str,
|
|
256
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
257
|
+
authdata: AuthorityData = None,
|
|
258
|
+
field_groups: DerivedFieldGroups = None,
|
|
259
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
260
|
+
parent_auth: "Authority" = None,
|
|
261
|
+
):
|
|
262
|
+
"""Initialize with this authority's metadata.
|
|
263
|
+
:param name: This authority's entity name
|
|
264
|
+
:param auth_anns_builder: The authority annotations row builder to use
|
|
265
|
+
for building annotation rows.
|
|
266
|
+
:param authdata: The authority data
|
|
267
|
+
:param field_groups: The derived field groups to use
|
|
268
|
+
:param anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
269
|
+
the list of annotation row dicts are valid to be added as
|
|
270
|
+
annotations for a single match or "entity".
|
|
271
|
+
:param parent_auth: This authority's parent authority (if any)
|
|
272
|
+
"""
|
|
273
|
+
super().__init__(name)
|
|
274
|
+
self.anns_builder = (
|
|
275
|
+
auth_anns_builder if auth_anns_builder is not None else AuthorityAnnotationsBuilder()
|
|
276
|
+
)
|
|
277
|
+
self.authdata = authdata
|
|
278
|
+
self.field_groups = field_groups if field_groups is not None else DerivedFieldGroups()
|
|
279
|
+
self.anns_validator = anns_validator
|
|
280
|
+
self._parent = parent_auth
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def metadata(self) -> AuthorityAnnotationsMetaData:
|
|
284
|
+
"""Get the meta-data"""
|
|
285
|
+
return self.anns_builder.metadata
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def parent(self) -> "Authority":
|
|
289
|
+
"""Get this authority's parent, or None."""
|
|
290
|
+
return self._parent
|
|
291
|
+
|
|
292
|
+
@abstractmethod
|
|
293
|
+
def has_value(self, value: Any) -> bool:
|
|
294
|
+
"""Determine whether the given value is in this authority.
|
|
295
|
+
:param value: A possible authority value.
|
|
296
|
+
:return: True if the value is a valid entity value.
|
|
297
|
+
"""
|
|
298
|
+
raise NotImplementedError
|
|
299
|
+
|
|
300
|
+
def annotate_input(
|
|
301
|
+
self,
|
|
302
|
+
text_obj: Union[dk_annots.AnnotatedText, str],
|
|
303
|
+
**kwargs,
|
|
304
|
+
) -> dk_annots.Annotations:
|
|
305
|
+
"""Find and annotate this authority's entities in the document text
|
|
306
|
+
as dictionaries like:
|
|
307
|
+
[
|
|
308
|
+
{
|
|
309
|
+
'input_id': <id>,
|
|
310
|
+
'start_pos': <start_char_pos>,
|
|
311
|
+
'end_pos': <end_char_pos>,
|
|
312
|
+
'entity_text': <entity_text>,
|
|
313
|
+
'ann_type': <authority_name>,
|
|
314
|
+
'<auth_id>': <auth_value_id_or_canonical_form>,
|
|
315
|
+
'confidence': <confidence_if_available>,
|
|
316
|
+
},
|
|
317
|
+
]
|
|
318
|
+
:param text_obj: The text object or string to process.
|
|
319
|
+
:return: An Annotations instance
|
|
320
|
+
"""
|
|
321
|
+
if text_obj is not None:
|
|
322
|
+
if isinstance(text_obj, str) and len(text_obj.strip()) > 0:
|
|
323
|
+
text_obj = dk_annots.AnnotatedText(
|
|
324
|
+
text_obj,
|
|
325
|
+
annots_metadata=self.metadata,
|
|
326
|
+
)
|
|
327
|
+
if text_obj is not None:
|
|
328
|
+
annotations = self.add_annotations(text_obj)
|
|
329
|
+
return annotations
|
|
330
|
+
|
|
331
|
+
@abstractmethod
|
|
332
|
+
def add_annotations(
|
|
333
|
+
self,
|
|
334
|
+
text_obj: dk_annots.AnnotatedText,
|
|
335
|
+
) -> dk_annots.Annotations:
|
|
336
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
337
|
+
:param text_obj: The annotated text object to process and add annotations.
|
|
338
|
+
:return: The added Annotations
|
|
339
|
+
"""
|
|
340
|
+
raise NotImplementedError
|
|
341
|
+
|
|
342
|
+
def validate_ann_dicts(self, ann_dicts: List[Dict[str, Any]]) -> bool:
|
|
343
|
+
"""The annotation row dictionaries are valid if:
|
|
344
|
+
* They are non-empty
|
|
345
|
+
* and
|
|
346
|
+
* either there is no annotations validator
|
|
347
|
+
* or they are valid according to the validator
|
|
348
|
+
:param ann_dicts: Annotation dictionaries
|
|
349
|
+
:return: True if valid
|
|
350
|
+
"""
|
|
351
|
+
return len(ann_dicts) > 0 and (
|
|
352
|
+
self.anns_validator is None or self.anns_validator(self, ann_dicts)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def compose(
|
|
356
|
+
self,
|
|
357
|
+
annotations: dk_annots.Annotations,
|
|
358
|
+
) -> dk_annots.Annotations:
|
|
359
|
+
"""Compose annotations into groups.
|
|
360
|
+
:param annotations: The annotations
|
|
361
|
+
:return: composed annotations
|
|
362
|
+
"""
|
|
363
|
+
return annotations
|
|
364
|
+
|
|
365
|
+
def build_annotation(
|
|
366
|
+
self,
|
|
367
|
+
start_pos: int = None,
|
|
368
|
+
end_pos: int = None,
|
|
369
|
+
entity_text: str = None,
|
|
370
|
+
auth_value_id: Any = None,
|
|
371
|
+
conf: float = 1.0,
|
|
372
|
+
**kwargs,
|
|
373
|
+
) -> Dict[str, Any]:
|
|
374
|
+
"""Build annotations with the given components."""
|
|
375
|
+
return self.anns_builder.build_annotation_row(
|
|
376
|
+
start_pos, end_pos, entity_text, self.name, auth_value_id, auth_valconf=conf, **kwargs
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
class AnnotationsValidator(ABC):
|
|
381
|
+
"""A base class with helper functions for performing validations on annotation
|
|
382
|
+
rows.
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
def __call__(
|
|
386
|
+
self,
|
|
387
|
+
auth: Authority,
|
|
388
|
+
ann_row_dicts: List[Dict[str, Any]],
|
|
389
|
+
) -> bool:
|
|
390
|
+
"""Call function to enable instances of this type of class to be passed in
|
|
391
|
+
as a anns_validator function to an Authority.
|
|
392
|
+
:param auth: The authority proposing annotations
|
|
393
|
+
:param ann_row_dicts: The proposed annotations
|
|
394
|
+
:return: True if the annotations are valid; otherwise, False
|
|
395
|
+
"""
|
|
396
|
+
return self.validate_annotation_rows(
|
|
397
|
+
AnnotationsValidator.AuthAnnotations(auth, ann_row_dicts)
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
@abstractmethod
|
|
401
|
+
def validate_annotation_rows(
|
|
402
|
+
self,
|
|
403
|
+
auth_annotations: "AnnotationsValidator.AuthAnnotations",
|
|
404
|
+
) -> bool:
|
|
405
|
+
"""Determine whether the proposed authority annotation rows are valid.
|
|
406
|
+
:param auth_annotations: The AuthAnnotations instance with the
|
|
407
|
+
proposed data.
|
|
408
|
+
:return: True if valid; False if not.
|
|
409
|
+
"""
|
|
410
|
+
raise NotImplementedError
|
|
411
|
+
|
|
412
|
+
class AuthAnnotations:
|
|
413
|
+
"""A wrapper class for convenient access to the entity annotations."""
|
|
414
|
+
|
|
415
|
+
def __init__(self, auth: Authority, ann_row_dicts: List[Dict[str, Any]]):
|
|
416
|
+
self.auth = auth
|
|
417
|
+
self.ann_row_dicts = ann_row_dicts
|
|
418
|
+
self._row_accessor = None # AnnotationsRowAccessor
|
|
419
|
+
self._anns = None # Annotations
|
|
420
|
+
self._atts = None # Dict[str, str]
|
|
421
|
+
|
|
422
|
+
@property
|
|
423
|
+
def row_accessor(self) -> dk_annots.AnnotationsRowAccessor:
|
|
424
|
+
"""Get the row accessor for this instance's annotations."""
|
|
425
|
+
if self._row_accessor is None:
|
|
426
|
+
self._row_accessor = dk_annots.AnnotationsRowAccessor(
|
|
427
|
+
self.auth.metadata, derived_cols=self.auth.field_groups
|
|
428
|
+
)
|
|
429
|
+
return self._row_accessor
|
|
430
|
+
|
|
431
|
+
@property
|
|
432
|
+
def anns(self) -> dk_annots.Annotations:
|
|
433
|
+
"""Get this instance's annotation rows as an annotations object"""
|
|
434
|
+
if self._anns is None:
|
|
435
|
+
self._anns = dk_annots.Annotations(self.auth.metadata)
|
|
436
|
+
for row_dict in self.ann_row_dicts:
|
|
437
|
+
self._anns.add_dict(row_dict)
|
|
438
|
+
return self._anns
|
|
439
|
+
|
|
440
|
+
@property
|
|
441
|
+
def df(self) -> pd.DataFrame:
|
|
442
|
+
"""Get the annotation's dataframe"""
|
|
443
|
+
return self.anns.df
|
|
444
|
+
|
|
445
|
+
def get_field_type(self, row: pd.Series) -> str:
|
|
446
|
+
"""Get the entity field type value"""
|
|
447
|
+
return self.row_accessor.get_col_value("field_type", row, None)
|
|
448
|
+
|
|
449
|
+
def get_text(self, row: pd.Series) -> str:
|
|
450
|
+
"""Get the entity text from the row"""
|
|
451
|
+
return self.row_accessor.get_col_value(self.auth.metadata.text_col, row, None)
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def attributes(self) -> Dict[str, str]:
|
|
455
|
+
"""Get this instance's annotation entity attributes"""
|
|
456
|
+
if self._atts is None:
|
|
457
|
+
self._atts = {
|
|
458
|
+
self.get_field_type(row): self.get_text(row) for _, row in self.df.iterrows()
|
|
459
|
+
}
|
|
460
|
+
return self._atts
|
|
461
|
+
|
|
462
|
+
def colval(self, col_name, row) -> Any:
|
|
463
|
+
"""Get the column's value from the given row"""
|
|
464
|
+
return self.row_accessor.get_col_value(col_name, row)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
class AuthorityFactory(ABC):
|
|
468
|
+
"""A factory class for building an authority."""
|
|
469
|
+
|
|
470
|
+
@abstractmethod
|
|
471
|
+
def build_authority(
|
|
472
|
+
self,
|
|
473
|
+
name: str,
|
|
474
|
+
auth_anns_builder: AuthorityAnnotationsBuilder,
|
|
475
|
+
authdata: AuthorityData,
|
|
476
|
+
parent_auth: Authority = None,
|
|
477
|
+
) -> Authority:
|
|
478
|
+
"""Build an authority with the given name and data.
|
|
479
|
+
:param name: The authority name
|
|
480
|
+
:param auth_anns_builder: The authority annotations row builder to use
|
|
481
|
+
for building annotation rows.
|
|
482
|
+
:param authdata: The authority data
|
|
483
|
+
:param parent_auth: The parent authority.
|
|
484
|
+
:return: The authority
|
|
485
|
+
"""
|
|
486
|
+
raise NotImplementedError
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
class LexicalAuthority(Authority):
|
|
490
|
+
"""A class for managing named entities by ID with associated values and
|
|
491
|
+
variations.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
def __init__(
|
|
495
|
+
self,
|
|
496
|
+
name: str,
|
|
497
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
498
|
+
authdata: AuthorityData = None,
|
|
499
|
+
field_groups: DerivedFieldGroups = None,
|
|
500
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
501
|
+
parent_auth: "Authority" = None,
|
|
502
|
+
):
|
|
503
|
+
"""Initialize with this authority's metadata.
|
|
504
|
+
:param name: This authority's entity name
|
|
505
|
+
:param auth_anns_builder: The authority annotations row builder to use
|
|
506
|
+
for building annotation rows.
|
|
507
|
+
:param authdata: The authority data
|
|
508
|
+
:param field_groups: The derived field groups to use
|
|
509
|
+
:param anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
510
|
+
the list of annotation row dicts are valid to be added as
|
|
511
|
+
annotations for a single match or "entity".
|
|
512
|
+
:param parent_auth: This authority's parent authority (if any)
|
|
513
|
+
"""
|
|
514
|
+
super().__init__(
|
|
515
|
+
name,
|
|
516
|
+
auth_anns_builder=auth_anns_builder,
|
|
517
|
+
authdata=authdata,
|
|
518
|
+
field_groups=field_groups,
|
|
519
|
+
anns_validator=anns_validator,
|
|
520
|
+
parent_auth=parent_auth,
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
@abstractmethod
|
|
524
|
+
def get_value_ids(self, value: Any) -> Set[Any]:
|
|
525
|
+
"""Get all IDs associated with the given value. Note that typically
|
|
526
|
+
there is a single ID for any value, but this allows for inherent
|
|
527
|
+
ambiguities in the authority.
|
|
528
|
+
:param value: An authority value
|
|
529
|
+
:return: The associated IDs or an empty set if the value is not valid.
|
|
530
|
+
"""
|
|
531
|
+
raise NotImplementedError
|
|
532
|
+
|
|
533
|
+
@abstractmethod
|
|
534
|
+
def get_values_by_id(self, value_id: Any) -> Set[Any]:
|
|
535
|
+
"""Get all values for the associated value ID. Note that typically
|
|
536
|
+
there is a single value for an ID, but this allows for inherent
|
|
537
|
+
ambiguities in the authority.
|
|
538
|
+
|
|
539
|
+
:param value: An authority value
|
|
540
|
+
:return: The associated IDs or an empty set if the value is not valid.
|
|
541
|
+
"""
|
|
542
|
+
raise NotImplementedError
|
|
543
|
+
|
|
544
|
+
@abstractmethod
|
|
545
|
+
def get_id_by_variation(self, variation: str) -> Set[str]:
|
|
546
|
+
"""Get the IDs of the value(s) associated with the given variation.
|
|
547
|
+
:param variation: Variation text
|
|
548
|
+
:return: The possibly empty set of associated value IDS.
|
|
549
|
+
"""
|
|
550
|
+
raise NotImplementedError
|
|
551
|
+
|
|
552
|
+
@abstractmethod
|
|
553
|
+
def find_variations(
|
|
554
|
+
self,
|
|
555
|
+
variation: str,
|
|
556
|
+
starts_with: bool = False,
|
|
557
|
+
ends_with: bool = False,
|
|
558
|
+
scope: str = "fullmatch",
|
|
559
|
+
) -> pd.Series:
|
|
560
|
+
"""Find all matches to the given variation.
|
|
561
|
+
:param variation: The text to find; treated as a regular expression
|
|
562
|
+
unless either starts_with or ends_with is True.
|
|
563
|
+
:param starts_with: When True, find all terms that start with the
|
|
564
|
+
variation text.
|
|
565
|
+
:param ends_with: When True, find all terms that end with the variation
|
|
566
|
+
text.
|
|
567
|
+
:param scope: 'fullmatch' (default), 'match', or 'contains' for
|
|
568
|
+
strict, less strict, and least strict matching
|
|
569
|
+
:param category_constraints: When present, limit results to terms with
|
|
570
|
+
the given constraints.
|
|
571
|
+
:return: The matching variations as a pd.Series
|
|
572
|
+
|
|
573
|
+
Note only the first true of starts_with, ends_with, and scope will
|
|
574
|
+
be applied. If none of these are true, an full match on the pattern
|
|
575
|
+
is performed.
|
|
576
|
+
"""
|
|
577
|
+
raise NotImplementedError
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
class RegexAuthority(Authority):
|
|
581
|
+
"""A class for managing named entities by ID with associated values and
|
|
582
|
+
variations.
|
|
583
|
+
"""
|
|
584
|
+
|
|
585
|
+
def __init__(
|
|
586
|
+
self,
|
|
587
|
+
name: str,
|
|
588
|
+
regex: re.Pattern,
|
|
589
|
+
canonical_fn: Callable[[str, str], Any] = None,
|
|
590
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
591
|
+
authdata: AuthorityData = None,
|
|
592
|
+
field_groups: DerivedFieldGroups = None,
|
|
593
|
+
anns_validator: Callable[[Authority, Dict[str, Any]], bool] = None,
|
|
594
|
+
parent_auth: "Authority" = None,
|
|
595
|
+
):
|
|
596
|
+
"""Initialize with this authority's entity name.
|
|
597
|
+
:param name: The authority name
|
|
598
|
+
:param regex: The regular expression to apply
|
|
599
|
+
:param canonical_fn: A function, fn(match_text, group_name), to
|
|
600
|
+
transform input matches to a canonical form as a value_id.
|
|
601
|
+
Where group_name will be None and the full match text will be
|
|
602
|
+
passed in if there are no group names. Note that the canonical form
|
|
603
|
+
is computed before the match_validator is applied and its value
|
|
604
|
+
will be found as the value to the <auth_id> key.
|
|
605
|
+
:param auth_anns_builder: The authority annotations row builder to use
|
|
606
|
+
for building annotation rows.
|
|
607
|
+
:param authdata: The authority data
|
|
608
|
+
:param field_groups: The derived field groups to use
|
|
609
|
+
:param anns_validator: A validation function for each regex match
|
|
610
|
+
formed as a list of annotation row dictionaries, one row dictionary
|
|
611
|
+
for each matching regex group. If the validator returns False,
|
|
612
|
+
then the annotation rows will be rejected. The entity_text key
|
|
613
|
+
will hold matched text and the <auth_name>_field key will hold
|
|
614
|
+
the group name or number (if there are groups with or without names)
|
|
615
|
+
or the <auth_name> if there are no groups in the regular expression.
|
|
616
|
+
Note that the validator function takes the regex authority instance
|
|
617
|
+
as its first parameter to provide access to the field_groups, etc.
|
|
618
|
+
The validation_fn signature is: fn(regexAuthority, ann_row_dicts)
|
|
619
|
+
and returns a boolean.
|
|
620
|
+
:param parent_auth: This authority's parent authority (if any)
|
|
621
|
+
:param group_name_colname: The name of the annotations column for
|
|
622
|
+
the regex group names, or None to ignore group_names.
|
|
623
|
+
|
|
624
|
+
NOTE: If the regular expression has capturing groups, each group
|
|
625
|
+
will result in a separate entity, with the group name if provided
|
|
626
|
+
in the regular expression as ...(?P<group_name>group_regex)...
|
|
627
|
+
"""
|
|
628
|
+
super().__init__(
|
|
629
|
+
name,
|
|
630
|
+
auth_anns_builder=auth_anns_builder,
|
|
631
|
+
authdata=authdata,
|
|
632
|
+
field_groups=field_groups,
|
|
633
|
+
anns_validator=anns_validator,
|
|
634
|
+
parent_auth=parent_auth,
|
|
635
|
+
)
|
|
636
|
+
self.regex = regex
|
|
637
|
+
self.canonical_fn = canonical_fn
|
|
638
|
+
|
|
639
|
+
def has_value(self, value: Any) -> re.Match:
|
|
640
|
+
"""Determine whether the given value is in this authority.
|
|
641
|
+
:param value: A possible authority value.
|
|
642
|
+
:return: None if the value is not a valid entity value; otherwise,
|
|
643
|
+
return the re.Match object.
|
|
644
|
+
"""
|
|
645
|
+
return self.regex.match(str(value))
|
|
646
|
+
|
|
647
|
+
def add_annotations(
|
|
648
|
+
self,
|
|
649
|
+
text_obj: dk_annots.AnnotatedText,
|
|
650
|
+
) -> dk_annots.Annotations:
|
|
651
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
652
|
+
:param text_obj: The annotated text object to process and add annotations.
|
|
653
|
+
:return: The added Annotations
|
|
654
|
+
"""
|
|
655
|
+
for match in re.finditer(self.regex, text_obj.text):
|
|
656
|
+
ann_dicts = list()
|
|
657
|
+
if match.lastindex is not None:
|
|
658
|
+
if len(self.regex.groupindex) > 0: # we have named groups
|
|
659
|
+
for group_name, group_num in self.regex.groupindex.items():
|
|
660
|
+
group_text = match.group(group_num)
|
|
661
|
+
kwargs = {self.field_groups.get_field_type_col(self.name): group_name}
|
|
662
|
+
ann_dicts.append(
|
|
663
|
+
self.build_annotation(
|
|
664
|
+
start_pos=match.start(group_name),
|
|
665
|
+
end_pos=match.end(group_name),
|
|
666
|
+
entity_text=group_text,
|
|
667
|
+
auth_value_id=self.get_canonical_form(group_text, group_name),
|
|
668
|
+
**kwargs,
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
else: # we have only numbers for groups
|
|
672
|
+
for group_num, group_text in enumerate(match.groups()):
|
|
673
|
+
group_num += 1
|
|
674
|
+
kwargs = {self.field_groups.get_field_type_col(self.name): group_num}
|
|
675
|
+
ann_dicts.append(
|
|
676
|
+
self.build_annotation(
|
|
677
|
+
start_pos=match.start(group_num),
|
|
678
|
+
end_pos=match.end(group_num),
|
|
679
|
+
entity_text=group_text,
|
|
680
|
+
auth_value_id=self.get_canonical_form(group_text, group_num),
|
|
681
|
+
**kwargs,
|
|
682
|
+
)
|
|
683
|
+
)
|
|
684
|
+
else: # we have no groups
|
|
685
|
+
ann_dicts.append(
|
|
686
|
+
self.build_annotation(
|
|
687
|
+
start_pos=match.start(),
|
|
688
|
+
end_pos=match.end(),
|
|
689
|
+
entity_text=match.group(),
|
|
690
|
+
auth_value_id=self.get_canonical_form(match.group(), self.name),
|
|
691
|
+
)
|
|
692
|
+
)
|
|
693
|
+
if self.validate_ann_dicts(ann_dicts):
|
|
694
|
+
# Add non-empty, valid annotation dicts to the result
|
|
695
|
+
text_obj.annotations.add_dicts(ann_dicts)
|
|
696
|
+
return text_obj.annotations
|
|
697
|
+
|
|
698
|
+
def get_canonical_form(self, entity_text: str, entity_type: str) -> Any:
|
|
699
|
+
if self.canonical_fn is not None:
|
|
700
|
+
entity_text = self.canonical_fn(entity_text, entity_type)
|
|
701
|
+
return entity_text
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
class AuthoritiesBundle(Authority):
|
|
705
|
+
"""An authority for expressing values through multiple bundled "authorities"
|
|
706
|
+
like dictionary-based and/or multiple regular expression patterns.
|
|
707
|
+
"""
|
|
708
|
+
|
|
709
|
+
def __init__(
|
|
710
|
+
self,
|
|
711
|
+
name: str,
|
|
712
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
713
|
+
authdata: AuthorityData = None,
|
|
714
|
+
field_groups: DerivedFieldGroups = None,
|
|
715
|
+
parent_auth: "Authority" = None,
|
|
716
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
717
|
+
auths: List[Authority] = None,
|
|
718
|
+
):
|
|
719
|
+
""":param name: This authority's entity name
|
|
720
|
+
:param auth_anns_builder: The authority annotations row builder to use
|
|
721
|
+
for building annotation rows.
|
|
722
|
+
:param authdata: The authority data
|
|
723
|
+
:param field_groups: The derived field groups to use
|
|
724
|
+
:param anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
725
|
+
the list of annotation row dicts are valid to be added as
|
|
726
|
+
annotations for a single match or "entity".
|
|
727
|
+
:param parent_auth: This authority's parent authority (if any)
|
|
728
|
+
:param auths: The authorities to bundle together.
|
|
729
|
+
"""
|
|
730
|
+
super().__init__(
|
|
731
|
+
name,
|
|
732
|
+
auth_anns_builder=auth_anns_builder,
|
|
733
|
+
authdata=authdata,
|
|
734
|
+
field_groups=field_groups,
|
|
735
|
+
anns_validator=anns_validator,
|
|
736
|
+
parent_auth=parent_auth,
|
|
737
|
+
)
|
|
738
|
+
self.auths = auths.copy() if auths is not None else list()
|
|
739
|
+
|
|
740
|
+
def add(self, auth: Authority):
|
|
741
|
+
"""Add the authority to this bundle
|
|
742
|
+
:param auth: The authority to add.
|
|
743
|
+
"""
|
|
744
|
+
self.auths.append(auth)
|
|
745
|
+
|
|
746
|
+
def has_value(self, value: Any) -> bool:
|
|
747
|
+
"""Determine whether the given value is in this authority.
|
|
748
|
+
:param value: A possible authority value.
|
|
749
|
+
:return: True if the value is a valid entity value.
|
|
750
|
+
"""
|
|
751
|
+
for auth in self.auths:
|
|
752
|
+
if auth.has_value(value):
|
|
753
|
+
return True
|
|
754
|
+
return False
|
|
755
|
+
|
|
756
|
+
def add_annotations(
|
|
757
|
+
self,
|
|
758
|
+
text_obj: dk_annots.AnnotatedText,
|
|
759
|
+
) -> dk_annots.Annotations:
|
|
760
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
761
|
+
:param text_obj: The annotated text object to process and add annotations.
|
|
762
|
+
:return: The added Annotations
|
|
763
|
+
"""
|
|
764
|
+
for auth in self.auths:
|
|
765
|
+
auth.annotate_input(text_obj)
|
|
766
|
+
return text_obj.annotations
|