dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,860 @@
|
|
|
1
|
+
"""Authority-based annotation processing and field grouping.
|
|
2
|
+
|
|
3
|
+
Provides classes for managing authority-based annotations, field groups,
|
|
4
|
+
and derived annotation columns for structured text extraction.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from typing import Any, Dict, List, Set, Union
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
import dataknobs_xization.annotations as dk_annots
|
|
15
|
+
|
|
16
|
+
# Key annotation column name constants
|
|
17
|
+
KEY_AUTH_ID_COL = "auth_id"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DerivedFieldGroups(dk_annots.DerivedAnnotationColumns):
|
|
21
|
+
"""Defines derived column types:
|
|
22
|
+
* "field_type" -- The column holding they type of field of an annotation row
|
|
23
|
+
* "field_group" -- The column holding the group number(s) of the field
|
|
24
|
+
* "field_record" -- The column holding record number(s) of the field
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
field_type_suffix: str = "_field",
|
|
30
|
+
field_group_suffix: str = "_num",
|
|
31
|
+
field_record_suffix: str = "_recsnum",
|
|
32
|
+
):
|
|
33
|
+
"""Add derived column types/names: Given an annnotation row,
|
|
34
|
+
* field_type(row) == f'{row[ann_type_col]}_field'
|
|
35
|
+
* field_group(row) == f'{row[ann_type_col]}_num'
|
|
36
|
+
* field_record(row) == f'{row[ann_type_col])_recsnum'
|
|
37
|
+
|
|
38
|
+
Where:
|
|
39
|
+
* A field_type column holds annotation "sub"- type values, or fields
|
|
40
|
+
* A field_group column identifies groups of annotation fields
|
|
41
|
+
* A field_record column identifies groups of annotation field groups
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
field_type_suffix: The field_type col name suffix (if not _field).
|
|
45
|
+
field_group_suffix: The field_group col name suffix (if not _num).
|
|
46
|
+
field_record_suffix: field_record colname sfx (if not _recsnum).
|
|
47
|
+
"""
|
|
48
|
+
self.field_type_suffix = field_type_suffix
|
|
49
|
+
self.field_group_suffix = field_group_suffix
|
|
50
|
+
self.field_record_suffix = field_record_suffix
|
|
51
|
+
|
|
52
|
+
def get_col_value(
|
|
53
|
+
self,
|
|
54
|
+
metadata: dk_annots.AnnotationsMetaData,
|
|
55
|
+
col_type: str,
|
|
56
|
+
row: pd.Series,
|
|
57
|
+
missing: str = None,
|
|
58
|
+
) -> str:
|
|
59
|
+
"""Get the value of the column in the given row derived from col_type,
|
|
60
|
+
where col_type is one of:
|
|
61
|
+
* "field_type" == f"{field}_field"
|
|
62
|
+
* "field_group" == f"{field}_num"
|
|
63
|
+
* "field_record" == f"{field}_recsnum"
|
|
64
|
+
|
|
65
|
+
And "field" is the row_accessor's metadata's "ann_type" col's value.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
metadata: The AnnotationsMetaData.
|
|
69
|
+
col_type: The type of column value to derive.
|
|
70
|
+
row: A row from which to get the value.
|
|
71
|
+
missing: The value to return for unknown or missing column.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
The row value or the missing value.
|
|
75
|
+
"""
|
|
76
|
+
value = missing
|
|
77
|
+
if metadata.ann_type_col in row.index:
|
|
78
|
+
field = row[metadata.ann_type_col]
|
|
79
|
+
if field is not None:
|
|
80
|
+
if col_type == "field_type":
|
|
81
|
+
col_name = self.get_field_type_col(field)
|
|
82
|
+
elif col_type == "field_group":
|
|
83
|
+
col_name = self.get_field_group_col(field)
|
|
84
|
+
elif col_type == "field_record":
|
|
85
|
+
col_name = self.get_field_record_col(field)
|
|
86
|
+
if col_name is not None and col_name in row.index:
|
|
87
|
+
value = row[col_name]
|
|
88
|
+
return value
|
|
89
|
+
|
|
90
|
+
def unpack_field(self, field_value: str) -> str:
|
|
91
|
+
"""Given a field in any of its derivatives (like field type, field group
|
|
92
|
+
or field record,) unpack and return the basic field value itself.
|
|
93
|
+
"""
|
|
94
|
+
field = field_value
|
|
95
|
+
if field.endswith(self.field_record_suffix):
|
|
96
|
+
field = field.replace(self.field_record_suffix, "")
|
|
97
|
+
elif field.endswith(self.field_group_suffix):
|
|
98
|
+
field = field.replace(self.field_group_suffix, "")
|
|
99
|
+
elif field.endswith(self.field_type_suffix):
|
|
100
|
+
field = field.replace(self.field_type_suffix, "")
|
|
101
|
+
return field
|
|
102
|
+
|
|
103
|
+
def get_field_name(self, field_value: str) -> str:
|
|
104
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
105
|
+
value (the field name); or a field type, group, or record column name,
|
|
106
|
+
get the field name.
|
|
107
|
+
"""
|
|
108
|
+
return self.unpack_field(field_value)
|
|
109
|
+
|
|
110
|
+
def get_field_type_col(self, field_value: str) -> str:
|
|
111
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
112
|
+
value; or a field type, group, or record column name, get the field
|
|
113
|
+
name.
|
|
114
|
+
"""
|
|
115
|
+
field = self.unpack_field(field_value)
|
|
116
|
+
return f"{field}{self.field_type_suffix}"
|
|
117
|
+
|
|
118
|
+
def get_field_group_col(self, field_value: str) -> str:
|
|
119
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
120
|
+
value; or a field type, group, or record, get the name of the derived
|
|
121
|
+
field group column.
|
|
122
|
+
"""
|
|
123
|
+
field = self.unpack_field(field_value)
|
|
124
|
+
return f"{field}{self.field_group_suffix}"
|
|
125
|
+
|
|
126
|
+
def get_field_record_col(self, field_value: str) -> str:
|
|
127
|
+
"""Given a field name or field col name, e.g., an annotation type col's
|
|
128
|
+
value; or a field type, group, or record, get the name of the derived
|
|
129
|
+
field record column.
|
|
130
|
+
"""
|
|
131
|
+
field = self.unpack_field(field_value)
|
|
132
|
+
return f"{field}{self.field_record_suffix}"
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class AuthorityAnnotationsMetaData(dk_annots.AnnotationsMetaData):
|
|
136
|
+
"""An extension of AnnotationsMetaData that adds an 'auth_id_col' to the
|
|
137
|
+
standard (key) annotation columns (attributes).
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def __init__(
|
|
141
|
+
self,
|
|
142
|
+
start_pos_col: str = dk_annots.KEY_START_POS_COL,
|
|
143
|
+
end_pos_col: str = dk_annots.KEY_END_POS_COL,
|
|
144
|
+
text_col: str = dk_annots.KEY_TEXT_COL,
|
|
145
|
+
ann_type_col: str = dk_annots.KEY_ANN_TYPE_COL,
|
|
146
|
+
auth_id_col: str = KEY_AUTH_ID_COL,
|
|
147
|
+
sort_fields: List[str] = (dk_annots.KEY_START_POS_COL, dk_annots.KEY_END_POS_COL),
|
|
148
|
+
sort_fields_ascending: List[bool] = (True, False),
|
|
149
|
+
**kwargs: Any,
|
|
150
|
+
):
|
|
151
|
+
"""Initialize with key (and more) column names and info.
|
|
152
|
+
|
|
153
|
+
Key column types:
|
|
154
|
+
* start_pos
|
|
155
|
+
* end_pos
|
|
156
|
+
* text
|
|
157
|
+
* ann_type
|
|
158
|
+
* auth_id
|
|
159
|
+
|
|
160
|
+
Note:
|
|
161
|
+
Actual table columns can be named arbitrarily, BUT interactions
|
|
162
|
+
through annotations classes and interfaces relating to the "key"
|
|
163
|
+
columns must use the key column constants.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
start_pos_col: Col name for the token starting position.
|
|
167
|
+
end_pos_col: Col name for the token ending position.
|
|
168
|
+
text_col: Col name for the token text.
|
|
169
|
+
ann_type_col: Col name for the annotation types.
|
|
170
|
+
auth_id_col: Col name for the authority value ID.
|
|
171
|
+
sort_fields: The col types relevant for sorting annotation rows.
|
|
172
|
+
sort_fields_ascending: To specify sort order of sort_fields.
|
|
173
|
+
**kwargs: More column types mapped to column names.
|
|
174
|
+
"""
|
|
175
|
+
super().__init__(
|
|
176
|
+
start_pos_col=start_pos_col,
|
|
177
|
+
end_pos_col=end_pos_col,
|
|
178
|
+
text_col=text_col,
|
|
179
|
+
ann_type_col=ann_type_col,
|
|
180
|
+
sort_fields=sort_fields,
|
|
181
|
+
sort_fields_ascending=sort_fields_ascending,
|
|
182
|
+
auth_id=auth_id_col,
|
|
183
|
+
**kwargs,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def auth_id_col(self) -> str:
|
|
188
|
+
"""Get the column name for the auth_id"""
|
|
189
|
+
return self.data[KEY_AUTH_ID_COL]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class AuthorityAnnotationsBuilder(dk_annots.AnnotationsBuilder):
|
|
193
|
+
"""An extension of an AnnotationsBuilder that adds the 'auth_id' column."""
|
|
194
|
+
|
|
195
|
+
def __init__(
|
|
196
|
+
self,
|
|
197
|
+
metadata: AuthorityAnnotationsMetaData = None,
|
|
198
|
+
data_defaults: Dict[str, Any] = None,
|
|
199
|
+
):
|
|
200
|
+
"""Initialize AuthorityAnnotationsBuilder.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
metadata: The authority annotations metadata.
|
|
204
|
+
data_defaults: Dict[ann_colname, default_value] with default
|
|
205
|
+
values for annotation columns.
|
|
206
|
+
"""
|
|
207
|
+
super().__init__(
|
|
208
|
+
metadata if metadata is not None else AuthorityAnnotationsMetaData(), data_defaults
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def build_annotation_row(
|
|
212
|
+
self, start_pos: int, end_pos: int, text: str, ann_type: str, auth_id: str, **kwargs: Any
|
|
213
|
+
) -> Dict[str, Any]:
|
|
214
|
+
"""Build an annotation row with the mandatory key values and those from
|
|
215
|
+
the remaining keyword arguments.
|
|
216
|
+
|
|
217
|
+
For those kwargs whose names match metadata column names, override the
|
|
218
|
+
data_defaults and add remaining data_default attributes.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
start_pos: The token start position.
|
|
222
|
+
end_pos: The token end position.
|
|
223
|
+
text: The token text.
|
|
224
|
+
ann_type: The annotation type.
|
|
225
|
+
auth_id: The authority ID for the row.
|
|
226
|
+
**kwargs: Additional keyword arguments.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The result row dictionary.
|
|
230
|
+
"""
|
|
231
|
+
return self.do_build_row(
|
|
232
|
+
{
|
|
233
|
+
self.metadata.start_pos_col: start_pos,
|
|
234
|
+
self.metadata.end_pos_col: end_pos,
|
|
235
|
+
self.metadata.text_col: text,
|
|
236
|
+
self.metadata.ann_type_col: ann_type,
|
|
237
|
+
self.metadata.auth_id_col: auth_id,
|
|
238
|
+
},
|
|
239
|
+
**kwargs,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class AuthorityData:
|
|
244
|
+
"""A wrapper for authority data."""
|
|
245
|
+
|
|
246
|
+
def __init__(self, df: pd.DataFrame, name: str):
|
|
247
|
+
self._df = df
|
|
248
|
+
self.name = name
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def df(self) -> pd.DataFrame:
|
|
252
|
+
"""Get the authority data in a dataframe"""
|
|
253
|
+
return self._df
|
|
254
|
+
|
|
255
|
+
def lookup_values(self, value: Any, is_id: bool = False) -> pd.DataFrame:
|
|
256
|
+
"""Lookup authority value(s) for the given value or value id.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
value: A value or value_id for this authority.
|
|
260
|
+
is_id: True if value is an ID.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
The applicable authority dataframe rows.
|
|
264
|
+
"""
|
|
265
|
+
col = self.df.index if is_id else self.df[self.name]
|
|
266
|
+
return self.df[col == value]
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class Authority(dk_annots.Annotator):
|
|
270
|
+
"""A class for managing and defining tabular authoritative data for e.g.,
|
|
271
|
+
taxonomies, etc., and using them to annotate instances within text.
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
def __init__(
|
|
275
|
+
self,
|
|
276
|
+
name: str,
|
|
277
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
278
|
+
authdata: AuthorityData = None,
|
|
279
|
+
field_groups: DerivedFieldGroups = None,
|
|
280
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
281
|
+
parent_auth: "Authority" = None,
|
|
282
|
+
):
|
|
283
|
+
"""Initialize with this authority's metadata.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
name: This authority's entity name.
|
|
287
|
+
auth_anns_builder: The authority annotations row builder to use
|
|
288
|
+
for building annotation rows.
|
|
289
|
+
authdata: The authority data.
|
|
290
|
+
field_groups: The derived field groups to use.
|
|
291
|
+
anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
292
|
+
the list of annotation row dicts are valid to be added as
|
|
293
|
+
annotations for a single match or "entity".
|
|
294
|
+
parent_auth: This authority's parent authority (if any).
|
|
295
|
+
"""
|
|
296
|
+
super().__init__(name)
|
|
297
|
+
self.anns_builder = (
|
|
298
|
+
auth_anns_builder if auth_anns_builder is not None else AuthorityAnnotationsBuilder()
|
|
299
|
+
)
|
|
300
|
+
self.authdata = authdata
|
|
301
|
+
self.field_groups = field_groups if field_groups is not None else DerivedFieldGroups()
|
|
302
|
+
self.anns_validator = anns_validator
|
|
303
|
+
self._parent = parent_auth
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def metadata(self) -> AuthorityAnnotationsMetaData:
|
|
307
|
+
"""Get the meta-data"""
|
|
308
|
+
return self.anns_builder.metadata
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def parent(self) -> "Authority":
|
|
312
|
+
"""Get this authority's parent, or None."""
|
|
313
|
+
return self._parent
|
|
314
|
+
|
|
315
|
+
@abstractmethod
|
|
316
|
+
def has_value(self, value: Any) -> bool:
|
|
317
|
+
"""Determine whether the given value is in this authority.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
value: A possible authority value.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
True if the value is a valid entity value.
|
|
324
|
+
"""
|
|
325
|
+
raise NotImplementedError
|
|
326
|
+
|
|
327
|
+
def annotate_input(
|
|
328
|
+
self,
|
|
329
|
+
text_obj: Union[dk_annots.AnnotatedText, str],
|
|
330
|
+
**kwargs: Any,
|
|
331
|
+
) -> dk_annots.Annotations:
|
|
332
|
+
"""Find and annotate this authority's entities in the document text
|
|
333
|
+
as dictionaries like:
|
|
334
|
+
[
|
|
335
|
+
{
|
|
336
|
+
'input_id': <id>,
|
|
337
|
+
'start_pos': <start_char_pos>,
|
|
338
|
+
'end_pos': <end_char_pos>,
|
|
339
|
+
'entity_text': <entity_text>,
|
|
340
|
+
'ann_type': <authority_name>,
|
|
341
|
+
'<auth_id>': <auth_value_id_or_canonical_form>,
|
|
342
|
+
'confidence': <confidence_if_available>,
|
|
343
|
+
},
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
text_obj: The text object or string to process.
|
|
348
|
+
**kwargs: Additional keyword arguments.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
An Annotations instance.
|
|
352
|
+
"""
|
|
353
|
+
if text_obj is not None:
|
|
354
|
+
if isinstance(text_obj, str) and len(text_obj.strip()) > 0:
|
|
355
|
+
text_obj = dk_annots.AnnotatedText(
|
|
356
|
+
text_obj,
|
|
357
|
+
annots_metadata=self.metadata,
|
|
358
|
+
)
|
|
359
|
+
if text_obj is not None:
|
|
360
|
+
annotations = self.add_annotations(text_obj)
|
|
361
|
+
return annotations
|
|
362
|
+
|
|
363
|
+
@abstractmethod
|
|
364
|
+
def add_annotations(
|
|
365
|
+
self,
|
|
366
|
+
text_obj: dk_annots.AnnotatedText,
|
|
367
|
+
) -> dk_annots.Annotations:
|
|
368
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
text_obj: The annotated text object to process and add annotations.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
The added Annotations.
|
|
375
|
+
"""
|
|
376
|
+
raise NotImplementedError
|
|
377
|
+
|
|
378
|
+
def validate_ann_dicts(self, ann_dicts: List[Dict[str, Any]]) -> bool:
|
|
379
|
+
"""The annotation row dictionaries are valid if:
|
|
380
|
+
* They are non-empty
|
|
381
|
+
* and
|
|
382
|
+
* either there is no annotations validator
|
|
383
|
+
* or they are valid according to the validator
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
ann_dicts: Annotation dictionaries.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
True if valid.
|
|
390
|
+
"""
|
|
391
|
+
return len(ann_dicts) > 0 and (
|
|
392
|
+
self.anns_validator is None or self.anns_validator(self, ann_dicts)
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
def compose(
|
|
396
|
+
self,
|
|
397
|
+
annotations: dk_annots.Annotations,
|
|
398
|
+
) -> dk_annots.Annotations:
|
|
399
|
+
"""Compose annotations into groups.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
annotations: The annotations.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Composed annotations.
|
|
406
|
+
"""
|
|
407
|
+
return annotations
|
|
408
|
+
|
|
409
|
+
def build_annotation(
|
|
410
|
+
self,
|
|
411
|
+
start_pos: int = None,
|
|
412
|
+
end_pos: int = None,
|
|
413
|
+
entity_text: str = None,
|
|
414
|
+
auth_value_id: Any = None,
|
|
415
|
+
conf: float = 1.0,
|
|
416
|
+
**kwargs,
|
|
417
|
+
) -> Dict[str, Any]:
|
|
418
|
+
"""Build annotations with the given components."""
|
|
419
|
+
return self.anns_builder.build_annotation_row(
|
|
420
|
+
start_pos, end_pos, entity_text, self.name, auth_value_id, auth_valconf=conf, **kwargs
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
class AnnotationsValidator(ABC):
|
|
425
|
+
"""A base class with helper functions for performing validations on annotation
|
|
426
|
+
rows.
|
|
427
|
+
"""
|
|
428
|
+
|
|
429
|
+
def __call__(
|
|
430
|
+
self,
|
|
431
|
+
auth: Authority,
|
|
432
|
+
ann_row_dicts: List[Dict[str, Any]],
|
|
433
|
+
) -> bool:
|
|
434
|
+
"""Call function to enable instances of this type of class to be passed in
|
|
435
|
+
as a anns_validator function to an Authority.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
auth: The authority proposing annotations.
|
|
439
|
+
ann_row_dicts: The proposed annotations.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
True if the annotations are valid; otherwise, False.
|
|
443
|
+
"""
|
|
444
|
+
return self.validate_annotation_rows(
|
|
445
|
+
AnnotationsValidator.AuthAnnotations(auth, ann_row_dicts)
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
@abstractmethod
|
|
449
|
+
def validate_annotation_rows(
|
|
450
|
+
self,
|
|
451
|
+
auth_annotations: "AnnotationsValidator.AuthAnnotations",
|
|
452
|
+
) -> bool:
|
|
453
|
+
"""Determine whether the proposed authority annotation rows are valid.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
auth_annotations: The AuthAnnotations instance with the
|
|
457
|
+
proposed data.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
True if valid; False if not.
|
|
461
|
+
"""
|
|
462
|
+
raise NotImplementedError
|
|
463
|
+
|
|
464
|
+
class AuthAnnotations:
|
|
465
|
+
"""A wrapper class for convenient access to the entity annotations."""
|
|
466
|
+
|
|
467
|
+
def __init__(self, auth: Authority, ann_row_dicts: List[Dict[str, Any]]):
|
|
468
|
+
self.auth = auth
|
|
469
|
+
self.ann_row_dicts = ann_row_dicts
|
|
470
|
+
self._row_accessor = None # AnnotationsRowAccessor
|
|
471
|
+
self._anns = None # Annotations
|
|
472
|
+
self._atts = None # Dict[str, str]
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def row_accessor(self) -> dk_annots.AnnotationsRowAccessor:
|
|
476
|
+
"""Get the row accessor for this instance's annotations."""
|
|
477
|
+
if self._row_accessor is None:
|
|
478
|
+
self._row_accessor = dk_annots.AnnotationsRowAccessor(
|
|
479
|
+
self.auth.metadata, derived_cols=self.auth.field_groups
|
|
480
|
+
)
|
|
481
|
+
return self._row_accessor
|
|
482
|
+
|
|
483
|
+
@property
|
|
484
|
+
def anns(self) -> dk_annots.Annotations:
|
|
485
|
+
"""Get this instance's annotation rows as an annotations object"""
|
|
486
|
+
if self._anns is None:
|
|
487
|
+
self._anns = dk_annots.Annotations(self.auth.metadata)
|
|
488
|
+
for row_dict in self.ann_row_dicts:
|
|
489
|
+
self._anns.add_dict(row_dict)
|
|
490
|
+
return self._anns
|
|
491
|
+
|
|
492
|
+
@property
|
|
493
|
+
def df(self) -> pd.DataFrame:
|
|
494
|
+
"""Get the annotation's dataframe"""
|
|
495
|
+
return self.anns.df
|
|
496
|
+
|
|
497
|
+
def get_field_type(self, row: pd.Series) -> str:
|
|
498
|
+
"""Get the entity field type value"""
|
|
499
|
+
return self.row_accessor.get_col_value("field_type", row, None)
|
|
500
|
+
|
|
501
|
+
def get_text(self, row: pd.Series) -> str:
|
|
502
|
+
"""Get the entity text from the row"""
|
|
503
|
+
return self.row_accessor.get_col_value(self.auth.metadata.text_col, row, None)
|
|
504
|
+
|
|
505
|
+
@property
|
|
506
|
+
def attributes(self) -> Dict[str, str]:
|
|
507
|
+
"""Get this instance's annotation entity attributes"""
|
|
508
|
+
if self._atts is None:
|
|
509
|
+
self._atts = {
|
|
510
|
+
self.get_field_type(row): self.get_text(row) for _, row in self.df.iterrows()
|
|
511
|
+
}
|
|
512
|
+
return self._atts
|
|
513
|
+
|
|
514
|
+
def colval(self, col_name, row) -> Any:
|
|
515
|
+
"""Get the column's value from the given row"""
|
|
516
|
+
return self.row_accessor.get_col_value(col_name, row)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
class AuthorityFactory(ABC):
|
|
520
|
+
"""A factory class for building an authority."""
|
|
521
|
+
|
|
522
|
+
@abstractmethod
|
|
523
|
+
def build_authority(
|
|
524
|
+
self,
|
|
525
|
+
name: str,
|
|
526
|
+
auth_anns_builder: AuthorityAnnotationsBuilder,
|
|
527
|
+
authdata: AuthorityData,
|
|
528
|
+
parent_auth: Authority = None,
|
|
529
|
+
) -> Authority:
|
|
530
|
+
"""Build an authority with the given name and data.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
name: The authority name.
|
|
534
|
+
auth_anns_builder: The authority annotations row builder to use
|
|
535
|
+
for building annotation rows.
|
|
536
|
+
authdata: The authority data.
|
|
537
|
+
parent_auth: The parent authority.
|
|
538
|
+
|
|
539
|
+
Returns:
|
|
540
|
+
The authority.
|
|
541
|
+
"""
|
|
542
|
+
raise NotImplementedError
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
class LexicalAuthority(Authority):
|
|
546
|
+
"""A class for managing named entities by ID with associated values and
|
|
547
|
+
variations.
|
|
548
|
+
"""
|
|
549
|
+
|
|
550
|
+
def __init__(
|
|
551
|
+
self,
|
|
552
|
+
name: str,
|
|
553
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
554
|
+
authdata: AuthorityData = None,
|
|
555
|
+
field_groups: DerivedFieldGroups = None,
|
|
556
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
557
|
+
parent_auth: "Authority" = None,
|
|
558
|
+
):
|
|
559
|
+
"""Initialize with this authority's metadata.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
name: This authority's entity name.
|
|
563
|
+
auth_anns_builder: The authority annotations row builder to use
|
|
564
|
+
for building annotation rows.
|
|
565
|
+
authdata: The authority data.
|
|
566
|
+
field_groups: The derived field groups to use.
|
|
567
|
+
anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
568
|
+
the list of annotation row dicts are valid to be added as
|
|
569
|
+
annotations for a single match or "entity".
|
|
570
|
+
parent_auth: This authority's parent authority (if any).
|
|
571
|
+
"""
|
|
572
|
+
super().__init__(
|
|
573
|
+
name,
|
|
574
|
+
auth_anns_builder=auth_anns_builder,
|
|
575
|
+
authdata=authdata,
|
|
576
|
+
field_groups=field_groups,
|
|
577
|
+
anns_validator=anns_validator,
|
|
578
|
+
parent_auth=parent_auth,
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
@abstractmethod
|
|
582
|
+
def get_value_ids(self, value: Any) -> Set[Any]:
|
|
583
|
+
"""Get all IDs associated with the given value. Note that typically
|
|
584
|
+
there is a single ID for any value, but this allows for inherent
|
|
585
|
+
ambiguities in the authority.
|
|
586
|
+
|
|
587
|
+
Args:
|
|
588
|
+
value: An authority value.
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
The associated IDs or an empty set if the value is not valid.
|
|
592
|
+
"""
|
|
593
|
+
raise NotImplementedError
|
|
594
|
+
|
|
595
|
+
@abstractmethod
|
|
596
|
+
def get_values_by_id(self, value_id: Any) -> Set[Any]:
|
|
597
|
+
"""Get all values for the associated value ID. Note that typically
|
|
598
|
+
there is a single value for an ID, but this allows for inherent
|
|
599
|
+
ambiguities in the authority.
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
value_id: An authority value ID.
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
The associated values or an empty set if the value is not valid.
|
|
606
|
+
"""
|
|
607
|
+
raise NotImplementedError
|
|
608
|
+
|
|
609
|
+
@abstractmethod
|
|
610
|
+
def get_id_by_variation(self, variation: str) -> Set[str]:
|
|
611
|
+
"""Get the IDs of the value(s) associated with the given variation.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
variation: Variation text.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
The possibly empty set of associated value IDS.
|
|
618
|
+
"""
|
|
619
|
+
raise NotImplementedError
|
|
620
|
+
|
|
621
|
+
@abstractmethod
|
|
622
|
+
def find_variations(
|
|
623
|
+
self,
|
|
624
|
+
variation: str,
|
|
625
|
+
starts_with: bool = False,
|
|
626
|
+
ends_with: bool = False,
|
|
627
|
+
scope: str = "fullmatch",
|
|
628
|
+
) -> pd.Series:
|
|
629
|
+
"""Find all matches to the given variation.
|
|
630
|
+
|
|
631
|
+
Note:
|
|
632
|
+
Only the first true of starts_with, ends_with, and scope will
|
|
633
|
+
be applied. If none of these are true, a full match on the pattern
|
|
634
|
+
is performed.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
variation: The text to find; treated as a regular expression
|
|
638
|
+
unless either starts_with or ends_with is True.
|
|
639
|
+
starts_with: When True, find all terms that start with the
|
|
640
|
+
variation text.
|
|
641
|
+
ends_with: When True, find all terms that end with the variation
|
|
642
|
+
text.
|
|
643
|
+
scope: 'fullmatch' (default), 'match', or 'contains' for
|
|
644
|
+
strict, less strict, and least strict matching.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
The matching variations as a pd.Series.
|
|
648
|
+
"""
|
|
649
|
+
raise NotImplementedError
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
class RegexAuthority(Authority):
|
|
653
|
+
"""A class for managing named entities by ID with associated values and
|
|
654
|
+
variations.
|
|
655
|
+
"""
|
|
656
|
+
|
|
657
|
+
def __init__(
|
|
658
|
+
self,
|
|
659
|
+
name: str,
|
|
660
|
+
regex: re.Pattern,
|
|
661
|
+
canonical_fn: Callable[[str, str], Any] = None,
|
|
662
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
663
|
+
authdata: AuthorityData = None,
|
|
664
|
+
field_groups: DerivedFieldGroups = None,
|
|
665
|
+
anns_validator: Callable[[Authority, Dict[str, Any]], bool] = None,
|
|
666
|
+
parent_auth: "Authority" = None,
|
|
667
|
+
):
|
|
668
|
+
"""Initialize with this authority's entity name.
|
|
669
|
+
|
|
670
|
+
Note:
|
|
671
|
+
If the regular expression has capturing groups, each group
|
|
672
|
+
will result in a separate entity, with the group name if provided
|
|
673
|
+
in the regular expression as ...(?P<group_name>group_regex)...
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
name: The authority name.
|
|
677
|
+
regex: The regular expression to apply.
|
|
678
|
+
canonical_fn: A function, fn(match_text, group_name), to
|
|
679
|
+
transform input matches to a canonical form as a value_id.
|
|
680
|
+
Where group_name will be None and the full match text will be
|
|
681
|
+
passed in if there are no group names. Note that the canonical form
|
|
682
|
+
is computed before the match_validator is applied and its value
|
|
683
|
+
will be found as the value to the <auth_id> key.
|
|
684
|
+
auth_anns_builder: The authority annotations row builder to use
|
|
685
|
+
for building annotation rows.
|
|
686
|
+
authdata: The authority data.
|
|
687
|
+
field_groups: The derived field groups to use.
|
|
688
|
+
anns_validator: A validation function for each regex match
|
|
689
|
+
formed as a list of annotation row dictionaries, one row dictionary
|
|
690
|
+
for each matching regex group. If the validator returns False,
|
|
691
|
+
then the annotation rows will be rejected. The entity_text key
|
|
692
|
+
will hold matched text and the <auth_name>_field key will hold
|
|
693
|
+
the group name or number (if there are groups with or without names)
|
|
694
|
+
or the <auth_name> if there are no groups in the regular expression.
|
|
695
|
+
Note that the validator function takes the regex authority instance
|
|
696
|
+
as its first parameter to provide access to the field_groups, etc.
|
|
697
|
+
The validation_fn signature is: fn(regexAuthority, ann_row_dicts)
|
|
698
|
+
and returns a boolean.
|
|
699
|
+
parent_auth: This authority's parent authority (if any).
|
|
700
|
+
"""
|
|
701
|
+
super().__init__(
|
|
702
|
+
name,
|
|
703
|
+
auth_anns_builder=auth_anns_builder,
|
|
704
|
+
authdata=authdata,
|
|
705
|
+
field_groups=field_groups,
|
|
706
|
+
anns_validator=anns_validator,
|
|
707
|
+
parent_auth=parent_auth,
|
|
708
|
+
)
|
|
709
|
+
self.regex = regex
|
|
710
|
+
self.canonical_fn = canonical_fn
|
|
711
|
+
|
|
712
|
+
def has_value(self, value: Any) -> re.Match:
|
|
713
|
+
"""Determine whether the given value is in this authority.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
value: A possible authority value.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
None if the value is not a valid entity value; otherwise,
|
|
720
|
+
return the re.Match object.
|
|
721
|
+
"""
|
|
722
|
+
return self.regex.match(str(value))
|
|
723
|
+
|
|
724
|
+
def add_annotations(
|
|
725
|
+
self,
|
|
726
|
+
text_obj: dk_annots.AnnotatedText,
|
|
727
|
+
) -> dk_annots.Annotations:
|
|
728
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
text_obj: The annotated text object to process and add annotations.
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
The added Annotations.
|
|
735
|
+
"""
|
|
736
|
+
for match in re.finditer(self.regex, text_obj.text):
|
|
737
|
+
ann_dicts = []
|
|
738
|
+
if match.lastindex is not None:
|
|
739
|
+
if len(self.regex.groupindex) > 0: # we have named groups
|
|
740
|
+
for group_name, group_num in self.regex.groupindex.items():
|
|
741
|
+
group_text = match.group(group_num)
|
|
742
|
+
kwargs = {self.field_groups.get_field_type_col(self.name): group_name}
|
|
743
|
+
ann_dicts.append(
|
|
744
|
+
self.build_annotation(
|
|
745
|
+
start_pos=match.start(group_name),
|
|
746
|
+
end_pos=match.end(group_name),
|
|
747
|
+
entity_text=group_text,
|
|
748
|
+
auth_value_id=self.get_canonical_form(group_text, group_name),
|
|
749
|
+
**kwargs,
|
|
750
|
+
)
|
|
751
|
+
)
|
|
752
|
+
else: # we have only numbers for groups
|
|
753
|
+
for group_num, group_text in enumerate(match.groups()):
|
|
754
|
+
group_num += 1
|
|
755
|
+
kwargs = {self.field_groups.get_field_type_col(self.name): group_num}
|
|
756
|
+
ann_dicts.append(
|
|
757
|
+
self.build_annotation(
|
|
758
|
+
start_pos=match.start(group_num),
|
|
759
|
+
end_pos=match.end(group_num),
|
|
760
|
+
entity_text=group_text,
|
|
761
|
+
auth_value_id=self.get_canonical_form(group_text, group_num),
|
|
762
|
+
**kwargs,
|
|
763
|
+
)
|
|
764
|
+
)
|
|
765
|
+
else: # we have no groups
|
|
766
|
+
ann_dicts.append(
|
|
767
|
+
self.build_annotation(
|
|
768
|
+
start_pos=match.start(),
|
|
769
|
+
end_pos=match.end(),
|
|
770
|
+
entity_text=match.group(),
|
|
771
|
+
auth_value_id=self.get_canonical_form(match.group(), self.name),
|
|
772
|
+
)
|
|
773
|
+
)
|
|
774
|
+
if self.validate_ann_dicts(ann_dicts):
|
|
775
|
+
# Add non-empty, valid annotation dicts to the result
|
|
776
|
+
text_obj.annotations.add_dicts(ann_dicts)
|
|
777
|
+
return text_obj.annotations
|
|
778
|
+
|
|
779
|
+
def get_canonical_form(self, entity_text: str, entity_type: str) -> Any:
|
|
780
|
+
if self.canonical_fn is not None:
|
|
781
|
+
entity_text = self.canonical_fn(entity_text, entity_type)
|
|
782
|
+
return entity_text
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
class AuthoritiesBundle(Authority):
|
|
786
|
+
"""An authority for expressing values through multiple bundled "authorities"
|
|
787
|
+
like dictionary-based and/or multiple regular expression patterns.
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
def __init__(
|
|
791
|
+
self,
|
|
792
|
+
name: str,
|
|
793
|
+
auth_anns_builder: AuthorityAnnotationsBuilder = None,
|
|
794
|
+
authdata: AuthorityData = None,
|
|
795
|
+
field_groups: DerivedFieldGroups = None,
|
|
796
|
+
parent_auth: "Authority" = None,
|
|
797
|
+
anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
|
|
798
|
+
auths: List[Authority] = None,
|
|
799
|
+
):
|
|
800
|
+
"""Initialize the AuthoritiesBundle.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
name: This authority's entity name.
|
|
804
|
+
auth_anns_builder: The authority annotations row builder to use
|
|
805
|
+
for building annotation rows.
|
|
806
|
+
authdata: The authority data.
|
|
807
|
+
field_groups: The derived field groups to use.
|
|
808
|
+
anns_validator: fn(auth, anns_dict_list) that returns True if
|
|
809
|
+
the list of annotation row dicts are valid to be added as
|
|
810
|
+
annotations for a single match or "entity".
|
|
811
|
+
parent_auth: This authority's parent authority (if any).
|
|
812
|
+
auths: The authorities to bundle together.
|
|
813
|
+
"""
|
|
814
|
+
super().__init__(
|
|
815
|
+
name,
|
|
816
|
+
auth_anns_builder=auth_anns_builder,
|
|
817
|
+
authdata=authdata,
|
|
818
|
+
field_groups=field_groups,
|
|
819
|
+
anns_validator=anns_validator,
|
|
820
|
+
parent_auth=parent_auth,
|
|
821
|
+
)
|
|
822
|
+
self.auths = auths.copy() if auths is not None else []
|
|
823
|
+
|
|
824
|
+
def add(self, auth: Authority):
|
|
825
|
+
"""Add the authority to this bundle.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
auth: The authority to add.
|
|
829
|
+
"""
|
|
830
|
+
self.auths.append(auth)
|
|
831
|
+
|
|
832
|
+
def has_value(self, value: Any) -> bool:
|
|
833
|
+
"""Determine whether the given value is in this authority.
|
|
834
|
+
|
|
835
|
+
Args:
|
|
836
|
+
value: A possible authority value.
|
|
837
|
+
|
|
838
|
+
Returns:
|
|
839
|
+
True if the value is a valid entity value.
|
|
840
|
+
"""
|
|
841
|
+
for auth in self.auths:
|
|
842
|
+
if auth.has_value(value):
|
|
843
|
+
return True
|
|
844
|
+
return False
|
|
845
|
+
|
|
846
|
+
def add_annotations(
|
|
847
|
+
self,
|
|
848
|
+
text_obj: dk_annots.AnnotatedText,
|
|
849
|
+
) -> dk_annots.Annotations:
|
|
850
|
+
"""Method to do the work of finding, validating, and adding annotations.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
text_obj: The annotated text object to process and add annotations.
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
The added Annotations.
|
|
857
|
+
"""
|
|
858
|
+
for auth in self.auths:
|
|
859
|
+
auth.annotate_input(text_obj)
|
|
860
|
+
return text_obj.annotations
|