dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,766 @@
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Callable
4
+ from typing import Any, Dict, List, Set, Union
5
+
6
+ import pandas as pd
7
+
8
+ import dataknobs_xization.annotations as dk_annots
9
+
10
+ # Key annotation column name constants
11
+ KEY_AUTH_ID_COL = "auth_id"
12
+
13
+
14
+ class DerivedFieldGroups(dk_annots.DerivedAnnotationColumns):
15
+ """Defines derived column types:
16
+ * "field_type" -- The column holding they type of field of an annotation row
17
+ * "field_group" -- The column holding the group number(s) of the field
18
+ * "field_record" -- The column holding record number(s) of the field
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ field_type_suffix: str = "_field",
24
+ field_group_suffix: str = "_num",
25
+ field_record_suffix: str = "_recsnum",
26
+ ):
27
+ """Add derived column types/names: Given an annnotation row,
28
+ * field_type(row) == f'{row[ann_type_col]}_field'
29
+ * field_group(row) == f'{row[ann_type_col]}_num'
30
+ * field_record(row) == f'{row[ann_type_col])_recsnum'
31
+
32
+ Where:
33
+ * A field_type column holds annotation "sub"- type values, or fields
34
+ * A field_group column identifies groups of annotation fields
35
+ * A field_record column identifies groups of annotation field groups
36
+
37
+ :param field_type_suffix: The field_type col name suffix (if not _field)
38
+ :param field_group_suffix: The field_group col name suffix (if not _num)
39
+ :param field_record_suffix: field_record colname sfx (if not _recsnum)
40
+ """
41
+ self.field_type_suffix = field_type_suffix
42
+ self.field_group_suffix = field_group_suffix
43
+ self.field_record_suffix = field_record_suffix
44
+
45
+ def get_col_value(
46
+ self,
47
+ metadata: dk_annots.AnnotationsMetaData,
48
+ col_type: str,
49
+ row: pd.Series,
50
+ missing: str = None,
51
+ ) -> str:
52
+ """Get the value of the column in the given row derived from col_type,
53
+ where col_type is one of:
54
+ * "field_type" == f"{field}_field"
55
+ * "field_group" == f"{field}_num"
56
+ * "field_record" == f"{field}_recsnum"
57
+
58
+ And "field" is the row_accessor's metadata's "ann_type" col's value.
59
+
60
+ :param metadata: The AnnotationsMetaData
61
+ :param col_type: The type of column value to derive
62
+ :param row: A row from which to get the value.
63
+ :param missing: The value to return for unknown or missing column
64
+ :return: The row value or the missing value
65
+ """
66
+ value = missing
67
+ if metadata.ann_type_col in row.index:
68
+ field = row[metadata.ann_type_col]
69
+ if field is not None:
70
+ if col_type == "field_type":
71
+ col_name = self.get_field_type_col(field)
72
+ elif col_type == "field_group":
73
+ col_name = self.get_field_group_col(field)
74
+ elif col_type == "field_record":
75
+ col_name = self.get_field_record_col(field)
76
+ if col_name is not None and col_name in row.index:
77
+ value = row[col_name]
78
+ return value
79
+
80
+ def unpack_field(self, field_value: str) -> str:
81
+ """Given a field in any of its derivatives (like field type, field group
82
+ or field record,) unpack and return the basic field value itself.
83
+ """
84
+ field = field_value
85
+ if field.endswith(self.field_record_suffix):
86
+ field = field.replace(self.field_record_suffix, "")
87
+ elif field.endswith(self.field_group_suffix):
88
+ field = field.replace(self.field_group_suffix, "")
89
+ elif field.endswith(self.field_type_suffix):
90
+ field = field.replace(self.field_type_suffix, "")
91
+ return field
92
+
93
+ def get_field_name(self, field_value: str) -> str:
94
+ """Given a field name or field col name, e.g., an annotation type col's
95
+ value (the field name); or a field type, group, or record column name,
96
+ get the field name.
97
+ """
98
+ return self.unpack_field(field_value)
99
+
100
+ def get_field_type_col(self, field_value: str) -> str:
101
+ """Given a field name or field col name, e.g., an annotation type col's
102
+ value; or a field type, group, or record column name, get the field
103
+ name.
104
+ """
105
+ field = self.unpack_field(field_value)
106
+ return f"{field}{self.field_type_suffix}"
107
+
108
+ def get_field_group_col(self, field_value: str) -> str:
109
+ """Given a field name or field col name, e.g., an annotation type col's
110
+ value; or a field type, group, or record, get the name of the derived
111
+ field group column.
112
+ """
113
+ field = self.unpack_field(field_value)
114
+ return f"{field}{self.field_group_suffix}"
115
+
116
+ def get_field_record_col(self, field_value: str) -> str:
117
+ """Given a field name or field col name, e.g., an annotation type col's
118
+ value; or a field type, group, or record, get the name of the derived
119
+ field record column.
120
+ """
121
+ field = self.unpack_field(field_value)
122
+ return f"{field}{self.field_record_suffix}"
123
+
124
+
125
+ class AuthorityAnnotationsMetaData(dk_annots.AnnotationsMetaData):
126
+ """An extension of AnnotationsMetaData that adds an 'auth_id_col' to the
127
+ standard (key) annotation columns (attributes).
128
+ """
129
+
130
+ def __init__(
131
+ self,
132
+ start_pos_col: str = dk_annots.KEY_START_POS_COL,
133
+ end_pos_col: str = dk_annots.KEY_END_POS_COL,
134
+ text_col: str = dk_annots.KEY_TEXT_COL,
135
+ ann_type_col: str = dk_annots.KEY_ANN_TYPE_COL,
136
+ auth_id_col: str = KEY_AUTH_ID_COL,
137
+ sort_fields: List[str] = (dk_annots.KEY_START_POS_COL, dk_annots.KEY_END_POS_COL),
138
+ sort_fields_ascending: List[bool] = (True, False),
139
+ **kwargs,
140
+ ):
141
+ """Initialize with key (and more) column names and info.
142
+
143
+ Key column types:
144
+ * start_pos
145
+ * end_pos
146
+ * text
147
+ * ann_type
148
+ * auth_id
149
+
150
+ Notes:
151
+ * Actual table columns can be named arbitrarily
152
+ * BUT: interactions through annotations classes and interfaces
153
+ relating to the "key" columns must use the key column constants
154
+
155
+ :param start_pos_col: Col name for the token starting position
156
+ :param end_pos_col: Col name for the token ending position
157
+ :param text_col: Col name for the token text
158
+ :param ann_type_col: Col name for the annotation types
159
+ :param auth_id_col: Col name for the authority value ID
160
+ :param sort_fields: The col types relevant for sorting annotation rows
161
+ :param sort_fields_ascending: To specify sort order of sort_fields
162
+ :param **kwargs: More column types mapped to column names
163
+ """
164
+ super().__init__(
165
+ start_pos_col=start_pos_col,
166
+ end_pos_col=end_pos_col,
167
+ text_col=text_col,
168
+ ann_type_col=ann_type_col,
169
+ sort_fields=sort_fields,
170
+ sort_fields_ascending=sort_fields_ascending,
171
+ auth_id=auth_id_col,
172
+ **kwargs,
173
+ )
174
+
175
+ @property
176
+ def auth_id_col(self) -> str:
177
+ """Get the column name for the auth_id"""
178
+ return self.data[KEY_AUTH_ID_COL]
179
+
180
+
181
+ class AuthorityAnnotationsBuilder(dk_annots.AnnotationsBuilder):
182
+ """An extension of an AnnotationsBuilder that adds the 'auth_id' column."""
183
+
184
+ def __init__(
185
+ self,
186
+ metadata: AuthorityAnnotationsMetaData = None,
187
+ data_defaults: Dict[str, Any] = None,
188
+ ):
189
+ """:param metadata: The authority annotations metadata
190
+ :param data_defaults: Dict[ann_colname, default_value] with default
191
+ values for annotation columns
192
+ """
193
+ super().__init__(
194
+ metadata if metadata is not None else AuthorityAnnotationsMetaData(), data_defaults
195
+ )
196
+
197
+ def build_annotation_row(
198
+ self, start_pos: int, end_pos: int, text: str, ann_type: str, auth_id: str, **kwargs
199
+ ) -> Dict[str, Any]:
200
+ """Build an annotation row with the mandatory key values and those from
201
+ the remaining keyword arguments.
202
+
203
+ For those kwargs whose names match metadata column names, override the
204
+ data_defaults and add remaining data_default attributes.
205
+
206
+ :param result_row_dict: The result row dictionary being built
207
+ :param start_pos: The token start position
208
+ :param end_pos: The token end position
209
+ :param text: The token text
210
+ :param ann_type: The annotation type
211
+ :param auth_id: The authority ID for the row
212
+ :return: The result_row_dict
213
+ """
214
+ return self.do_build_row(
215
+ {
216
+ self.metadata.start_pos_col: start_pos,
217
+ self.metadata.end_pos_col: end_pos,
218
+ self.metadata.text_col: text,
219
+ self.metadata.ann_type_col: ann_type,
220
+ self.metadata.auth_id_col: auth_id,
221
+ },
222
+ **kwargs,
223
+ )
224
+
225
+
226
+ class AuthorityData:
227
+ """A wrapper for authority data."""
228
+
229
+ def __init__(self, df: pd.DataFrame, name: str):
230
+ self._df = df
231
+ self.name = name
232
+
233
+ @property
234
+ def df(self) -> pd.DataFrame:
235
+ """Get the authority data in a dataframe"""
236
+ return self._df
237
+
238
+ def lookup_values(self, value: Any, is_id=False) -> pd.DataFrame:
239
+ """Lookup authority value(s) for the given value or value id.
240
+ :param value: A value or value_id for this authority
241
+ :param is_id: True if value is an ID
242
+ :return: The applicable authority dataframe rows.
243
+ """
244
+ col = self.df.index if is_id else self.df[self.name]
245
+ return self.df[col == value]
246
+
247
+
248
+ class Authority(dk_annots.Annotator):
249
+ """A class for managing and defining tabular authoritative data for e.g.,
250
+ taxonomies, etc., and using them to annotate instances within text.
251
+ """
252
+
253
+ def __init__(
254
+ self,
255
+ name: str,
256
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
257
+ authdata: AuthorityData = None,
258
+ field_groups: DerivedFieldGroups = None,
259
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
260
+ parent_auth: "Authority" = None,
261
+ ):
262
+ """Initialize with this authority's metadata.
263
+ :param name: This authority's entity name
264
+ :param auth_anns_builder: The authority annotations row builder to use
265
+ for building annotation rows.
266
+ :param authdata: The authority data
267
+ :param field_groups: The derived field groups to use
268
+ :param anns_validator: fn(auth, anns_dict_list) that returns True if
269
+ the list of annotation row dicts are valid to be added as
270
+ annotations for a single match or "entity".
271
+ :param parent_auth: This authority's parent authority (if any)
272
+ """
273
+ super().__init__(name)
274
+ self.anns_builder = (
275
+ auth_anns_builder if auth_anns_builder is not None else AuthorityAnnotationsBuilder()
276
+ )
277
+ self.authdata = authdata
278
+ self.field_groups = field_groups if field_groups is not None else DerivedFieldGroups()
279
+ self.anns_validator = anns_validator
280
+ self._parent = parent_auth
281
+
282
+ @property
283
+ def metadata(self) -> AuthorityAnnotationsMetaData:
284
+ """Get the meta-data"""
285
+ return self.anns_builder.metadata
286
+
287
+ @property
288
+ def parent(self) -> "Authority":
289
+ """Get this authority's parent, or None."""
290
+ return self._parent
291
+
292
+ @abstractmethod
293
+ def has_value(self, value: Any) -> bool:
294
+ """Determine whether the given value is in this authority.
295
+ :param value: A possible authority value.
296
+ :return: True if the value is a valid entity value.
297
+ """
298
+ raise NotImplementedError
299
+
300
+ def annotate_input(
301
+ self,
302
+ text_obj: Union[dk_annots.AnnotatedText, str],
303
+ **kwargs,
304
+ ) -> dk_annots.Annotations:
305
+ """Find and annotate this authority's entities in the document text
306
+ as dictionaries like:
307
+ [
308
+ {
309
+ 'input_id': <id>,
310
+ 'start_pos': <start_char_pos>,
311
+ 'end_pos': <end_char_pos>,
312
+ 'entity_text': <entity_text>,
313
+ 'ann_type': <authority_name>,
314
+ '<auth_id>': <auth_value_id_or_canonical_form>,
315
+ 'confidence': <confidence_if_available>,
316
+ },
317
+ ]
318
+ :param text_obj: The text object or string to process.
319
+ :return: An Annotations instance
320
+ """
321
+ if text_obj is not None:
322
+ if isinstance(text_obj, str) and len(text_obj.strip()) > 0:
323
+ text_obj = dk_annots.AnnotatedText(
324
+ text_obj,
325
+ annots_metadata=self.metadata,
326
+ )
327
+ if text_obj is not None:
328
+ annotations = self.add_annotations(text_obj)
329
+ return annotations
330
+
331
+ @abstractmethod
332
+ def add_annotations(
333
+ self,
334
+ text_obj: dk_annots.AnnotatedText,
335
+ ) -> dk_annots.Annotations:
336
+ """Method to do the work of finding, validating, and adding annotations.
337
+ :param text_obj: The annotated text object to process and add annotations.
338
+ :return: The added Annotations
339
+ """
340
+ raise NotImplementedError
341
+
342
+ def validate_ann_dicts(self, ann_dicts: List[Dict[str, Any]]) -> bool:
343
+ """The annotation row dictionaries are valid if:
344
+ * They are non-empty
345
+ * and
346
+ * either there is no annotations validator
347
+ * or they are valid according to the validator
348
+ :param ann_dicts: Annotation dictionaries
349
+ :return: True if valid
350
+ """
351
+ return len(ann_dicts) > 0 and (
352
+ self.anns_validator is None or self.anns_validator(self, ann_dicts)
353
+ )
354
+
355
+ def compose(
356
+ self,
357
+ annotations: dk_annots.Annotations,
358
+ ) -> dk_annots.Annotations:
359
+ """Compose annotations into groups.
360
+ :param annotations: The annotations
361
+ :return: composed annotations
362
+ """
363
+ return annotations
364
+
365
+ def build_annotation(
366
+ self,
367
+ start_pos: int = None,
368
+ end_pos: int = None,
369
+ entity_text: str = None,
370
+ auth_value_id: Any = None,
371
+ conf: float = 1.0,
372
+ **kwargs,
373
+ ) -> Dict[str, Any]:
374
+ """Build annotations with the given components."""
375
+ return self.anns_builder.build_annotation_row(
376
+ start_pos, end_pos, entity_text, self.name, auth_value_id, auth_valconf=conf, **kwargs
377
+ )
378
+
379
+
380
+ class AnnotationsValidator(ABC):
381
+ """A base class with helper functions for performing validations on annotation
382
+ rows.
383
+ """
384
+
385
+ def __call__(
386
+ self,
387
+ auth: Authority,
388
+ ann_row_dicts: List[Dict[str, Any]],
389
+ ) -> bool:
390
+ """Call function to enable instances of this type of class to be passed in
391
+ as a anns_validator function to an Authority.
392
+ :param auth: The authority proposing annotations
393
+ :param ann_row_dicts: The proposed annotations
394
+ :return: True if the annotations are valid; otherwise, False
395
+ """
396
+ return self.validate_annotation_rows(
397
+ AnnotationsValidator.AuthAnnotations(auth, ann_row_dicts)
398
+ )
399
+
400
+ @abstractmethod
401
+ def validate_annotation_rows(
402
+ self,
403
+ auth_annotations: "AnnotationsValidator.AuthAnnotations",
404
+ ) -> bool:
405
+ """Determine whether the proposed authority annotation rows are valid.
406
+ :param auth_annotations: The AuthAnnotations instance with the
407
+ proposed data.
408
+ :return: True if valid; False if not.
409
+ """
410
+ raise NotImplementedError
411
+
412
+ class AuthAnnotations:
413
+ """A wrapper class for convenient access to the entity annotations."""
414
+
415
+ def __init__(self, auth: Authority, ann_row_dicts: List[Dict[str, Any]]):
416
+ self.auth = auth
417
+ self.ann_row_dicts = ann_row_dicts
418
+ self._row_accessor = None # AnnotationsRowAccessor
419
+ self._anns = None # Annotations
420
+ self._atts = None # Dict[str, str]
421
+
422
+ @property
423
+ def row_accessor(self) -> dk_annots.AnnotationsRowAccessor:
424
+ """Get the row accessor for this instance's annotations."""
425
+ if self._row_accessor is None:
426
+ self._row_accessor = dk_annots.AnnotationsRowAccessor(
427
+ self.auth.metadata, derived_cols=self.auth.field_groups
428
+ )
429
+ return self._row_accessor
430
+
431
+ @property
432
+ def anns(self) -> dk_annots.Annotations:
433
+ """Get this instance's annotation rows as an annotations object"""
434
+ if self._anns is None:
435
+ self._anns = dk_annots.Annotations(self.auth.metadata)
436
+ for row_dict in self.ann_row_dicts:
437
+ self._anns.add_dict(row_dict)
438
+ return self._anns
439
+
440
+ @property
441
+ def df(self) -> pd.DataFrame:
442
+ """Get the annotation's dataframe"""
443
+ return self.anns.df
444
+
445
+ def get_field_type(self, row: pd.Series) -> str:
446
+ """Get the entity field type value"""
447
+ return self.row_accessor.get_col_value("field_type", row, None)
448
+
449
+ def get_text(self, row: pd.Series) -> str:
450
+ """Get the entity text from the row"""
451
+ return self.row_accessor.get_col_value(self.auth.metadata.text_col, row, None)
452
+
453
+ @property
454
+ def attributes(self) -> Dict[str, str]:
455
+ """Get this instance's annotation entity attributes"""
456
+ if self._atts is None:
457
+ self._atts = {
458
+ self.get_field_type(row): self.get_text(row) for _, row in self.df.iterrows()
459
+ }
460
+ return self._atts
461
+
462
+ def colval(self, col_name, row) -> Any:
463
+ """Get the column's value from the given row"""
464
+ return self.row_accessor.get_col_value(col_name, row)
465
+
466
+
467
+ class AuthorityFactory(ABC):
468
+ """A factory class for building an authority."""
469
+
470
+ @abstractmethod
471
+ def build_authority(
472
+ self,
473
+ name: str,
474
+ auth_anns_builder: AuthorityAnnotationsBuilder,
475
+ authdata: AuthorityData,
476
+ parent_auth: Authority = None,
477
+ ) -> Authority:
478
+ """Build an authority with the given name and data.
479
+ :param name: The authority name
480
+ :param auth_anns_builder: The authority annotations row builder to use
481
+ for building annotation rows.
482
+ :param authdata: The authority data
483
+ :param parent_auth: The parent authority.
484
+ :return: The authority
485
+ """
486
+ raise NotImplementedError
487
+
488
+
489
+ class LexicalAuthority(Authority):
490
+ """A class for managing named entities by ID with associated values and
491
+ variations.
492
+ """
493
+
494
+ def __init__(
495
+ self,
496
+ name: str,
497
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
498
+ authdata: AuthorityData = None,
499
+ field_groups: DerivedFieldGroups = None,
500
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
501
+ parent_auth: "Authority" = None,
502
+ ):
503
+ """Initialize with this authority's metadata.
504
+ :param name: This authority's entity name
505
+ :param auth_anns_builder: The authority annotations row builder to use
506
+ for building annotation rows.
507
+ :param authdata: The authority data
508
+ :param field_groups: The derived field groups to use
509
+ :param anns_validator: fn(auth, anns_dict_list) that returns True if
510
+ the list of annotation row dicts are valid to be added as
511
+ annotations for a single match or "entity".
512
+ :param parent_auth: This authority's parent authority (if any)
513
+ """
514
+ super().__init__(
515
+ name,
516
+ auth_anns_builder=auth_anns_builder,
517
+ authdata=authdata,
518
+ field_groups=field_groups,
519
+ anns_validator=anns_validator,
520
+ parent_auth=parent_auth,
521
+ )
522
+
523
+ @abstractmethod
524
+ def get_value_ids(self, value: Any) -> Set[Any]:
525
+ """Get all IDs associated with the given value. Note that typically
526
+ there is a single ID for any value, but this allows for inherent
527
+ ambiguities in the authority.
528
+ :param value: An authority value
529
+ :return: The associated IDs or an empty set if the value is not valid.
530
+ """
531
+ raise NotImplementedError
532
+
533
+ @abstractmethod
534
+ def get_values_by_id(self, value_id: Any) -> Set[Any]:
535
+ """Get all values for the associated value ID. Note that typically
536
+ there is a single value for an ID, but this allows for inherent
537
+ ambiguities in the authority.
538
+
539
+ :param value: An authority value
540
+ :return: The associated IDs or an empty set if the value is not valid.
541
+ """
542
+ raise NotImplementedError
543
+
544
+ @abstractmethod
545
+ def get_id_by_variation(self, variation: str) -> Set[str]:
546
+ """Get the IDs of the value(s) associated with the given variation.
547
+ :param variation: Variation text
548
+ :return: The possibly empty set of associated value IDS.
549
+ """
550
+ raise NotImplementedError
551
+
552
+ @abstractmethod
553
+ def find_variations(
554
+ self,
555
+ variation: str,
556
+ starts_with: bool = False,
557
+ ends_with: bool = False,
558
+ scope: str = "fullmatch",
559
+ ) -> pd.Series:
560
+ """Find all matches to the given variation.
561
+ :param variation: The text to find; treated as a regular expression
562
+ unless either starts_with or ends_with is True.
563
+ :param starts_with: When True, find all terms that start with the
564
+ variation text.
565
+ :param ends_with: When True, find all terms that end with the variation
566
+ text.
567
+ :param scope: 'fullmatch' (default), 'match', or 'contains' for
568
+ strict, less strict, and least strict matching
569
+ :param category_constraints: When present, limit results to terms with
570
+ the given constraints.
571
+ :return: The matching variations as a pd.Series
572
+
573
+ Note only the first true of starts_with, ends_with, and scope will
574
+ be applied. If none of these are true, an full match on the pattern
575
+ is performed.
576
+ """
577
+ raise NotImplementedError
578
+
579
+
580
+ class RegexAuthority(Authority):
581
+ """A class for managing named entities by ID with associated values and
582
+ variations.
583
+ """
584
+
585
+ def __init__(
586
+ self,
587
+ name: str,
588
+ regex: re.Pattern,
589
+ canonical_fn: Callable[[str, str], Any] = None,
590
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
591
+ authdata: AuthorityData = None,
592
+ field_groups: DerivedFieldGroups = None,
593
+ anns_validator: Callable[[Authority, Dict[str, Any]], bool] = None,
594
+ parent_auth: "Authority" = None,
595
+ ):
596
+ """Initialize with this authority's entity name.
597
+ :param name: The authority name
598
+ :param regex: The regular expression to apply
599
+ :param canonical_fn: A function, fn(match_text, group_name), to
600
+ transform input matches to a canonical form as a value_id.
601
+ Where group_name will be None and the full match text will be
602
+ passed in if there are no group names. Note that the canonical form
603
+ is computed before the match_validator is applied and its value
604
+ will be found as the value to the <auth_id> key.
605
+ :param auth_anns_builder: The authority annotations row builder to use
606
+ for building annotation rows.
607
+ :param authdata: The authority data
608
+ :param field_groups: The derived field groups to use
609
+ :param anns_validator: A validation function for each regex match
610
+ formed as a list of annotation row dictionaries, one row dictionary
611
+ for each matching regex group. If the validator returns False,
612
+ then the annotation rows will be rejected. The entity_text key
613
+ will hold matched text and the <auth_name>_field key will hold
614
+ the group name or number (if there are groups with or without names)
615
+ or the <auth_name> if there are no groups in the regular expression.
616
+ Note that the validator function takes the regex authority instance
617
+ as its first parameter to provide access to the field_groups, etc.
618
+ The validation_fn signature is: fn(regexAuthority, ann_row_dicts)
619
+ and returns a boolean.
620
+ :param parent_auth: This authority's parent authority (if any)
621
+ :param group_name_colname: The name of the annotations column for
622
+ the regex group names, or None to ignore group_names.
623
+
624
+ NOTE: If the regular expression has capturing groups, each group
625
+ will result in a separate entity, with the group name if provided
626
+ in the regular expression as ...(?P<group_name>group_regex)...
627
+ """
628
+ super().__init__(
629
+ name,
630
+ auth_anns_builder=auth_anns_builder,
631
+ authdata=authdata,
632
+ field_groups=field_groups,
633
+ anns_validator=anns_validator,
634
+ parent_auth=parent_auth,
635
+ )
636
+ self.regex = regex
637
+ self.canonical_fn = canonical_fn
638
+
639
+ def has_value(self, value: Any) -> re.Match:
640
+ """Determine whether the given value is in this authority.
641
+ :param value: A possible authority value.
642
+ :return: None if the value is not a valid entity value; otherwise,
643
+ return the re.Match object.
644
+ """
645
+ return self.regex.match(str(value))
646
+
647
+ def add_annotations(
648
+ self,
649
+ text_obj: dk_annots.AnnotatedText,
650
+ ) -> dk_annots.Annotations:
651
+ """Method to do the work of finding, validating, and adding annotations.
652
+ :param text_obj: The annotated text object to process and add annotations.
653
+ :return: The added Annotations
654
+ """
655
+ for match in re.finditer(self.regex, text_obj.text):
656
+ ann_dicts = list()
657
+ if match.lastindex is not None:
658
+ if len(self.regex.groupindex) > 0: # we have named groups
659
+ for group_name, group_num in self.regex.groupindex.items():
660
+ group_text = match.group(group_num)
661
+ kwargs = {self.field_groups.get_field_type_col(self.name): group_name}
662
+ ann_dicts.append(
663
+ self.build_annotation(
664
+ start_pos=match.start(group_name),
665
+ end_pos=match.end(group_name),
666
+ entity_text=group_text,
667
+ auth_value_id=self.get_canonical_form(group_text, group_name),
668
+ **kwargs,
669
+ )
670
+ )
671
+ else: # we have only numbers for groups
672
+ for group_num, group_text in enumerate(match.groups()):
673
+ group_num += 1
674
+ kwargs = {self.field_groups.get_field_type_col(self.name): group_num}
675
+ ann_dicts.append(
676
+ self.build_annotation(
677
+ start_pos=match.start(group_num),
678
+ end_pos=match.end(group_num),
679
+ entity_text=group_text,
680
+ auth_value_id=self.get_canonical_form(group_text, group_num),
681
+ **kwargs,
682
+ )
683
+ )
684
+ else: # we have no groups
685
+ ann_dicts.append(
686
+ self.build_annotation(
687
+ start_pos=match.start(),
688
+ end_pos=match.end(),
689
+ entity_text=match.group(),
690
+ auth_value_id=self.get_canonical_form(match.group(), self.name),
691
+ )
692
+ )
693
+ if self.validate_ann_dicts(ann_dicts):
694
+ # Add non-empty, valid annotation dicts to the result
695
+ text_obj.annotations.add_dicts(ann_dicts)
696
+ return text_obj.annotations
697
+
698
+ def get_canonical_form(self, entity_text: str, entity_type: str) -> Any:
699
+ if self.canonical_fn is not None:
700
+ entity_text = self.canonical_fn(entity_text, entity_type)
701
+ return entity_text
702
+
703
+
704
+ class AuthoritiesBundle(Authority):
705
+ """An authority for expressing values through multiple bundled "authorities"
706
+ like dictionary-based and/or multiple regular expression patterns.
707
+ """
708
+
709
+ def __init__(
710
+ self,
711
+ name: str,
712
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
713
+ authdata: AuthorityData = None,
714
+ field_groups: DerivedFieldGroups = None,
715
+ parent_auth: "Authority" = None,
716
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
717
+ auths: List[Authority] = None,
718
+ ):
719
+ """:param name: This authority's entity name
720
+ :param auth_anns_builder: The authority annotations row builder to use
721
+ for building annotation rows.
722
+ :param authdata: The authority data
723
+ :param field_groups: The derived field groups to use
724
+ :param anns_validator: fn(auth, anns_dict_list) that returns True if
725
+ the list of annotation row dicts are valid to be added as
726
+ annotations for a single match or "entity".
727
+ :param parent_auth: This authority's parent authority (if any)
728
+ :param auths: The authorities to bundle together.
729
+ """
730
+ super().__init__(
731
+ name,
732
+ auth_anns_builder=auth_anns_builder,
733
+ authdata=authdata,
734
+ field_groups=field_groups,
735
+ anns_validator=anns_validator,
736
+ parent_auth=parent_auth,
737
+ )
738
+ self.auths = auths.copy() if auths is not None else list()
739
+
740
+ def add(self, auth: Authority):
741
+ """Add the authority to this bundle
742
+ :param auth: The authority to add.
743
+ """
744
+ self.auths.append(auth)
745
+
746
+ def has_value(self, value: Any) -> bool:
747
+ """Determine whether the given value is in this authority.
748
+ :param value: A possible authority value.
749
+ :return: True if the value is a valid entity value.
750
+ """
751
+ for auth in self.auths:
752
+ if auth.has_value(value):
753
+ return True
754
+ return False
755
+
756
+ def add_annotations(
757
+ self,
758
+ text_obj: dk_annots.AnnotatedText,
759
+ ) -> dk_annots.Annotations:
760
+ """Method to do the work of finding, validating, and adding annotations.
761
+ :param text_obj: The annotated text object to process and add annotations.
762
+ :return: The added Annotations
763
+ """
764
+ for auth in self.auths:
765
+ auth.annotate_input(text_obj)
766
+ return text_obj.annotations