dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,860 @@
1
+ """Authority-based annotation processing and field grouping.
2
+
3
+ Provides classes for managing authority-based annotations, field groups,
4
+ and derived annotation columns for structured text extraction.
5
+ """
6
+
7
+ import re
8
+ from abc import ABC, abstractmethod
9
+ from collections.abc import Callable
10
+ from typing import Any, Dict, List, Set, Union
11
+
12
+ import pandas as pd
13
+
14
+ import dataknobs_xization.annotations as dk_annots
15
+
16
+ # Key annotation column name constants
17
+ KEY_AUTH_ID_COL = "auth_id"
18
+
19
+
20
+ class DerivedFieldGroups(dk_annots.DerivedAnnotationColumns):
21
+ """Defines derived column types:
22
+ * "field_type" -- The column holding they type of field of an annotation row
23
+ * "field_group" -- The column holding the group number(s) of the field
24
+ * "field_record" -- The column holding record number(s) of the field
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ field_type_suffix: str = "_field",
30
+ field_group_suffix: str = "_num",
31
+ field_record_suffix: str = "_recsnum",
32
+ ):
33
+ """Add derived column types/names: Given an annnotation row,
34
+ * field_type(row) == f'{row[ann_type_col]}_field'
35
+ * field_group(row) == f'{row[ann_type_col]}_num'
36
+ * field_record(row) == f'{row[ann_type_col])_recsnum'
37
+
38
+ Where:
39
+ * A field_type column holds annotation "sub"- type values, or fields
40
+ * A field_group column identifies groups of annotation fields
41
+ * A field_record column identifies groups of annotation field groups
42
+
43
+ Args:
44
+ field_type_suffix: The field_type col name suffix (if not _field).
45
+ field_group_suffix: The field_group col name suffix (if not _num).
46
+ field_record_suffix: field_record colname sfx (if not _recsnum).
47
+ """
48
+ self.field_type_suffix = field_type_suffix
49
+ self.field_group_suffix = field_group_suffix
50
+ self.field_record_suffix = field_record_suffix
51
+
52
+ def get_col_value(
53
+ self,
54
+ metadata: dk_annots.AnnotationsMetaData,
55
+ col_type: str,
56
+ row: pd.Series,
57
+ missing: str = None,
58
+ ) -> str:
59
+ """Get the value of the column in the given row derived from col_type,
60
+ where col_type is one of:
61
+ * "field_type" == f"{field}_field"
62
+ * "field_group" == f"{field}_num"
63
+ * "field_record" == f"{field}_recsnum"
64
+
65
+ And "field" is the row_accessor's metadata's "ann_type" col's value.
66
+
67
+ Args:
68
+ metadata: The AnnotationsMetaData.
69
+ col_type: The type of column value to derive.
70
+ row: A row from which to get the value.
71
+ missing: The value to return for unknown or missing column.
72
+
73
+ Returns:
74
+ The row value or the missing value.
75
+ """
76
+ value = missing
77
+ if metadata.ann_type_col in row.index:
78
+ field = row[metadata.ann_type_col]
79
+ if field is not None:
80
+ if col_type == "field_type":
81
+ col_name = self.get_field_type_col(field)
82
+ elif col_type == "field_group":
83
+ col_name = self.get_field_group_col(field)
84
+ elif col_type == "field_record":
85
+ col_name = self.get_field_record_col(field)
86
+ if col_name is not None and col_name in row.index:
87
+ value = row[col_name]
88
+ return value
89
+
90
+ def unpack_field(self, field_value: str) -> str:
91
+ """Given a field in any of its derivatives (like field type, field group
92
+ or field record,) unpack and return the basic field value itself.
93
+ """
94
+ field = field_value
95
+ if field.endswith(self.field_record_suffix):
96
+ field = field.replace(self.field_record_suffix, "")
97
+ elif field.endswith(self.field_group_suffix):
98
+ field = field.replace(self.field_group_suffix, "")
99
+ elif field.endswith(self.field_type_suffix):
100
+ field = field.replace(self.field_type_suffix, "")
101
+ return field
102
+
103
+ def get_field_name(self, field_value: str) -> str:
104
+ """Given a field name or field col name, e.g., an annotation type col's
105
+ value (the field name); or a field type, group, or record column name,
106
+ get the field name.
107
+ """
108
+ return self.unpack_field(field_value)
109
+
110
+ def get_field_type_col(self, field_value: str) -> str:
111
+ """Given a field name or field col name, e.g., an annotation type col's
112
+ value; or a field type, group, or record column name, get the field
113
+ name.
114
+ """
115
+ field = self.unpack_field(field_value)
116
+ return f"{field}{self.field_type_suffix}"
117
+
118
+ def get_field_group_col(self, field_value: str) -> str:
119
+ """Given a field name or field col name, e.g., an annotation type col's
120
+ value; or a field type, group, or record, get the name of the derived
121
+ field group column.
122
+ """
123
+ field = self.unpack_field(field_value)
124
+ return f"{field}{self.field_group_suffix}"
125
+
126
+ def get_field_record_col(self, field_value: str) -> str:
127
+ """Given a field name or field col name, e.g., an annotation type col's
128
+ value; or a field type, group, or record, get the name of the derived
129
+ field record column.
130
+ """
131
+ field = self.unpack_field(field_value)
132
+ return f"{field}{self.field_record_suffix}"
133
+
134
+
135
+ class AuthorityAnnotationsMetaData(dk_annots.AnnotationsMetaData):
136
+ """An extension of AnnotationsMetaData that adds an 'auth_id_col' to the
137
+ standard (key) annotation columns (attributes).
138
+ """
139
+
140
+ def __init__(
141
+ self,
142
+ start_pos_col: str = dk_annots.KEY_START_POS_COL,
143
+ end_pos_col: str = dk_annots.KEY_END_POS_COL,
144
+ text_col: str = dk_annots.KEY_TEXT_COL,
145
+ ann_type_col: str = dk_annots.KEY_ANN_TYPE_COL,
146
+ auth_id_col: str = KEY_AUTH_ID_COL,
147
+ sort_fields: List[str] = (dk_annots.KEY_START_POS_COL, dk_annots.KEY_END_POS_COL),
148
+ sort_fields_ascending: List[bool] = (True, False),
149
+ **kwargs: Any,
150
+ ):
151
+ """Initialize with key (and more) column names and info.
152
+
153
+ Key column types:
154
+ * start_pos
155
+ * end_pos
156
+ * text
157
+ * ann_type
158
+ * auth_id
159
+
160
+ Note:
161
+ Actual table columns can be named arbitrarily, BUT interactions
162
+ through annotations classes and interfaces relating to the "key"
163
+ columns must use the key column constants.
164
+
165
+ Args:
166
+ start_pos_col: Col name for the token starting position.
167
+ end_pos_col: Col name for the token ending position.
168
+ text_col: Col name for the token text.
169
+ ann_type_col: Col name for the annotation types.
170
+ auth_id_col: Col name for the authority value ID.
171
+ sort_fields: The col types relevant for sorting annotation rows.
172
+ sort_fields_ascending: To specify sort order of sort_fields.
173
+ **kwargs: More column types mapped to column names.
174
+ """
175
+ super().__init__(
176
+ start_pos_col=start_pos_col,
177
+ end_pos_col=end_pos_col,
178
+ text_col=text_col,
179
+ ann_type_col=ann_type_col,
180
+ sort_fields=sort_fields,
181
+ sort_fields_ascending=sort_fields_ascending,
182
+ auth_id=auth_id_col,
183
+ **kwargs,
184
+ )
185
+
186
+ @property
187
+ def auth_id_col(self) -> str:
188
+ """Get the column name for the auth_id"""
189
+ return self.data[KEY_AUTH_ID_COL]
190
+
191
+
192
+ class AuthorityAnnotationsBuilder(dk_annots.AnnotationsBuilder):
193
+ """An extension of an AnnotationsBuilder that adds the 'auth_id' column."""
194
+
195
+ def __init__(
196
+ self,
197
+ metadata: AuthorityAnnotationsMetaData = None,
198
+ data_defaults: Dict[str, Any] = None,
199
+ ):
200
+ """Initialize AuthorityAnnotationsBuilder.
201
+
202
+ Args:
203
+ metadata: The authority annotations metadata.
204
+ data_defaults: Dict[ann_colname, default_value] with default
205
+ values for annotation columns.
206
+ """
207
+ super().__init__(
208
+ metadata if metadata is not None else AuthorityAnnotationsMetaData(), data_defaults
209
+ )
210
+
211
+ def build_annotation_row(
212
+ self, start_pos: int, end_pos: int, text: str, ann_type: str, auth_id: str, **kwargs: Any
213
+ ) -> Dict[str, Any]:
214
+ """Build an annotation row with the mandatory key values and those from
215
+ the remaining keyword arguments.
216
+
217
+ For those kwargs whose names match metadata column names, override the
218
+ data_defaults and add remaining data_default attributes.
219
+
220
+ Args:
221
+ start_pos: The token start position.
222
+ end_pos: The token end position.
223
+ text: The token text.
224
+ ann_type: The annotation type.
225
+ auth_id: The authority ID for the row.
226
+ **kwargs: Additional keyword arguments.
227
+
228
+ Returns:
229
+ The result row dictionary.
230
+ """
231
+ return self.do_build_row(
232
+ {
233
+ self.metadata.start_pos_col: start_pos,
234
+ self.metadata.end_pos_col: end_pos,
235
+ self.metadata.text_col: text,
236
+ self.metadata.ann_type_col: ann_type,
237
+ self.metadata.auth_id_col: auth_id,
238
+ },
239
+ **kwargs,
240
+ )
241
+
242
+
243
+ class AuthorityData:
244
+ """A wrapper for authority data."""
245
+
246
+ def __init__(self, df: pd.DataFrame, name: str):
247
+ self._df = df
248
+ self.name = name
249
+
250
+ @property
251
+ def df(self) -> pd.DataFrame:
252
+ """Get the authority data in a dataframe"""
253
+ return self._df
254
+
255
+ def lookup_values(self, value: Any, is_id: bool = False) -> pd.DataFrame:
256
+ """Lookup authority value(s) for the given value or value id.
257
+
258
+ Args:
259
+ value: A value or value_id for this authority.
260
+ is_id: True if value is an ID.
261
+
262
+ Returns:
263
+ The applicable authority dataframe rows.
264
+ """
265
+ col = self.df.index if is_id else self.df[self.name]
266
+ return self.df[col == value]
267
+
268
+
269
+ class Authority(dk_annots.Annotator):
270
+ """A class for managing and defining tabular authoritative data for e.g.,
271
+ taxonomies, etc., and using them to annotate instances within text.
272
+ """
273
+
274
+ def __init__(
275
+ self,
276
+ name: str,
277
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
278
+ authdata: AuthorityData = None,
279
+ field_groups: DerivedFieldGroups = None,
280
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
281
+ parent_auth: "Authority" = None,
282
+ ):
283
+ """Initialize with this authority's metadata.
284
+
285
+ Args:
286
+ name: This authority's entity name.
287
+ auth_anns_builder: The authority annotations row builder to use
288
+ for building annotation rows.
289
+ authdata: The authority data.
290
+ field_groups: The derived field groups to use.
291
+ anns_validator: fn(auth, anns_dict_list) that returns True if
292
+ the list of annotation row dicts are valid to be added as
293
+ annotations for a single match or "entity".
294
+ parent_auth: This authority's parent authority (if any).
295
+ """
296
+ super().__init__(name)
297
+ self.anns_builder = (
298
+ auth_anns_builder if auth_anns_builder is not None else AuthorityAnnotationsBuilder()
299
+ )
300
+ self.authdata = authdata
301
+ self.field_groups = field_groups if field_groups is not None else DerivedFieldGroups()
302
+ self.anns_validator = anns_validator
303
+ self._parent = parent_auth
304
+
305
+ @property
306
+ def metadata(self) -> AuthorityAnnotationsMetaData:
307
+ """Get the meta-data"""
308
+ return self.anns_builder.metadata
309
+
310
+ @property
311
+ def parent(self) -> "Authority":
312
+ """Get this authority's parent, or None."""
313
+ return self._parent
314
+
315
+ @abstractmethod
316
+ def has_value(self, value: Any) -> bool:
317
+ """Determine whether the given value is in this authority.
318
+
319
+ Args:
320
+ value: A possible authority value.
321
+
322
+ Returns:
323
+ True if the value is a valid entity value.
324
+ """
325
+ raise NotImplementedError
326
+
327
+ def annotate_input(
328
+ self,
329
+ text_obj: Union[dk_annots.AnnotatedText, str],
330
+ **kwargs: Any,
331
+ ) -> dk_annots.Annotations:
332
+ """Find and annotate this authority's entities in the document text
333
+ as dictionaries like:
334
+ [
335
+ {
336
+ 'input_id': <id>,
337
+ 'start_pos': <start_char_pos>,
338
+ 'end_pos': <end_char_pos>,
339
+ 'entity_text': <entity_text>,
340
+ 'ann_type': <authority_name>,
341
+ '<auth_id>': <auth_value_id_or_canonical_form>,
342
+ 'confidence': <confidence_if_available>,
343
+ },
344
+ ]
345
+
346
+ Args:
347
+ text_obj: The text object or string to process.
348
+ **kwargs: Additional keyword arguments.
349
+
350
+ Returns:
351
+ An Annotations instance.
352
+ """
353
+ if text_obj is not None:
354
+ if isinstance(text_obj, str) and len(text_obj.strip()) > 0:
355
+ text_obj = dk_annots.AnnotatedText(
356
+ text_obj,
357
+ annots_metadata=self.metadata,
358
+ )
359
+ if text_obj is not None:
360
+ annotations = self.add_annotations(text_obj)
361
+ return annotations
362
+
363
+ @abstractmethod
364
+ def add_annotations(
365
+ self,
366
+ text_obj: dk_annots.AnnotatedText,
367
+ ) -> dk_annots.Annotations:
368
+ """Method to do the work of finding, validating, and adding annotations.
369
+
370
+ Args:
371
+ text_obj: The annotated text object to process and add annotations.
372
+
373
+ Returns:
374
+ The added Annotations.
375
+ """
376
+ raise NotImplementedError
377
+
378
+ def validate_ann_dicts(self, ann_dicts: List[Dict[str, Any]]) -> bool:
379
+ """The annotation row dictionaries are valid if:
380
+ * They are non-empty
381
+ * and
382
+ * either there is no annotations validator
383
+ * or they are valid according to the validator
384
+
385
+ Args:
386
+ ann_dicts: Annotation dictionaries.
387
+
388
+ Returns:
389
+ True if valid.
390
+ """
391
+ return len(ann_dicts) > 0 and (
392
+ self.anns_validator is None or self.anns_validator(self, ann_dicts)
393
+ )
394
+
395
+ def compose(
396
+ self,
397
+ annotations: dk_annots.Annotations,
398
+ ) -> dk_annots.Annotations:
399
+ """Compose annotations into groups.
400
+
401
+ Args:
402
+ annotations: The annotations.
403
+
404
+ Returns:
405
+ Composed annotations.
406
+ """
407
+ return annotations
408
+
409
+ def build_annotation(
410
+ self,
411
+ start_pos: int = None,
412
+ end_pos: int = None,
413
+ entity_text: str = None,
414
+ auth_value_id: Any = None,
415
+ conf: float = 1.0,
416
+ **kwargs,
417
+ ) -> Dict[str, Any]:
418
+ """Build annotations with the given components."""
419
+ return self.anns_builder.build_annotation_row(
420
+ start_pos, end_pos, entity_text, self.name, auth_value_id, auth_valconf=conf, **kwargs
421
+ )
422
+
423
+
424
+ class AnnotationsValidator(ABC):
425
+ """A base class with helper functions for performing validations on annotation
426
+ rows.
427
+ """
428
+
429
+ def __call__(
430
+ self,
431
+ auth: Authority,
432
+ ann_row_dicts: List[Dict[str, Any]],
433
+ ) -> bool:
434
+ """Call function to enable instances of this type of class to be passed in
435
+ as a anns_validator function to an Authority.
436
+
437
+ Args:
438
+ auth: The authority proposing annotations.
439
+ ann_row_dicts: The proposed annotations.
440
+
441
+ Returns:
442
+ True if the annotations are valid; otherwise, False.
443
+ """
444
+ return self.validate_annotation_rows(
445
+ AnnotationsValidator.AuthAnnotations(auth, ann_row_dicts)
446
+ )
447
+
448
+ @abstractmethod
449
+ def validate_annotation_rows(
450
+ self,
451
+ auth_annotations: "AnnotationsValidator.AuthAnnotations",
452
+ ) -> bool:
453
+ """Determine whether the proposed authority annotation rows are valid.
454
+
455
+ Args:
456
+ auth_annotations: The AuthAnnotations instance with the
457
+ proposed data.
458
+
459
+ Returns:
460
+ True if valid; False if not.
461
+ """
462
+ raise NotImplementedError
463
+
464
+ class AuthAnnotations:
465
+ """A wrapper class for convenient access to the entity annotations."""
466
+
467
+ def __init__(self, auth: Authority, ann_row_dicts: List[Dict[str, Any]]):
468
+ self.auth = auth
469
+ self.ann_row_dicts = ann_row_dicts
470
+ self._row_accessor = None # AnnotationsRowAccessor
471
+ self._anns = None # Annotations
472
+ self._atts = None # Dict[str, str]
473
+
474
+ @property
475
+ def row_accessor(self) -> dk_annots.AnnotationsRowAccessor:
476
+ """Get the row accessor for this instance's annotations."""
477
+ if self._row_accessor is None:
478
+ self._row_accessor = dk_annots.AnnotationsRowAccessor(
479
+ self.auth.metadata, derived_cols=self.auth.field_groups
480
+ )
481
+ return self._row_accessor
482
+
483
+ @property
484
+ def anns(self) -> dk_annots.Annotations:
485
+ """Get this instance's annotation rows as an annotations object"""
486
+ if self._anns is None:
487
+ self._anns = dk_annots.Annotations(self.auth.metadata)
488
+ for row_dict in self.ann_row_dicts:
489
+ self._anns.add_dict(row_dict)
490
+ return self._anns
491
+
492
+ @property
493
+ def df(self) -> pd.DataFrame:
494
+ """Get the annotation's dataframe"""
495
+ return self.anns.df
496
+
497
+ def get_field_type(self, row: pd.Series) -> str:
498
+ """Get the entity field type value"""
499
+ return self.row_accessor.get_col_value("field_type", row, None)
500
+
501
+ def get_text(self, row: pd.Series) -> str:
502
+ """Get the entity text from the row"""
503
+ return self.row_accessor.get_col_value(self.auth.metadata.text_col, row, None)
504
+
505
+ @property
506
+ def attributes(self) -> Dict[str, str]:
507
+ """Get this instance's annotation entity attributes"""
508
+ if self._atts is None:
509
+ self._atts = {
510
+ self.get_field_type(row): self.get_text(row) for _, row in self.df.iterrows()
511
+ }
512
+ return self._atts
513
+
514
+ def colval(self, col_name, row) -> Any:
515
+ """Get the column's value from the given row"""
516
+ return self.row_accessor.get_col_value(col_name, row)
517
+
518
+
519
+ class AuthorityFactory(ABC):
520
+ """A factory class for building an authority."""
521
+
522
+ @abstractmethod
523
+ def build_authority(
524
+ self,
525
+ name: str,
526
+ auth_anns_builder: AuthorityAnnotationsBuilder,
527
+ authdata: AuthorityData,
528
+ parent_auth: Authority = None,
529
+ ) -> Authority:
530
+ """Build an authority with the given name and data.
531
+
532
+ Args:
533
+ name: The authority name.
534
+ auth_anns_builder: The authority annotations row builder to use
535
+ for building annotation rows.
536
+ authdata: The authority data.
537
+ parent_auth: The parent authority.
538
+
539
+ Returns:
540
+ The authority.
541
+ """
542
+ raise NotImplementedError
543
+
544
+
545
+ class LexicalAuthority(Authority):
546
+ """A class for managing named entities by ID with associated values and
547
+ variations.
548
+ """
549
+
550
+ def __init__(
551
+ self,
552
+ name: str,
553
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
554
+ authdata: AuthorityData = None,
555
+ field_groups: DerivedFieldGroups = None,
556
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
557
+ parent_auth: "Authority" = None,
558
+ ):
559
+ """Initialize with this authority's metadata.
560
+
561
+ Args:
562
+ name: This authority's entity name.
563
+ auth_anns_builder: The authority annotations row builder to use
564
+ for building annotation rows.
565
+ authdata: The authority data.
566
+ field_groups: The derived field groups to use.
567
+ anns_validator: fn(auth, anns_dict_list) that returns True if
568
+ the list of annotation row dicts are valid to be added as
569
+ annotations for a single match or "entity".
570
+ parent_auth: This authority's parent authority (if any).
571
+ """
572
+ super().__init__(
573
+ name,
574
+ auth_anns_builder=auth_anns_builder,
575
+ authdata=authdata,
576
+ field_groups=field_groups,
577
+ anns_validator=anns_validator,
578
+ parent_auth=parent_auth,
579
+ )
580
+
581
+ @abstractmethod
582
+ def get_value_ids(self, value: Any) -> Set[Any]:
583
+ """Get all IDs associated with the given value. Note that typically
584
+ there is a single ID for any value, but this allows for inherent
585
+ ambiguities in the authority.
586
+
587
+ Args:
588
+ value: An authority value.
589
+
590
+ Returns:
591
+ The associated IDs or an empty set if the value is not valid.
592
+ """
593
+ raise NotImplementedError
594
+
595
+ @abstractmethod
596
+ def get_values_by_id(self, value_id: Any) -> Set[Any]:
597
+ """Get all values for the associated value ID. Note that typically
598
+ there is a single value for an ID, but this allows for inherent
599
+ ambiguities in the authority.
600
+
601
+ Args:
602
+ value_id: An authority value ID.
603
+
604
+ Returns:
605
+ The associated values or an empty set if the value is not valid.
606
+ """
607
+ raise NotImplementedError
608
+
609
+ @abstractmethod
610
+ def get_id_by_variation(self, variation: str) -> Set[str]:
611
+ """Get the IDs of the value(s) associated with the given variation.
612
+
613
+ Args:
614
+ variation: Variation text.
615
+
616
+ Returns:
617
+ The possibly empty set of associated value IDS.
618
+ """
619
+ raise NotImplementedError
620
+
621
+ @abstractmethod
622
+ def find_variations(
623
+ self,
624
+ variation: str,
625
+ starts_with: bool = False,
626
+ ends_with: bool = False,
627
+ scope: str = "fullmatch",
628
+ ) -> pd.Series:
629
+ """Find all matches to the given variation.
630
+
631
+ Note:
632
+ Only the first true of starts_with, ends_with, and scope will
633
+ be applied. If none of these are true, a full match on the pattern
634
+ is performed.
635
+
636
+ Args:
637
+ variation: The text to find; treated as a regular expression
638
+ unless either starts_with or ends_with is True.
639
+ starts_with: When True, find all terms that start with the
640
+ variation text.
641
+ ends_with: When True, find all terms that end with the variation
642
+ text.
643
+ scope: 'fullmatch' (default), 'match', or 'contains' for
644
+ strict, less strict, and least strict matching.
645
+
646
+ Returns:
647
+ The matching variations as a pd.Series.
648
+ """
649
+ raise NotImplementedError
650
+
651
+
652
+ class RegexAuthority(Authority):
653
+ """A class for managing named entities by ID with associated values and
654
+ variations.
655
+ """
656
+
657
+ def __init__(
658
+ self,
659
+ name: str,
660
+ regex: re.Pattern,
661
+ canonical_fn: Callable[[str, str], Any] = None,
662
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
663
+ authdata: AuthorityData = None,
664
+ field_groups: DerivedFieldGroups = None,
665
+ anns_validator: Callable[[Authority, Dict[str, Any]], bool] = None,
666
+ parent_auth: "Authority" = None,
667
+ ):
668
+ """Initialize with this authority's entity name.
669
+
670
+ Note:
671
+ If the regular expression has capturing groups, each group
672
+ will result in a separate entity, with the group name if provided
673
+ in the regular expression as ...(?P<group_name>group_regex)...
674
+
675
+ Args:
676
+ name: The authority name.
677
+ regex: The regular expression to apply.
678
+ canonical_fn: A function, fn(match_text, group_name), to
679
+ transform input matches to a canonical form as a value_id.
680
+ Where group_name will be None and the full match text will be
681
+ passed in if there are no group names. Note that the canonical form
682
+ is computed before the match_validator is applied and its value
683
+ will be found as the value to the <auth_id> key.
684
+ auth_anns_builder: The authority annotations row builder to use
685
+ for building annotation rows.
686
+ authdata: The authority data.
687
+ field_groups: The derived field groups to use.
688
+ anns_validator: A validation function for each regex match
689
+ formed as a list of annotation row dictionaries, one row dictionary
690
+ for each matching regex group. If the validator returns False,
691
+ then the annotation rows will be rejected. The entity_text key
692
+ will hold matched text and the <auth_name>_field key will hold
693
+ the group name or number (if there are groups with or without names)
694
+ or the <auth_name> if there are no groups in the regular expression.
695
+ Note that the validator function takes the regex authority instance
696
+ as its first parameter to provide access to the field_groups, etc.
697
+ The validation_fn signature is: fn(regexAuthority, ann_row_dicts)
698
+ and returns a boolean.
699
+ parent_auth: This authority's parent authority (if any).
700
+ """
701
+ super().__init__(
702
+ name,
703
+ auth_anns_builder=auth_anns_builder,
704
+ authdata=authdata,
705
+ field_groups=field_groups,
706
+ anns_validator=anns_validator,
707
+ parent_auth=parent_auth,
708
+ )
709
+ self.regex = regex
710
+ self.canonical_fn = canonical_fn
711
+
712
+ def has_value(self, value: Any) -> re.Match:
713
+ """Determine whether the given value is in this authority.
714
+
715
+ Args:
716
+ value: A possible authority value.
717
+
718
+ Returns:
719
+ None if the value is not a valid entity value; otherwise,
720
+ return the re.Match object.
721
+ """
722
+ return self.regex.match(str(value))
723
+
724
+ def add_annotations(
725
+ self,
726
+ text_obj: dk_annots.AnnotatedText,
727
+ ) -> dk_annots.Annotations:
728
+ """Method to do the work of finding, validating, and adding annotations.
729
+
730
+ Args:
731
+ text_obj: The annotated text object to process and add annotations.
732
+
733
+ Returns:
734
+ The added Annotations.
735
+ """
736
+ for match in re.finditer(self.regex, text_obj.text):
737
+ ann_dicts = []
738
+ if match.lastindex is not None:
739
+ if len(self.regex.groupindex) > 0: # we have named groups
740
+ for group_name, group_num in self.regex.groupindex.items():
741
+ group_text = match.group(group_num)
742
+ kwargs = {self.field_groups.get_field_type_col(self.name): group_name}
743
+ ann_dicts.append(
744
+ self.build_annotation(
745
+ start_pos=match.start(group_name),
746
+ end_pos=match.end(group_name),
747
+ entity_text=group_text,
748
+ auth_value_id=self.get_canonical_form(group_text, group_name),
749
+ **kwargs,
750
+ )
751
+ )
752
+ else: # we have only numbers for groups
753
+ for group_num, group_text in enumerate(match.groups()):
754
+ group_num += 1
755
+ kwargs = {self.field_groups.get_field_type_col(self.name): group_num}
756
+ ann_dicts.append(
757
+ self.build_annotation(
758
+ start_pos=match.start(group_num),
759
+ end_pos=match.end(group_num),
760
+ entity_text=group_text,
761
+ auth_value_id=self.get_canonical_form(group_text, group_num),
762
+ **kwargs,
763
+ )
764
+ )
765
+ else: # we have no groups
766
+ ann_dicts.append(
767
+ self.build_annotation(
768
+ start_pos=match.start(),
769
+ end_pos=match.end(),
770
+ entity_text=match.group(),
771
+ auth_value_id=self.get_canonical_form(match.group(), self.name),
772
+ )
773
+ )
774
+ if self.validate_ann_dicts(ann_dicts):
775
+ # Add non-empty, valid annotation dicts to the result
776
+ text_obj.annotations.add_dicts(ann_dicts)
777
+ return text_obj.annotations
778
+
779
+ def get_canonical_form(self, entity_text: str, entity_type: str) -> Any:
780
+ if self.canonical_fn is not None:
781
+ entity_text = self.canonical_fn(entity_text, entity_type)
782
+ return entity_text
783
+
784
+
785
+ class AuthoritiesBundle(Authority):
786
+ """An authority for expressing values through multiple bundled "authorities"
787
+ like dictionary-based and/or multiple regular expression patterns.
788
+ """
789
+
790
+ def __init__(
791
+ self,
792
+ name: str,
793
+ auth_anns_builder: AuthorityAnnotationsBuilder = None,
794
+ authdata: AuthorityData = None,
795
+ field_groups: DerivedFieldGroups = None,
796
+ parent_auth: "Authority" = None,
797
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
798
+ auths: List[Authority] = None,
799
+ ):
800
+ """Initialize the AuthoritiesBundle.
801
+
802
+ Args:
803
+ name: This authority's entity name.
804
+ auth_anns_builder: The authority annotations row builder to use
805
+ for building annotation rows.
806
+ authdata: The authority data.
807
+ field_groups: The derived field groups to use.
808
+ anns_validator: fn(auth, anns_dict_list) that returns True if
809
+ the list of annotation row dicts are valid to be added as
810
+ annotations for a single match or "entity".
811
+ parent_auth: This authority's parent authority (if any).
812
+ auths: The authorities to bundle together.
813
+ """
814
+ super().__init__(
815
+ name,
816
+ auth_anns_builder=auth_anns_builder,
817
+ authdata=authdata,
818
+ field_groups=field_groups,
819
+ anns_validator=anns_validator,
820
+ parent_auth=parent_auth,
821
+ )
822
+ self.auths = auths.copy() if auths is not None else []
823
+
824
+ def add(self, auth: Authority):
825
+ """Add the authority to this bundle.
826
+
827
+ Args:
828
+ auth: The authority to add.
829
+ """
830
+ self.auths.append(auth)
831
+
832
+ def has_value(self, value: Any) -> bool:
833
+ """Determine whether the given value is in this authority.
834
+
835
+ Args:
836
+ value: A possible authority value.
837
+
838
+ Returns:
839
+ True if the value is a valid entity value.
840
+ """
841
+ for auth in self.auths:
842
+ if auth.has_value(value):
843
+ return True
844
+ return False
845
+
846
+ def add_annotations(
847
+ self,
848
+ text_obj: dk_annots.AnnotatedText,
849
+ ) -> dk_annots.Annotations:
850
+ """Method to do the work of finding, validating, and adding annotations.
851
+
852
+ Args:
853
+ text_obj: The annotated text object to process and add annotations.
854
+
855
+ Returns:
856
+ The added Annotations.
857
+ """
858
+ for auth in self.auths:
859
+ auth.annotate_input(text_obj)
860
+ return text_obj.annotations