dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,1308 @@
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import Callable
4
+ from typing import Any, Dict, List, Set
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ import dataknobs_structures.document as dk_doc
10
+
11
+ # Key annotations column name constants for use across annotation interfaces
12
+ KEY_START_POS_COL = "start_pos"
13
+ KEY_END_POS_COL = "end_pos"
14
+ KEY_TEXT_COL = "text"
15
+ KEY_ANN_TYPE_COL = "ann_type"
16
+
17
+
18
+ class AnnotationsMetaData(dk_doc.MetaData):
19
+ """Container for annotations meta-data, identifying key column names.
20
+
21
+ NOTE: this object contains only information about annotation column names
22
+ and not annotation table values.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ start_pos_col: str = KEY_START_POS_COL,
28
+ end_pos_col: str = KEY_END_POS_COL,
29
+ text_col: str = KEY_TEXT_COL,
30
+ ann_type_col: str = KEY_ANN_TYPE_COL,
31
+ sort_fields: List[str] = (KEY_START_POS_COL, KEY_END_POS_COL),
32
+ sort_fields_ascending: List[bool] = (True, False),
33
+ **kwargs,
34
+ ):
35
+ """Initialize with key (and more) column names and info.
36
+
37
+ Key column types:
38
+ * start_pos
39
+ * end_pos
40
+ * text
41
+ * ann_type
42
+
43
+ Notes:
44
+ * Actual table columns can be named arbitrarily
45
+ * BUT: interactions through annotations classes and interfaces
46
+ relating to the "key" columns must use the key column constants
47
+
48
+ :param start_pos_col: Col name for the token starting position
49
+ :param end_pos_col: Col name for the token ending position
50
+ :param text_col: Col name for the token text
51
+ :param ann_type_col: Col name for the annotation types
52
+ :param sort_fields: The col types relevant for sorting annotation rows
53
+ :param sort_fields_ascending: To specify sort order of sort_fields
54
+ :param **kwargs: More column types mapped to column names
55
+ """
56
+ super().__init__(
57
+ {
58
+ KEY_START_POS_COL: start_pos_col,
59
+ KEY_END_POS_COL: end_pos_col,
60
+ KEY_TEXT_COL: text_col,
61
+ KEY_ANN_TYPE_COL: ann_type_col,
62
+ },
63
+ **kwargs,
64
+ )
65
+ self.sort_fields = list(sort_fields)
66
+ self.ascending = sort_fields_ascending
67
+
68
+ @property
69
+ def start_pos_col(self) -> str:
70
+ """Get the column name for the token starting postition"""
71
+ return self.data[KEY_START_POS_COL]
72
+
73
+ @property
74
+ def end_pos_col(self) -> str:
75
+ """Get the column name for the token ending position"""
76
+ return self.data[KEY_END_POS_COL]
77
+
78
+ @property
79
+ def text_col(self) -> str:
80
+ """Get the column name for the token text"""
81
+ return self.data[KEY_TEXT_COL]
82
+
83
+ @property
84
+ def ann_type_col(self) -> str:
85
+ """Get the column name for the token annotation type"""
86
+ return self.data[KEY_ANN_TYPE_COL]
87
+
88
+ def get_col(self, col_type: str, missing: str = None) -> str:
89
+ """Get the name of the column having the given type (including key column
90
+ types but not derived,) or get the missing value.
91
+
92
+ :param col_type: The type of column name to get
93
+ :param missing: The value to return for unknown column types
94
+ :return: The column name or the missing value
95
+ """
96
+ return self.get_value(col_type, missing)
97
+
98
+ def sort_df(self, an_df: pd.DataFrame):
99
+ """Sort an annotations dataframe according to this metadata.
100
+ :param an_df: An annotations dataframe
101
+ :return: The sorted annotations dataframe.
102
+ """
103
+ if self.sort_fields is not None:
104
+ an_df = an_df.sort_values(self.sort_fields, ascending=self.ascending)
105
+ return an_df
106
+
107
+
108
+ class DerivedAnnotationColumns(ABC):
109
+ """Interface for injecting derived columns into AnnotationsMetaData."""
110
+
111
+ @abstractmethod
112
+ def get_col_value(
113
+ self,
114
+ metadata: AnnotationsMetaData,
115
+ col_type: str,
116
+ row: pd.Series,
117
+ missing: str = None,
118
+ ) -> str:
119
+ """Get the value of the column in the given row derived from col_type.
120
+
121
+ :param metadata: The AnnotationsMetaData
122
+ :param col_type: The type of column value to derive
123
+ :param row: A row from which to get the value.
124
+ :param missing: The value to return for unknown or missing column
125
+ :return: The row value or the missing value
126
+ """
127
+ raise NotImplementedError
128
+
129
+
130
+ class AnnotationsRowAccessor:
131
+ """A class that accesses row data according to the metadata and derived cols."""
132
+
133
+ def __init__(
134
+ self, metadata: AnnotationsMetaData, derived_cols: DerivedAnnotationColumns = None
135
+ ):
136
+ """:param metadata: The metadata for annotation columns
137
+ :param derived_cols: A DerivedAnnotationColumns instance for injecting
138
+ derived columns.
139
+ """
140
+ self.metadata = metadata
141
+ self.derived_cols = derived_cols
142
+
143
+ def get_col_value(
144
+ self,
145
+ col_type: str,
146
+ row: pd.Series,
147
+ missing: str = None,
148
+ ) -> str:
149
+ """Get the value of the column in the given row with the given type.
150
+
151
+ This gets the value from the first existing column in the row from:
152
+ * The metadata.get_col(col_type) column
153
+ * col_type itself
154
+ * The columns derived from col_type
155
+
156
+ :param col_type: The type of column value to get
157
+ :param row: A row from which to get the value.
158
+ :param missing: The value to return for unknown or missing column
159
+ :return: The row value or the missing value
160
+ """
161
+ value = missing
162
+ col = self.metadata.get_col(col_type, None)
163
+ if col is None or col not in row.index:
164
+ if col_type in self.metadata.data:
165
+ value = row[col_type]
166
+ elif self.derived_cols is not None:
167
+ value = self.derived_cols.get_col_value(self.metadata, col_type, row, missing)
168
+ else:
169
+ value = row[col]
170
+ return value
171
+
172
+
173
+ class Annotations:
174
+ """DAO for collecting and managing a table of annotations, where each row
175
+ carries annotation information for an input token.
176
+
177
+ The data in this class is maintained either as a list of dicts, each dict
178
+ representing a "row," or as a pandas DataFrame, depending on the latest
179
+ access. Changes in either the lists or dataframe will be reflected in the
180
+ alternate data structure.
181
+ """
182
+
183
+ def __init__(
184
+ self,
185
+ metadata: AnnotationsMetaData,
186
+ df: pd.DataFrame = None,
187
+ ):
188
+ """Construct as empty or initialize with the dataframe form.
189
+ :param df: A dataframe with annotation records.
190
+ """
191
+ self.metadata = metadata
192
+ self._annotations_list = None
193
+ self._df = df
194
+
195
+ @property
196
+ def ann_row_dicts(self) -> List[Dict[str, Any]]:
197
+ """Get the annotations as a list of dictionaries."""
198
+ if self._annotations_list is None:
199
+ self._annotations_list = self._build_list()
200
+ return self._annotations_list
201
+
202
+ @property
203
+ def df(self) -> pd.DataFrame:
204
+ """Get the annotations as a pandas dataframe."""
205
+ if self._df is None:
206
+ self._df = self._build_df()
207
+ return self._df
208
+
209
+ def clear(self) -> pd.DataFrame:
210
+ """Clear/empty out all annotations, returning the annotations df"""
211
+ rv = self.df
212
+ self._df = None
213
+ self._annotations_list = None
214
+ return rv
215
+
216
+ def is_empty(self) -> bool:
217
+ return (self._df is None or len(self._df) == 0) and (
218
+ self._annotations_list is None or len(self._annotations_list) == 0
219
+ )
220
+
221
+ def add_dict(self, annotation: Dict[str, Any]):
222
+ """Add the annotation dict."""
223
+ self.ann_row_dicts.append(annotation)
224
+
225
+ def add_dicts(self, annotations: List[Dict[str, Any]]):
226
+ """Add the annotation dicts."""
227
+ self.ann_row_dicts.extend(annotations)
228
+
229
+ def add_df(self, an_df: pd.DataFrame):
230
+ """Add (concatentate) the annotation dataframe to the current annotations."""
231
+ df = self.metadata.sort_df(pd.concat([self.df, an_df]))
232
+ self.set_df(df)
233
+
234
+ def _build_list(self) -> List[Dict[str, Any]]:
235
+ """Build the annotations list from the dataframe."""
236
+ alist = None
237
+ if self._df is not None:
238
+ alist = self._df.to_dict(orient="records")
239
+ self._df = None
240
+ return alist if alist is not None else list()
241
+
242
+ def _build_df(self) -> pd.DataFrame:
243
+ """Get the annotations as a df."""
244
+ df = None
245
+ if self._annotations_list is not None:
246
+ if len(self._annotations_list) > 0:
247
+ df = self.metadata.sort_df(pd.DataFrame(self._annotations_list))
248
+ self._annotations_list = None
249
+ return df
250
+
251
+ def set_df(self, df: pd.DataFrame):
252
+ """Set (or reset) this annotation's dataframe.
253
+ :param df: The new annotations dataframe.
254
+ """
255
+ self._df = df
256
+ self._annotations_list = None
257
+
258
+
259
+ class AnnotationsBuilder:
260
+ """A class for building annotations."""
261
+
262
+ def __init__(
263
+ self,
264
+ metadata: AnnotationsMetaData,
265
+ data_defaults: Dict[str, Any],
266
+ ):
267
+ """:param metadata: The annotations metadata
268
+ :param data_defaults: Dict[ann_colname, default_value] with default
269
+ values for annotation columns
270
+ """
271
+ self.metadata = metadata if metadata is not None else AnnotationsMetaData()
272
+ self.data_defaults = data_defaults
273
+
274
+ def build_annotation_row(
275
+ self, start_pos: int, end_pos: int, text: str, ann_type: str, **kwargs
276
+ ) -> Dict[str, Any]:
277
+ """Build an annotation row with the mandatory key values and those from
278
+ the remaining keyword arguments.
279
+
280
+ For those kwargs whose names match metadata column names, override the
281
+ data_defaults and add remaining data_default attributes.
282
+
283
+ :param result_row_dict: The result row dictionary being built
284
+ :param start_pos: The token start position
285
+ :param end_pos: The token end position
286
+ :param text: The token text
287
+ :param ann_type: The annotation type
288
+ :return: The result_row_dict
289
+ """
290
+ return self.do_build_row(
291
+ {
292
+ self.metadata.start_pos_col: start_pos,
293
+ self.metadata.end_pos_col: end_pos,
294
+ self.metadata.text_col: text,
295
+ self.metadata.ann_type_col: ann_type,
296
+ },
297
+ **kwargs,
298
+ )
299
+
300
+ def do_build_row(self, key_fields: Dict[str, Any], **kwargs) -> Dict[str, Any]:
301
+ """Do the row building with the key fields, followed by data defaults,
302
+ followed by any extra kwargs.
303
+ :param key_fields: The dictionary of key fields
304
+ :param kwargs: Any extra fields to add
305
+ """
306
+ result = dict()
307
+ result.update(key_fields)
308
+ if self.data_defaults is not None:
309
+ # Add data_defaults
310
+ result.update(self.data_defaults)
311
+ if kwargs is not None:
312
+ # Override with extra kwargs
313
+ result.update(kwargs)
314
+ return result
315
+
316
+
317
+ class RowData:
318
+ """A wrapper for an annotation row (pd.Series) to facilitate e.g., grouping."""
319
+
320
+ def __init__(
321
+ self,
322
+ metadata: AnnotationsMetaData,
323
+ row: pd.Series,
324
+ ):
325
+ self.metadata = metadata
326
+ self.row = row
327
+
328
+ @property
329
+ def loc(self):
330
+ return self.row.name
331
+
332
+ def __repr__(self) -> str:
333
+ return f'[{self.start_pos}:{self.end_pos})"{self.text}"'
334
+
335
+ @property
336
+ def start_pos(self) -> int:
337
+ return self.row[self.metadata.start_pos_col]
338
+
339
+ @property
340
+ def end_pos(self) -> int:
341
+ return self.row[self.metadata.end_pos_col]
342
+
343
+ @property
344
+ def text(self) -> str:
345
+ return self.row[self.metadata.text_col]
346
+
347
+ def is_subset(self, other_row: "RowData") -> bool:
348
+ """Determine whether this row's span is a subset of the other.
349
+ :param other_row: The other row
350
+ """
351
+ return self.start_pos >= other_row.start_pos and self.end_pos <= other_row.end_pos
352
+
353
+ def is_subset_of_any(self, other_rows: List["RowData"]) -> bool:
354
+ """Determine whether this row is a subset of any of the others
355
+ according to text span coverage.
356
+ :param other_rows: The rows to test for this to be a subset of any
357
+ """
358
+ result = False
359
+ for other_row in other_rows:
360
+ if self.is_subset(other_row):
361
+ result = True
362
+ break
363
+ return result
364
+
365
+
366
+ class AnnotationsGroup:
367
+ """Container for annotation rows that belong together as a (consistent) group.
368
+
369
+ NOTE: An instance will only accept rows on condition of consistency per its
370
+ acceptance function.
371
+ """
372
+
373
+ def __init__(
374
+ self,
375
+ row_accessor: AnnotationsRowAccessor,
376
+ field_col_type: str,
377
+ accept_fn: Callable[["AnnotationsGroup", RowData], bool],
378
+ group_type: str = None,
379
+ group_num: int = None,
380
+ valid: bool = True,
381
+ autolock: bool = False,
382
+ ):
383
+ """:param row_accessor: The annotations row_accessor
384
+ :param field_col_type: The col_type for the group field_type for retrieval
385
+ using the annotations row accessor
386
+ :param accept_fn: A fn(g, row_data) that returns True to accept the row
387
+ data into this group g, or False to reject the row. If None, then
388
+ all rows are always accepted.
389
+ :param group_type: An optional (override) type for identifying this group.
390
+ :param group_num: An optional number for identifying this group.
391
+ :param valid: True if the group is valid, or False if not
392
+ :param autolock: True to automatically lock this group when (1) at
393
+ least one row has been added and (2) a row is rejected.
394
+ """
395
+ self.rows = list() # List[RowData]
396
+ self.row_accessor = row_accessor
397
+ self.field_col_type = field_col_type
398
+ self.accept_fn = accept_fn
399
+ self._group_type = group_type
400
+ self._group_num = group_num
401
+ self._valid = valid
402
+ self._autolock = autolock
403
+ self._locked = False
404
+ self._locs = None # track loc's for recognizing dupes
405
+ self._key = None # a hash key using the _locs
406
+ self._df = None
407
+ self._ann_type = None
408
+
409
+ @property
410
+ def is_locked(self) -> bool:
411
+ """Get whether this group is locked from adding more rows."""
412
+ return self._locked
413
+
414
+ @is_locked.setter
415
+ def is_locked(self, value: bool):
416
+ """Set this group as locked (value=True) or unlocked (value=False) to
417
+ allow or disallow more rows from being added regardless of the accept
418
+ function.
419
+
420
+ Note that while unlocked only rows that pass the accept function will
421
+ be added.
422
+
423
+ :param value: True to lock or False to unlock this group.
424
+ """
425
+ self._locked = value
426
+
427
+ @property
428
+ def is_valid(self) -> bool:
429
+ """Get whether this group is currently marked as valid."""
430
+ return self._valid
431
+
432
+ @is_valid.setter
433
+ def is_valid(self, value: bool):
434
+ """Mark this group as valid (value=True) or invalid (value=False).
435
+ :param value: True for valid or False for invalid.
436
+ """
437
+ self._valid = value
438
+
439
+ @property
440
+ def autolock(self) -> bool:
441
+ """Get whether this group is currently set to autolock."""
442
+ return self._autolock
443
+
444
+ @autolock.setter
445
+ def autolock(self, value: bool):
446
+ """Set this group to autolock (True) or not (False).
447
+ :param value: True for False to autolock or not.
448
+ """
449
+ self._autolock = value
450
+
451
+ def __repr__(self):
452
+ return json.dumps(self.to_dict())
453
+
454
+ @property
455
+ def size(self) -> int:
456
+ """Get the number of rows in this group."""
457
+ return len(self.rows)
458
+
459
+ @property
460
+ def group_type(self) -> str:
461
+ """Get this group's type, which is either an "override" value that has
462
+ been set, or the "ann_type" value of the first row added.
463
+ """
464
+ return self._group_type if self._group_type is not None else self.ann_type
465
+
466
+ @group_type.setter
467
+ def group_type(self, value: str):
468
+ """Set this group's type"""
469
+ self._group_type = value
470
+
471
+ @property
472
+ def group_num(self) -> int:
473
+ """Get this group's number"""
474
+ return self._group_num
475
+
476
+ @group_num.setter
477
+ def group_num(self, value: int):
478
+ """Set this group's num"""
479
+ self._group_num = value
480
+
481
+ @property
482
+ def df(self) -> pd.DataFrame:
483
+ """Get this group as a dataframe"""
484
+ if self._df is None:
485
+ self._df = pd.DataFrame([r.row for r in self.rows])
486
+ return self._df
487
+
488
+ @property
489
+ def ann_type(self) -> str:
490
+ """Get this record's annotation type"""
491
+ return self._ann_type
492
+
493
+ @property
494
+ def text(self) -> str:
495
+ return " ".join([row.text for row in self.rows])
496
+
497
+ @property
498
+ def locs(self) -> List[int]:
499
+ if self._locs is None:
500
+ self._locs = [r.loc for r in self.rows]
501
+ return self._locs
502
+
503
+ @property
504
+ def key(self) -> str:
505
+ """A hash key for this group."""
506
+ if self._key is None:
507
+ self._key = "_".join([str(x) for x in sorted(self.locs)])
508
+ return self._key
509
+
510
+ def copy(self) -> "AnnotationsGroup":
511
+ result = AnnotationsGroup(
512
+ self.row_accessor,
513
+ self.field_col_type,
514
+ self.accept_fn,
515
+ group_type=self.group_type,
516
+ group_num=self.group_num,
517
+ valid=self.is_valid,
518
+ autolock=self.autolock,
519
+ )
520
+ result.rows = self.rows.copy()
521
+ result._locked = self._locked # pylint: disable=protected-access
522
+ result._ann_type = self._ann_type # pylint: disable=protected-access
523
+
524
+ def add(self, rowdata: RowData) -> bool:
525
+ """Add the row if the group is not locked and the row belongs in this
526
+ group, or return False.
527
+
528
+ If autolock is True and a row fails to be added (after the first
529
+ row has been added,) "lock" the group and refuse to accept any more
530
+ rows.
531
+
532
+ :param rowdata: The row to add
533
+ :return: True if the row belongs and was added; otherwise, False
534
+ """
535
+ result = False
536
+ if self._locked:
537
+ return result
538
+
539
+ if self.accept_fn is None or self.accept_fn(self, rowdata):
540
+ self.rows.append(rowdata)
541
+ self._df = None
542
+ self._locs = None
543
+ self._key = None
544
+ if self._ann_type is None:
545
+ self._ann_type = self.row_accessor.get_col_value(
546
+ KEY_ANN_TYPE_COL,
547
+ rowdata.row,
548
+ missing=None,
549
+ )
550
+ result = True
551
+
552
+ if not result and self.size > 0 and self.autolock:
553
+ self._locked = True
554
+
555
+ return result
556
+
557
+ def to_dict(self) -> Dict[str, str]:
558
+ """Get this group (record) as a dictionary of field type to text values."""
559
+ return {self.row_accessor.get_col_value(self.field_col_type): row.text for row in self.rows}
560
+
561
+ def is_subset(self, other: "AnnotationsGroup") -> bool:
562
+ """Determine whether the this group's text is contained within the others.
563
+ :param other: The other group
564
+ """
565
+ result = True
566
+ for my_row in self.rows:
567
+ if not my_row.is_subset_of_any(other.rows):
568
+ result = False
569
+ break
570
+ return result
571
+
572
+ def is_subset_of_any(self, groups: List["AnnotationsGroup"]) -> "AnnotationsGroup":
573
+ """Determine whether this group is a subset of any of the given groups.
574
+ :param groups: List of annotation groups
575
+ :return: The first AnnotationsGroup that this group is a subset of, or
576
+ None
577
+ """
578
+ result = None
579
+ for other_group in groups:
580
+ if self.is_subset(other_group):
581
+ result = other_group
582
+ break
583
+ return result
584
+
585
+ def remove_row(
586
+ self,
587
+ row_idx: int,
588
+ ) -> RowData:
589
+ """Remove the row from this group and optionally update the annotations
590
+ accordingly.
591
+
592
+ :param row_idx: The positional index of the row to remove
593
+ :return: The removed row data instance
594
+ """
595
+ rowdata = self.rows.pop(row_idx)
596
+
597
+ # Reset cached values
598
+ self._df = None
599
+ self._locs = None
600
+ self._key = None
601
+
602
+ return rowdata
603
+
604
+
605
+ class MergeStrategy(ABC):
606
+ """A merge strategy to be injected based on entity types being merged."""
607
+
608
+ @abstractmethod
609
+ def merge(self, group: AnnotationsGroup) -> List[Dict[str, Any]]:
610
+ """Process the annotations in the given annotations group, returning the
611
+ group's merged annotation dictionaries.
612
+ """
613
+ raise NotImplementedError
614
+
615
+
616
+ class PositionalAnnotationsGroup(AnnotationsGroup):
617
+ """Container for annotations that either overlap with each other or don't."""
618
+
619
+ def __init__(self, overlap: bool, rectype: str = None, gnum: int = -1):
620
+ """:param overlap: If False, then only accept rows that don't overlap; else
621
+ only accept rows that do ovelap
622
+ """
623
+ super().__init__(None, None, None, group_type=rectype, group_num=gnum)
624
+ self.overlap = overlap
625
+ self.start_pos = -1
626
+ self.end_pos = -1
627
+
628
+ def __repr__(self) -> str:
629
+ return f'nrows={len(self.rows)}[{self.start_pos},{self.end_pos})"{self.entity_text}"'
630
+
631
+ @property
632
+ def entity_text(self) -> str:
633
+ jstr = " | " if self.overlap else " "
634
+ return jstr.join(r.entity_text for r in self.rows)
635
+
636
+ def belongs(self, rowdata: RowData) -> bool:
637
+ """Determine if the row belongs in this instance based on its overlap
638
+ or not.
639
+ :param rowdata: The rowdata to test
640
+ :return: True if the rowdata belongs in this instance
641
+ """
642
+ result = True # Anything belongs to an empty group
643
+ if len(self.rows) > 0:
644
+ start_overlaps = self._is_in_bounds(rowdata.start_pos)
645
+ end_overlaps = self._is_in_bounds(rowdata.end_pos - 1)
646
+ result = start_overlaps or end_overlaps
647
+ if not self.overlap:
648
+ result = not result
649
+ if result:
650
+ if self.start_pos < 0:
651
+ self.start_pos = rowdata.start_pos
652
+ self.end_pos = rowdata.end_pos
653
+ else:
654
+ self.start_pos = min(self.start_pos, rowdata.start_pos)
655
+ self.end_pos = max(self.end_pos, rowdata.end_pos)
656
+ return result
657
+
658
+ def _is_in_bounds(self, char_pos):
659
+ return char_pos >= self.start_pos and char_pos < self.end_pos
660
+
661
+ def copy(self) -> "PositionalAnnotationsGroup":
662
+ result = PositionalAnnotationsGroup(self.overlap)
663
+ result.start_pos = self.start_pos
664
+ result.end_pos = self.end_pos
665
+ result.rows = self.rows.copy()
666
+ return result
667
+
668
+ # TODO: Add comparison and merge functions
669
+
670
+
671
+ class OverlapGroupIterator:
672
+ """Given:
673
+ * annotation rows (dataframe)
674
+ * in order sorted by
675
+ * start_pos (increasing for input order), and
676
+ * end_pos (decreasing for longest spans first)
677
+ Collect:
678
+ * overlapping consecutive annotations
679
+ * for processing
680
+ """
681
+
682
+ def __init__(self, an_df: pd.DataFrame):
683
+ """:param an_df: An annotations.as_df DataFrame, sliced and sorted."""
684
+ self.an_df = an_df
685
+ self._cur_iter = None
686
+ self._queued_row_data = None
687
+ self.cur_group = None
688
+ self.reset()
689
+
690
+ def next_group(self) -> AnnotationsGroup:
691
+ group = None
692
+ if self.has_next:
693
+ group = PositionalAnnotationsGroup(True)
694
+ while self.has_next and group.belongs(self._queued_row_data):
695
+ self._queue_next()
696
+ self.cur_group = group
697
+ return group
698
+
699
+ def reset(self):
700
+ self._cur_iter = self.an_df.iterrows()
701
+ self._queue_next()
702
+ self.cur_group = None
703
+
704
+ @property
705
+ def has_next(self) -> bool:
706
+ return self._queued_row_data is not None
707
+
708
+ def _queue_next(self):
709
+ try:
710
+ _loc, row = next(self._cur_iter)
711
+ self._queued_row_data = RowData(None, row) # TODO: add metadata
712
+ except StopIteration:
713
+ self._queued_row_data = None
714
+
715
+
716
+ def merge(
717
+ annotations: Annotations,
718
+ merge_strategy: MergeStrategy,
719
+ ) -> Annotations:
720
+ """Merge the overlapping groups according to the given strategy."""
721
+ og_iter = OverlapGroupIterator(annotations.as_df)
722
+ result = Annotations(annotations.metadata)
723
+ while og_iter.has_next:
724
+ og = og_iter.next_group()
725
+ result.add_dicts(merge_strategy.merge(og))
726
+ return result
727
+
728
+
729
+ class AnnotationsGroupList:
730
+ """Container for a list of annotation groups."""
731
+
732
+ def __init__(
733
+ self,
734
+ groups: List[AnnotationsGroup] = None,
735
+ accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda l, g: l.size
736
+ == 0
737
+ or not g.is_subset_of_any(l.groups),
738
+ ):
739
+ """:param groups: The initial groups for this list
740
+ :param accept_fn: A fn(l, g) that returns True to accept the group, g,
741
+ into this list, l, or False to reject the group. If None, then all
742
+ groups are always accepted. The default function will reject any
743
+ group that is a subset of any existing group in the list.
744
+ """
745
+ self.groups = groups if groups is not None else list()
746
+ self.accept_fn = accept_fn
747
+ self._coverage = None
748
+
749
+ def __repr__(self) -> str:
750
+ return str(self.groups)
751
+
752
+ @property
753
+ def size(self) -> int:
754
+ """Get the number of groups in this list"""
755
+ return len(self.groups)
756
+
757
+ @property
758
+ def coverage(self) -> int:
759
+ """Get the total number of (token) rows covered by the groups"""
760
+ if self._coverage is None:
761
+ locs = set()
762
+ for group in self.groups:
763
+ locs.update(set(group.locs))
764
+ self._coverage = len(locs)
765
+ return self._coverage
766
+
767
+ @property
768
+ def df(self) -> pd.DataFrame:
769
+ return pd.DataFrame([r.row for g in self.groups for r in g.rows])
770
+
771
+ def copy(self) -> "AnnotationsGroupList":
772
+ result = AnnotationsGroupList(self.groups.copy(), accept_fn=self.accept_fn)
773
+ result._coverage = self._coverage # pylint: disable=protected-access
774
+ return result
775
+
776
+ def add(self, group: AnnotationsGroup) -> bool:
777
+ """Add the group if it belongs in this group list or return False.
778
+ :param group: The group to add
779
+ :return: True if the group belongs and was added; otherwise, False
780
+ """
781
+ result = False
782
+ if self.accept_fn is None or self.accept_fn(self, group):
783
+ self.groups.append(group)
784
+ self._coverage = None
785
+ result = True
786
+ return result
787
+
788
+ def is_subset(self, other: "AnnotationsGroupList") -> bool:
789
+ """Determine whether the this group's text spans are contained within all
790
+ of the other's.
791
+ :param other: The other group list
792
+ """
793
+ result = True
794
+ for my_group in self.groups:
795
+ if not my_group.is_subset_of_any(other.groups):
796
+ result = False
797
+ break
798
+ return result
799
+
800
+
801
+ class AnnotatedText(dk_doc.Text):
802
+ """A Text object that manages its own annotations."""
803
+
804
+ def __init__(
805
+ self,
806
+ text_str: str,
807
+ metadata: dk_doc.TextMetaData = None,
808
+ annots: Annotations = None,
809
+ bookmarks: Dict[str, pd.DataFrame] = None,
810
+ text_obj: dk_doc.Text = None,
811
+ annots_metadata: AnnotationsMetaData = None,
812
+ ):
813
+ """:param text_str: The text string
814
+ :param metadata: The text's metadata
815
+ :param annots: The annotations
816
+ :param bookmarks: The annotation bookmarks
817
+ :param text_obj: A text_obj to override text_str and metadata
818
+ initialization
819
+ :param annots_metadata: Override for default annotations metadata
820
+ (NOTE: ineffectual if an annots instance is provided.)
821
+ """
822
+ super().__init__(
823
+ text_obj.text if text_obj is not None else text_str,
824
+ text_obj.metadata if text_obj is not None else metadata,
825
+ )
826
+ self._annots = annots
827
+ self._bookmarks = bookmarks
828
+ self._annots_metadata = annots_metadata
829
+
830
+ @property
831
+ def annotations(self) -> Annotations:
832
+ """Get the this object's annotations"""
833
+ if self._annots is None:
834
+ self._annots = Annotations(self._annots_metadata or AnnotationsMetaData())
835
+ return self._annots
836
+
837
+ @property
838
+ def bookmarks(self) -> Dict[str, pd.DataFrame]:
839
+ """Get this object's bookmarks"""
840
+ if self._bookmarks is None:
841
+ self._bookmarks = dict()
842
+ return self._bookmarks
843
+
844
+ def get_text(
845
+ self,
846
+ annot2mask: Dict[str, str] = None,
847
+ annot_df: pd.DataFrame = None,
848
+ text: str = None,
849
+ ) -> str:
850
+ """Get the text object's string, masking if indicated.
851
+
852
+ :param annot2mask: Mapping from annotation column (e.g., _num or
853
+ _recsnum) to the replacement character(s) in the input text
854
+ for masking already managed input.
855
+ :param annot_df: Override annotations dataframe
856
+ :param text: Override text
857
+ :return: The (masked) text
858
+ """
859
+ if annot2mask is None:
860
+ return self.text
861
+ # Apply the mask
862
+ text_s = self.get_text_series(text=text) # no padding
863
+ if annot2mask is not None:
864
+ annot_df = self.annotations.as_df
865
+ text_s = self._apply_mask(text_s, annot2mask, annot_df)
866
+ return "".join(text_s)
867
+
868
+ def get_text_series(
869
+ self,
870
+ pad_len: int = 0,
871
+ text: str = None,
872
+ ) -> pd.Series:
873
+ """Get the input text as a (padded) pandas series.
874
+ :param pad_len: The number of spaces to pad both front ant back
875
+ :param text: Override text
876
+ :return: The (padded) pandas series of input characters.
877
+ """
878
+ if text is None:
879
+ text = self.text
880
+ return pd.Series(list(" " * pad_len + text + " " * pad_len))
881
+
882
+ def get_annot_mask(
883
+ self,
884
+ annot_col: str,
885
+ pad_len: int = 0,
886
+ annot_df: pd.DataFrame = None,
887
+ text: str = None,
888
+ ) -> pd.Series:
889
+ """Get a True/False series for the input such that start to end positions
890
+ for rows where the the annotation column is non-null and non-empty are
891
+ True.
892
+ :param annot_col: The annotation column identifying chars to mask
893
+ :param pad_len: The number of characters to pad the mask with False
894
+ values at both the front and back.
895
+ :param annot_df: Override annotations dataframe
896
+ :param text: Override text
897
+ :return: A pandas Series where annotated input character positions
898
+ are True and non-annotated positions are False.
899
+ """
900
+ if annot_df is None:
901
+ annot_df = self.annotations.as_df
902
+ if text is None:
903
+ text = self.text
904
+ textlen = len(text)
905
+ return self._get_annot_mask(annot_df, textlen, annot_col, pad_len=pad_len)
906
+
907
+ @staticmethod
908
+ def _get_annot_mask(
909
+ annot_df: pd.DataFrame,
910
+ textlen: int,
911
+ annot_col: str,
912
+ pad_len: int = 0,
913
+ ) -> pd.Series:
914
+ """Get a True/False series for the input such that start to end positions
915
+ for rows where the the annotation column is non-null and non-empty are
916
+ True.
917
+
918
+ :param annot_df: The annotations dataframe
919
+ :param textlen: The length of the input text
920
+ :param annot_col: The annotation column identifying chars to mask
921
+ :param pad_len: The number of characters to pad the mask with False
922
+ values at both the front and back.
923
+ :return: A pandas Series where annotated input character positions
924
+ are True and non-annotated positions are False.
925
+ """
926
+ mask = None
927
+ df = annot_df
928
+ if annot_col in df.columns:
929
+ df = df[np.logical_and(df[annot_col].notna(), df[annot_col] != "")]
930
+ mask = pd.Series([False] * textlen)
931
+ for _, row in df.iterrows():
932
+ mask.loc[row["start_pos"] + pad_len : row["end_pos"] - 1 + pad_len] = True
933
+ return mask
934
+
935
+ def _apply_mask(
936
+ self,
937
+ text_s: pd.Series,
938
+ annot2mask: Dict[str, str],
939
+ annot_df: pd.DataFrame,
940
+ ) -> str:
941
+ if len(text_s) > 0 and annot2mask is not None and annot_df is not None:
942
+ cols = set(annot_df.columns).intersection(annot2mask.keys())
943
+ if len(cols) > 0:
944
+ for col in cols:
945
+ text_s = self._substitute(
946
+ text_s,
947
+ col,
948
+ annot2mask[col],
949
+ annot_df,
950
+ )
951
+ return text_s
952
+
953
+ def _substitute(
954
+ self,
955
+ text_s: pd.Series,
956
+ col: str,
957
+ repl_mask: str,
958
+ annot_df: pd.DataFrame,
959
+ ) -> str:
960
+ """Substitute the "mask" char for "text" chars at "col"-annotated positions
961
+ :param text_s: The text series to revise
962
+ :param col: The annotation col identifying positions to mask
963
+ :param repl_mask: The mask character to inject at annotated positions
964
+ :param annot_df: The annotations dataframe
965
+ :return: The masked text
966
+ """
967
+ annot_mask = self._get_annot_mask(annot_df, len(text_s), col)
968
+ text_s = text_s.mask(annot_mask, repl_mask)
969
+ return text_s
970
+
971
+ def add_annotations(self, annotations: Annotations):
972
+ """Add the annotations to this instance.
973
+
974
+ :param annotations: The annotations to add.
975
+ """
976
+ if annotations is not None and not annotations.is_empty():
977
+ df = annotations.df
978
+ if self._annots is None:
979
+ self._annots = annotations
980
+ elif self._annots.is_empty():
981
+ if df is not None:
982
+ self._annots.set_df(df.copy())
983
+ elif df is not None:
984
+ self._annots.add_df(df)
985
+
986
+
987
+ class Annotator(ABC):
988
+ """Class for annotating text"""
989
+
990
+ def __init__(
991
+ self,
992
+ name: str,
993
+ ):
994
+ """:param name: The name of this annotator"""
995
+ self.name = name
996
+
997
+ @abstractmethod
998
+ def annotate_input(
999
+ self,
1000
+ text_obj: AnnotatedText,
1001
+ **kwargs,
1002
+ ) -> Annotations:
1003
+ """Annotate this instance's text, additively updating its annotations.
1004
+
1005
+ :param text_obj: The text object to annotate
1006
+ :return: The annotations added
1007
+ """
1008
+ raise NotImplementedError
1009
+
1010
+
1011
+ class BasicAnnotator(Annotator):
1012
+ """Class for extracting basic (possibly multi -level or -part) entities."""
1013
+
1014
+ def annotate_input(
1015
+ self,
1016
+ text_obj: AnnotatedText,
1017
+ **kwargs,
1018
+ ) -> Annotations:
1019
+ """Annotate the text obj, additively updating the annotations
1020
+
1021
+ :param text: The text to annotate
1022
+ :return: The annotations added to the text
1023
+ """
1024
+ # Get new annotation with just the syntax
1025
+ annots = self.annotate_text(text_obj.text)
1026
+
1027
+ # Add syntactic annotations only as a bookmark
1028
+ text_obj.annotations.add_df(annots.as_df)
1029
+
1030
+ return annots
1031
+
1032
+ @abstractmethod
1033
+ def annotate_text(self, text_str: str) -> Annotations:
1034
+ """Build annotations for the text string.
1035
+ :param text_str: The text string to annotate
1036
+ :return: Annotations for the text
1037
+ """
1038
+ raise NotImplementedError
1039
+
1040
+
1041
+ # TODO: remove this if unused -- stanza_annotator isa Authority -vs- stanza_annotator isa SyntacticParser
1042
+ class SyntacticParser(BasicAnnotator):
1043
+ """Class for creating syntactic annotations for an input."""
1044
+
1045
+ def annotate_input(
1046
+ self,
1047
+ text_obj: AnnotatedText,
1048
+ **kwargs,
1049
+ ) -> Annotations:
1050
+ """Annotate the text, additively updating the annotations
1051
+
1052
+ :param text: The text to annotate
1053
+ :return: The annotations added to the text
1054
+ """
1055
+ # Get new annotation with just the syntax
1056
+ annots = self.annotate_text(text_obj.text)
1057
+
1058
+ # Add syntactic annotations only as a bookmark
1059
+ text_obj.bookmarks[self.name] = annots.as_df
1060
+
1061
+ return annots
1062
+
1063
+
1064
+ class EntityAnnotator(BasicAnnotator):
1065
+ """Class for extracting single (possibly multi-level or -part) entities."""
1066
+
1067
+ def __init__(
1068
+ self,
1069
+ name: str,
1070
+ mask_char: str = " ",
1071
+ ):
1072
+ """:param name: The name of this annotator
1073
+ :param mask_char: The character to use to mask out previously annotated
1074
+ spans of this annotator's text.
1075
+ """
1076
+ super().__init__(name)
1077
+ self.mask_char = mask_char
1078
+
1079
+ @property
1080
+ @abstractmethod
1081
+ def annotation_cols(self) -> Set[str]:
1082
+ """Report the (final group or record) annotation columns that are filled
1083
+ by this annotator when its entities are annotated.
1084
+ """
1085
+ raise NotImplementedError
1086
+
1087
+ @abstractmethod
1088
+ def mark_records(self, annotations: Annotations, largest_only: bool = True):
1089
+ """Collect and mark annotation records.
1090
+
1091
+ :param annotations: The annotations
1092
+ :param largest_only: True to only mark (keep) the largest records.
1093
+ """
1094
+ raise NotImplementedError
1095
+
1096
+ @abstractmethod
1097
+ def validate_records(
1098
+ self,
1099
+ annotations: Annotations,
1100
+ ):
1101
+ """Validate annotated records.
1102
+
1103
+ :param annotations: The annotations
1104
+ """
1105
+ raise NotImplementedError
1106
+
1107
+ @abstractmethod
1108
+ def compose_groups(self, annotations: Annotations) -> Annotations:
1109
+ """Compose annotation rows into groups.
1110
+ :param annotations: The annotations
1111
+ :return: The composed annotations
1112
+ """
1113
+ raise NotImplementedError
1114
+
1115
+ def annotate_input(
1116
+ self,
1117
+ text_obj: AnnotatedText,
1118
+ annot_mask_cols: Set[str] = None,
1119
+ merge_strategies: Dict[str, MergeStrategy] = None,
1120
+ largest_only: bool = True,
1121
+ **kwargs,
1122
+ ) -> Annotations:
1123
+ """Annotate the text object (optionally) after masking out previously
1124
+ annotated spans, additively updating the annotations in the text
1125
+ object.
1126
+
1127
+ :param text_obj: The text object to annotate
1128
+ :param annot_mask_cols: The (possible) previous annotations whose
1129
+ spans to ignore in the text
1130
+ :param merge_strategies: A dictionary of each input annotation bookmark
1131
+ tag mapped to a merge strategy for merging this annotator's
1132
+ annotations with the bookmarked dataframe. This is useful, for
1133
+ example, when merging syntactic information to refine ambiguities.
1134
+ :param largest_only: True to only mark largest records.
1135
+ :return: The annotations added to the text object
1136
+ """
1137
+ annot2mask = (
1138
+ None
1139
+ if annot_mask_cols is None
1140
+ else { # TODO: Use this?!
1141
+ col: self.mask_char for col in annot_mask_cols
1142
+ }
1143
+ )
1144
+
1145
+ annots = self.annotate_text(text_obj.text)
1146
+ if annots is None:
1147
+ return annots
1148
+
1149
+ if merge_strategies is not None:
1150
+ bookmarks = text_obj.bookmarks
1151
+ if bookmarks is not None and len(bookmarks) > 0:
1152
+ for tag, merge_strategy in merge_strategies.items():
1153
+ if tag in bookmarks:
1154
+ text_obj.bookmarks[f"{self.name}.pre-merge:{tag}"] = annots.df
1155
+ annots.add_df(bookmarks[tag])
1156
+ annots = merge(annots, merge_strategy)
1157
+
1158
+ annots = self.compose_groups(annots)
1159
+
1160
+ self.mark_records(annots, largest_only=largest_only)
1161
+ # NOTE: don't pass "text" here because it may be masked
1162
+ self.validate_records(annots)
1163
+ text_obj.annotations.add_df(annots.df)
1164
+ return annots
1165
+
1166
+ @property
1167
+ @abstractmethod
1168
+ def highlight_fieldstyles(self) -> Dict[str, Dict[str, Dict[str, str]]]:
1169
+ """Get highlight field styles for this annotator's annotations of the form:
1170
+ {
1171
+ <field_col>: {
1172
+ <field_value>: {
1173
+ <css-attr>: <css-value>
1174
+ }
1175
+ }
1176
+ }
1177
+ For css-attr's like 'background-color', 'foreground-color', etc.
1178
+ """
1179
+ raise NotImplementedError
1180
+
1181
+
1182
+ class HtmlHighlighter:
1183
+ """Helper class to add HTML markup for highlighting spans of text."""
1184
+
1185
+ def __init__(
1186
+ self,
1187
+ field2style: Dict[str, Dict[str, str]],
1188
+ tooltip_class="tooltip",
1189
+ tooltiptext_class="tooltiptext",
1190
+ ):
1191
+ """:param field2style: The annotation column to highlight with its
1192
+ associated style, for example:
1193
+ {
1194
+ 'car_model_field': {
1195
+ 'year': {'background-color': 'lightyellow'},
1196
+ 'make': {'background-color': 'lightgreen'},
1197
+ 'model': {'background-color': 'cyan'},
1198
+ 'style': {'background-color': 'magenta'},
1199
+ },
1200
+ }
1201
+ :param tooltip_class: The css tooltip class
1202
+ :param tooltiptext_class: The css tooltiptext class
1203
+ """
1204
+ self.field2style = field2style
1205
+ self.tooltip_class = tooltip_class
1206
+ self.tooltiptext_class = tooltiptext_class
1207
+
1208
+ def highlight(
1209
+ self,
1210
+ text_obj: AnnotatedText,
1211
+ ) -> str:
1212
+ """Return an html string with the given fields (annotation columns)
1213
+ highlighted with the associated styles.
1214
+ :param text_obj: The annotated text to markup
1215
+ """
1216
+ result = ["<p>"]
1217
+ anns = text_obj.annotations
1218
+ an_df = anns.df
1219
+ for field, styles in self.field2style.items():
1220
+ # NOTE: the following line relies on an_df already being sorted
1221
+ df = an_df[an_df[field].isin(styles)]
1222
+ cur_pos = 0
1223
+ for _loc, row in df.iterrows():
1224
+ enttype = row[field]
1225
+ style = styles[enttype]
1226
+ style_str = " ".join([f"{key}: {value};" for key, value in style.items()])
1227
+ start_pos = row[anns.metadata.start_pos_col]
1228
+ if start_pos > cur_pos:
1229
+ result.append(text_obj.text[cur_pos:start_pos])
1230
+ end_pos = row[anns.metadata.end_pos_col]
1231
+ result.append(f'<mark class="{self.tooltip_class}" style="{style_str}">')
1232
+ result.append(text_obj.text[start_pos:end_pos])
1233
+ result.append(f'<span class="{self.tooltiptext_class}">{enttype}</span>')
1234
+ result.append("</mark>")
1235
+ cur_pos = end_pos
1236
+ result.append("</p>")
1237
+ return "\n".join(result)
1238
+
1239
+
1240
+ class AnnotatorKernel(ABC):
1241
+ """Class for encapsulating core annotation logic for multiple annotators"""
1242
+
1243
+ @property
1244
+ @abstractmethod
1245
+ def annotators(self) -> List[EntityAnnotator]:
1246
+ """Get the entity annotators"""
1247
+ raise NotImplementedError
1248
+
1249
+ @abstractmethod
1250
+ def annotate_input(self, text_obj: AnnotatedText) -> Annotations:
1251
+ """Execute all annotations on the text_obj"""
1252
+ raise NotImplementedError
1253
+
1254
+
1255
+ class CompoundAnnotator(Annotator):
1256
+ """Class to apply a series of annotators through an AnnotatorKernel"""
1257
+
1258
+ def __init__(
1259
+ self,
1260
+ kernel: AnnotatorKernel,
1261
+ name: str = "entity",
1262
+ ):
1263
+ """Initialize with the annotators and this extractor's name.
1264
+
1265
+ :param kernel: The annotations kernel to use
1266
+ :param name: The name of this information extractor to be the
1267
+ annotations base column name for <name>_num and <name>_recsnum
1268
+ """
1269
+ super().__init__(name=name)
1270
+ self.kernel = kernel
1271
+
1272
+ def annotate_input(
1273
+ self,
1274
+ text_obj: AnnotatedText,
1275
+ reset: bool = True,
1276
+ **kwargs,
1277
+ ) -> Annotations:
1278
+ """Annotate the text.
1279
+
1280
+ :param text_obj: The AnnotatedText object to annotate.
1281
+ :param reset: When True, reset and rebuild any existing annotations
1282
+ :return: The annotations added to the text_obj
1283
+ """
1284
+ if reset:
1285
+ text_obj.annotations.clear()
1286
+ annots = self.kernel.annotate_input(text_obj)
1287
+ return annots
1288
+
1289
+ def get_html_highlighted_text(
1290
+ self,
1291
+ text_obj: AnnotatedText,
1292
+ annotator_names: List[str] = None,
1293
+ ) -> str:
1294
+ """Get html-hilighted text for the identified input's annotations
1295
+ from the given annotators (or all).
1296
+
1297
+ :param text_obj: The input text to highlight
1298
+ :param annotator_names: The subset of annotators to highlight.
1299
+ """
1300
+ if annotator_names is None:
1301
+ annotator_names = [ann.name for ann in self.kernel.annotators]
1302
+ hfs = {
1303
+ ann.name: ann.highlight_fieldstyles
1304
+ for ann in self.kernel.annotators
1305
+ if ann.name in annotator_names
1306
+ }
1307
+ hh = HtmlHighlighter(hfs)
1308
+ return hh.highlight(text_obj)