PyPI - dataknobs-xization - Versions diffs - 1.0.0__py3-none-any.whl - Mend

dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dataknobs-xization might be problematic. Click here for more details.

Files changed (10) hide show

dataknobs_xization/0.readme.txt +66 -0
dataknobs_xization/__init__.py +16 -0
dataknobs_xization/annotations.py +1308 -0
dataknobs_xization/authorities.py +766 -0
dataknobs_xization/lexicon.py +596 -0
dataknobs_xization/masking_tokenizer.py +697 -0
dataknobs_xization/normalize.py +448 -0
dataknobs_xization-1.0.0.dist-info/METADATA +58 -0
dataknobs_xization-1.0.0.dist-info/RECORD +10 -0
dataknobs_xization-1.0.0.dist-info/WHEEL +4 -0

dataknobs_xization/annotations.py ADDED Viewed

@@ -0,0 +1,1308 @@
+import json
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from typing import Any, Dict, List, Set
+import numpy as np
+import pandas as pd
+import dataknobs_structures.document as dk_doc
+# Key annotations column name constants for use across annotation interfaces
+KEY_START_POS_COL = "start_pos"
+KEY_END_POS_COL = "end_pos"
+KEY_TEXT_COL = "text"
+KEY_ANN_TYPE_COL = "ann_type"
+class AnnotationsMetaData(dk_doc.MetaData):
+    """Container for annotations meta-data, identifying key column names.
+    NOTE: this object contains only information about annotation column names
+    and not annotation table values.
+    """
+    def __init__(
+        self,
+        start_pos_col: str = KEY_START_POS_COL,
+        end_pos_col: str = KEY_END_POS_COL,
+        text_col: str = KEY_TEXT_COL,
+        ann_type_col: str = KEY_ANN_TYPE_COL,
+        sort_fields: List[str] = (KEY_START_POS_COL, KEY_END_POS_COL),
+        sort_fields_ascending: List[bool] = (True, False),
+        **kwargs,
+    ):
+        """Initialize with key (and more) column names and info.
+        Key column types:
+          * start_pos
+          * end_pos
+          * text
+          * ann_type
+        Notes:
+          * Actual table columns can be named arbitrarily
+             * BUT: interactions through annotations classes and interfaces
+               relating to the "key" columns must use the key column constants
+        :param start_pos_col: Col name for the token starting position
+        :param end_pos_col: Col name for the token ending position
+        :param text_col: Col name for the token text
+        :param ann_type_col: Col name for the annotation types
+        :param sort_fields: The col types relevant for sorting annotation rows
+        :param sort_fields_ascending: To specify sort order of sort_fields
+        :param **kwargs: More column types mapped to column names
+        """
+        super().__init__(
+            {
+                KEY_START_POS_COL: start_pos_col,
+                KEY_END_POS_COL: end_pos_col,
+                KEY_TEXT_COL: text_col,
+                KEY_ANN_TYPE_COL: ann_type_col,
+            },
+            **kwargs,
+        )
+        self.sort_fields = list(sort_fields)
+        self.ascending = sort_fields_ascending
+    @property
+    def start_pos_col(self) -> str:
+        """Get the column name for the token starting postition"""
+        return self.data[KEY_START_POS_COL]
+    @property
+    def end_pos_col(self) -> str:
+        """Get the column name for the token ending position"""
+        return self.data[KEY_END_POS_COL]
+    @property
+    def text_col(self) -> str:
+        """Get the column name for the token text"""
+        return self.data[KEY_TEXT_COL]
+    @property
+    def ann_type_col(self) -> str:
+        """Get the column name for the token annotation type"""
+        return self.data[KEY_ANN_TYPE_COL]
+    def get_col(self, col_type: str, missing: str = None) -> str:
+        """Get the name of the column having the given type (including key column
+        types but not derived,) or get the missing value.
+        :param col_type: The type of column name to get
+        :param missing: The value to return for unknown column types
+        :return: The column name or the missing value
+        """
+        return self.get_value(col_type, missing)
+    def sort_df(self, an_df: pd.DataFrame):
+        """Sort an annotations dataframe according to this metadata.
+        :param an_df: An annotations dataframe
+        :return: The sorted annotations dataframe.
+        """
+        if self.sort_fields is not None:
+            an_df = an_df.sort_values(self.sort_fields, ascending=self.ascending)
+        return an_df
+class DerivedAnnotationColumns(ABC):
+    """Interface for injecting derived columns into AnnotationsMetaData."""
+    @abstractmethod
+    def get_col_value(
+        self,
+        metadata: AnnotationsMetaData,
+        col_type: str,
+        row: pd.Series,
+        missing: str = None,
+    ) -> str:
+        """Get the value of the column in the given row derived from col_type.
+        :param metadata: The AnnotationsMetaData
+        :param col_type: The type of column value to derive
+        :param row: A row from which to get the value.
+        :param missing: The value to return for unknown or missing column
+        :return: The row value or the missing value
+        """
+        raise NotImplementedError
+class AnnotationsRowAccessor:
+    """A class that accesses row data according to the metadata and derived cols."""
+    def __init__(
+        self, metadata: AnnotationsMetaData, derived_cols: DerivedAnnotationColumns = None
+    ):
+        """:param metadata: The metadata for annotation columns
+        :param derived_cols: A DerivedAnnotationColumns instance for injecting
+            derived columns.
+        """
+        self.metadata = metadata
+        self.derived_cols = derived_cols
+    def get_col_value(
+        self,
+        col_type: str,
+        row: pd.Series,
+        missing: str = None,
+    ) -> str:
+        """Get the value of the column in the given row with the given type.
+        This gets the value from the first existing column in the row from:
+          * The metadata.get_col(col_type) column
+          * col_type itself
+          * The columns derived from col_type
+        :param col_type: The type of column value to get
+        :param row: A row from which to get the value.
+        :param missing: The value to return for unknown or missing column
+        :return: The row value or the missing value
+        """
+        value = missing
+        col = self.metadata.get_col(col_type, None)
+        if col is None or col not in row.index:
+            if col_type in self.metadata.data:
+                value = row[col_type]
+            elif self.derived_cols is not None:
+                value = self.derived_cols.get_col_value(self.metadata, col_type, row, missing)
+        else:
+            value = row[col]
+        return value
+class Annotations:
+    """DAO for collecting and managing a table of annotations, where each row
+    carries annotation information for an input token.
+    The data in this class is maintained either as a list of dicts, each dict
+    representing a "row," or as a pandas DataFrame, depending on the latest
+    access. Changes in either the lists or dataframe will be reflected in the
+    alternate data structure.
+    """
+    def __init__(
+        self,
+        metadata: AnnotationsMetaData,
+        df: pd.DataFrame = None,
+    ):
+        """Construct as empty or initialize with the dataframe form.
+        :param df: A dataframe with annotation records.
+        """
+        self.metadata = metadata
+        self._annotations_list = None
+        self._df = df
+    @property
+    def ann_row_dicts(self) -> List[Dict[str, Any]]:
+        """Get the annotations as a list of dictionaries."""
+        if self._annotations_list is None:
+            self._annotations_list = self._build_list()
+        return self._annotations_list
+    @property
+    def df(self) -> pd.DataFrame:
+        """Get the annotations as a pandas dataframe."""
+        if self._df is None:
+            self._df = self._build_df()
+        return self._df
+    def clear(self) -> pd.DataFrame:
+        """Clear/empty out all annotations, returning the annotations df"""
+        rv = self.df
+        self._df = None
+        self._annotations_list = None
+        return rv
+    def is_empty(self) -> bool:
+        return (self._df is None or len(self._df) == 0) and (
+            self._annotations_list is None or len(self._annotations_list) == 0
+        )
+    def add_dict(self, annotation: Dict[str, Any]):
+        """Add the annotation dict."""
+        self.ann_row_dicts.append(annotation)
+    def add_dicts(self, annotations: List[Dict[str, Any]]):
+        """Add the annotation dicts."""
+        self.ann_row_dicts.extend(annotations)
+    def add_df(self, an_df: pd.DataFrame):
+        """Add (concatentate) the annotation dataframe to the current annotations."""
+        df = self.metadata.sort_df(pd.concat([self.df, an_df]))
+        self.set_df(df)
+    def _build_list(self) -> List[Dict[str, Any]]:
+        """Build the annotations list from the dataframe."""
+        alist = None
+        if self._df is not None:
+            alist = self._df.to_dict(orient="records")
+            self._df = None
+        return alist if alist is not None else list()
+    def _build_df(self) -> pd.DataFrame:
+        """Get the annotations as a df."""
+        df = None
+        if self._annotations_list is not None:
+            if len(self._annotations_list) > 0:
+                df = self.metadata.sort_df(pd.DataFrame(self._annotations_list))
+            self._annotations_list = None
+        return df
+    def set_df(self, df: pd.DataFrame):
+        """Set (or reset) this annotation's dataframe.
+        :param df: The new annotations dataframe.
+        """
+        self._df = df
+        self._annotations_list = None
+class AnnotationsBuilder:
+    """A class for building annotations."""
+    def __init__(
+        self,
+        metadata: AnnotationsMetaData,
+        data_defaults: Dict[str, Any],
+    ):
+        """:param metadata: The annotations metadata
+        :param data_defaults: Dict[ann_colname, default_value] with default
+            values for annotation columns
+        """
+        self.metadata = metadata if metadata is not None else AnnotationsMetaData()
+        self.data_defaults = data_defaults
+    def build_annotation_row(
+        self, start_pos: int, end_pos: int, text: str, ann_type: str, **kwargs
+    ) -> Dict[str, Any]:
+        """Build an annotation row with the mandatory key values and those from
+        the remaining keyword arguments.
+        For those kwargs whose names match metadata column names, override the
+        data_defaults and add remaining data_default attributes.
+        :param result_row_dict: The result row dictionary being built
+        :param start_pos: The token start position
+        :param end_pos: The token end position
+        :param text: The token text
+        :param ann_type: The annotation type
+        :return: The result_row_dict
+        """
+        return self.do_build_row(
+            {
+                self.metadata.start_pos_col: start_pos,
+                self.metadata.end_pos_col: end_pos,
+                self.metadata.text_col: text,
+                self.metadata.ann_type_col: ann_type,
+            },
+            **kwargs,
+        )
+    def do_build_row(self, key_fields: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """Do the row building with the key fields, followed by data defaults,
+        followed by any extra kwargs.
+        :param key_fields: The dictionary of key fields
+        :param kwargs: Any extra fields to add
+        """
+        result = dict()
+        result.update(key_fields)
+        if self.data_defaults is not None:
+            # Add data_defaults
+            result.update(self.data_defaults)
+        if kwargs is not None:
+            # Override with extra kwargs
+            result.update(kwargs)
+        return result
+class RowData:
+    """A wrapper for an annotation row (pd.Series) to facilitate e.g., grouping."""
+    def __init__(
+        self,
+        metadata: AnnotationsMetaData,
+        row: pd.Series,
+    ):
+        self.metadata = metadata
+        self.row = row
+    @property
+    def loc(self):
+        return self.row.name
+    def __repr__(self) -> str:
+        return f'[{self.start_pos}:{self.end_pos})"{self.text}"'
+    @property
+    def start_pos(self) -> int:
+        return self.row[self.metadata.start_pos_col]
+    @property
+    def end_pos(self) -> int:
+        return self.row[self.metadata.end_pos_col]
+    @property
+    def text(self) -> str:
+        return self.row[self.metadata.text_col]
+    def is_subset(self, other_row: "RowData") -> bool:
+        """Determine whether this row's span is a subset of the other.
+        :param other_row: The other row
+        """
+        return self.start_pos >= other_row.start_pos and self.end_pos <= other_row.end_pos
+    def is_subset_of_any(self, other_rows: List["RowData"]) -> bool:
+        """Determine whether this row is a subset of any of the others
+        according to text span coverage.
+        :param other_rows: The rows to test for this to be a subset of any
+        """
+        result = False
+        for other_row in other_rows:
+            if self.is_subset(other_row):
+                result = True
+                break
+        return result
+class AnnotationsGroup:
+    """Container for annotation rows that belong together as a (consistent) group.
+    NOTE: An instance will only accept rows on condition of consistency per its
+    acceptance function.
+    """
+    def __init__(
+        self,
+        row_accessor: AnnotationsRowAccessor,
+        field_col_type: str,
+        accept_fn: Callable[["AnnotationsGroup", RowData], bool],
+        group_type: str = None,
+        group_num: int = None,
+        valid: bool = True,
+        autolock: bool = False,
+    ):
+        """:param row_accessor: The annotations row_accessor
+        :param field_col_type: The col_type for the group field_type for retrieval
+           using the annotations row accessor
+        :param accept_fn: A fn(g, row_data) that returns True to accept the row
+            data into this group g, or False to reject the row. If None, then
+            all rows are always accepted.
+        :param group_type: An optional (override) type for identifying this group.
+        :param group_num: An optional number for identifying this group.
+        :param valid: True if the group is valid, or False if not
+        :param autolock: True to automatically lock this group when (1) at
+            least one row has been added and (2) a row is rejected.
+        """
+        self.rows = list()  # List[RowData]
+        self.row_accessor = row_accessor
+        self.field_col_type = field_col_type
+        self.accept_fn = accept_fn
+        self._group_type = group_type
+        self._group_num = group_num
+        self._valid = valid
+        self._autolock = autolock
+        self._locked = False
+        self._locs = None  # track loc's for recognizing dupes
+        self._key = None  # a hash key using the _locs
+        self._df = None
+        self._ann_type = None
+    @property
+    def is_locked(self) -> bool:
+        """Get whether this group is locked from adding more rows."""
+        return self._locked
+    @is_locked.setter
+    def is_locked(self, value: bool):
+        """Set this group as locked (value=True) or unlocked (value=False) to
+        allow or disallow more rows from being added regardless of the accept
+        function.
+        Note that while unlocked only rows that pass the accept function will
+        be added.
+        :param value: True to lock or False to unlock this group.
+        """
+        self._locked = value
+    @property
+    def is_valid(self) -> bool:
+        """Get whether this group is currently marked as valid."""
+        return self._valid
+    @is_valid.setter
+    def is_valid(self, value: bool):
+        """Mark this group as valid (value=True) or invalid (value=False).
+        :param value: True for valid or False for invalid.
+        """
+        self._valid = value
+    @property
+    def autolock(self) -> bool:
+        """Get whether this group is currently set to autolock."""
+        return self._autolock
+    @autolock.setter
+    def autolock(self, value: bool):
+        """Set this group to autolock (True) or not (False).
+        :param value: True for False to autolock or not.
+        """
+        self._autolock = value
+    def __repr__(self):
+        return json.dumps(self.to_dict())
+    @property
+    def size(self) -> int:
+        """Get the number of rows in this group."""
+        return len(self.rows)
+    @property
+    def group_type(self) -> str:
+        """Get this group's type, which is either an "override" value that has
+        been set, or the "ann_type" value of the first row added.
+        """
+        return self._group_type if self._group_type is not None else self.ann_type
+    @group_type.setter
+    def group_type(self, value: str):
+        """Set this group's type"""
+        self._group_type = value
+    @property
+    def group_num(self) -> int:
+        """Get this group's number"""
+        return self._group_num
+    @group_num.setter
+    def group_num(self, value: int):
+        """Set this group's num"""
+        self._group_num = value
+    @property
+    def df(self) -> pd.DataFrame:
+        """Get this group as a dataframe"""
+        if self._df is None:
+            self._df = pd.DataFrame([r.row for r in self.rows])
+        return self._df
+    @property
+    def ann_type(self) -> str:
+        """Get this record's annotation type"""
+        return self._ann_type
+    @property
+    def text(self) -> str:
+        return " ".join([row.text for row in self.rows])
+    @property
+    def locs(self) -> List[int]:
+        if self._locs is None:
+            self._locs = [r.loc for r in self.rows]
+        return self._locs
+    @property
+    def key(self) -> str:
+        """A hash key for this group."""
+        if self._key is None:
+            self._key = "_".join([str(x) for x in sorted(self.locs)])
+        return self._key
+    def copy(self) -> "AnnotationsGroup":
+        result = AnnotationsGroup(
+            self.row_accessor,
+            self.field_col_type,
+            self.accept_fn,
+            group_type=self.group_type,
+            group_num=self.group_num,
+            valid=self.is_valid,
+            autolock=self.autolock,
+        )
+        result.rows = self.rows.copy()
+        result._locked = self._locked  # pylint: disable=protected-access
+        result._ann_type = self._ann_type  # pylint: disable=protected-access
+    def add(self, rowdata: RowData) -> bool:
+        """Add the row if the group is not locked and the row belongs in this
+        group, or return False.
+        If autolock is True and a row fails to be added (after the first
+        row has been added,) "lock" the group and refuse to accept any more
+        rows.
+        :param rowdata: The row to add
+        :return: True if the row belongs and was added; otherwise, False
+        """
+        result = False
+        if self._locked:
+            return result
+        if self.accept_fn is None or self.accept_fn(self, rowdata):
+            self.rows.append(rowdata)
+            self._df = None
+            self._locs = None
+            self._key = None
+            if self._ann_type is None:
+                self._ann_type = self.row_accessor.get_col_value(
+                    KEY_ANN_TYPE_COL,
+                    rowdata.row,
+                    missing=None,
+                )
+            result = True
+        if not result and self.size > 0 and self.autolock:
+            self._locked = True
+        return result
+    def to_dict(self) -> Dict[str, str]:
+        """Get this group (record) as a dictionary of field type to text values."""
+        return {self.row_accessor.get_col_value(self.field_col_type): row.text for row in self.rows}
+    def is_subset(self, other: "AnnotationsGroup") -> bool:
+        """Determine whether the this group's text is contained within the others.
+        :param other: The other group
+        """
+        result = True
+        for my_row in self.rows:
+            if not my_row.is_subset_of_any(other.rows):
+                result = False
+                break
+        return result
+    def is_subset_of_any(self, groups: List["AnnotationsGroup"]) -> "AnnotationsGroup":
+        """Determine whether this group is a subset of any of the given groups.
+        :param groups: List of annotation groups
+        :return: The first AnnotationsGroup that this group is a subset of, or
+            None
+        """
+        result = None
+        for other_group in groups:
+            if self.is_subset(other_group):
+                result = other_group
+                break
+        return result
+    def remove_row(
+        self,
+        row_idx: int,
+    ) -> RowData:
+        """Remove the row from this group and optionally update the annotations
+        accordingly.
+        :param row_idx: The positional index of the row to remove
+        :return: The removed row data instance
+        """
+        rowdata = self.rows.pop(row_idx)
+        # Reset cached values
+        self._df = None
+        self._locs = None
+        self._key = None
+        return rowdata
+class MergeStrategy(ABC):
+    """A merge strategy to be injected based on entity types being merged."""
+    @abstractmethod
+    def merge(self, group: AnnotationsGroup) -> List[Dict[str, Any]]:
+        """Process the annotations in the given annotations group, returning the
+        group's merged annotation dictionaries.
+        """
+        raise NotImplementedError
+class PositionalAnnotationsGroup(AnnotationsGroup):
+    """Container for annotations that either overlap with each other or don't."""
+    def __init__(self, overlap: bool, rectype: str = None, gnum: int = -1):
+        """:param overlap: If False, then only accept rows that don't overlap; else
+        only accept rows that do ovelap
+        """
+        super().__init__(None, None, None, group_type=rectype, group_num=gnum)
+        self.overlap = overlap
+        self.start_pos = -1
+        self.end_pos = -1
+    def __repr__(self) -> str:
+        return f'nrows={len(self.rows)}[{self.start_pos},{self.end_pos})"{self.entity_text}"'
+    @property
+    def entity_text(self) -> str:
+        jstr = " | " if self.overlap else " "
+        return jstr.join(r.entity_text for r in self.rows)
+    def belongs(self, rowdata: RowData) -> bool:
+        """Determine if the row belongs in this instance based on its overlap
+        or not.
+        :param rowdata: The rowdata to test
+        :return: True if the rowdata belongs in this instance
+        """
+        result = True  # Anything belongs to an empty group
+        if len(self.rows) > 0:
+            start_overlaps = self._is_in_bounds(rowdata.start_pos)
+            end_overlaps = self._is_in_bounds(rowdata.end_pos - 1)
+            result = start_overlaps or end_overlaps
+            if not self.overlap:
+                result = not result
+        if result:
+            if self.start_pos < 0:
+                self.start_pos = rowdata.start_pos
+                self.end_pos = rowdata.end_pos
+            else:
+                self.start_pos = min(self.start_pos, rowdata.start_pos)
+                self.end_pos = max(self.end_pos, rowdata.end_pos)
+        return result
+    def _is_in_bounds(self, char_pos):
+        return char_pos >= self.start_pos and char_pos < self.end_pos
+    def copy(self) -> "PositionalAnnotationsGroup":
+        result = PositionalAnnotationsGroup(self.overlap)
+        result.start_pos = self.start_pos
+        result.end_pos = self.end_pos
+        result.rows = self.rows.copy()
+        return result
+    # TODO: Add comparison and merge functions
+class OverlapGroupIterator:
+    """Given:
+      * annotation rows (dataframe)
+        * in order sorted by
+          * start_pos (increasing for input order), and
+          * end_pos (decreasing for longest spans first)
+    Collect:
+      * overlapping consecutive annotations
+      * for processing
+    """
+    def __init__(self, an_df: pd.DataFrame):
+        """:param an_df: An annotations.as_df DataFrame, sliced and sorted."""
+        self.an_df = an_df
+        self._cur_iter = None
+        self._queued_row_data = None
+        self.cur_group = None
+        self.reset()
+    def next_group(self) -> AnnotationsGroup:
+        group = None
+        if self.has_next:
+            group = PositionalAnnotationsGroup(True)
+            while self.has_next and group.belongs(self._queued_row_data):
+                self._queue_next()
+            self.cur_group = group
+        return group
+    def reset(self):
+        self._cur_iter = self.an_df.iterrows()
+        self._queue_next()
+        self.cur_group = None
+    @property
+    def has_next(self) -> bool:
+        return self._queued_row_data is not None
+    def _queue_next(self):
+        try:
+            _loc, row = next(self._cur_iter)
+            self._queued_row_data = RowData(None, row)  # TODO: add metadata
+        except StopIteration:
+            self._queued_row_data = None
+def merge(
+    annotations: Annotations,
+    merge_strategy: MergeStrategy,
+) -> Annotations:
+    """Merge the overlapping groups according to the given strategy."""
+    og_iter = OverlapGroupIterator(annotations.as_df)
+    result = Annotations(annotations.metadata)
+    while og_iter.has_next:
+        og = og_iter.next_group()
+        result.add_dicts(merge_strategy.merge(og))
+    return result
+class AnnotationsGroupList:
+    """Container for a list of annotation groups."""
+    def __init__(
+        self,
+        groups: List[AnnotationsGroup] = None,
+        accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda l, g: l.size
+        == 0
+        or not g.is_subset_of_any(l.groups),
+    ):
+        """:param groups: The initial groups for this list
+        :param accept_fn: A fn(l, g) that returns True to accept the group, g,
+            into this list, l, or False to reject the group. If None, then all
+            groups are always accepted. The default function will reject any
+            group that is a subset of any existing group in the list.
+        """
+        self.groups = groups if groups is not None else list()
+        self.accept_fn = accept_fn
+        self._coverage = None
+    def __repr__(self) -> str:
+        return str(self.groups)
+    @property
+    def size(self) -> int:
+        """Get the number of groups in this list"""
+        return len(self.groups)
+    @property
+    def coverage(self) -> int:
+        """Get the total number of (token) rows covered by the groups"""
+        if self._coverage is None:
+            locs = set()
+            for group in self.groups:
+                locs.update(set(group.locs))
+            self._coverage = len(locs)
+        return self._coverage
+    @property
+    def df(self) -> pd.DataFrame:
+        return pd.DataFrame([r.row for g in self.groups for r in g.rows])
+    def copy(self) -> "AnnotationsGroupList":
+        result = AnnotationsGroupList(self.groups.copy(), accept_fn=self.accept_fn)
+        result._coverage = self._coverage  # pylint: disable=protected-access
+        return result
+    def add(self, group: AnnotationsGroup) -> bool:
+        """Add the group if it belongs in this group list or return False.
+        :param group: The group to add
+        :return: True if the group belongs and was added; otherwise, False
+        """
+        result = False
+        if self.accept_fn is None or self.accept_fn(self, group):
+            self.groups.append(group)
+            self._coverage = None
+            result = True
+        return result
+    def is_subset(self, other: "AnnotationsGroupList") -> bool:
+        """Determine whether the this group's text spans are contained within all
+        of the other's.
+        :param other: The other group list
+        """
+        result = True
+        for my_group in self.groups:
+            if not my_group.is_subset_of_any(other.groups):
+                result = False
+                break
+        return result
+class AnnotatedText(dk_doc.Text):
+    """A Text object that manages its own annotations."""
+    def __init__(
+        self,
+        text_str: str,
+        metadata: dk_doc.TextMetaData = None,
+        annots: Annotations = None,
+        bookmarks: Dict[str, pd.DataFrame] = None,
+        text_obj: dk_doc.Text = None,
+        annots_metadata: AnnotationsMetaData = None,
+    ):
+        """:param text_str: The text string
+        :param metadata: The text's metadata
+        :param annots: The annotations
+        :param bookmarks: The annotation bookmarks
+        :param text_obj: A text_obj to override text_str and metadata
+            initialization
+        :param annots_metadata: Override for default annotations metadata
+            (NOTE: ineffectual if an annots instance is provided.)
+        """
+        super().__init__(
+            text_obj.text if text_obj is not None else text_str,
+            text_obj.metadata if text_obj is not None else metadata,
+        )
+        self._annots = annots
+        self._bookmarks = bookmarks
+        self._annots_metadata = annots_metadata
+    @property
+    def annotations(self) -> Annotations:
+        """Get the this object's annotations"""
+        if self._annots is None:
+            self._annots = Annotations(self._annots_metadata or AnnotationsMetaData())
+        return self._annots
+    @property
+    def bookmarks(self) -> Dict[str, pd.DataFrame]:
+        """Get this object's bookmarks"""
+        if self._bookmarks is None:
+            self._bookmarks = dict()
+        return self._bookmarks
+    def get_text(
+        self,
+        annot2mask: Dict[str, str] = None,
+        annot_df: pd.DataFrame = None,
+        text: str = None,
+    ) -> str:
+        """Get the text object's string, masking if indicated.
+        :param annot2mask: Mapping from annotation column (e.g., _num or
+            _recsnum) to the replacement character(s) in the input text
+            for masking already managed input.
+        :param annot_df: Override annotations dataframe
+        :param text: Override text
+        :return: The (masked) text
+        """
+        if annot2mask is None:
+            return self.text
+        # Apply the mask
+        text_s = self.get_text_series(text=text)  # no padding
+        if annot2mask is not None:
+            annot_df = self.annotations.as_df
+            text_s = self._apply_mask(text_s, annot2mask, annot_df)
+        return "".join(text_s)
+    def get_text_series(
+        self,
+        pad_len: int = 0,
+        text: str = None,
+    ) -> pd.Series:
+        """Get the input text as a (padded) pandas series.
+        :param pad_len: The number of spaces to pad both front ant back
+        :param text: Override text
+        :return: The (padded) pandas series of input characters.
+        """
+        if text is None:
+            text = self.text
+        return pd.Series(list(" " * pad_len + text + " " * pad_len))
+    def get_annot_mask(
+        self,
+        annot_col: str,
+        pad_len: int = 0,
+        annot_df: pd.DataFrame = None,
+        text: str = None,
+    ) -> pd.Series:
+        """Get a True/False series for the input such that start to end positions
+        for rows where the the annotation column is non-null and non-empty are
+        True.
+        :param annot_col: The annotation column identifying chars to mask
+        :param pad_len: The number of characters to pad the mask with False
+            values at both the front and back.
+        :param annot_df: Override annotations dataframe
+        :param text: Override text
+        :return: A pandas Series where annotated input character positions
+            are True and non-annotated positions are False.
+        """
+        if annot_df is None:
+            annot_df = self.annotations.as_df
+        if text is None:
+            text = self.text
+        textlen = len(text)
+        return self._get_annot_mask(annot_df, textlen, annot_col, pad_len=pad_len)
+    @staticmethod
+    def _get_annot_mask(
+        annot_df: pd.DataFrame,
+        textlen: int,
+        annot_col: str,
+        pad_len: int = 0,
+    ) -> pd.Series:
+        """Get a True/False series for the input such that start to end positions
+        for rows where the the annotation column is non-null and non-empty are
+        True.
+        :param annot_df: The annotations dataframe
+        :param textlen: The length of the input text
+        :param annot_col: The annotation column identifying chars to mask
+        :param pad_len: The number of characters to pad the mask with False
+            values at both the front and back.
+        :return: A pandas Series where annotated input character positions
+            are True and non-annotated positions are False.
+        """
+        mask = None
+        df = annot_df
+        if annot_col in df.columns:
+            df = df[np.logical_and(df[annot_col].notna(), df[annot_col] != "")]
+            mask = pd.Series([False] * textlen)
+            for _, row in df.iterrows():
+                mask.loc[row["start_pos"] + pad_len : row["end_pos"] - 1 + pad_len] = True
+        return mask
+    def _apply_mask(
+        self,
+        text_s: pd.Series,
+        annot2mask: Dict[str, str],
+        annot_df: pd.DataFrame,
+    ) -> str:
+        if len(text_s) > 0 and annot2mask is not None and annot_df is not None:
+            cols = set(annot_df.columns).intersection(annot2mask.keys())
+            if len(cols) > 0:
+                for col in cols:
+                    text_s = self._substitute(
+                        text_s,
+                        col,
+                        annot2mask[col],
+                        annot_df,
+                    )
+        return text_s
+    def _substitute(
+        self,
+        text_s: pd.Series,
+        col: str,
+        repl_mask: str,
+        annot_df: pd.DataFrame,
+    ) -> str:
+        """Substitute the "mask" char for "text" chars at "col"-annotated positions
+        :param text_s: The text series to revise
+        :param col: The annotation col identifying positions to mask
+        :param repl_mask: The mask character to inject at annotated positions
+        :param annot_df: The annotations dataframe
+        :return: The masked text
+        """
+        annot_mask = self._get_annot_mask(annot_df, len(text_s), col)
+        text_s = text_s.mask(annot_mask, repl_mask)
+        return text_s
+    def add_annotations(self, annotations: Annotations):
+        """Add the annotations to this instance.
+        :param annotations: The annotations to add.
+        """
+        if annotations is not None and not annotations.is_empty():
+            df = annotations.df
+            if self._annots is None:
+                self._annots = annotations
+            elif self._annots.is_empty():
+                if df is not None:
+                    self._annots.set_df(df.copy())
+            elif df is not None:
+                self._annots.add_df(df)
+class Annotator(ABC):
+    """Class for annotating text"""
+    def __init__(
+        self,
+        name: str,
+    ):
+        """:param name: The name of this annotator"""
+        self.name = name
+    @abstractmethod
+    def annotate_input(
+        self,
+        text_obj: AnnotatedText,
+        **kwargs,
+    ) -> Annotations:
+        """Annotate this instance's text, additively updating its annotations.
+        :param text_obj: The text object to annotate
+        :return: The annotations added
+        """
+        raise NotImplementedError
+class BasicAnnotator(Annotator):
+    """Class for extracting basic (possibly multi -level or -part) entities."""
+    def annotate_input(
+        self,
+        text_obj: AnnotatedText,
+        **kwargs,
+    ) -> Annotations:
+        """Annotate the text obj, additively updating the annotations
+        :param text: The text to annotate
+        :return: The annotations added to the text
+        """
+        # Get new annotation with just the syntax
+        annots = self.annotate_text(text_obj.text)
+        # Add syntactic annotations only as a bookmark
+        text_obj.annotations.add_df(annots.as_df)
+        return annots
+    @abstractmethod
+    def annotate_text(self, text_str: str) -> Annotations:
+        """Build annotations for the text string.
+        :param text_str: The text string to annotate
+        :return: Annotations for the text
+        """
+        raise NotImplementedError
+# TODO: remove this if unused -- stanza_annotator isa Authority -vs- stanza_annotator isa SyntacticParser
+class SyntacticParser(BasicAnnotator):
+    """Class for creating syntactic annotations for an input."""
+    def annotate_input(
+        self,
+        text_obj: AnnotatedText,
+        **kwargs,
+    ) -> Annotations:
+        """Annotate the text, additively updating the annotations
+        :param text: The text to annotate
+        :return: The annotations added to the text
+        """
+        # Get new annotation with just the syntax
+        annots = self.annotate_text(text_obj.text)
+        # Add syntactic annotations only as a bookmark
+        text_obj.bookmarks[self.name] = annots.as_df
+        return annots
+class EntityAnnotator(BasicAnnotator):
+    """Class for extracting single (possibly multi-level or -part) entities."""
+    def __init__(
+        self,
+        name: str,
+        mask_char: str = " ",
+    ):
+        """:param name: The name of this annotator
+        :param mask_char: The character to use to mask out previously annotated
+            spans of this annotator's text.
+        """
+        super().__init__(name)
+        self.mask_char = mask_char
+    @property
+    @abstractmethod
+    def annotation_cols(self) -> Set[str]:
+        """Report the (final group or record) annotation columns that are filled
+        by this annotator when its entities are annotated.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def mark_records(self, annotations: Annotations, largest_only: bool = True):
+        """Collect and mark annotation records.
+        :param annotations: The annotations
+        :param largest_only: True to only mark (keep) the largest records.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def validate_records(
+        self,
+        annotations: Annotations,
+    ):
+        """Validate annotated records.
+        :param annotations: The annotations
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def compose_groups(self, annotations: Annotations) -> Annotations:
+        """Compose annotation rows into groups.
+        :param annotations: The annotations
+        :return: The composed annotations
+        """
+        raise NotImplementedError
+    def annotate_input(
+        self,
+        text_obj: AnnotatedText,
+        annot_mask_cols: Set[str] = None,
+        merge_strategies: Dict[str, MergeStrategy] = None,
+        largest_only: bool = True,
+        **kwargs,
+    ) -> Annotations:
+        """Annotate the text object (optionally) after masking out previously
+        annotated spans, additively updating the annotations in the text
+        object.
+        :param text_obj: The text object to annotate
+        :param annot_mask_cols: The (possible) previous annotations whose
+            spans to ignore in the text
+        :param merge_strategies: A dictionary of each input annotation bookmark
+            tag mapped to a merge strategy for merging this annotator's
+            annotations with the bookmarked dataframe. This is useful, for
+            example, when merging syntactic information to refine ambiguities.
+        :param largest_only: True to only mark largest records.
+        :return: The annotations added to the text object
+        """
+        annot2mask = (
+            None
+            if annot_mask_cols is None
+            else {  # TODO: Use this?!
+                col: self.mask_char for col in annot_mask_cols
+            }
+        )
+        annots = self.annotate_text(text_obj.text)
+        if annots is None:
+            return annots
+        if merge_strategies is not None:
+            bookmarks = text_obj.bookmarks
+            if bookmarks is not None and len(bookmarks) > 0:
+                for tag, merge_strategy in merge_strategies.items():
+                    if tag in bookmarks:
+                        text_obj.bookmarks[f"{self.name}.pre-merge:{tag}"] = annots.df
+                        annots.add_df(bookmarks[tag])
+                        annots = merge(annots, merge_strategy)
+        annots = self.compose_groups(annots)
+        self.mark_records(annots, largest_only=largest_only)
+        # NOTE: don't pass "text" here because it may be masked
+        self.validate_records(annots)
+        text_obj.annotations.add_df(annots.df)
+        return annots
+    @property
+    @abstractmethod
+    def highlight_fieldstyles(self) -> Dict[str, Dict[str, Dict[str, str]]]:
+        """Get highlight field styles for this annotator's annotations of the form:
+        {
+            <field_col>: {
+                <field_value>: {
+                    <css-attr>: <css-value>
+                }
+            }
+        }
+        For css-attr's like 'background-color', 'foreground-color', etc.
+        """
+        raise NotImplementedError
+class HtmlHighlighter:
+    """Helper class to add HTML markup for highlighting spans of text."""
+    def __init__(
+        self,
+        field2style: Dict[str, Dict[str, str]],
+        tooltip_class="tooltip",
+        tooltiptext_class="tooltiptext",
+    ):
+        """:param field2style: The annotation column to highlight with its
+            associated style, for example:
+                {
+                    'car_model_field': {
+                        'year': {'background-color': 'lightyellow'},
+                        'make': {'background-color': 'lightgreen'},
+                        'model': {'background-color': 'cyan'},
+                        'style': {'background-color': 'magenta'},
+                    },
+                }
+        :param tooltip_class: The css tooltip class
+        :param tooltiptext_class: The css tooltiptext class
+        """
+        self.field2style = field2style
+        self.tooltip_class = tooltip_class
+        self.tooltiptext_class = tooltiptext_class
+    def highlight(
+        self,
+        text_obj: AnnotatedText,
+    ) -> str:
+        """Return an html string with the given fields (annotation columns)
+        highlighted with the associated styles.
+        :param text_obj: The annotated text to markup
+        """
+        result = ["<p>"]
+        anns = text_obj.annotations
+        an_df = anns.df
+        for field, styles in self.field2style.items():
+            # NOTE: the following line relies on an_df already being sorted
+            df = an_df[an_df[field].isin(styles)]
+            cur_pos = 0
+            for _loc, row in df.iterrows():
+                enttype = row[field]
+                style = styles[enttype]
+                style_str = " ".join([f"{key}: {value};" for key, value in style.items()])
+                start_pos = row[anns.metadata.start_pos_col]
+                if start_pos > cur_pos:
+                    result.append(text_obj.text[cur_pos:start_pos])
+                end_pos = row[anns.metadata.end_pos_col]
+                result.append(f'<mark class="{self.tooltip_class}" style="{style_str}">')
+                result.append(text_obj.text[start_pos:end_pos])
+                result.append(f'<span class="{self.tooltiptext_class}">{enttype}</span>')
+                result.append("</mark>")
+                cur_pos = end_pos
+        result.append("</p>")
+        return "\n".join(result)
+class AnnotatorKernel(ABC):
+    """Class for encapsulating core annotation logic for multiple annotators"""
+    @property
+    @abstractmethod
+    def annotators(self) -> List[EntityAnnotator]:
+        """Get the entity annotators"""
+        raise NotImplementedError
+    @abstractmethod
+    def annotate_input(self, text_obj: AnnotatedText) -> Annotations:
+        """Execute all annotations on the text_obj"""
+        raise NotImplementedError
+class CompoundAnnotator(Annotator):
+    """Class to apply a series of annotators through an AnnotatorKernel"""
+    def __init__(
+        self,
+        kernel: AnnotatorKernel,
+        name: str = "entity",
+    ):
+        """Initialize with the annotators and this extractor's name.
+        :param kernel: The annotations kernel to use
+        :param name: The name of this information extractor to be the
+            annotations base column name for <name>_num and <name>_recsnum
+        """
+        super().__init__(name=name)
+        self.kernel = kernel
+    def annotate_input(
+        self,
+        text_obj: AnnotatedText,
+        reset: bool = True,
+        **kwargs,
+    ) -> Annotations:
+        """Annotate the text.
+        :param text_obj: The AnnotatedText object to annotate.
+        :param reset: When True, reset and rebuild any existing annotations
+        :return: The annotations added to the text_obj
+        """
+        if reset:
+            text_obj.annotations.clear()
+        annots = self.kernel.annotate_input(text_obj)
+        return annots
+    def get_html_highlighted_text(
+        self,
+        text_obj: AnnotatedText,
+        annotator_names: List[str] = None,
+    ) -> str:
+        """Get html-hilighted text for the identified input's annotations
+        from the given annotators (or all).
+        :param text_obj: The input text to highlight
+        :param annotator_names: The subset of annotators to highlight.
+        """
+        if annotator_names is None:
+            annotator_names = [ann.name for ann in self.kernel.annotators]
+        hfs = {
+            ann.name: ann.highlight_fieldstyles
+            for ann in self.kernel.annotators
+            if ann.name in annotator_names
+        }
+        hh = HtmlHighlighter(hfs)
+        return hh.highlight(text_obj)