PyPI - kodexa - Versions diffs - 7.0.10350737552__py3-none-any.whl → 7.0.10402571165__py3-none-any.whl - Mend

kodexa 7.0.10350737552py3-none-any.whl → 7.0.10402571165py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

kodexa/dataclasses/__init__.py ADDED Viewed

@@ -0,0 +1,339 @@
+import logging
+import os
+import uuid
+from typing import Optional, List
+import jinja2
+from pydantic import BaseModel
+from kodexa import ContentNode
+from kodexa.model.model import Tag
+from kodexa.model.objects import ContentException, Taxon, Taxonomy, Assistant
+from kodexa.utils import taxon_to_property_name, taxon_to_class_name, taxon_to_group_path, snake_to_camel, \
+    to_snake
+logger = logging.getLogger()
+class LLMDataAttribute(BaseModel):
+    """
+    This is the data structure that is used take the results from the LLM so
+    we can use it.  We use this as a base class for building classes that align
+    with a taxonomy
+    """
+    value: Optional[str] = None
+    line_ids: Optional[list[str]] = None
+    taxon_path: Optional[str] = None
+    data_type: Optional[str] = None
+    value_path: Optional[str] = None
+    normalized_text: Optional[str] = None
+    node_uuid_list: Optional[List[int]] = None
+    tag_uuid: Optional[str] = None
+    exceptions: Optional[list[ContentException]] = None
+    def create_exception(
+            self,
+            exception_type_id: str,
+            exception_type: str,
+            normalized_text: str,
+            message: str,
+            exception_detail: str,
+    ):
+        content_exception = ContentException(
+            exception_type=exception_type,
+            exception_detail=exception_detail,
+            message=message,
+            tag_uuid=self.tag_uuid,
+        )
+        self.exceptions.append(content_exception)
+class LLMDataObject(BaseModel):
+    """
+    A class to represent a LLM (Large Language Model) data object.
+    ...
+    Attributes
+    ----------
+    group_uuid : str, optional
+        A unique identifier for the group, by default None
+    cell_index : int, optional
+        The index of the cell which is really the row, by default 0
+    Methods
+    -------
+    __init__(self, document: "KodexaDocumentLLMWrapper" = None, **data: Any):
+        Initializes the LLMDataObject with a given document and additional data.
+    apply_labels(self, document: KodexaDocumentLLMWrapper, parent_group_uuid: str = None):
+        Applies labels to the document if it exists.
+    """
+    group_uuid: Optional[str] = None
+    cell_index: int = 0
+    exceptions: Optional[list[ContentException]] = None
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, group_uuid: str = None, cell_index: int = 0):
+        """
+        Initializes the LLMDataObject
+        """
+        super().__init__()
+        self.cell_index = cell_index
+        if group_uuid is None:
+            self.group_uuid = str(uuid.uuid4())
+        else:
+            self.group_uuid = group_uuid
+    def create_exception(
+            self,
+            exception_type_id: str,
+            exception_type: str,
+            message: str,
+            exception_detail: str,
+            severity: str = "ERROR",
+    ):
+        content_exception = ContentException(
+            exception_type=exception_type,
+            exception_details=exception_detail,
+            message=message,
+            group_uuid=self.group_uuid,
+            severity=severity,
+        )
+        if self.exceptions is None:
+            self.exceptions = []
+        self.exceptions.append(content_exception)
+    def apply_labels(
+            self, document: "KodexaDocumentLLMWrapper", parent_group_uuid: str = None,
+            assistant: Optional["Assistant"] = None
+    ):
+        """
+        Applies labels to the document if it exists.
+        If a document has been assigned to the LLMDataObject, it calls the
+        apply_labels method of the document with the current LLMDataObject and
+        the parent group uuid.
+        Parameters
+        ----------
+        document : KodexaDocumentLLMWrapper
+            The Kodexa document LLM wrapper
+        parent_group_uuid : str, optional
+            A unique identifier for the parent group, by default None
+        assistant : Assistant, optional
+        """
+        # Lets make sure we add all the content exceptions
+        if self.exceptions is not None:
+            for exception in self.exceptions:
+                # We have two types of exception, one in the API and one in the
+                # document
+                from kodexa.model import ContentException as KodexaContentException
+                internal_exception = KodexaContentException(
+                    exception_type=exception.exception_type,
+                    message=exception.message,
+                    exception_details=exception.exception_details,
+                    severity=exception.severity,
+                    group_uuid=exception.group_uuid,
+                    tag_uuid=exception.tag_uuid,
+                )
+                document.doc.add_exception(internal_exception)
+        # Let's go through this data object and find all the attributes that have a value
+        # then we will apply the labels to the document
+        for field in self.__fields__:
+            logger.info(f"Processing field {field}")
+            value = getattr(self, field)
+            if isinstance(value, list):
+                logger.info(f"Processing as a list {value}")
+                for item in value:
+                    self.process_child(item, document, parent_group_uuid, assistant)
+            else:
+                logger.info(f"Processing as a single value {value}")
+                self.process_child(value, document, parent_group_uuid, assistant)
+    def process_child(self, value, document, parent_group_uuid, assistant):
+        logger.info(f"Processing child {value}")
+        if isinstance(value, LLMDataAttribute):
+            # We need to add the label to the document for this attribute
+            tag = value.taxon_path
+            # TODO need to work out why we are missing them?
+            logger.info(f"Value: {value.normalized_text}, node_uuid_list: {value.node_uuid_list}")
+            if value.node_uuid_list is None:
+                value.node_uuid_list = value.line_ids
+            logger.info(f"Applying label {tag} to node UUIDs {value.node_uuid_list}")
+            if isinstance(value.node_uuid_list, int):
+                value.node_uuid_list = [value.node_uuid_list]
+            nodes_to_label: list[ContentNode] = (
+                [
+                    document.doc.get_persistence().get_node(node_uuid)
+                    for node_uuid in value.node_uuid_list if (node_uuid != '0' and node_uuid != 0)
+                ]
+                if value.node_uuid_list
+                else []
+            )
+            tag_uuid = str(uuid.uuid4())
+            for node in nodes_to_label:
+                if node:
+                    if not node.has_tag(tag):
+                        try:
+                            confidence = -1 if value.value_path == 'DERIVED' else 1
+                            node.tag(
+                                tag_to_apply=tag,
+                                value=value.normalized_text,
+                                tag_uuid=tag_uuid,
+                                cell_index=self.cell_index,
+                                selector="//word",
+                                confidence=confidence,
+                                group_uuid=self.group_uuid,
+                                parent_group_uuid=parent_group_uuid,
+                                owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm",
+                            )
+                        except:
+                            logger.error(f"Error tagging node {node.uuid} with tag {tag}")
+                    else:
+                        current_value = node.get_feature_values("tag", tag)
+                        new_tag = Tag(cell_index=self.cell_index,
+                                      uuid=tag_uuid,
+                                      value=value.normalized_text,
+                                      confidence=-1,
+                                      group_uuid=self.group_uuid,
+                                      parent_group_uuid=parent_group_uuid,
+                                      owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm")
+                        current_value.append(new_tag)
+                        node.remove_feature("tag", tag)
+                        node.add_feature("tag", tag, current_value, single=False)
+            logger.info(f"Applied label {tag} to {len(nodes_to_label)} nodes")
+        if isinstance(value, LLMDataObject):
+            # We need to apply the labels to the document for this object
+            value.apply_labels(document, parent_group_uuid=self.group_uuid)
+            # logger.info(f"Applied labels to data object {value.group_uuid}")
+def find_nearby_word_to_tag(node, tag):
+    logger.info(f"find_nearby_word_to_tag: {tag}")
+    # Create an ordered list of the lines on the page, sorted by distance from the target node
+    target_line_index = node.index if node.node_type == 'line' else node.select('parent::line')[0].index
+    all_lines_on_page = node.select('parent::page')[0].select('//line')
+    print(target_line_index, len(all_lines_on_page), all_lines_on_page)
+    sorted_lines = sorted(all_lines_on_page, key=lambda line: abs(target_line_index - line.index))
+    # Find the first word that isn't yet tagged by this tag
+    for line in sorted_lines:
+        for word in line.select('//word'):
+            if not word.has_tag(tag):
+                return word
+    return None
+def get_template_env():
+    """Get the Jinja2 template environmnet
+    :return:
+    Args:
+    Returns:
+    """
+    package_location = os.path.dirname(os.path.abspath(__file__))
+    template_loader = jinja2.FileSystemLoader([os.getcwd(), package_location])
+    env = jinja2.Environment(loader=template_loader, autoescape=True)
+    env.globals["snake_to_camel"] = snake_to_camel
+    env.globals["to_snake"] = to_snake
+    env.globals['taxon_to_property_name'] = taxon_to_property_name
+    env.globals['taxon_to_class_name'] = taxon_to_class_name
+    env.globals['taxon_to_group_path'] = taxon_to_group_path
+    return env
+def write_template(template, output_location, output_filename, context):
+    """
+    Write the given template out to a file
+    Args:
+      template: the name of the template
+      output_location: the location to write the output
+      output_filename: the name of the output file
+      context: the context
+    """
+    template = get_template_env().get_template(template)
+    processed_template = template.render(context)
+    from pathlib import Path
+    Path(output_location).mkdir(parents=True, exist_ok=True)
+    with open(output_location + "/" + output_filename, "w") as text_file:
+        text_file.write(processed_template)
+def build_llm_data_classes_for_taxonomy(
+        taxonomy: Taxonomy, output_dir: str, output_file: str, use_labels: bool = False
+):
+    """
+    This function will use jinja templates to build a set of classes that represent a taxonomy,
+    these classes will extend the LLMData class and therefore have the ability to take an LLM
+    response and map it to the Kodexa Document identifying and labeling the nodes as needed
+    :param taxonomy:
+    :param output_dir:
+    :param output_file:
+    :param use_labels:
+    :return:
+    """
+    # We will use a jinja template to build all the classes we need, to do this
+    # will iterate over all the taxons the taxonomy
+    def set_path(taxon: Taxon, parent_path: Optional[str] = None):
+        if parent_path is not None:
+            taxon.path = parent_path + "/" + taxon.name
+        else:
+            taxon.path = taxon.name
+        if taxon.children:
+            for child_taxon in taxon.children:
+                set_path(child_taxon, taxon.path)
+    for taxon in taxonomy.taxons:
+        set_path(taxon, None)
+    def collect_group_taxons(taxons: list[Taxon]) -> list[Taxon]:
+        """
+        Recursively collects all group taxons from a list of taxons.
+        Args:
+            taxons (list[Taxon]): The list of taxons to collect group taxons from.
+        Returns:
+            list[Taxon]: A list of group taxons.
+        """
+        group_taxons = []
+        for taxon in taxons:
+            if taxon.group:
+                group_taxons.append(taxon)
+            if taxon.children:
+                group_taxons = group_taxons + collect_group_taxons(taxon.children)
+        return group_taxons
+    all_group_taxons = collect_group_taxons(taxonomy.taxons)
+    all_group_taxons.reverse()
+    context = {"taxons": all_group_taxons, "use_labels": use_labels}
+    write_template("llm_data_class.j2", output_dir, output_file, context)
+    # Lets log what we created
+    logger.info(f"Created the following classes in {output_dir}/{output_file}")
+    with open(f"{output_dir}/{output_file}", "r") as file:
+        logger.info(file.read())

kodexa/dataclasses/llm_data_class.j2 ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Optional, List
+from kodexa_langchain.data_class import LLMDataAttribute, LLMDataObject
+from kodexa_langchain.llm import deserialize_llm_data
+from kodexa_langchain.document import KodexaDocumentLLMWrapper
+{%- for taxon in taxons %}
+class {{ taxon_to_class_name(taxon) }}(LLMDataObject):
+    {%- for child_taxon in taxon.children %}{%- if child_taxon.group %}
+    {{ taxon_to_property_name(child_taxon) }}: Optional[List[{{ taxon_to_class_name(child_taxon) }}]] = None
+    {%- else %}
+    {{ taxon_to_property_name(child_taxon) }}: Optional[LLMDataAttribute] = LLMDataAttribute(taxon_path='{{ child_taxon.path }}', data_type='{{ child_taxon.taxon_type.title() }}', value_path='{{ child_taxon.value_path.title() }}')
+    {%- endif %}
+    {%- endfor %}
+    def __init__(self, data: dict, document: Optional[KodexaDocumentLLMWrapper] = None, group_uuid=None, cell_index: int = 0, taxon=None, extraction_context=None):
+        super().__init__(group_uuid, cell_index)
+        deserialize_llm_data(self, data, document, f'{{ taxon_to_group_path(taxon) }}', group_uuid, taxon, extraction_context)
+{%- endfor %}

kodexa/utils/__init__.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""
+This module provides a set of functions to manipulate and convert taxonomy objects for use within a data model.
+It includes functions to convert taxonomy names to various naming conventions such as property names, class names,
+and group paths. Additionally, it offers utility functions for string manipulation, like converting snake case strings
+to camel case or title case, making string names safe for use as attribute names, converting strings to hexadecimal
+color codes, estimating the token count of a text, and recursively finding all non-abstract subclasses of a given class.
+"""
+import keyword
+import logging
+import re
+from inspect import isabstract
+from kodexa.model.objects import Taxon
+logger = logging.getLogger(__name__)
+def taxon_to_property_name(taxon: Taxon):
+    # We need to convert the taxon name to a property name
+    # if the name of the taxon doesn't look like a UUID we will camel case
+    # it otherwise we will camelcase the taxon label
+    safe_property_name = to_snake(safe_name(taxon.label))
+    taxon.external_name = safe_property_name
+    return safe_property_name
+def taxon_to_class_name(taxon: Taxon):
+    # We need to convert the taxon name to a class name
+    # if the name of the taxon doesn't look like a UUID we will camel case
+    # it otherwise we will camelcase the taxon label
+    safe_class_name = snake_to_camel(safe_name(taxon.label))
+    taxon.external_name = safe_class_name
+    return safe_class_name
+def taxon_to_group_path(taxon: Taxon):
+    # We need to get the "group_name" from one of the taxons
+    # Which is the first part of the taxon path
+    return taxon.path.split('/')[0]
+def snake_to_camel(snake_str):
+    components = snake_str.replace(" ", "_").split("_")
+    # We convert first letter of second word to uppercase
+    return components[0].strip().title() + "".join(
+        x.strip().title() for x in components[1:]
+    )
+def to_snake(base_str):
+    components = base_str.replace(" ", "_").replace("-", "_").split("_")
+    # if the base string starts with a number than we add n_ to the start
+    if components[0].isdigit():
+        components[0] = "n_" + components[0]
+    # We convert first letter of second word to uppercase
+    return "_".join(x.strip().lower() for x in components)
+def make_safe_attribute_name(name):
+    # Replace invalid characters (anything not a letter, digit, or underscore) with an underscore
+    safe_name = ''.join(char if char.isalnum() or char == '_' else '_' for char in name)
+    # If the name starts with a digit, prepend an underscore
+    if safe_name[0].isdigit():
+        safe_name = '_' + safe_name
+    # Append an underscore if the name is a Python keyword
+    if keyword.iskeyword(safe_name):
+        safe_name += '_'
+    return safe_name
+def safe_name(string):
+    """
+    Removes invalid characters from a string, replaces spaces with underscores, removes leading/trailing underscores and hyphens, and makes the string lowercase. If the resulting string
+    * starts with a number, it prefixes it with "n_".
+    :param string: The string to be transformed.
+    :return: The transformed string.
+    """
+    # Remove invalid characters
+    # trim the string
+    string = string.strip()
+    string = re.sub(r"[^\w\s-]", "", string)
+    # Replace spaces with underscores
+    string = re.sub(r"\s+", "_", string)
+    # Remove leading/trailing underscores and hyphens
+    string = string.strip("_-")
+    # Make it lowercase
+    string = string.lower()
+    if string[0].isdigit():
+        # can't have things starting with a number
+        string = "n_" + string
+    # make sure we don't collide with a python keyword
+    return make_safe_attribute_name(string)
+def snake_case_to_title_case(snake_case_string):
+    words = snake_case_string.split("_")
+    title_case_words = [word.capitalize() for word in words]
+    return " ".join(title_case_words)
+def string_to_hex_color(string):
+    # Remove any leading or trailing whitespace from the string
+    string = string.strip()
+    # Calculate the hash value of the string
+    hash_value = hash(string)
+    # Convert the hash value to a 24-bit hexadecimal color code
+    hex_color = "#{:06x}".format(hash_value & 0xFFFFFF)
+    return hex_color
+def get_is_square_bracket_first(string):
+    first_square_bracket = string.find("[")
+    first_bracket = string.find("{")
+    # Check if both "{" and "[" exist in the string
+    if first_bracket != -1 and first_square_bracket != -1:
+        # Compare their indices to determine which appears first
+        if first_bracket < first_square_bracket:
+            return False
+        else:
+            return True
+    # If only one of them exists, return the one that appears
+    elif first_bracket != -1:
+        return False
+    elif first_square_bracket != -1:
+        return True
+    else:
+        return None
+def cosine_similarity(v1, v2):
+    """Compute the cosine similarity between two vectors."""
+    dot_product = sum(a * b for a, b in zip(v1, v2))
+    norm_a = sum(a * a for a in v1) ** 0.5
+    norm_b = sum(b * b for b in v2) ** 0.5
+    return dot_product / (norm_a * norm_b)
+def estimate_token_count(text, avg_token_length=1):
+    # Removing spaces to focus on characters that form tokens
+    char_count = len(text.replace(" ", ""))
+    # Estimating token count
+    estimated_tokens = char_count / avg_token_length
+    return round(estimated_tokens)
+def get_all_concrete_subclasses(cls):
+    """
+    Recursively find all non-abstract subclasses of a given class.
+    Parameters:
+    cls (class): The parent class to find subclasses for.
+    Returns:
+    list: A list of all non-abstract subclasses of cls.
+    """
+    concrete_subclasses = []
+    for subclass in cls.__subclasses__():
+        if not isabstract(subclass):
+            concrete_subclasses.append(subclass)
+        concrete_subclasses.extend(get_all_concrete_subclasses(subclass))
+    return concrete_subclasses

{kodexa-7.0.10350737552.dist-info → kodexa-7.0.10402571165.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: kodexa
-Version: 7.0.10350737552
+Version: 7.0.10402571165
 Summary: Python SDK for the Kodexa Platform
 Author: Austin Redenbaugh
 Author-email: austin@kodexa.com

{kodexa-7.0.10350737552.dist-info → kodexa-7.0.10402571165.dist-info}/RECORD RENAMED Viewed

@@ -3,6 +3,8 @@ kodexa/assistant/__init__.py,sha256=nlXm_YnV_50hgn0TIT2Fkc2fQ-86OjmctY_j8My9nc4,
 kodexa/assistant/assistant.py,sha256=5KFdbqFSLIZJyDRyZdpcfr448fT-CW4JhYu9A6B9DGY,14663
 kodexa/connectors/__init__.py,sha256=WF6G_MUeU32TlKSUKkpNoNX7dq8iBPliFMep4E8BmZc,328
 kodexa/connectors/connectors.py,sha256=FpUZDkSyHld2b9eYRuVOWzaFtuGoaRuPXXicJB7THbc,10413
+kodexa/dataclasses/__init__.py,sha256=gM1meK2rltv3OO9oJGtuLG7It0L-JS8rMmSAg44Wbp8,12815
+kodexa/dataclasses/llm_data_class.j2,sha256=1l30_Men0_cPEd6FCzbwsrWUi1QZidNEFXR06WudYlk,1127
 kodexa/model/__init__.py,sha256=rtLXYJBxB-rnukhslN9rlqoB3--1H3253HyHGbD_Gc8,796
 kodexa/model/base.py,sha256=CaZK8nMhT1LdCpt4aLhebJGcorjq9qRID1FjnXnP14M,521
 kodexa/model/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,7 +41,8 @@ kodexa/testing/test_components.py,sha256=g5lP-GY0nTHuH5cIEw45vIejEeBaWkPKQGHL36j
 kodexa/testing/test_utils.py,sha256=DrLCkHxdb6AbZ-X3WmTMbQmnVIm55VEBL8MjtUK9POs,14021
 kodexa/training/__init__.py,sha256=xs2L62YpRkIRfslQwtQZ5Yxjhm7sLzX2TrVX6EuBnZQ,52
 kodexa/training/train_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kodexa-7.0.10350737552.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
-kodexa-7.0.10350737552.dist-info/METADATA,sha256=SFvyMuqsVbNSBZ7tal88dBHT4temLjiySjugrvxE2Bs,3533
-kodexa-7.0.10350737552.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-kodexa-7.0.10350737552.dist-info/RECORD,,
+kodexa/utils/__init__.py,sha256=Pnim1o9_db5YEnNvDTxpM7HG-qTlL6n8JwFwOafU9wo,5928
+kodexa-7.0.10402571165.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
+kodexa-7.0.10402571165.dist-info/METADATA,sha256=_A6adX98QOX0rTPw7Mq_5Ev-ssdlQAXeTorZJx9Eyh4,3533
+kodexa-7.0.10402571165.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
+kodexa-7.0.10402571165.dist-info/RECORD,,

{kodexa-7.0.10350737552.dist-info → kodexa-7.0.10402571165.dist-info}/LICENSE RENAMED Viewed

File without changes

{kodexa-7.0.10350737552.dist-info → kodexa-7.0.10402571165.dist-info}/WHEEL RENAMED Viewed

File without changes

kodexa 7.0.10350737552__py3-none-any.whl → 7.0.10402571165__py3-none-any.whl

kodexa 7.0.10350737552py3-none-any.whl → 7.0.10402571165py3-none-any.whl