PyPI - data-science-document-ai - Versions diffs - 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl - Mend

data-science-document-ai 1.13.0py3-none-any.whl → 1.56.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
{data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
src/constants.py +42 -12
src/constants_sandbox.py +2 -22
src/docai.py +18 -7
src/docai_processor_config.yaml +0 -64
src/excel_processing.py +34 -15
src/io.py +74 -6
src/llm.py +12 -34
src/pdf_processing.py +228 -78
src/postprocessing/common.py +495 -618
src/postprocessing/postprocess_partner_invoice.py +383 -27
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
src/prompts/library/bundeskasse/other/placeholders.json +113 -0
src/prompts/library/bundeskasse/other/prompt.txt +48 -0
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
src/prompts/library/customsAssessment/other/placeholders.json +67 -16
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/placeholders.json +205 -0
src/prompts/library/customsInvoice/other/prompt.txt +105 -0
src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +34 -34
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/packingList/other/prompt.txt +1 -1
src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
src/setup.py +73 -63
src/utils.py +207 -30
data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -17

src/postprocessing/common.py CHANGED Viewed

@@ -1,414 +1,22 @@
 import asyncio
-import datetime
 import json
 import os
 import re
 from datetime import timezone
-import numpy as np
 import pandas as pd
-import requests
-from vertexai.preview.language_models import TextEmbeddingModel
+from nltk.corpus import stopwords
+from rapidfuzz import process
 from src.constants import formatting_rules
-from src.io import get_storage_client, logger
+from src.io import logger
+from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.tms import call_tms, set_tms_service_token
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
-class EmbeddingsManager:  # noqa: D101
-    def __init__(self, params):  # noqa: D107
-        self.params = params
-        self.embeddings_dict = {}
-        self.embed_model = setup_embed_model()
-        self.bucket = self.get_bucket_storage()
-        self.embedding_folder = self.embed_model._model_id
-        self.embedding_dimension = 768  # TODO: to be reduced
-    def get_bucket_storage(self):
-        """
-        Retrieve the bucket storage object.
-        Returns:
-            The bucket storage object.
-        """
-        params = self.params
-        storage_client = get_storage_client(params)
-        bucket = storage_client.bucket(params["doc_ai_bucket_name"])
-        return bucket
-    def _find_most_similar_option(self, input_string, option_ids, option_embeddings):
-        """
-        Find the most similar option to the given input string based on embeddings.
-        Args:
-            model: The model used for generating embeddings.
-            input_string (str): The input string to find the most similar option for.
-            option_ids (list): The list of option IDs.
-            option_embeddings (np.ndarray): The embeddings of the options.
-        Returns:
-            The ID of the most similar option.
-        """
-        try:
-            input_embedding = self.embed_model.get_embeddings(
-                [input_string], output_dimensionality=self.embedding_dimension
-            )[0].values
-            similarities = np.dot(option_embeddings, input_embedding)
-            idx = np.argmax(similarities)
-            return option_ids[idx]
-        except Exception as e:
-            logger.error(f"Embeddings error: {e}")
-            return None
-    def load_embeddings(self):
-        """
-        Load embeddings for container types, ports, and terminals.
-        Returns:
-            None
-        """
-        for data_field in [
-            "container_types",
-            "ports",
-            "terminals",
-            "depots",
-            "item_codes",
-        ]:
-            self.embeddings_dict[data_field] = load_embed_by_data_field(
-                self.bucket,
-                f"{self.embedding_folder}/{data_field}/output",
-                self.embedding_dimension,
-            )
-    async def update_embeddings(self):
-        """
-        Update the embeddings dictionary.
-        Returns:
-            dict: The updated embeddings dictionary with the following keys:
-                - "container_types": A tuple containing the container types and their embeddings.
-                - "ports": A tuple containing the ports and their embeddings.
-                - "terminals": A tuple containing the terminal IDs and their embeddings.
-        """
-        # Update embeddings dict here.
-        # Ensure this method is async if you're calling async operations.
-        set_tms_service_token()
-        (
-            container_types,
-            container_type_embeddings,
-        ) = self.setup_container_type_embeddings(
-            *self.embeddings_dict.get("container_types", ([], []))
-        )
-        ports, port_embeddings = self.setup_ports_embeddings(
-            *self.embeddings_dict.get("ports", ([], []))
-        )
-        # Setup terminal embeddings
-        # Since retrieving terminal attributes requires calling TMS' api to extract terminals by each port,
-        # we only do it for new ports.
-        prev_port_ids, _ = self.embeddings_dict.get("ports", ([], []))
-        added_port_ids = [port for port in ports if port not in prev_port_ids]
-        if added_port_ids:
-            terminal_ids, terminal_embeddings = self.setup_terminal_embeddings(
-                added_port_ids
-            )
-        else:
-            terminal_ids, terminal_embeddings = self.embeddings_dict["terminals"]
-        depot_names, depot_embeddings = self.setup_depot_embeddings(
-            *self.embeddings_dict.get("depots", ([], []))
-        )
-        item_code_names, item_code_embeddings = self.setup_item_code_embeddings(
-            *self.embeddings_dict.get("item_codes", ([], []))
-        )
-        self.embeddings_dict = {
-            "container_types": (container_types, container_type_embeddings),
-            "ports": (ports, port_embeddings),
-            "terminals": (terminal_ids, terminal_embeddings),
-            "depots": (depot_names, depot_embeddings),
-            "item_codes": (item_code_names, item_code_embeddings),
-        }
-        return self.embeddings_dict
-    def batch_embed(self, option_strings: list[dict], suffix: str):
-        """
-        Compute embeddings for a batch of option strings and uploads them to a cloud storage bucket.
-        Args:
-            option_strings (list): A list of option strings to compute embeddings for.
-            suffix (str): A suffix to be used in the storage path for the embeddings:
-            input & output will be stored under "{bucket}/{parent_folder}/{suffix}/"
-        Returns:
-            tuple: A tuple containing the option IDs and embeddings.
-        """
-        now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-        input_path = f"{self.embedding_folder}/{suffix}/input/{now}.jsonl"
-        blob = self.bucket.blob(input_path)
-        # Convert each dictionary to a JSON string and join them with newlines
-        option_strings = [
-            {**option, "task_type": "SEMANTIC_SIMILARITY", "output_dimensionality": 256}
-            for option in option_strings
-        ]
-        jsonl_string = "\n".join(json.dumps(d) for d in option_strings)
-        # Convert the combined string to bytes
-        jsonl_bytes = jsonl_string.encode("utf-8")
-        # Upload the bytes to the blob
-        blob.upload_from_string(jsonl_bytes, content_type="text/plain")
-        # Compute embeddings for the options
-        embedding_path = f"{self.embedding_folder}/{suffix}/output"
-        assert len(option_strings) <= 30000  # Limit for batch embedding
-        batch_resp = self.embed_model.batch_predict(
-            dataset=f"gs://{self.bucket.name}/{input_path}",  # noqa
-            destination_uri_prefix=f"gs://{self.bucket.name}/{embedding_path}",  # noqa
-        )
-        if batch_resp.state.name != "JOB_STATE_SUCCEEDED":
-            logger.warning(
-                f"Batch prediction job failed with state {batch_resp.state.name}"
-            )
-        else:
-            logger.info(f"Embeddings for {suffix} computed successfully.")
-        option_ids, option_embeddings = load_embed_by_data_field(
-            self.bucket, embedding_path, self.embedding_dimension
-        )
-        return option_ids, option_embeddings
-    def setup_container_type_embeddings(
-        self, computed_container_type_ids, computed_container_type_embeddings
-    ):
-        """
-        Set up container type embeddings.
-        Args:
-            computed_container_type_ids (list): The list of already computed container type IDs.
-            computed_container_type_embeddings (list): The list of already computed container type embeddings.
-        Returns:
-            tuple: A tuple containing the updated container type IDs and embeddings.
-        """
-        url = (
-            f"https://tms.forto.{tms_domain}/api/transport-units/api/types/list"  # noqa
-        )
-        resp = call_tms(requests.get, url)
-        container_types = resp.json()
-        container_attribute_strings = [
-            dict(
-                title=container_type["code"],
-                content=" | ".join(
-                    ["container type"]
-                    + [
-                        f"{k}: {v}"
-                        for k, v in container_type["containerAttributes"].items()
-                        if k in ["isoSizeType", "isoTypeGroup"]
-                    ]
-                    + [container_type[k] for k in ["displayName", "notes"]]
-                ),
-            )
-            for container_type in container_types
-            if container_type["isActive"]
-            and container_type["code"] not in computed_container_type_ids
-        ]
-        if not container_attribute_strings:
-            logger.info("No new container types found.")
-            return computed_container_type_ids, computed_container_type_embeddings
-        logger.info("Computing embeddings for container types...")
-        container_type_ids, container_type_embeddings = self.batch_embed(
-            container_attribute_strings, "container_types"
-        )
-        return container_type_ids, container_type_embeddings
-    def setup_ports_embeddings(self, computed_port_ids, computed_port_embeddings):
-        """
-        Set up port embeddings.
-        Steps:
-        - Retrieve active ports from the TMS API
-        - Compute embeddings for new tradelane-enabled ports
-        - Return ALL port IDs and embeddings.
-        Args:
-            computed_port_ids (list): The list of previously computed port IDs.
-            computed_port_embeddings (list): The list of previously computed port embeddings.
-        Returns:
-            tuple: A tuple containing ALL port IDs and embeddings.
-        """
-        url = f"https://tms.forto.{tms_domain}/api/transport-network/api/ports?pageSize=1000000&status=active"  # noqa
-        resp = call_tms(requests.get, url)
-        resp_json = resp.json()
-        if len(resp_json["data"]) != resp_json["_paging"]["totalRecords"]:
-            logger.error("Not all ports were returned.")
-        new_sea_ports = [
-            port
-            for port in resp_json["data"]
-            if "sea" in port["modes"] and port["id"] not in computed_port_ids
-        ]
-        if not new_sea_ports:
-            logger.info("No new ports found.")
-            return computed_port_ids, computed_port_embeddings
-        port_attribute_strings = [
-            dict(
-                title=port["id"],
-                content=" ".join(
-                    [
-                        "port for shipping",
-                        add_text_without_space(
-                            port["name"]
-                        ),  # for cases like QUINHON - Quinhon
-                        port["id"],
-                    ]
-                ),
-            )
-            for port in new_sea_ports
-        ]
-        logger.info("Computing embeddings for ports.")
-        port_ids, port_embeddings = self.batch_embed(port_attribute_strings, "ports")
-        return port_ids, port_embeddings
-    def setup_depot_embeddings(self, computed_depot_names, computed_depot_embeddings):
-        """
-        Set up depot embeddings.
-        Steps:
-        - Retrieve active depot from the TMS API
-        - Compute embeddings for new tdepot
-        - Return ALL depot names and embeddings.
-        Args:
-            computed_depot_names (list): The list of previously computed depot names.
-            computed_depot_embeddings (list): The list of previously computed depot embeddings.
-        Returns:
-            tuple: A tuple containing ALL depot names and embeddings.
-        """
-        url = f"https://tms.forto.{tms_domain}/api/transport-network/api/depots?pageSize=1000000"  # noqa
-        resp = call_tms(requests.get, url)
-        resp_json = resp.json()
-        new_depots = [
-            depot
-            for depot in resp_json["data"]
-            if depot["name"] not in computed_depot_names
-        ]
-        if not new_depots:
-            logger.info("No new depots found.")
-            return computed_depot_names, computed_depot_embeddings
-        depot_attribute_strings = [
-            dict(
-                title=depot["name"],
-                content=" | ".join(
-                    [
-                        "depot",
-                        "name - " + depot["name"],
-                        "address - " + depot["address"]["fullAddress"],
-                    ]
-                ),
-            )
-            for depot in resp_json["data"]
-        ]
-        logger.info("Computing embeddings for depots.")
-        depot_names, depot_embeddings = self.batch_embed(
-            depot_attribute_strings, "depots"
-        )
-        return depot_names, depot_embeddings
-    def setup_terminal_embeddings(self, added_port_ids):
-        """
-        Set up terminal embeddings for `added_port_ids`, using `model`, uploaded to `bucket`.
-        Args:
-            added_port_ids (list): A list of added port IDs.
-        Returns:
-            tuple: A tuple containing the ALL terminal IDs and terminal embeddings.
-            Not just for the added port IDs.
-        """
-        terminal_attibute_strings = [
-            setup_terminal_attributes(port_id) for port_id in added_port_ids
-        ]
-        terminal_attibute_strings = sum(terminal_attibute_strings, [])
-        if not terminal_attibute_strings:
-            logger.info("No new terminals found.")
-            return [], np.array([])
-        terminal_ids, terminal_embeddings = self.batch_embed(
-            terminal_attibute_strings, "terminals"
-        )
-        return terminal_ids, terminal_embeddings
-    def setup_item_code_embeddings(
-        self, computed_item_code_names, computed_item_code_embeddings
-    ):
-        """
-        Set up item_code embeddings.
-        Steps:
-        - Retrieve active item_code from the TMS API
-        - Compute embeddings for new titem_code
-        - Return ALL item_code names and embeddings.
-        Args:
-            computed_item_code_names (list): The list of previously computed item_code names.
-            computed_item_code_embeddings (list): The list of previously computed item_code embeddings.
-        Returns:
-            tuple: A tuple containing ALL item_code names and embeddings.
-        """
-        url = f"https://tms.forto.{tms_domain}/api/catalog/item-codes?pageSize=1000000"  # noqa
-        resp = call_tms(requests.get, url)
-        resp_json = resp.json()
-        new_item_codes = [
-            item_code
-            for item_code in resp_json["results"]
-            if item_code["id"] not in computed_item_code_names
-        ]
-        if not new_item_codes:
-            logger.info("No new item_codes found.")
-            return computed_item_code_names, computed_item_code_embeddings
-        item_code_attribute_strings = [
-            dict(title=item_code["id"], content=item_code["description"])
-            for item_code in resp_json["results"]
-        ]
-        logger.info("Computing embeddings for item_codes.")
-        item_code_names, item_code_embeddings = self.batch_embed(
-            item_code_attribute_strings, "item_codes"
-        )
-        return item_code_names, item_code_embeddings
-def setup_embed_model():
-    """
-    Set up and return a text embedding model.
-    Returns:
-        TextEmbeddingModel: The initialized text embedding model.
-    """
-    model = TextEmbeddingModel.from_pretrained("text-multilingual-embedding-002")
-    return model
 def convert_container_number(container_number):
     """
     Convert a container number to ISO standard.
@@ -423,7 +31,13 @@ def convert_container_number(container_number):
         return
     # 'FFAU2932130--FX34650895-40HC' -> 'FFAU2932130'
     match = re.findall(r"[A-Z]{4}\d{7}", container_number)
-    stripped_value = match[0] if match else None
+    stripped_value = match if match else None
+    # LLMs do extract all the container numbers as a list of strings
+    if stripped_value and len(stripped_value) > 1:
+        return stripped_value
+    else:
+        stripped_value = stripped_value[0] if stripped_value else None
     if not stripped_value:
         stripped_value = "".join(
@@ -440,6 +54,51 @@ def convert_container_number(container_number):
     return formatted_value
+def clean_invoice_number(invoice_number):
+    """Post process invoice number
+    Args:
+        invoice_number (str): The invoice number to be cleaned.
+    Returns:
+        str: The cleaned invoice number if it is valid, None otherwise.
+    """
+    if not invoice_number:
+        return
+    # Remove all non-alphanumeric characters
+    stripped_value = re.sub(r"[^\w]", "", invoice_number)
+    return stripped_value
+def clean_shipment_id(shipment_id):
+    """
+    Convert shipment_id to Forto standard.
+    Args:
+        shipment_id (str): The Shipment ID to be converted.
+    Returns:
+        str: The formatted shipment_id if it is valid, None otherwise.
+    """
+    if not shipment_id:
+        return
+    # '#S1234565@-1' -> 'S1234565'
+    # Find the pattern of a shipment ID that starts with 'S' followed by 7 to 8 digits
+    match = re.findall(r"S\d{6,8}", shipment_id)
+    stripped_value = match[0] if match else None
+    if not stripped_value:
+        return None
+    # Check if length is valid (should be either 7 or 8)
+    if len(stripped_value) not in (7, 8, 9):
+        return None
+    return stripped_value
 # Clean the date for date obj parse in tms formatting
 def clean_date_string(date_str):
     """Remove hours and timezone information from the date string."""
@@ -475,9 +134,12 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
-        if c.isnumeric() or c in [",", "."]:
+    for c in value:
+        if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
     # First and last characters should not be  [",", "."]
@@ -507,106 +169,6 @@ def extract_string(data_field_value):
     return formatted_value if formatted_value not in ["''", ""] else None
-def extract_google_embed_resp(prediction_string, embedding_dimension):
-    """
-    Extract relevant information from the Google Embed API response.
-    Args:
-        prediction_string (str): The prediction string returned by the Google Embed API.
-    Returns:
-        dict: A dictionary containing the extracted information.
-            - _id (str): The title of the instance.
-            - attr_text (str): The content of the instance.
-            - embedding (list): The embeddings values from the predictions.
-    """
-    res = json.loads(prediction_string)
-    return dict(
-        _id=res["instance"]["title"],
-        attr_text=res["instance"]["content"],
-        embedding=res["predictions"][0]["embeddings"]["values"][:embedding_dimension],
-    )
-def load_embed_by_data_field(bucket, embedding_path, embedding_dimension):
-    """
-    Load embeddings by data field from the specified bucket and embedding path.
-    Args:
-        bucket (Bucket): The bucket object representing the storage bucket.
-        embedding_path (str): The path to the embeddings in the bucket (different by data_field).
-    Returns:
-        tuple: A tuple containing the option IDs and option embeddings.
-            - option_ids (list): A list of option IDs.
-            - option_embeddings (ndarray): An array of option embeddings.
-    """
-    # Retrieve the embeddings from the output files
-    blobs = bucket.list_blobs(prefix=embedding_path)
-    all_blob_data = []
-    for blob in blobs:
-        blob_data = blob.download_as_bytes().decode("utf-8").splitlines()
-        embeddings = [
-            extract_google_embed_resp(data, embedding_dimension) for data in blob_data
-        ]
-        all_blob_data.extend(embeddings)
-    option_ids = [embed["_id"] for embed in all_blob_data]
-    option_embeddings = np.stack([embed["embedding"] for embed in all_blob_data])
-    return option_ids, option_embeddings
-def setup_terminal_attributes(port_id: str):
-    """
-    Retrieve and format the attributes of active terminals at a given port.
-    Args:
-        port_id (str): The ID of the port.
-    Returns:
-        list: A list of dictionaries containing the formatted attributes of active terminals.
-              Each dictionary has the following keys:
-              - title: The terminal's short code.
-              - content: A string representation of the terminal's attributes, including its name,
-                         searchable name, and full address.
-    """
-    url = f"https://gateway.forto.{tms_domain}/api/transport-network/api/ports/{port_id}/terminals/list"  # noqa
-    resp = call_tms(requests.get, url)
-    terminals = resp.json()
-    if len(terminals) == 0:
-        return []
-    active_terminals = [term for term in terminals if term["isActive"]]
-    if len(active_terminals) == 0:
-        logger.warning(f"No active terminals found at port {port_id}.")
-        return []
-    terminal_attibute_strings = [
-        dict(
-            title=term["name"],
-            content=" | ".join(
-                [
-                    "shipping terminal",
-                    "code - " + term["terminalShortCode"],
-                    "name - " + modify_terminal_name(term["searchableName"]),
-                    "country - " + term["address"]["country"],
-                ]
-            ),
-        )
-        for term in active_terminals
-    ]
-    return terminal_attibute_strings
-def modify_terminal_name(text):
-    # Find the first occurrence of a word starting with 'K' followed by a number
-    # and replace it with 'KAAI' - meaning Quay in Dutch
-    match = re.search(r"K(\d+)", text)
-    if match:
-        # Append "KAAI" followed by the number if a match is found
-        text += f" KAAI {match.group(1)}"
-    return text
 def remove_none_values(d):
     if isinstance(d, dict):
         # Create a new dictionary to store non-None values
@@ -641,33 +203,137 @@ def check_formatting_rule(entity_key, document_type_code, rule):
     return False
-def convert_invoice_type(data_field_value):
-    keyword_classification = {
-        "invoice": "invoice",
-        "rechnung": "invoice",
-        "factura": "invoice",
-        "fattura": "invoice",
-        "faktura": "invoice",
-        "debit": "invoice",
-        "factuur": "invoice",
-        "credit": "creditNote",
-        "credito": "creditNote",
-        "crédito": "creditNote",
-        "creditnota": "creditNote",
-        "kreditnota": "creditNote",
-        "kredytowa": "creditNote",
-        "rechnungskorrektur": "creditNote",
-        "stornobeleg": "creditNote",
-    }
+def convert_invoice_type(data_field_value, params):
+    """
+    Converts a raw invoice type string to either invoice or creditNote using fuzzy matching.
-    # TODO: sort according to lenght of the keys
-    for keyword, classification in keyword_classification.items():
-        if keyword in data_field_value.lower():
-            return classification
+    Args:
+        data_field_value (str): The raw invoice type string from the data.
+        params (dict): A dictionary of parameters, including:
+            - "lookup_data": A dictionary containing lookup tables.
+            - "fuzzy_threshold_invoice_classification": The minimum fuzzy matching score.
+    Returns:
+        str or None: The standardized invoice type if a match is found, otherwise None.
+    """
+    lookup_data = params["lookup_data"]["invoice_classification"]
+    keywords = list(lookup_data.keys())
+    best_match = process.extractOne(
+        data_field_value.lower(),
+        keywords,
+    )
+    if best_match:
+        best_match_key, score, _ = best_match
+        if score >= params["fuzzy_threshold_invoice_classification"]:
+            return lookup_data[best_match_key]
     return None
-def clean_item_description(lineitem: str):
+# Function to create KVP dictionary using apply method
+def create_kvp_dictionary(df_raw: pd.DataFrame):
+    """Create a key-value pair dictionary from the given DataFrame.
+    Args:
+        df_raw (pd.DataFrame): The input DataFrame containing 'lineitem' and 'Forto SLI' columns.
+    return:
+        A key-value pair dictionary with 'Processed Lineitem' as key and 'Forto SLI' as value.
+    """
+    df = df_raw.copy()
+    df["Processed Lineitem"] = df["lineitem"].apply(clean_item_description)
+    kvp_dict = df.set_index("Processed Lineitem")["Forto SLI"].to_dict()
+    return kvp_dict
+def remove_dates(lineitem: str):
+    """
+    This function removes dates in the format "dd Month yyyy" from the given line item string.
+    Args:
+    lineitem (str): The input string from which dates will be removed.
+    Returns:
+    str: The string with dates removed.
+    """
+    # Remove dates in the format dd.mm.yy or dd.mm.yyyy
+    lineitem = re.sub(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b", "", lineitem)
+    # Remove dates in the format "dd Month yyyy"
+    lineitem = re.sub(
+        r"\b\d{1,2} (?:january|february|march|april|may|june|july|august|september|october|november|december|januar|"
+        r"februar|märz|mai|juni|juli|oktober|dezember) \d{4}\b",
+        "",
+        lineitem,
+        flags=re.IGNORECASE,
+    )
+    # Define a list of month abbreviations in English and German
+    month_abbreviations = [
+        "JAN",
+        "FEB",
+        "MAR",
+        "APR",
+        "MAY",
+        "JUN",
+        "JUL",
+        "AUG",
+        "SEP",
+        "OCT",
+        "NOV",
+        "DEC",
+        "JAN",
+        "FEB",
+        "MRZ",
+        "APR",
+        "MAI",
+        "JUN",
+        "JUL",
+        "AUG",
+        "SEP",
+        "OKT",
+        "NOV",
+        "DEZ",
+    ]
+    # Create a regular expression pattern to match month abbreviations
+    pattern = r"\b(?:{})\b".format("|".join(month_abbreviations))
+    # Remove month abbreviations
+    lineitem = re.sub(pattern, "", lineitem, flags=re.IGNORECASE)
+    return lineitem
+def remove_unwanted_patterns(lineitem: str):
+    """
+    This function removes dates, month names, and container numbers from the given line item string.
+    Args:
+    lineitem (str): The input string from which unwanted patterns will be removed.
+    Returns:
+    str: The string with dates, month names, and container numbers removed.
+    """
+    # Remove container numbers (4 letters followed by 7 digits)
+    lineitem = re.sub(r"\b[A-Z]{4}\d{7}\b", "", lineitem)
+    # Remove "HIGH CUBE"
+    lineitem = lineitem.replace("HIGH CUBE", "")
+    # Remove container size e.g., 20FT, 40HC, etc.
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
+    return lineitem
+def clean_item_description(lineitem: str, remove_numbers: bool = True):
     """
     This function removes dates, month names, whitespaces, currency patterns and container numbers from the given line item string.  # noqa
@@ -677,16 +343,13 @@ def clean_item_description(lineitem: str):
     Returns:
     str: The cleaned string removed.
     """
-    # Patterns
-    date_explicit_patterns = r"\b\d{1,2} (?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|mai|juni|juli|oktober|dezember) \d{4}\b"  # noqa
-    date_abbrevation_patterns = r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC|JAN|FEB|MRZ|APR|MAI|JUN|JUL|AUG|SEP|OKT|NOV|DEZ)\b"  # noqa
     currency_codes_pattern = r"\b(USD|EUR|JPY|GBP|CAD|AUD|CHF|CNY|SEK|NZD|KRW|SGD|INR|BRL|ZAR|RUB|MXN|HKD|NOK|TRY|IDR|MYR|PHP|THB|VND|PLN|CZK|HUF|ILS|AED|SAR|QAR|KWD|EGP|NGN|ARS|CLP|COP|PEN|UYU|VEF|INR|PKR|BDT|LKR|NPR|MMK)\b"  # noqa
-    other_patterns = r"\b(HIGH CUBE|HIGHCUBE|FROM|TO|day s|Free|HQ)\b"
+    # Remove stopwords
+    lineitem = remove_stop_words(lineitem)
     # remove dates
-    lineitem = re.sub(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b", "", lineitem)
-    lineitem = re.sub(date_explicit_patterns, "", lineitem, flags=re.IGNORECASE)
-    lineitem = re.sub(date_abbrevation_patterns, "", lineitem, flags=re.IGNORECASE)
+    lineitem = remove_dates(lineitem)
     # remove whitespaces
     lineitem = re.sub(r"\s{2,}", " ", lineitem)
@@ -694,30 +357,50 @@ def clean_item_description(lineitem: str):
     # remove newlines
     lineitem = re.sub(r"\\n|\n", " ", lineitem)
-    # remove special chars
-    lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
-    # remove other patterns
-    lineitem = re.sub(other_patterns, "", lineitem, flags=re.IGNORECASE)
     # Remove the currency codes
     lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
-    # Remove container numbers (4 letters followed by 7 digits)
-    lineitem = re.sub(r"\b[A-Z]{4}\d{7}\b", "", lineitem)
+    # remove other patterns
+    lineitem = remove_unwanted_patterns(lineitem)
     # Remove numbers from the line item
-    lineitem = re.sub(r"\d+", "", lineitem)
+    if (
+        remove_numbers
+    ):  # Do not remove numbers for the reverse charge sentence as it contains Article number
+        lineitem = re.sub(r"\d+", "", lineitem)
+    # remove special chars
+    lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
+    # Remove x from lineitem like 10 x
+    lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
     return re.sub(r"\s{2,}", " ", lineitem).strip()
 async def format_label(
-    entity_k, entity_value, embed_manager, document_type_code, llm_client
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
 ):
+    llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, embed_manager, document_type_code, llm_client)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
@@ -725,35 +408,45 @@ async def format_label(
         format_tasks = await asyncio.gather(
             *[
                 format_label(
-                    entity_k, sub_v, embed_manager, document_type_code, llm_client
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
                 )
                 for sub_v in entity_value
             ]
         )
         return entity_k, [v for _, v in format_tasks]
+    if mime_type == "application/pdf":
+        if isinstance(entity_value, tuple):
+            page = entity_value[1]
+            entity_value = entity_value[0]
+        else:
+            page = -1
     entity_key = entity_k.lower()
-    embeddings_dict = embed_manager.embeddings_dict
     formatted_value = None
     if entity_key.startswith("port"):
         formatted_value = await get_port_code_ai(
-            entity_value, llm_client, embed_manager, *embeddings_dict["ports"]
+            entity_value, llm_client, doc_type=document_type_code
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = embed_manager._find_most_similar_option(
-            "container type " + entity_value,
-            *embeddings_dict["container_types"],
-        )
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = embed_manager._find_most_similar_option(
-            "shipping terminal " + str(entity_value),
-            *embeddings_dict["terminals"],
-        )
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = embed_manager._find_most_similar_option(
-            "depot " + str(entity_value),
-            *embeddings_dict["depots"],
-        )
-    elif entity_key.startswith(("eta", "etd", "duedate", "issuedate")):
+        formatted_value = depot_map.get(entity_value)
+    elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
             cleaned_data_field_value = clean_date_string(entity_value)
             dt_obj = extract_date(cleaned_data_field_value)
@@ -771,28 +464,27 @@ async def format_label(
             formatted_value = dt_obj.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
         except ValueError as e:
             logger.info(f"ParserError: {e}")
+    elif (
+        entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
+        and document_type_code == "bundeskasse"
+    ):
+        formatted_value = clean_invoice_number(entity_value)
+    elif entity_key in ("shipmentid", "partnerreference"):
+        # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 7 or 8 digits)
+        formatted_value = clean_shipment_id(entity_value)
     elif entity_key == "containernumber":
         # Remove all non-alphanumeric characters like ' ', '-', etc.
         formatted_value = convert_container_number(entity_value)
     elif any(
         numeric_indicator in entity_key
-        for numeric_indicator in [
-            "weight",
-            "quantity",
-            "value",
-            "amount",
-            "totalamount",
-            "totalamounteuro",
-            "vatamount",
-            "vatApplicableAmount",
-            "grandTotal",
-        ]
-    ):
-        formatted_value = extract_number(entity_value)
-    elif (
-        document_type_code in ["finalMbL", "draftMbl"] and entity_key == "measurements"
+        for numeric_indicator in ["measurements", "weight"]
     ):
         formatted_value = extract_number(entity_value)
     elif any(
         packaging_type in entity_key
         for packaging_type in ["packagingtype", "packagetype", "currency"]
@@ -802,27 +494,57 @@ async def format_label(
     elif "lineitemdescription" in entity_key:
         formatted_value = clean_item_description(entity_value)
     elif "documenttype" in entity_key:
-        formatted_value = convert_invoice_type(entity_value)
+        formatted_value = convert_invoice_type(entity_value, params)
+    # Handle reverseChargeSentence
+    elif "reversechargesentence" in entity_key:
+        formatted_value = clean_item_description(entity_value, remove_numbers=False)
+    elif "quantity" in entity_key:
+        if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
+            # For partner invoice, quantity can be mentioned as whole number
+            # Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
+            formatted_value = decimal_convertor(
+                extract_number(entity_value), quantity=True
+            )
+        else:
+            formatted_value = extract_number(entity_value)
+    elif any(
+        numeric_indicator in entity_key
+        for numeric_indicator in [
+            "value",
+            "amount",
+            "price",
+            "totalamount",
+            "totalamounteuro",
+            "vatamount",
+            "vatapplicableamount",
+            "grandtotal",
+        ]
+    ):
+        # Convert EU values to English values (e.g., 4.123,45 -> 4123.45)
+        formatted_value = decimal_convertor(extract_number(entity_value))
     result = {
         "documentValue": entity_value,
         "formattedValue": formatted_value,
     }
+    if mime_type == "application/pdf":
+        result["page"] = page
     return entity_k, result
-async def get_port_code_ai(
-    port: str, llm_client, embed_manager, port_ids, port_embeddings
-):
-    port_llm = await get_port_code_llm(port, llm_client)
+async def get_port_code_ai(port: str, llm_client, doc_type=None):
+    """Get port code using AI model."""
+    port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    if port_llm in port_ids:
-        return port_llm
-    port_text = f"port for shipping {port}"
-    return embed_manager._find_most_similar_option(port_text, port_ids, port_embeddings)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
-async def get_port_code_llm(port: str, llm_client):
+async def get_port_code_llm(port: str, llm_client, doc_type=None):
     if (
         "postprocessing" in prompt_library.library.keys()
         and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -849,7 +571,7 @@ async def get_port_code_llm(port: str, llm_client):
         }
         response = await llm_client.get_unified_json_genai(
-            prompt, response_schema=response_schema, model="chatgpt"
+            prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
         )
         try:
             mapped_port = response["port"]
@@ -859,25 +581,149 @@ async def get_port_code_llm(port: str, llm_client):
             return None
-async def format_all_entities(result, embed_manager, document_type_code, llm_client):
-    # remove None values from dict
+def decimal_convertor(value, quantity=False):
+    """Convert EU values to English values."""
+    if value is None:
+        return None
+    # Remove spaces
+    value = value.strip().replace(" ", "")
+    # Check "-" and remove it for processing
+    is_negative, value = (True, value[1:]) if value.startswith("-") else (False, value)
+    if not quantity:
+        # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
+        if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
+            value = value.replace(".", "").replace(",", ".")
+        # European style integer with thousand separator: 2.500
+        elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
+            value = value.replace(".", "")
+        # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
+        elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
+            value = value.replace(",", "")
+        # English style integer with thousand separator: 2,500
+        elif re.match(r"^\d{1,3}(,\d{3})+$", value):
+            value = value.replace(",", "")
+        # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
+        if re.match(r"^\d+,\d{1,2}$", value):
+            value = value.replace(",", ".")
+        # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
+        elif re.match(r"^\d+\.\d{3,}$", value):
+            value = value[: value.index(".") + 3]
+    else:  # quantity=True → only last two
+        # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
+        if re.match(r"^\d+,\d{1,2}$", value):
+            value = value.replace(",", ".")
+        # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
+        elif re.match(r"^\d+\.\d{3,}$", value):
+            value = value[: value.index(".") + 3]
+    # Re-add negative sign if applicable
+    value = "-" + value if is_negative else value
+    return value
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
+async def format_all_entities(result, document_type_code, params, mime_type):
+    """Format the entity values in the result dictionary."""
+    # Since we treat `customsInvoice` same as `partnerInvoice`
+    document_type_code = (
+        "partnerInvoice"
+        if document_type_code == "customsInvoice"
+        else document_type_code
+    )
+    # Remove None values from the dictionary
     result = remove_none_values(result)
     if result is None:
         logger.info("No data was extracted.")
         return {}
-    _, aggregated_data = await format_label(
-        None, result, embed_manager, document_type_code, llm_client
+    # Format all entities recursively
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
     )
-    # TODO: this is hardcoded cuz its an exception
-    # It will stay this way until a better implementation will be found
-    if document_type_code == "partnerInvoice":
-        for line_item in aggregated_data.get("lineItem", []):
-            line_item["itemCode"] = associate_forto_item_code(
-                embed_manager, line_item["lineItemDescription"]["formattedValue"]
-            )
-    logger.info("Data Extraction completed successfully")
+    # Process partner invoice on lineitem mapping and reverse charge sentence
+    if document_type_code in ["partnerInvoice", "bundeskasse"]:
+        await process_partner_invoice(params, aggregated_data, document_type_code)
+    logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -890,32 +736,63 @@ def add_text_without_space(text):
     return text
-def associate_forto_item_code(embed_manager, line_item_description):
-    """
-    Associates a given line item description with the most similar Forto item code.
-    This function utilizes an embedding manager to find the most similar Forto item code
-    from a pre-computed set of embeddings, based on the provided line item description.
+def remove_stop_words(lineitem: str):
+    """Remove stop words in English and German from the given line item string.
     Args:
-        embed_manager: An instance of an embedding manager that contains the item code embeddings.
-        line_item_description: The description of the line item to associate with an item code.
+    lineitem (str): The input string from which stop words will be removed.
     Returns:
-        A dictionary containing the original line item description and the most similar
-        formatted item code. The dictionary has the following structure:
-        {
-            "documentValue": line_item_description,
-            "formattedValue": formatted_value,
-        }
-        where formatted_value is the most similar item code found.
+    str: The string with stop words removed.
     """
-    embeddings_dict = embed_manager.embeddings_dict
-    formatted_value = embed_manager._find_most_similar_option(
-        line_item_description, *embeddings_dict["item_codes"]
+    stop_words = set(stopwords.words("english") + stopwords.words("german")) - {"off"}
+    return (
+        " ".join(word for word in lineitem.split() if word.lower() not in stop_words)
+        .upper()
+        .strip()
     )
-    result = {
-        "documentValue": line_item_description,
-        "formattedValue": formatted_value,
-    }
-    return result
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
+    """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    # If only 1 page, simply pair each value with page number 0
+    if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
+        if isinstance(llm_prediction, dict):
+            return {
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
+                for k, v in llm_prediction.items()
+            }
+        elif isinstance(llm_prediction, list):
+            return [
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
+            ]
+        else:
+            return (llm_prediction, effective_page) if llm_prediction else None
+    # logic for multi-page predictions
+    if isinstance(llm_prediction, dict):
+        if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
+            if llm_prediction["value"]:
+                try:
+                    _page_number = int(llm_prediction["page_number"])
+                except:  # noqa: E722
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
+            return None
+        for key, value in llm_prediction.items():
+            llm_prediction[key] = llm_prediction_to_tuples(
+                llm_prediction.get(key, value), number_of_pages, page_number
+            )
+    elif isinstance(llm_prediction, list):
+        for i, item in enumerate(llm_prediction):
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
+    return llm_prediction

data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl

data-science-document-ai 1.13.0py3-none-any.whl → 1.56.1py3-none-any.whl