PyPI - data-science-document-ai - Versions diffs - 1.42.5__py3-none-any.whl → 1.57.0__py3-none-any.whl - Mend

data-science-document-ai 1.42.5py3-none-any.whl → 1.57.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/METADATA +2 -2
data_science_document_ai-1.57.0.dist-info/RECORD +60 -0
src/constants.py +13 -34
src/docai_processor_config.yaml +0 -69
src/excel_processing.py +24 -14
src/io.py +23 -0
src/llm.py +0 -29
src/pdf_processing.py +183 -76
src/postprocessing/common.py +172 -28
src/postprocessing/postprocess_partner_invoice.py +194 -59
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bookingConfirmation/evergreen/placeholders.json +135 -21
src/prompts/library/bookingConfirmation/evergreen/prompt.txt +21 -17
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +136 -22
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +52 -58
src/prompts/library/bookingConfirmation/maersk/placeholders.json +135 -21
src/prompts/library/bookingConfirmation/maersk/prompt.txt +10 -1
src/prompts/library/bookingConfirmation/msc/placeholders.json +135 -21
src/prompts/library/bookingConfirmation/msc/prompt.txt +10 -1
src/prompts/library/bookingConfirmation/oocl/placeholders.json +149 -21
src/prompts/library/bookingConfirmation/oocl/prompt.txt +11 -3
src/prompts/library/bookingConfirmation/other/placeholders.json +149 -21
src/prompts/library/bookingConfirmation/other/prompt.txt +56 -57
src/prompts/library/bookingConfirmation/yangming/placeholders.json +149 -21
src/prompts/library/bookingConfirmation/yangming/prompt.txt +11 -1
src/prompts/library/bundeskasse/other/placeholders.json +5 -5
src/prompts/library/bundeskasse/other/prompt.txt +7 -5
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
src/prompts/library/customsAssessment/other/placeholders.json +70 -0
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/prompt.txt +4 -3
src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +80 -0
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
src/prompts/prompt_library.py +0 -4
src/setup.py +25 -24
src/utils.py +120 -68
data_science_document_ai-1.42.5.dist-info/RECORD +0 -57
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/WHEEL +0 -0

src/postprocessing/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
+    for c in value:
         if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
@@ -319,6 +322,14 @@ def remove_unwanted_patterns(lineitem: str):
     # Remove "HIGH CUBE"
     lineitem = lineitem.replace("HIGH CUBE", "")
+    # Remove container size e.g., 20FT, 40HC, etc.
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
     return lineitem
@@ -349,42 +360,75 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     # Remove the currency codes
     lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
+    # remove other patterns
+    lineitem = remove_unwanted_patterns(lineitem)
     # Remove numbers from the line item
     if (
         remove_numbers
     ):  # Do not remove numbers for the reverse charge sentence as it contains Article number
         lineitem = re.sub(r"\d+", "", lineitem)
-    # remove other patterns
-    lineitem = remove_unwanted_patterns(lineitem)
     # remove special chars
     lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
+    # Remove x from lineitem like 10 x
+    lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
         return entity_k, [v for _, v in format_tasks]
-    if isinstance(entity_value, tuple):
-        page = entity_value[1]
-        entity_value = entity_value[0]
-    else:
-        page = -1
+    if mime_type == "application/pdf":
+        if isinstance(entity_value, tuple):
+            page = entity_value[1]
+            entity_value = entity_value[0]
+        else:
+            page = -1
     entity_key = entity_k.lower()
     formatted_value = None
@@ -394,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -421,7 +465,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         except ValueError as e:
             logger.info(f"ParserError: {e}")
-    elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
+    elif (
+        entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
+        and document_type_code == "bundeskasse"
+    ):
         formatted_value = clean_invoice_number(entity_value)
     elif entity_key in ("shipmentid", "partnerreference"):
@@ -482,8 +529,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     result = {
         "documentValue": entity_value,
         "formattedValue": formatted_value,
-        "page": page,
     }
+    if mime_type == "application/pdf":
+        result["page"] = page
     return entity_k, result
@@ -491,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -582,7 +632,75 @@ def decimal_convertor(value, quantity=False):
     return value
-async def format_all_entities(result, document_type_code, params):
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
+async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
     document_type_code = (
@@ -597,11 +715,13 @@ async def format_all_entities(result, document_type_code, params):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(None, result, document_type_code, params)
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
+    )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
-        process_partner_invoice(params, aggregated_data, document_type_code)
+        await process_partner_invoice(params, aggregated_data, document_type_code)
     logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -633,22 +753,46 @@ def remove_stop_words(lineitem: str):
     )
-def llm_prediction_to_tuples(llm_prediction):
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
     """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    # If only 1 page, simply pair each value with page number 0
+    if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
+        if isinstance(llm_prediction, dict):
+            return {
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
+                for k, v in llm_prediction.items()
+            }
+        elif isinstance(llm_prediction, list):
+            return [
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
+            ]
+        else:
+            return (llm_prediction, effective_page) if llm_prediction else None
+    # logic for multi-page predictions
     if isinstance(llm_prediction, dict):
         if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
             if llm_prediction["value"]:
                 try:
-                    page_number = int(llm_prediction["page_number"])
+                    _page_number = int(llm_prediction["page_number"])
                 except:  # noqa: E722
-                    page_number = -1
-                return (llm_prediction["value"], page_number)
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
             return None
         for key, value in llm_prediction.items():
             llm_prediction[key] = llm_prediction_to_tuples(
-                llm_prediction.get(key, value)
+                llm_prediction.get(key, value), number_of_pages, page_number
             )
     elif isinstance(llm_prediction, list):
         for i, item in enumerate(llm_prediction):
-            llm_prediction[i] = llm_prediction_to_tuples(item)
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
     return llm_prediction

src/postprocessing/postprocess_partner_invoice.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """This module contains the postprocessing functions for the partner invoice."""
-from concurrent.futures import ThreadPoolExecutor
+from collections import defaultdict
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz, process
 from src.io import logger
 from src.utils import get_tms_mappings
@@ -105,9 +105,18 @@ def post_process_bundeskasse(aggregated_data):
             )
         # Check if the deferredDutyPayer is forto
-        deferredDutyPayer = line_item.get("deferredDutyPayer", {})
-        lower = deferredDutyPayer.get("documentValue", "").lower()
-        if any(key in lower for key in ["de789147263644738", "forto"]):
+        KEYWORDS = {"de789147263644738", "forto", "009812"}
+        def is_forto_recipient(line_item: dict) -> bool:
+            values_to_check = [
+                line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
+                line_item.get("vatId", {}).get("documentValue", ""),
+            ]
+            combined = " ".join(values_to_check).lower()
+            return any(keyword in combined for keyword in KEYWORDS)
+        if is_forto_recipient(line_item):
             is_recipient_forto = True
     update_recipient_and_vendor(aggregated_data, is_recipient_forto)
@@ -136,13 +145,32 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
     ] = "Dasbachstraße 15, 54292 Trier, Germany"
-def process_partner_invoice(params, aggregated_data, document_type_code):
+def select_unique_bank_account(bank_account):
+    # Select the unique bank account if multiple are present
+    if isinstance(bank_account, list) and bank_account:
+        best = defaultdict(lambda: None)
+        for item in bank_account:
+            dv = item["documentValue"]
+            if best[dv] is None or item["page"] < best[dv]["page"]:
+                best[dv] = item
+        unique = list(best.values())
+        return unique
+async def process_partner_invoice(params, aggregated_data, document_type_code):
     """Process the partner invoice data."""
     # Post process bundeskasse invoices
     if document_type_code == "bundeskasse":
         post_process_bundeskasse(aggregated_data)
         return
+    if "bankAccount" in aggregated_data:
+        aggregated_data["bankAccount"] = select_unique_bank_account(
+            aggregated_data["bankAccount"]
+        )
     line_items = aggregated_data.get("lineItem", [])
     # Add debug logging
     logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -160,27 +188,78 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
         reverse_charge_info["formattedValue"] = reverse_charge_value
         reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
-    # Process each line item
-    for line_item in line_items:
-        if line_item.get("lineItemDescription", None) is not None:
-            line_item["itemCode"] = associate_forto_item_code(
-                line_item["lineItemDescription"]["formattedValue"],
-                params,
-            )
+    # Partner Name
+    partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
+    # Process everything in one go
+    processed_items = await process_line_items_batch(
+        params, line_items, reverse_charge, partner_name
+    )
-            # Add page number for the consistency
-            line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
+    # Update your main data structure
+    aggregated_data["lineItem"] = processed_items
-        if reverse_charge:
-            # Distribute reverseChargeSentence to all line items
-            line_item["reverseChargeSentence"] = reverse_charge
-            line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
+async def process_line_items_batch(
+    params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
+):
+    """
+    Processes all line items efficiently using a "Split-Apply-Combine" strategy.
+    """
+    # To store items that need external API lookup
+    pending_line_items = {}
+    # Check Fuzzy Matching
+    logger.info(f"Mapping line item codes with Fuzzy matching....")
+    for i, item in enumerate(line_items):
+        description_obj = item.get("lineItemDescription")
+        if not description_obj or not description_obj.get("formattedValue"):
+            continue
+        # Get the formatted description text
+        desc = description_obj["formattedValue"]
+        # Find Fuzzy Match
+        matched_code = find_matching_lineitem(
+            desc,
+            params["lookup_data"]["item_code"],
+            params["fuzzy_threshold_item_code"],
+        )
+        if matched_code:
+            # Set the code to the line item
+            item["itemCode"] = {
+                "documentValue": desc,
+                "formattedValue": matched_code,
+                "page": description_obj.get("page"),
+            }
+        else:
+            # Store for batch API call
+            pending_line_items[i] = desc
+    # Batch API Call for Embedding lookups
+    if pending_line_items:
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
+        for index, desc in pending_line_items.items():
+            line_items[index]["itemCode"] = {
+                "documentValue": desc,
+                "formattedValue": code_map.get(desc),
+                "page": line_items[index]["lineItemDescription"].get("page"),
+            }
+    # Add reverse charge here if exists
+    if reverse_charge:
+        [
+            item.update({"reverseChargeSentence": reverse_charge})
+            for item in line_items
+            if (
+                (item.get("itemCode") and item["itemCode"]["formattedValue"] != "CDU")
+                or not item.get("itemCode")
+            )
+        ]
-def compute_score(args):
-    """Compute the fuzzy matching score between a new line item and a key."""
-    new_lineitem, key = args
-    return key, fuzz.ratio(new_lineitem, key)
+    return line_items
 def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
@@ -195,16 +274,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
         tuple: (best_match, score) if above threshold, else (None, 0)
     """
     # Use multiprocessing to find the best match
-    with ThreadPoolExecutor() as executor:
-        results = executor.map(compute_score, [(target, s) for s in sentences])
+    result = process.extractOne(
+        target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
+    )
+    if result is None:
+        return None, False
-    # Find the best match and score
-    best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
+    match, score, index = result
-    # return best_match, best_score
-    # If the best match score is above a threshold (e.g., 80), return it
-    if best_score >= threshold:
-        return best_match, True
+    # return best_match if the best match score is above a threshold (e.g., 80)
+    if match:
+        return match, True
     return None, False
@@ -219,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
         return False
     # Check if the sentence is similar to any of the reverse charge sentences
-    _, is_reverse_charge = get_fuzzy_match_score(
-        sentence, reverse_charge_sentences, threshold
+    match, _ = get_fuzzy_match_score(
+        sentence, list(reverse_charge_sentences.keys()), threshold
     )
-    return is_reverse_charge
+    if match:
+        return reverse_charge_sentences[match]
+    return False
 def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
@@ -236,46 +320,97 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     Returns:
         str: The best matching 'Forto SLI' value from the dictionary.
     """
-    new_lineitem = new_lineitem.upper()
     # Check if the new line item is already in the dictionary
     if new_lineitem in kvp_dict:
         return kvp_dict[new_lineitem]
     # Get the best fuzzy match score for the extracted line item
-    best_match, _ = get_fuzzy_match_score(
-        new_lineitem, list(kvp_dict.keys()), threshold
+    match, _ = get_fuzzy_match_score(
+        new_lineitem,
+        list(kvp_dict.keys()),
+        threshold,
     )
-    return kvp_dict.get(best_match, None)
+    if match:
+        # find the code from the kvp_dict
+        return kvp_dict[match]
-def associate_forto_item_code(input_string, params):
-    """
-    Finds a match for the input string using fuzzy matching first, then embedding fallback.
+    return None
-    1. Tries to find a fuzzy match for input_string against the keys in
-       mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
-    2. If found, returns the corresponding value from mapping_data.
-    3. If not found above threshold, calls the embedding_fallback function.
+async def associate_forto_item_code(line_item_data, params, partner_name=None):
+    """
+    Associates Forto item codes to a list of line item descriptions.
     Args:
-        input_string: The string to find a match for.
-        params: Parameters containing the lookup data and fuzzy threshold.
+        line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
+        params (dict): Parameters containing lookup data and thresholds.
+        partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
     Returns:
-        The matched value (from fuzzy match or embedding), or None if no match found.
+        list: A list of dictionaries with 'description' and 'itemCode' keys.
     """
-    # Get the Forto item code using fuzzy matching
-    forto_item_code = find_matching_lineitem(
-        new_lineitem=input_string,
-        kvp_dict=params["lookup_data"]["item_code"],  # TODO: Parse the KVP dictionary
-        threshold=params["fuzzy_threshold_item_code"],
+    result = []
+    pending_line_items = {}
+    for desc, f_desc in line_item_data.items():
+        # Get the Forto item code using fuzzy matching
+        code = find_matching_lineitem(
+            new_lineitem=f_desc,
+            kvp_dict=params["lookup_data"]["item_code"],
+            threshold=params["fuzzy_threshold_item_code"],
+        )
+        if code:
+            result.append({"description": desc, "itemCode": code})
+        else:
+            pending_line_items[desc] = f_desc
+    # Batch API Call for Embedding lookups
+    if pending_line_items:
+        code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
+        for desc, f_desc in pending_line_items.items():
+            result.append(
+                {
+                    "description": desc,
+                    "itemCode": code_map.get(f_desc),
+                }
+            )
+    return result
+async def fetch_line_item_codes(
+    pending_line_items: dict,
+    partner_name: str | None,
+    params: dict,
+):
+    """Returns: {original_description: mapped_code_or_None}"""
+    t_mode = (
+        find_matching_lineitem(
+            partner_name.upper(),
+            params["lookup_data"]["intermodal_partners"],
+            threshold=87,
+        )
+        if partner_name
+        else None
     )
-    if forto_item_code is None:
-        # 2. Fallback to embedding function if no good fuzzy match
-        forto_item_code = get_tms_mappings(input_string, "line_items")
+    unique_descs = list(set(pending_line_items.values()))
+    logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
-    result = {"documentValue": input_string, "formattedValue": forto_item_code}
+    # Build API input map
+    api_input_map = {
+        desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
+    }
+    api_results = await get_tms_mappings(
+        input_list=list(api_input_map.values()),
+        embedding_type="line_items",
+    )
+    # Normalize response back to original descriptions
+    result = {
+        original_desc: api_results.get(api_desc)
+        for original_desc, api_desc in api_input_map.items()
+    }
     return result

data-science-document-ai 1.42.5__py3-none-any.whl → 1.57.0__py3-none-any.whl

data-science-document-ai 1.42.5py3-none-any.whl → 1.57.0py3-none-any.whl