PyPI - data-science-document-ai - Versions diffs - 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl - Mend

data-science-document-ai 1.44.0py3-none-any.whl → 1.45.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.44.0
+Version: 1.45.1
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/RECORD RENAMED Viewed

@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=_vP2q1xEIeyjO8TvZlSTeEM-M1PMceyDSuYGfyZeceY,3361
 src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
 src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
 src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
-src/pdf_processing.py,sha256=7ZNC-OCf3OlvmfzCqrY4Simv_Pofac-mgFyCi7WYUB0,20274
-src/postprocessing/common.py,sha256=KhXDxJ2AKfBrvYovA5ZyvW9IX76EFoTD4L6wnVCzxQ4,23322
+src/pdf_processing.py,sha256=lzvoza9itpEyl-rcBQbIcWuFxUAvF_Qyc-OpuPQWWMk,20354
+src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
 src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
 src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
-src/postprocessing/postprocess_partner_invoice.py,sha256=Hm9frILlIOvCWVcFNpyh0jLi6QEN9eBbHseZShYiISQ,12562
+src/postprocessing/postprocess_partner_invoice.py,sha256=LZcMZfJeLdcbYqPemO8gn9SmJxv-NPmb4uVCT3lKg18,12341
 src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
 src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
 src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
@@ -31,7 +31,7 @@ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHi
 src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
 src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
 src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
-src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
+src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
 src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
 src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
 src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
@@ -44,7 +44,7 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
 src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
 src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
 src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
-src/prompts/library/partnerInvoice/other/prompt.txt,sha256=bn1_CXrQy38DI7MXl6r40Cp-70w5cfXY6CQyBntvaX8,7944
+src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
 src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
 src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
 src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
@@ -53,7 +53,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYp
 src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
 src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
 src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
-src/utils.py,sha256=OqEu1apmN428_RgzqjRU5sZdEbECgBH0YiMpyys4Q5E,16947
-data_science_document_ai-1.44.0.dist-info/METADATA,sha256=jLyTuN383EQ-WdVsShEIsoj-t_ubnQ9VTSfSTKV3g9o,2152
-data_science_document_ai-1.44.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-data_science_document_ai-1.44.0.dist-info/RECORD,,
+src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
+data_science_document_ai-1.45.1.dist-info/METADATA,sha256=U2ASt9xmLqXeWIDx7cr0LBJFV9yJC4yh398R25jkWvs,2152
+data_science_document_ai-1.45.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+data_science_document_ai-1.45.1.dist-info/RECORD,,

src/pdf_processing.py CHANGED Viewed

@@ -200,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         file_content = extract_top_pages(file_content, num_pages=5)
     number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
     # get the schema placeholder from the Doc AI and generate the response structure
     response_schema = (

src/postprocessing/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
+    for c in value:
         if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
     lineitem = lineitem.replace("HIGH CUBE", "")
     # Remove container size e.g., 20FT, 40HC, etc.
-    lineitem = re.sub(
-        r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
-    ).strip()
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
     return lineitem
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params, mime_type)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params, mime_type)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = await get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = await get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = await get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return await get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
     return value
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
 async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,8 +715,8 @@ async def format_all_entities(result, document_type_code, params, mime_type):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(
-        None, result, document_type_code, params, mime_type
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
     )
     # Process partner invoice on lineitem mapping and reverse charge sentence

src/postprocessing/postprocess_partner_invoice.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """This module contains the postprocessing functions for the partner invoice."""
-from concurrent.futures import ThreadPoolExecutor
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz, process
 from src.io import logger
 from src.utils import get_tms_mappings
@@ -177,6 +175,7 @@ async def process_line_items_batch(
     pending_line_items = {}
     # Check Fuzzy Matching
+    logger.info(f"Mapping line item codes with Fuzzy matching....")
     for i, item in enumerate(line_items):
         description_obj = item.get("lineItemDescription")
@@ -231,12 +230,6 @@ async def process_line_items_batch(
     return line_items
-def compute_score(args):
-    """Compute the fuzzy matching score between a new line item and a key."""
-    new_lineitem, key = args
-    return key, fuzz.ratio(new_lineitem, key)
 def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
     """Get the best fuzzy match for a target string from a list of candidates.
@@ -249,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
         tuple: (best_match, score) if above threshold, else (None, 0)
     """
     # Use multiprocessing to find the best match
-    with ThreadPoolExecutor() as executor:
-        results = executor.map(compute_score, [(target, s) for s in sentences])
+    result = process.extractOne(
+        target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
+    )
-    # Find the best match and score
-    best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
+    if result is None:
+        return None, False
-    # return best_match, best_score
-    # If the best match score is above a threshold (e.g., 80), return it
-    if best_score >= threshold:
-        return best_match, True
+    match, score, index = result
+    # return best_match if the best match score is above a threshold (e.g., 80)
+    if match:
+        return match, True
     return None, False
@@ -290,18 +285,22 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     Returns:
         str: The best matching 'Forto SLI' value from the dictionary.
     """
-    new_lineitem = new_lineitem.upper()
     # Check if the new line item is already in the dictionary
     if new_lineitem in kvp_dict:
         return kvp_dict[new_lineitem]
     # Get the best fuzzy match score for the extracted line item
-    best_match, _ = get_fuzzy_match_score(
-        new_lineitem, list(kvp_dict.keys()), threshold
+    match, _ = get_fuzzy_match_score(
+        new_lineitem,
+        list(kvp_dict.keys()),
+        threshold,
     )
-    return kvp_dict.get(best_match, None)
+    if match:
+        # find the code from the kvp_dict
+        return kvp_dict[match]
+    return None
 async def associate_forto_item_code(line_item_data, params):

src/prompts/library/customsInvoice/other/prompt.txt CHANGED Viewed

@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
     - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
     - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
     - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
-    - quantity: The quantity of the item or service provided in the line item.
+    - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
     - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
 - hblNumber and mblNumber:

src/prompts/library/partnerInvoice/other/prompt.txt CHANGED Viewed

@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
     - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
     - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
     - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
-    - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
+    - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
     - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
 - hblNumber and mblNumber:
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
     - Example:
         - "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
         - vendorName: COSCO SHIPPING Lines Co.,Ltd.
+    - From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
 - agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
 - agentKeyWord:

src/utils.py CHANGED Viewed

@@ -406,16 +406,7 @@ async def get_tms_mappings(
             response.raise_for_status()
             # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
-            if embedding_type == "line_items":
-                # For line_items, return the full data mapping
-                return response.json().get("response", {}).get("data", {})
-            else:
-                return (
-                    response.json()
-                    .get("response", {})
-                    .get("data", {})
-                    .get(input_list[0], None)
-                )
+            return response.json().get("response", {}).get("data", {})
         except httpx.HTTPStatusError as exc:
             logger.error(
@@ -424,6 +415,25 @@ async def get_tms_mappings(
             return {}
+async def batch_fetch_all_mappings(container_types, terminals, depots):
+    """Batch fetch all mappings for container types, terminals, and depots."""
+    # run batch calls concurrently
+    results = await asyncio.gather(
+        get_tms_mappings(list(container_types), "container_types"),
+        get_tms_mappings(list(terminals), "terminals"),
+        get_tms_mappings(list(depots), "depots"),
+    )
+    batch_container_map, batch_terminal_map, batch_depot_map = results
+    # Convert lists of tuples to dicts if necessary
+    return (
+        dict(batch_container_map or {}),
+        dict(batch_terminal_map or {}),
+        dict(batch_depot_map or {}),
+    )
 def transform_schema_strings(schema):
     """
     Recursively transforms a schema dictionary, replacing all "type": "STRING"

{data_science_document_ai-1.44.0.dist-info → data_science_document_ai-1.45.1.dist-info}/WHEEL RENAMED Viewed

File without changes

data-science-document-ai 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl

data-science-document-ai 1.44.0py3-none-any.whl → 1.45.1py3-none-any.whl