PyPI - data-science-document-ai - Versions diffs - 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl - Mend

data-science-document-ai 1.40.3py3-none-any.whl → 1.51.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
src/constants.py +6 -10
src/docai.py +14 -5
src/docai_processor_config.yaml +0 -56
src/excel_processing.py +34 -13
src/io.py +69 -1
src/llm.py +10 -32
src/pdf_processing.py +192 -54
src/postprocessing/common.py +246 -44
src/postprocessing/postprocess_partner_invoice.py +139 -85
src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
src/prompts/library/bundeskasse/other/placeholders.json +25 -25
src/prompts/library/bundeskasse/other/prompt.txt +8 -6
src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
src/prompts/library/customsAssessment/other/placeholders.json +67 -16
src/prompts/library/customsAssessment/other/prompt.txt +24 -37
src/prompts/library/customsInvoice/other/placeholders.json +20 -20
src/prompts/library/customsInvoice/other/prompt.txt +4 -4
src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
src/prompts/library/draftMbl/other/placeholders.json +33 -33
src/prompts/library/draftMbl/other/prompt.txt +34 -44
src/prompts/library/finalMbL/other/placeholders.json +34 -34
src/prompts/library/finalMbL/other/prompt.txt +34 -44
src/prompts/library/packingList/other/placeholders.json +98 -0
src/prompts/library/packingList/other/prompt.txt +1 -1
src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
src/setup.py +13 -16
src/utils.py +157 -45
data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
{data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0

src/postprocessing/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -84,16 +84,16 @@ def clean_shipment_id(shipment_id):
     """
     if not shipment_id:
         return
-    # '#S123456@-1' -> 'S123456'
-    # Find the pattern of a shipment ID that starts with 'S' followed by 5 to 7 digits
-    match = re.findall(r"S\d{5,7}", shipment_id)
+    # '#S1234565@-1' -> 'S1234565'
+    # Find the pattern of a shipment ID that starts with 'S' followed by 7 to 8 digits
+    match = re.findall(r"S\d{6,8}", shipment_id)
     stripped_value = match[0] if match else None
     if not stripped_value:
         return None
     # Check if length is valid (should be either 7 or 8)
-    if len(stripped_value) not in (6, 7, 8):
+    if len(stripped_value) not in (7, 8, 9):
         return None
     return stripped_value
@@ -134,9 +134,12 @@ def extract_number(data_field_value):
         formatted_value: string
     """
+    # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
+    value = remove_unwanted_patterns(data_field_value)
     formatted_value = ""
-    for c in data_field_value:
-        if c.isnumeric() or c in [",", "."]:
+    for c in value:
+        if c.isnumeric() or c in [",", ".", "-"]:
             formatted_value += c
     # First and last characters should not be  [",", "."]
@@ -319,6 +322,14 @@ def remove_unwanted_patterns(lineitem: str):
     # Remove "HIGH CUBE"
     lineitem = lineitem.replace("HIGH CUBE", "")
+    # Remove container size e.g., 20FT, 40HC, etc.
+    pattern = [
+        f"{s}{t}"
+        for s in ("20|22|40|45".split("|"))
+        for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
+    ]
+    lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
     return lineitem
@@ -349,51 +360,91 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     # Remove the currency codes
     lineitem = re.sub(currency_codes_pattern, "", lineitem, flags=re.IGNORECASE)
+    # remove other patterns
+    lineitem = remove_unwanted_patterns(lineitem)
     # Remove numbers from the line item
     if (
         remove_numbers
     ):  # Do not remove numbers for the reverse charge sentence as it contains Article number
         lineitem = re.sub(r"\d+", "", lineitem)
-    # remove other patterns
-    lineitem = remove_unwanted_patterns(lineitem)
     # remove special chars
     lineitem = re.sub(r"[^A-Za-z0-9\s]", " ", lineitem).strip()
+    # Remove x from lineitem like 10 x
+    lineitem = re.sub(r"\b[xX]\b", " ", lineitem).strip()
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
         return entity_k, [v for _, v in format_tasks]
+    if mime_type == "application/pdf":
+        if isinstance(entity_value, tuple):
+            page = entity_value[1]
+            entity_value = entity_value[0]
+        else:
+            page = -1
     entity_key = entity_k.lower()
     formatted_value = None
     if entity_key.startswith("port"):
-        formatted_value = await get_port_code_ai(entity_value, llm_client)
+        formatted_value = await get_port_code_ai(
+            entity_value, llm_client, doc_type=document_type_code
+        )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -414,11 +465,14 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         except ValueError as e:
             logger.info(f"ParserError: {e}")
-    elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
+    elif (
+        entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
+        and document_type_code == "bundeskasse"
+    ):
         formatted_value = clean_invoice_number(entity_value)
     elif entity_key in ("shipmentid", "partnerreference"):
-        # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 5 to 7 digits)
+        # Clean the shipment ID to match Forto's standard (starts with 'S' followed by 7 or 8 digits)
         formatted_value = clean_shipment_id(entity_value)
     elif entity_key == "containernumber":
@@ -446,10 +500,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
     elif "reversechargesentence" in entity_key:
         formatted_value = clean_item_description(entity_value, remove_numbers=False)
+    elif "quantity" in entity_key:
+        if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
+            # For partner invoice, quantity can be mentioned as whole number
+            # Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
+            formatted_value = decimal_convertor(
+                extract_number(entity_value), quantity=True
+            )
+        else:
+            formatted_value = extract_number(entity_value)
     elif any(
         numeric_indicator in entity_key
         for numeric_indicator in [
-            "quantity",
             "value",
             "amount",
             "price",
@@ -467,17 +530,21 @@ async def format_label(entity_k, entity_value, document_type_code, params):
         "documentValue": entity_value,
         "formattedValue": formatted_value,
     }
+    if mime_type == "application/pdf":
+        result["page"] = page
     return entity_k, result
-async def get_port_code_ai(port: str, llm_client):
+async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
-    port_llm = await get_port_code_llm(port, llm_client)
+    port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
-async def get_port_code_llm(port: str, llm_client):
+async def get_port_code_llm(port: str, llm_client, doc_type=None):
     if (
         "postprocessing" in prompt_library.library.keys()
         and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -504,7 +571,7 @@ async def get_port_code_llm(port: str, llm_client):
         }
         response = await llm_client.get_unified_json_genai(
-            prompt, response_schema=response_schema, model="chatgpt"
+            prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
         )
         try:
             mapped_port = response["port"]
@@ -514,7 +581,7 @@ async def get_port_code_llm(port: str, llm_client):
             return None
-def decimal_convertor(value):
+def decimal_convertor(value, quantity=False):
     """Convert EU values to English values."""
     if value is None:
         return None
@@ -522,30 +589,118 @@ def decimal_convertor(value):
     # Remove spaces
     value = value.strip().replace(" ", "")
-    # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
-    if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
-        value = value.replace(".", "").replace(",", ".")
+    # Check "-" and remove it for processing
+    is_negative, value = (True, value[1:]) if value.startswith("-") else (False, value)
+    if not quantity:
+        # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
+        if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
+            value = value.replace(".", "").replace(",", ".")
+        # European style integer with thousand separator: 2.500
+        elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
+            value = value.replace(".", "")
-    # European style integer with thousand separator: 2.500
-    elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
-        value = value.replace(".", "")
+        # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
+        elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
+            value = value.replace(",", "")
-    # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
-    elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
-        value = value.replace(",", "")
+        # English style integer with thousand separator: 2,500
+        elif re.match(r"^\d{1,3}(,\d{3})+$", value):
+            value = value.replace(",", "")
-    # English style integer with thousand separator: 2,500
-    elif re.match(r"^\d{1,3}(,\d{3})+$", value):
-        value = value.replace(",", "")
+        # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
+        if re.match(r"^\d+,\d{1,2}$", value):
+            value = value.replace(",", ".")
-    # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
-    elif re.match(r"^\d+,\d{1,2}$", value):
-        value = value.replace(",", ".")
+        # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
+        elif re.match(r"^\d+\.\d{3,}$", value):
+            value = value[: value.index(".") + 3]
+    else:  # quantity=True → only last two
+        # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
+        if re.match(r"^\d+,\d{1,2}$", value):
+            value = value.replace(",", ".")
+        # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
+        elif re.match(r"^\d+\.\d{3,}$", value):
+            value = value[: value.index(".") + 3]
+    # Re-add negative sign if applicable
+    value = "-" + value if is_negative else value
     return value
-async def format_all_entities(result, document_type_code, params):
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
+async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
     document_type_code = (
@@ -560,11 +715,13 @@ async def format_all_entities(result, document_type_code, params):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(None, result, document_type_code, params)
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
+    )
     # Process partner invoice on lineitem mapping and reverse charge sentence
     if document_type_code in ["partnerInvoice", "bundeskasse"]:
-        process_partner_invoice(params, aggregated_data, document_type_code)
+        await process_partner_invoice(params, aggregated_data, document_type_code)
     logger.info("Data Extraction completed successfully")
     return aggregated_data
@@ -594,3 +751,48 @@ def remove_stop_words(lineitem: str):
         .upper()
         .strip()
     )
+def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
+    """Convert LLM prediction dictionary to tuples of (value, page_number)."""
+    # If only 1 page, simply pair each value with page number 0
+    if number_of_pages == 1:
+        effective_page = 0 if page_number is None else page_number
+        if isinstance(llm_prediction, dict):
+            return {
+                k: llm_prediction_to_tuples(
+                    v, number_of_pages, page_number=effective_page
+                )
+                for k, v in llm_prediction.items()
+            }
+        elif isinstance(llm_prediction, list):
+            return [
+                llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
+                for v in llm_prediction
+            ]
+        else:
+            return (llm_prediction, effective_page) if llm_prediction else None
+    # logic for multi-page predictions
+    if isinstance(llm_prediction, dict):
+        if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
+            if llm_prediction["value"]:
+                try:
+                    _page_number = int(llm_prediction["page_number"])
+                except:  # noqa: E722
+                    _page_number = -1
+                return (llm_prediction["value"], _page_number)
+            return None
+        for key, value in llm_prediction.items():
+            llm_prediction[key] = llm_prediction_to_tuples(
+                llm_prediction.get(key, value), number_of_pages, page_number
+            )
+    elif isinstance(llm_prediction, list):
+        for i, item in enumerate(llm_prediction):
+            llm_prediction[i] = llm_prediction_to_tuples(
+                item, number_of_pages, page_number
+            )
+    return llm_prediction

data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl

data-science-document-ai 1.40.3py3-none-any.whl → 1.51.0py3-none-any.whl