PyPI - data-science-document-ai - Versions diffs - 1.44.0__tar.gz → 1.45.0__tar.gz - Mend

data-science-document-ai 1.44.0tar.gz → 1.45.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-science-document-ai
-Version: 1.44.0
+Version: 1.45.0
 Summary: "Document AI repo for data science"
 Author: Naomi Nguyen
 Author-email: naomi.nguyen@forto.com

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-science-document-ai"
-version = "1.44.0"
+version = "1.45.0"
 description = "\"Document AI repo for data science\""
 authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
 packages = [

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/pdf_processing.py RENAMED Viewed

@@ -200,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
         file_content = extract_top_pages(file_content, num_pages=5)
     number_of_pages = get_pdf_page_count(file_content)
+    logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
     # get the schema placeholder from the Doc AI and generate the response structure
     response_schema = (

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/common.py RENAMED Viewed

@@ -12,7 +12,7 @@ from src.constants import formatting_rules
 from src.io import logger
 from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
 from src.prompts.prompt_library import prompt_library
-from src.utils import get_tms_mappings
+from src.utils import batch_fetch_all_mappings, get_tms_mappings
 tms_domain = os.environ["TMS_DOMAIN"]
@@ -372,18 +372,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
     return re.sub(r"\s{2,}", " ", lineitem).strip()
-async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
+async def format_label(
+    entity_k,
+    entity_value,
+    document_type_code,
+    params,
+    mime_type,
+    container_map,
+    terminal_map,
+    depot_map,
+):
     llm_client = params["LlmClient"]
     if isinstance(entity_value, dict):  # if it's a nested entity
         format_tasks = [
-            format_label(sub_k, sub_v, document_type_code, params, mime_type)
+            format_label(
+                sub_k,
+                sub_v,
+                document_type_code,
+                params,
+                mime_type,
+                container_map,
+                terminal_map,
+                depot_map,
+            )
             for sub_k, sub_v in entity_value.items()
         ]
         return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
     if isinstance(entity_value, list):
         format_tasks = await asyncio.gather(
             *[
-                format_label(entity_k, sub_v, document_type_code, params, mime_type)
+                format_label(
+                    entity_k,
+                    sub_v,
+                    document_type_code,
+                    params,
+                    mime_type,
+                    container_map,
+                    terminal_map,
+                    depot_map,
+                )
                 for sub_v in entity_value
             ]
         )
@@ -405,13 +432,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
         )
     elif (entity_key == "containertype") or (entity_key == "containersize"):
-        formatted_value = await get_tms_mappings(entity_value, "container_types")
+        formatted_value = container_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "terminal"):
-        formatted_value = await get_tms_mappings(entity_value, "terminals")
+        formatted_value = terminal_map.get(entity_value)
     elif check_formatting_rule(entity_k, document_type_code, "depot"):
-        formatted_value = await get_tms_mappings(entity_value, "depots")
+        formatted_value = depot_map.get(entity_value)
     elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
         try:
@@ -507,7 +534,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
     """Get port code using AI model."""
     port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
-    return await get_tms_mappings(port, "ports", port_llm)
+    result = await get_tms_mappings(port, "ports", port_llm)
+    return result.get(port, None)
 async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +626,74 @@ def decimal_convertor(value, quantity=False):
     return value
+async def collect_mapping_requests(entity_value, document_type_code):
+    """Collect all unique container types, terminals, and depots from the entity value."""
+    # Sets to store unique values
+    container_types = set()
+    terminals = set()
+    depots = set()
+    def walk(key, value):
+        key_lower = key.lower()
+        # nested dict
+        if isinstance(value, dict):
+            for k, v in value.items():
+                walk(k, v)
+        # list of values
+        elif isinstance(value, list):
+            for item in value:
+                walk(key, item)
+        # leaf node
+        else:
+            if key_lower in ("containertype", "containersize"):
+                # Take only "20DV" from ('20DV', 0) if it's a tuple
+                container_types.add(value[0]) if isinstance(
+                    value, tuple
+                ) else container_types.add(value)
+            elif check_formatting_rule(key, document_type_code, "terminal"):
+                terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
+                    value
+                )
+            elif check_formatting_rule(key, document_type_code, "depot"):
+                depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
+    walk("root", entity_value)
+    return container_types, terminals, depots
+async def format_all_labels(entity_data, document_type_code, params, mime_type):
+    """Format all labels in the entity data using cached mappings."""
+    # Collect all mapping values needed
+    container_req, terminal_req, depot_req = await collect_mapping_requests(
+        entity_data, document_type_code
+    )
+    # Batch fetch mappings
+    container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
+        container_req, terminal_req, depot_req
+    )
+    # Format labels using cached mappings
+    _, result = await format_label(
+        "root",
+        entity_data,
+        document_type_code,
+        params,
+        mime_type,
+        container_map,
+        terminal_map,
+        depot_map,
+    )
+    return _, result
 async def format_all_entities(result, document_type_code, params, mime_type):
     """Format the entity values in the result dictionary."""
     # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,8 +709,8 @@ async def format_all_entities(result, document_type_code, params, mime_type):
         return {}
     # Format all entities recursively
-    _, aggregated_data = await format_label(
-        None, result, document_type_code, params, mime_type
+    _, aggregated_data = await format_all_labels(
+        result, document_type_code, params, mime_type
     )
     # Process partner invoice on lineitem mapping and reverse charge sentence

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/postprocess_partner_invoice.py RENAMED Viewed

@@ -1,7 +1,5 @@
 """This module contains the postprocessing functions for the partner invoice."""
-from concurrent.futures import ThreadPoolExecutor
-from fuzzywuzzy import fuzz
+from rapidfuzz import fuzz, process
 from src.io import logger
 from src.utils import get_tms_mappings
@@ -177,6 +175,7 @@ async def process_line_items_batch(
     pending_line_items = {}
     # Check Fuzzy Matching
+    logger.info(f"Mapping line item codes with Fuzzy matching....")
     for i, item in enumerate(line_items):
         description_obj = item.get("lineItemDescription")
@@ -231,12 +230,6 @@ async def process_line_items_batch(
     return line_items
-def compute_score(args):
-    """Compute the fuzzy matching score between a new line item and a key."""
-    new_lineitem, key = args
-    return key, fuzz.ratio(new_lineitem, key)
 def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
     """Get the best fuzzy match for a target string from a list of candidates.
@@ -249,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
         tuple: (best_match, score) if above threshold, else (None, 0)
     """
     # Use multiprocessing to find the best match
-    with ThreadPoolExecutor() as executor:
-        results = executor.map(compute_score, [(target, s) for s in sentences])
+    result = process.extractOne(
+        target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
+    )
-    # Find the best match and score
-    best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
+    if result is None:
+        return None, False
-    # return best_match, best_score
-    # If the best match score is above a threshold (e.g., 80), return it
-    if best_score >= threshold:
-        return best_match, True
+    match, score, index = result
+    # return best_match if the best match score is above a threshold (e.g., 80)
+    if match:
+        return match, True
     return None, False
@@ -290,18 +285,22 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
     Returns:
         str: The best matching 'Forto SLI' value from the dictionary.
     """
-    new_lineitem = new_lineitem.upper()
     # Check if the new line item is already in the dictionary
     if new_lineitem in kvp_dict:
         return kvp_dict[new_lineitem]
     # Get the best fuzzy match score for the extracted line item
-    best_match, _ = get_fuzzy_match_score(
-        new_lineitem, list(kvp_dict.keys()), threshold
+    match, _ = get_fuzzy_match_score(
+        new_lineitem,
+        list(kvp_dict.keys()),
+        threshold,
     )
-    return kvp_dict.get(best_match, None)
+    if match:
+        # find the code from the kvp_dict
+        return kvp_dict[match]
+    return None
 async def associate_forto_item_code(line_item_data, params):

{data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/utils.py RENAMED Viewed

@@ -406,16 +406,7 @@ async def get_tms_mappings(
             response.raise_for_status()
             # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
-            if embedding_type == "line_items":
-                # For line_items, return the full data mapping
-                return response.json().get("response", {}).get("data", {})
-            else:
-                return (
-                    response.json()
-                    .get("response", {})
-                    .get("data", {})
-                    .get(input_list[0], None)
-                )
+            return response.json().get("response", {}).get("data", {})
         except httpx.HTTPStatusError as exc:
             logger.error(
@@ -424,6 +415,25 @@ async def get_tms_mappings(
             return {}
+async def batch_fetch_all_mappings(container_types, terminals, depots):
+    """Batch fetch all mappings for container types, terminals, and depots."""
+    # run batch calls concurrently
+    results = await asyncio.gather(
+        get_tms_mappings(list(container_types), "container_types"),
+        get_tms_mappings(list(terminals), "terminals"),
+        get_tms_mappings(list(depots), "depots"),
+    )
+    batch_container_map, batch_terminal_map, batch_depot_map = results
+    # Convert lists of tuples to dicts if necessary
+    return (
+        dict(batch_container_map or {}),
+        dict(batch_terminal_map or {}),
+        dict(batch_depot_map or {}),
+    )
 def transform_schema_strings(schema):
     """
     Recursively transforms a schema dictionary, replacing all "type": "STRING"