data-science-document-ai 1.44.0__tar.gz → 1.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/pdf_processing.py +1 -0
  4. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/common.py +106 -10
  5. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/postprocess_partner_invoice.py +21 -22
  6. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/utils.py +20 -10
  7. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/constants.py +0 -0
  8. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/constants_sandbox.py +0 -0
  9. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/docai.py +0 -0
  10. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/docai_processor_config.yaml +0 -0
  11. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/excel_processing.py +0 -0
  12. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/io.py +0 -0
  13. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/llm.py +0 -0
  14. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/log_setup.py +0 -0
  15. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  16. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  17. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  18. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  19. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  20. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  21. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  22. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  23. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  24. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  25. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  26. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  27. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  28. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  29. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  30. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  31. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  32. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  33. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  34. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  37. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  40. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -0
  41. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/draftMbl/maersk/prompt.txt +0 -0
  42. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  43. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  44. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -0
  45. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/finalMbL/maersk/prompt.txt +0 -0
  46. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  47. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  50. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  51. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  56. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/prompts/prompt_library.py +0 -0
  57. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/setup.py +0 -0
  58. {data_science_document_ai-1.44.0 → data_science_document_ai-1.45.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.44.0
3
+ Version: 1.45.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.44.0"
3
+ version = "1.45.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -200,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
200
200
  file_content = extract_top_pages(file_content, num_pages=5)
201
201
 
202
202
  number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
203
204
 
204
205
  # get the schema placeholder from the Doc AI and generate the response structure
205
206
  response_schema = (
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
12
12
  from src.io import logger
13
13
  from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
14
  from src.prompts.prompt_library import prompt_library
15
- from src.utils import get_tms_mappings
15
+ from src.utils import batch_fetch_all_mappings, get_tms_mappings
16
16
 
17
17
  tms_domain = os.environ["TMS_DOMAIN"]
18
18
 
@@ -372,18 +372,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
372
372
  return re.sub(r"\s{2,}", " ", lineitem).strip()
373
373
 
374
374
 
375
- async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
375
+ async def format_label(
376
+ entity_k,
377
+ entity_value,
378
+ document_type_code,
379
+ params,
380
+ mime_type,
381
+ container_map,
382
+ terminal_map,
383
+ depot_map,
384
+ ):
376
385
  llm_client = params["LlmClient"]
377
386
  if isinstance(entity_value, dict): # if it's a nested entity
378
387
  format_tasks = [
379
- format_label(sub_k, sub_v, document_type_code, params, mime_type)
388
+ format_label(
389
+ sub_k,
390
+ sub_v,
391
+ document_type_code,
392
+ params,
393
+ mime_type,
394
+ container_map,
395
+ terminal_map,
396
+ depot_map,
397
+ )
380
398
  for sub_k, sub_v in entity_value.items()
381
399
  ]
382
400
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
383
401
  if isinstance(entity_value, list):
384
402
  format_tasks = await asyncio.gather(
385
403
  *[
386
- format_label(entity_k, sub_v, document_type_code, params, mime_type)
404
+ format_label(
405
+ entity_k,
406
+ sub_v,
407
+ document_type_code,
408
+ params,
409
+ mime_type,
410
+ container_map,
411
+ terminal_map,
412
+ depot_map,
413
+ )
387
414
  for sub_v in entity_value
388
415
  ]
389
416
  )
@@ -405,13 +432,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
405
432
  )
406
433
 
407
434
  elif (entity_key == "containertype") or (entity_key == "containersize"):
408
- formatted_value = await get_tms_mappings(entity_value, "container_types")
435
+ formatted_value = container_map.get(entity_value)
409
436
 
410
437
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
411
- formatted_value = await get_tms_mappings(entity_value, "terminals")
438
+ formatted_value = terminal_map.get(entity_value)
412
439
 
413
440
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
414
- formatted_value = await get_tms_mappings(entity_value, "depots")
441
+ formatted_value = depot_map.get(entity_value)
415
442
 
416
443
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
417
444
  try:
@@ -507,7 +534,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
507
534
  """Get port code using AI model."""
508
535
  port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
509
536
 
510
- return await get_tms_mappings(port, "ports", port_llm)
537
+ result = await get_tms_mappings(port, "ports", port_llm)
538
+ return result.get(port, None)
511
539
 
512
540
 
513
541
  async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +626,74 @@ def decimal_convertor(value, quantity=False):
598
626
  return value
599
627
 
600
628
 
629
+ async def collect_mapping_requests(entity_value, document_type_code):
630
+ """Collect all unique container types, terminals, and depots from the entity value."""
631
+ # Sets to store unique values
632
+ container_types = set()
633
+ terminals = set()
634
+ depots = set()
635
+
636
+ def walk(key, value):
637
+ key_lower = key.lower()
638
+
639
+ # nested dict
640
+ if isinstance(value, dict):
641
+ for k, v in value.items():
642
+ walk(k, v)
643
+
644
+ # list of values
645
+ elif isinstance(value, list):
646
+ for item in value:
647
+ walk(key, item)
648
+
649
+ # leaf node
650
+ else:
651
+ if key_lower in ("containertype", "containersize"):
652
+ # Take only "20DV" from ('20DV', 0) if it's a tuple
653
+ container_types.add(value[0]) if isinstance(
654
+ value, tuple
655
+ ) else container_types.add(value)
656
+
657
+ elif check_formatting_rule(key, document_type_code, "terminal"):
658
+ terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
659
+ value
660
+ )
661
+
662
+ elif check_formatting_rule(key, document_type_code, "depot"):
663
+ depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
664
+
665
+ walk("root", entity_value)
666
+
667
+ return container_types, terminals, depots
668
+
669
+
670
+ async def format_all_labels(entity_data, document_type_code, params, mime_type):
671
+ """Format all labels in the entity data using cached mappings."""
672
+ # Collect all mapping values needed
673
+ container_req, terminal_req, depot_req = await collect_mapping_requests(
674
+ entity_data, document_type_code
675
+ )
676
+
677
+ # Batch fetch mappings
678
+ container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
679
+ container_req, terminal_req, depot_req
680
+ )
681
+
682
+ # Format labels using cached mappings
683
+ _, result = await format_label(
684
+ "root",
685
+ entity_data,
686
+ document_type_code,
687
+ params,
688
+ mime_type,
689
+ container_map,
690
+ terminal_map,
691
+ depot_map,
692
+ )
693
+
694
+ return _, result
695
+
696
+
601
697
  async def format_all_entities(result, document_type_code, params, mime_type):
602
698
  """Format the entity values in the result dictionary."""
603
699
  # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,8 +709,8 @@ async def format_all_entities(result, document_type_code, params, mime_type):
613
709
  return {}
614
710
 
615
711
  # Format all entities recursively
616
- _, aggregated_data = await format_label(
617
- None, result, document_type_code, params, mime_type
712
+ _, aggregated_data = await format_all_labels(
713
+ result, document_type_code, params, mime_type
618
714
  )
619
715
 
620
716
  # Process partner invoice on lineitem mapping and reverse charge sentence
@@ -1,7 +1,5 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
- from concurrent.futures import ThreadPoolExecutor
3
-
4
- from fuzzywuzzy import fuzz
2
+ from rapidfuzz import fuzz, process
5
3
 
6
4
  from src.io import logger
7
5
  from src.utils import get_tms_mappings
@@ -177,6 +175,7 @@ async def process_line_items_batch(
177
175
  pending_line_items = {}
178
176
 
179
177
  # Check Fuzzy Matching
178
+ logger.info(f"Mapping line item codes with Fuzzy matching....")
180
179
  for i, item in enumerate(line_items):
181
180
  description_obj = item.get("lineItemDescription")
182
181
 
@@ -231,12 +230,6 @@ async def process_line_items_batch(
231
230
  return line_items
232
231
 
233
232
 
234
- def compute_score(args):
235
- """Compute the fuzzy matching score between a new line item and a key."""
236
- new_lineitem, key = args
237
- return key, fuzz.ratio(new_lineitem, key)
238
-
239
-
240
233
  def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
241
234
  """Get the best fuzzy match for a target string from a list of candidates.
242
235
 
@@ -249,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
249
242
  tuple: (best_match, score) if above threshold, else (None, 0)
250
243
  """
251
244
  # Use multiprocessing to find the best match
252
- with ThreadPoolExecutor() as executor:
253
- results = executor.map(compute_score, [(target, s) for s in sentences])
245
+ result = process.extractOne(
246
+ target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
247
+ )
254
248
 
255
- # Find the best match and score
256
- best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
249
+ if result is None:
250
+ return None, False
257
251
 
258
- # return best_match, best_score
259
- # If the best match score is above a threshold (e.g., 80), return it
260
- if best_score >= threshold:
261
- return best_match, True
252
+ match, score, index = result
253
+
254
+ # return best_match if the best match score is above a threshold (e.g., 80)
255
+ if match:
256
+ return match, True
262
257
 
263
258
  return None, False
264
259
 
@@ -290,18 +285,22 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
290
285
  Returns:
291
286
  str: The best matching 'Forto SLI' value from the dictionary.
292
287
  """
293
- new_lineitem = new_lineitem.upper()
294
-
295
288
  # Check if the new line item is already in the dictionary
296
289
  if new_lineitem in kvp_dict:
297
290
  return kvp_dict[new_lineitem]
298
291
 
299
292
  # Get the best fuzzy match score for the extracted line item
300
- best_match, _ = get_fuzzy_match_score(
301
- new_lineitem, list(kvp_dict.keys()), threshold
293
+ match, _ = get_fuzzy_match_score(
294
+ new_lineitem,
295
+ list(kvp_dict.keys()),
296
+ threshold,
302
297
  )
303
298
 
304
- return kvp_dict.get(best_match, None)
299
+ if match:
300
+ # find the code from the kvp_dict
301
+ return kvp_dict[match]
302
+
303
+ return None
305
304
 
306
305
 
307
306
  async def associate_forto_item_code(line_item_data, params):
@@ -406,16 +406,7 @@ async def get_tms_mappings(
406
406
  response.raise_for_status()
407
407
 
408
408
  # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
- if embedding_type == "line_items":
410
- # For line_items, return the full data mapping
411
- return response.json().get("response", {}).get("data", {})
412
- else:
413
- return (
414
- response.json()
415
- .get("response", {})
416
- .get("data", {})
417
- .get(input_list[0], None)
418
- )
409
+ return response.json().get("response", {}).get("data", {})
419
410
 
420
411
  except httpx.HTTPStatusError as exc:
421
412
  logger.error(
@@ -424,6 +415,25 @@ async def get_tms_mappings(
424
415
  return {}
425
416
 
426
417
 
418
+ async def batch_fetch_all_mappings(container_types, terminals, depots):
419
+ """Batch fetch all mappings for container types, terminals, and depots."""
420
+ # run batch calls concurrently
421
+ results = await asyncio.gather(
422
+ get_tms_mappings(list(container_types), "container_types"),
423
+ get_tms_mappings(list(terminals), "terminals"),
424
+ get_tms_mappings(list(depots), "depots"),
425
+ )
426
+
427
+ batch_container_map, batch_terminal_map, batch_depot_map = results
428
+
429
+ # Convert lists of tuples to dicts if necessary
430
+ return (
431
+ dict(batch_container_map or {}),
432
+ dict(batch_terminal_map or {}),
433
+ dict(batch_depot_map or {}),
434
+ )
435
+
436
+
427
437
  def transform_schema_strings(schema):
428
438
  """
429
439
  Recursively transforms a schema dictionary, replacing all "type": "STRING"