data-science-document-ai 1.51.0__tar.gz → 1.58.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/constants.py +10 -27
  4. data_science_document_ai-1.58.0/src/docai_processor_config.yaml +9 -0
  5. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/pdf_processing.py +34 -29
  6. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/common.py +35 -0
  7. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/postprocess_partner_invoice.py +82 -26
  8. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +146 -0
  9. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +21 -17
  10. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +146 -0
  11. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +61 -0
  12. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +146 -0
  13. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +10 -1
  14. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +146 -0
  15. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +10 -1
  16. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +160 -0
  17. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +11 -3
  18. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/other/placeholders.json +160 -0
  19. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/other/prompt.txt +57 -0
  20. data_science_document_ai-1.58.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +160 -0
  21. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +11 -1
  22. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/customsInvoice/other/prompt.txt +2 -1
  23. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/partnerInvoice/other/prompt.txt +3 -4
  24. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/setup.py +17 -9
  25. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/utils.py +6 -2
  26. data_science_document_ai-1.51.0/src/docai_processor_config.yaml +0 -22
  27. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
  28. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
  29. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -65
  30. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
  31. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
  32. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
  33. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
  34. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -58
  35. data_science_document_ai-1.51.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
  36. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/constants_sandbox.py +0 -0
  37. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/docai.py +0 -0
  38. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/excel_processing.py +0 -0
  39. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/io.py +0 -0
  40. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/llm.py +0 -0
  41. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/log_setup.py +0 -0
  42. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  43. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  44. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
  45. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  47. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  51. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  52. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  53. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  54. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  55. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  56. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  57. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  58. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  59. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  60. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  61. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  62. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  63. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  64. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  65. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  66. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
  67. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  68. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/prompts/prompt_library.py +0 -0
  69. {data_science_document_ai-1.51.0 → data_science_document_ai-1.58.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.51.0
3
+ Version: 1.58.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.51.0"
3
+ version = "1.58.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -20,10 +20,11 @@ project_parameters = {
20
20
  # Fuzzy lookup
21
21
  "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
22
  "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
23
24
  "invoice_classification_lookup": "invoice_classification.json",
24
25
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
26
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 90,
27
+ "fuzzy_threshold_item_code": 92,
27
28
  "fuzzy_threshold_reverse_charge": 80,
28
29
  "fuzzy_threshold_invoice_classification": 70,
29
30
  # Chunking params
@@ -36,6 +37,8 @@ project_parameters = {
36
37
  # models metadata (confidence),
37
38
  "g_model_data_folder": "models",
38
39
  "local_model_data_folder": "data",
40
+ "if_use_docai": False,
41
+ "if_use_llm": True, # Keep it always True
39
42
  "released_doc_types": {
40
43
  "bookingConfirmation",
41
44
  "packingList",
@@ -50,16 +53,6 @@ project_parameters = {
50
53
  "customsInvoice",
51
54
  "bundeskasse",
52
55
  },
53
- "model_selector": {
54
- "stable": {
55
- "bookingConfirmation": 1,
56
- },
57
- "beta": {
58
- "bookingConfirmation": 0,
59
- },
60
- },
61
- # this is the model selector for the model to be used from the model_config.yaml
62
- # file based on the environment, 0 mean the first model in the list
63
56
  # LLM model parameters
64
57
  "gemini_params": {
65
58
  "temperature": 0,
@@ -77,25 +70,15 @@ project_parameters = {
77
70
  "seed": 42,
78
71
  "model_id": "gemini-2.5-flash",
79
72
  },
80
- # Key to combine the LLM results with the Doc Ai results
81
- "key_to_combine": {
82
- "bookingConfirmation": ["transportLegs"],
83
- "arrivalNotice": ["containers"],
84
- "finalMbL": ["containers"],
85
- "draftMbl": ["containers"],
86
- "deliveryOrder": ["Equipment", "TransportLeg"],
87
- "customsAssessment": ["containers"],
88
- "packingList": ["skuData"],
89
- "commercialInvoice": ["skus"],
90
- "shippingInstruction": ["containers"],
91
- "partnerInvoice": ["lineItem"],
92
- "customsInvoice": ["lineItem"],
93
- "bundeskasse": ["lineItem"],
94
- },
95
73
  }
96
74
 
97
75
  # Hardcoded rules for data points formatting that can't be based on label name alone
98
76
  formatting_rules = {
99
- "bookingConfirmation": {"pickUpTerminal": "depot", "gateInTerminal": "terminal"},
77
+ "bookingConfirmation": {
78
+ "pickUpDepotCode": "depot",
79
+ "dropOffDepotCode": "depot",
80
+ "gateInTerminalCode": "terminal",
81
+ "pickUpTerminalCode": "terminal",
82
+ },
100
83
  "deliveryOrder": {"pickUpTerminal": "terminal", "EmptyContainerDepot": "depot"},
101
84
  }
@@ -0,0 +1,9 @@
1
+ models_project_id: "738250249861"
2
+ model_config:
3
+ stable:
4
+ bookingConfirmation:
5
+ - id: "dc3e714cd168aeaa"
6
+ details:
7
+ display_name: "doc_cap_bookingConfirmation"
8
+ author: "reet.kanjilal@forto.com"
9
+ created_date: ""
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
32
32
  from src.prompts.prompt_library import prompt_library
33
33
  from src.utils import (
34
34
  extract_top_pages,
35
- generate_schema_structure,
36
35
  get_pdf_page_count,
37
36
  get_processor_name,
38
37
  run_background_tasks,
@@ -202,9 +201,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
202
201
  number_of_pages = get_pdf_page_count(file_content)
203
202
  logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
204
203
 
205
- # get the schema placeholder
206
- response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
207
-
208
204
  carrier = "other"
209
205
  carrier_schema = (
210
206
  prompt_library.library.get("preprocessing", {})
@@ -241,6 +237,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
241
237
  # get the related prompt from predefined prompt library
242
238
  prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
243
239
 
240
+ # get the schema placeholder
241
+ response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
242
+
244
243
  # Add page-number extraction for moderately large docs
245
244
  use_chunking = number_of_pages >= params["chunk_after"]
246
245
 
@@ -258,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
258
257
  ):
259
258
  tasks.append(
260
259
  process_chunk_with_retry(
261
- chunk, prompt, response_schema, llm_client, input_doc_type
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
262
265
  )
263
266
  )
264
267
 
@@ -350,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
350
353
  # Add currency from the amount field
351
354
  if input_doc_type in ["commercialInvoice"]:
352
355
  result = postprocessing_commercial_invoice(result, params, input_doc_type)
353
- elif input_doc_type == "bookingConfirmation":
354
- result = postprocess_booking_confirmation(result)
356
+
355
357
  return result, llm_client.model_id
356
358
 
357
359
 
@@ -370,13 +372,14 @@ def combine_llm_results_w_doc_ai(
370
372
  Returns:
371
373
  combined result
372
374
  """
373
- result = doc_ai.copy()
374
- llm = remove_none_values(llm)
375
- if not llm:
375
+ result = remove_none_values(llm)
376
+
377
+ docAi = doc_ai.copy()
378
+ if not docAi:
376
379
  return result
377
380
 
378
381
  # Merge top-level keys
379
- result.update({k: v for k, v in llm.items() if k not in result})
382
+ result.update({k: v for k, v in docAi.items() if k not in result})
380
383
 
381
384
  if (
382
385
  input_doc_type
@@ -384,28 +387,28 @@ def combine_llm_results_w_doc_ai(
384
387
  and keys_to_combine
385
388
  ):
386
389
  result.update(
387
- {key: llm.get(key) for key in keys_to_combine if key in llm.keys()}
390
+ {key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
388
391
  )
389
392
  return result
390
393
 
391
394
  # Handle specific key-based merging logic for multiple keys
392
395
  if keys_to_combine:
393
396
  for key in keys_to_combine:
394
- if key in llm.keys():
397
+ if key in docAi.keys():
395
398
  # Merge the list of dictionaries
396
- # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
397
- if len(llm[key]) < len(result[key]):
398
- result[key] = llm[key]
399
+ # If the length of the docAi list is less than the LLM result, replace with the docAi list
400
+ if len(docAi[key]) < len(result[key]):
401
+ result[key] = docAi[key]
399
402
  else:
400
- # If the length of the LLM list is greater than or equal to the Doc AI result,
403
+ # If the length of the docAi list is greater than or equal to the LLM result,
401
404
  # add & merge the dictionaries
402
- if isinstance(llm[key], list):
403
- for i in range(len(llm[key])):
405
+ if isinstance(docAi[key], list):
406
+ for i in range(len(docAi[key])):
404
407
  if i == len(result[key]):
405
- result[key].append(llm[key][i])
408
+ result[key].append(docAi[key][i])
406
409
  else:
407
- for sub_key in llm[key][i].keys():
408
- result[key][i][sub_key] = llm[key][i][sub_key]
410
+ for sub_key in docAi[key][i].keys():
411
+ result[key][i][sub_key] = docAi[key][i][sub_key]
409
412
  return result
410
413
 
411
414
 
@@ -499,13 +502,15 @@ async def data_extraction_manual_flow(
499
502
  page_count = None
500
503
  # Validate the file type
501
504
  if mime_type == "application/pdf":
505
+ if_use_docai = params["if_use_docai"]
506
+
502
507
  # Enable Doc Ai only for certain document types.
503
- if_use_docai = (
504
- True if meta.documentTypeCode in params["model_config"]["stable"] else False
505
- )
506
- if_use_llm = (
507
- True if meta.documentTypeCode in params["key_to_combine"].keys() else False
508
- )
508
+ if params["if_use_docai"]:
509
+ if_use_docai = (
510
+ True
511
+ if meta.documentTypeCode in params["model_config"]["stable"]
512
+ else False
513
+ )
509
514
 
510
515
  (
511
516
  extracted_data,
@@ -517,7 +522,7 @@ async def data_extraction_manual_flow(
517
522
  meta.documentTypeCode,
518
523
  processor_client,
519
524
  if_use_docai=if_use_docai,
520
- if_use_llm=if_use_llm,
525
+ if_use_llm=params["if_use_llm"],
521
526
  llm_client=llm_client,
522
527
  isBetaTest=False,
523
528
  )
@@ -723,10 +723,45 @@ async def format_all_entities(result, document_type_code, params, mime_type):
723
723
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
724
724
  await process_partner_invoice(params, aggregated_data, document_type_code)
725
725
 
726
+ if document_type_code in ["bookingConfirmation"]:
727
+ aggregated_data["legalEntity"] = await get_legal_entity(
728
+ aggregated_data.get("carrierName", {}).get("documentValue", None),
729
+ aggregated_data.get("carrierAddress", {}).get("documentValue", None),
730
+ )
731
+
726
732
  logger.info("Data Extraction completed successfully")
727
733
  return aggregated_data
728
734
 
729
735
 
736
+ async def get_legal_entity(name, address):
737
+ """Get legal entity mapping from TMS mappings.
738
+
739
+ Args:
740
+ name (str): The name of the legal entity. Mandatory.
741
+ address (str): The address of the legal entity. Optional for better matching.
742
+
743
+ Returns:
744
+ dict or None: The mapping result from TMS embeddings, or None if not found.
745
+ """
746
+ # Name is mandatory for legal entity mapping
747
+ if not name:
748
+ return {"documentValue": None, "mappedValue": None}
749
+
750
+ # Build input safely
751
+ input_text = name if not address else f"{name} | {address}"
752
+
753
+ api_results = await get_tms_mappings(
754
+ input_list=[input_text],
755
+ embedding_type="legal_entities",
756
+ input_key="partnerNameAddress",
757
+ )
758
+
759
+ return {
760
+ "documentValue": None,
761
+ "formattedValue": api_results.get(input_text),
762
+ }
763
+
764
+
730
765
  def add_text_without_space(text):
731
766
  """If the cleaned text is different from the original text, append it.
732
767
  Useful for port names like QUINHON - Quinhon"""
@@ -1,4 +1,6 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
+ from collections import defaultdict
3
+
2
4
  from rapidfuzz import fuzz, process
3
5
 
4
6
  from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
143
145
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
144
146
 
145
147
 
148
+ def select_unique_bank_account(bank_account):
149
+ # Select the unique bank account if multiple are present
150
+ if isinstance(bank_account, list) and bank_account:
151
+ best = defaultdict(lambda: None)
152
+
153
+ for item in bank_account:
154
+ dv = item["documentValue"]
155
+ if best[dv] is None or item["page"] < best[dv]["page"]:
156
+ best[dv] = item
157
+
158
+ unique = list(best.values())
159
+ return unique
160
+
161
+
146
162
  async def process_partner_invoice(params, aggregated_data, document_type_code):
147
163
  """Process the partner invoice data."""
148
164
  # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
150
166
  post_process_bundeskasse(aggregated_data)
151
167
  return
152
168
 
169
+ if "bankAccount" in aggregated_data:
170
+ aggregated_data["bankAccount"] = select_unique_bank_account(
171
+ aggregated_data["bankAccount"]
172
+ )
173
+
153
174
  line_items = aggregated_data.get("lineItem", [])
154
175
  # Add debug logging
155
176
  logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -167,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
167
188
  reverse_charge_info["formattedValue"] = reverse_charge_value
168
189
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
169
190
 
191
+ # Partner Name
192
+ partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
193
+
170
194
  # Process everything in one go
171
- processed_items = await process_line_items_batch(params, line_items, reverse_charge)
195
+ processed_items = await process_line_items_batch(
196
+ params, line_items, reverse_charge, partner_name
197
+ )
172
198
 
173
199
  # Update your main data structure
174
200
  aggregated_data["lineItem"] = processed_items
175
201
 
176
202
 
177
203
  async def process_line_items_batch(
178
- params: dict, line_items: list[dict], reverse_charge=None
204
+ params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
179
205
  ):
180
206
  """
181
207
  Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -213,23 +239,12 @@ async def process_line_items_batch(
213
239
 
214
240
  # Batch API Call for Embedding lookups
215
241
  if pending_line_items:
216
- values_to_fetch = list(set(pending_line_items.values()))
217
- logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
218
-
219
- # Await the batch response {"desc1": "code1", "desc2": "code2"}
220
- api_results = await get_tms_mappings(
221
- input_list=values_to_fetch, embedding_type="line_items"
222
- )
242
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
223
243
 
224
- # Merge API results back into original list
225
244
  for index, desc in pending_line_items.items():
226
- # Get result from API response, or None if API failed for that item
227
- forto_code = api_results.get(desc)
228
-
229
- # Update the original item
230
245
  line_items[index]["itemCode"] = {
231
246
  "documentValue": desc,
232
- "formattedValue": forto_code, # Might be None if API failed
247
+ "formattedValue": code_map.get(desc),
233
248
  "page": line_items[index]["lineItemDescription"].get("page"),
234
249
  }
235
250
 
@@ -285,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
285
300
  return False
286
301
 
287
302
  # Check if the sentence is similar to any of the reverse charge sentences
288
- _, is_reverse_charge = get_fuzzy_match_score(
289
- sentence, reverse_charge_sentences, threshold
303
+ match, _ = get_fuzzy_match_score(
304
+ sentence, list(reverse_charge_sentences.keys()), threshold
290
305
  )
291
306
 
292
- return is_reverse_charge
307
+ if match:
308
+ return reverse_charge_sentences[match]
309
+
310
+ return False
293
311
 
294
312
 
295
313
  def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
@@ -320,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
320
338
  return None
321
339
 
322
340
 
323
- async def associate_forto_item_code(line_item_data, params):
341
+ async def associate_forto_item_code(line_item_data, params, partner_name=None):
324
342
  """
325
343
  Associates Forto item codes to a list of line item descriptions.
326
344
  Args:
327
345
  line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
328
346
  params (dict): Parameters containing lookup data and thresholds.
347
+ partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
329
348
 
330
349
  Returns:
331
350
  list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -347,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
347
366
 
348
367
  # Batch API Call for Embedding lookups
349
368
  if pending_line_items:
350
- api_results = await get_tms_mappings(
351
- input_list=list(pending_line_items.values()),
352
- embedding_type="line_items",
353
- )
369
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
354
370
 
355
- # Merge API results back into original list
356
371
  for desc, f_desc in pending_line_items.items():
357
- code = api_results.get(f_desc)
358
- result.append({"description": desc, "itemCode": code})
372
+ result.append(
373
+ {
374
+ "description": desc,
375
+ "itemCode": code_map.get(f_desc),
376
+ }
377
+ )
378
+
379
+ return result
359
380
 
381
+
382
+ async def fetch_line_item_codes(
383
+ pending_line_items: dict,
384
+ partner_name: str | None,
385
+ params: dict,
386
+ ):
387
+ """Returns: {original_description: mapped_code_or_None}"""
388
+ t_mode = (
389
+ find_matching_lineitem(
390
+ partner_name.upper(),
391
+ params["lookup_data"]["intermodal_partners"],
392
+ threshold=87,
393
+ )
394
+ if partner_name
395
+ else None
396
+ )
397
+
398
+ unique_descs = list(set(pending_line_items.values()))
399
+ logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
400
+
401
+ # Build API input map
402
+ api_input_map = {
403
+ desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
404
+ }
405
+
406
+ api_results = await get_tms_mappings(
407
+ input_list=list(api_input_map.values()),
408
+ embedding_type="line_items",
409
+ )
410
+
411
+ # Normalize response back to original descriptions
412
+ result = {
413
+ original_desc: api_results.get(api_desc)
414
+ for original_desc, api_desc in api_input_map.items()
415
+ }
360
416
  return result
@@ -0,0 +1,146 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "bookingNumber": {
5
+ "type": "STRING",
6
+ "nullable": true,
7
+ "description": "A unique identifier assigned to the shipment booking, used for tracking and reference. They are often referred to as 'Booking No.', 'Booking Reference', 'Our Reference', or 'Order Ref'."
8
+ },
9
+ "contractNumber": {
10
+ "type": "STRING",
11
+ "nullable": true,
12
+ "description": "It's a contract number between the carrier and Forto Logistics SE & Co KG."
13
+ },
14
+ "pickUpTerminalCode": {
15
+ "type": "STRING",
16
+ "nullable": true,
17
+ "description": "The specific terminal for cargo pickup during the import shipment."
18
+ },
19
+ "gateInTerminalCode": {
20
+ "type": "STRING",
21
+ "nullable": true,
22
+ "description": "The specific terminal where cargo is gated in especially Export terminal delivery address. E.g., Export terminal delivery address, Export terminal location, or Export terminal name."
23
+ },
24
+ "performaDate": {
25
+ "type": "STRING",
26
+ "nullable": true,
27
+ "description": "The date considered to apply the rates and charges specified in the booking confirmation"
28
+ },
29
+ "cyCutOff": {
30
+ "type": "STRING",
31
+ "nullable": true,
32
+ "description": "The datetime by which the cargo to be delivered to the Container Yard. It can be found with keys FCL delivery cut-off, FCL DG delivery cut-off, CY CUT OFF, CY Closing."
33
+ },
34
+ "gateInReference": {
35
+ "type": "STRING",
36
+ "nullable": true,
37
+ "description": "A reference code for cargo entering the terminal to drop the loaded cargo for Export. Sometimes it can be 'Our Reference'."
38
+ },
39
+ "mblNumber": {
40
+ "type": "STRING",
41
+ "nullable": true,
42
+ "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
43
+ },
44
+ "pickUpReference": {
45
+ "type": "STRING",
46
+ "nullable": true,
47
+ "description": "A reference code for cargo pickup during the import shipment. Sometimes it can be 'Our Reference'."
48
+ },
49
+ "siCutOff": {
50
+ "type": "STRING",
51
+ "nullable": true,
52
+ "description": "The deadline datetime for submitting the Shipping Instructions (SI) to the carrier. It can be found with keys Shipping Instruction Closing."
53
+ },
54
+ "vgmCutOff": {
55
+ "type": "STRING",
56
+ "nullable": true,
57
+ "description": "The deadline datetime for submitting the Verified Gross Mass (VGM) to the carrier. It can be found with keys VGM DEADLINE, VGM DUE, VGM CUT OFF."
58
+ },
59
+ "containers": {
60
+ "type": "ARRAY",
61
+ "items": {
62
+ "type": "OBJECT",
63
+ "properties": {
64
+ "containerType": {
65
+ "type": "STRING",
66
+ "nullable": true,
67
+ "description": "The size / type of the container, such as 20ft, 40ft, 40HC, 20DC etc under Type/Size column."
68
+ },
69
+ "pickUpDepotCode": {
70
+ "type": "STRING",
71
+ "nullable": true,
72
+ "description": "The depot code where the empty container will be picked up. It is identified as Empty Pick Up Depot or Export Empty Pick Up Depot(s)."
73
+ },
74
+ "dropOffDepotCode": {
75
+ "type": "STRING",
76
+ "nullable": true,
77
+ "description": "The depot code where the empty container will be dropped off."
78
+ }
79
+ }
80
+ },
81
+ "required": ["containerType", "pickupDepotCode", "dropoffDepotCode"]
82
+ },
83
+ "transportLegs": {
84
+ "type": "ARRAY",
85
+ "items": {
86
+ "type": "OBJECT",
87
+ "properties": {
88
+ "eta": {
89
+ "type": "STRING",
90
+ "nullable": true,
91
+ "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
92
+ },
93
+ "etd": {
94
+ "type": "STRING",
95
+ "nullable": true,
96
+ "description": "Estimated Time of Departure (ETD) is the expected date when the shipment will leave the origin port."
97
+ },
98
+ "imoNumber": {
99
+ "type": "STRING",
100
+ "nullable": true,
101
+ "description": "The International Maritime Organization number for a specific leg. It can be found as IMO No, IMO number."
102
+ },
103
+ "portOfDischarge": {
104
+ "type": "STRING",
105
+ "nullable": true,
106
+ "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment. It can be found at POD, Port of Discharge, To, Discharge Port"
107
+ },
108
+ "portOfLoading": {
109
+ "type": "STRING",
110
+ "nullable": true,
111
+ "description": "The port where the goods are loaded onto the vessel. This is the origin port for the shipment. It can be found at POL, Port of Loading, From, Load Port"
112
+ },
113
+ "vesselName": {
114
+ "type": "STRING",
115
+ "nullable": true,
116
+ "description": "The name of the vessel carrying the shipment. It can be found at vessel, INTENDED VESSEL/VOYAGE"
117
+ },
118
+ "voyage": {
119
+ "type": "STRING",
120
+ "nullable": true,
121
+ "description": "The journey or route taken by the vessel for a specific leg. It can be found at Voy. no, INTENDED VESSEL/VOYAGE"
122
+ }
123
+ }
124
+ },
125
+ "required": [
126
+ "eta",
127
+ "etd",
128
+ "portOfDischarge",
129
+ "portOfLoading",
130
+ "vesselName",
131
+ "voyage"
132
+ ]
133
+ },
134
+ "carrierAddress": {
135
+ "type": "STRING",
136
+ "nullable": true,
137
+ "description": "The address of the carrier who provides service and issued the document."
138
+ },
139
+ "carrierName": {
140
+ "type": "STRING",
141
+ "nullable": true,
142
+ "description": "The name of the carrier who issued the document e,g, Hapag-Lloyd."
143
+ }
144
+ },
145
+ "required": ["bookingNumber", "transportLegs", "containers", "cyCutOff", "vgmCutOff", "siCutOff"]
146
+ }
@@ -1,6 +1,14 @@
1
- your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
2
- ```json
3
- {
1
+ <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
+
3
+ <TASK> Your task is to extract data from Booking Confirmation documents as per the given response schema structure. <TASK>
4
+
5
+ <CONTEXT>
6
+ The Freight Forwarding company receives Booking Confirmation from EverGreen Carrier (Shipping Lines) partner.
7
+ These Booking Confirmations contain various details related to booking, container pick up and drop off depot details, vessel details, as well as other transport Legs data.
8
+ They may be written in different languages such as English, German, Vietnamese, Chinese, and other European languages, and can appear in a variety of formats and layouts.
9
+ Your role is to accurately extract specific entities from these Booking Confirmations to support efficient processing and accurate record-keeping.
10
+ <CONTEXT>
11
+
4
12
  "mblNumber": "Extract the value after the label 'BOOKING NO.'.",
5
13
  "gateInReference": "Extract the value after the label 'BOOKING NO.'.",
6
14
  "pickUpReference": "Extract the value after the label 'BOOKING NO.'.",
@@ -14,23 +22,19 @@ your task is to extract the text value of the following entities and page number
14
22
  "portOfDischarge": "Extract the text after the label 'PORT OF DISCHARGING:' and before 'FINAL DESTINATION'.",
15
23
  "pickUpTerminal": "Extract the text after the label 'EMPTY PICK UP AT:' removing any extra spaces or line breaks.",
16
24
  "gateInTerminal": "Extract the text after the label 'FULL RETURN TO:' removing any extra spaces or line breaks.",
17
- "transportLegs": [
18
- {
19
- "portOfLoading": "For the first leg, use the extracted 'portOfLoading'.",
20
- "portOfDischarge": "Extract the text after the label 'T/S PORT OF LOADING:'.",
21
- "vesselName": "For the first leg, use the extracted 'vesselName'.",
22
- "voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. For the first leg, use the extracted 'voyage'.",
23
- "eta": "Extract the date after the label 'ETA DATE' that appears within the section starting with 'FINAL DESTINATION:' and ending with 'T/S PORT OF LOADING:'.",
24
- "etd": "Extract the date after the label 'ETD DATE' that appears within the section starting with 'PORT OF LOADING:' and ending with 'FINAL DESTINATION:'.",
25
- },
26
- {
25
+
26
+ "transportLegs":
27
+ "portOfLoading": "For the first leg, use the extracted 'portOfLoading'.",
28
+ "portOfDischarge": "Extract the text after the label 'T/S PORT OF LOADING:'.",
29
+ "vesselName": "For the first leg, use the extracted 'vesselName'.",
30
+ "voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. For the first leg, use the extracted 'voyage'.",
31
+ "eta": "Extract the date after the label 'ETA DATE' that appears within the section starting with 'FINAL DESTINATION:' and ending with 'T/S PORT OF LOADING:'.",
32
+ "etd": "Extract the date after the label 'ETD DATE' that appears within the section starting with 'PORT OF LOADING:' and ending with 'FINAL DESTINATION:'.",
33
+
34
+
27
35
  "portOfLoading": "For the second leg, use the 'portOfDischarge' from the previous leg.",
28
36
  "portOfDischarge": "For the second leg, use the extracted 'portOfDischarge' from the main extraction.",
29
37
  "vesselName": "Extract the text after the label 'EST. CONNECT VSL/VOY:' and before the hyphen and numbers.",
30
38
  "voyage": "Voyage is a code of numbers and letters sometimes separated by '-'. Extract the code after the label 'EST. CONNECT VSL/VOY:' and after the vessel name.",
31
39
  "eta": "Extract the date after the label 'ETA DATE' that is after the line that contains 'T/S PORT OF LOADING'",
32
40
  "etd": "Extract the date after the label 'ETD DATE' that is related to the 'EST. CONNECT VSL/VOY:'. "
33
- }
34
- ]
35
- }
36
- ```