data-science-document-ai 1.53.0__py3-none-any.whl → 1.55.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.53.0
3
+ Version: 1.55.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,4 +1,4 @@
1
- src/constants.py,sha256=k5bBnJN-kmXiAtIAlz6Kg6fDyR9n0DuIudCZ9ZHO_Jw,3528
1
+ src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
4
  src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
6
  src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
7
  src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=ER-gwh_YfJ-bMqh3nI8K89CZPAdPNnwjLmJ-5cnn1Rk,20469
9
+ src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
10
10
  src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=V9ANqlSBBO2_YEyfyCms7vjhUi9pjlGeRfUhMQQ4F6c,13507
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
14
14
  src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -34,7 +34,7 @@ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanET
34
34
  src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
35
35
  src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
36
36
  src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
37
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
37
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=MtDx5UIJYbOfBqfMELZposrZmLGiBpcZ8EE5PwRBRG8,9783
38
38
  src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
39
39
  src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
40
40
  src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
@@ -44,7 +44,7 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-
44
44
  src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
45
45
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
46
46
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
47
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=8dUZy516GwJ2EXVU5HJHBm0bbGMqxyqKacfCuSGtBxY,8067
48
48
  src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
49
  src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
50
  src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
@@ -52,9 +52,9 @@ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93
52
52
  src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
53
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
54
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
- src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
55
+ src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
57
  src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.53.0.dist-info/METADATA,sha256=s3O_vRgHuHypiv9A6R2NWiwexz4SYjjmuKARhcpwgCo,2152
59
- data_science_document_ai-1.53.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.53.0.dist-info/RECORD,,
58
+ data_science_document_ai-1.55.0.dist-info/METADATA,sha256=PRbleO7DbAfZHyjX5H-9hs2e7c6k0h5SQ9PbPA2Q6IY,2152
59
+ data_science_document_ai-1.55.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.55.0.dist-info/RECORD,,
src/constants.py CHANGED
@@ -20,10 +20,11 @@ project_parameters = {
20
20
  # Fuzzy lookup
21
21
  "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
22
  "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
23
24
  "invoice_classification_lookup": "invoice_classification.json",
24
25
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
26
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 90,
27
+ "fuzzy_threshold_item_code": 92,
27
28
  "fuzzy_threshold_reverse_charge": 80,
28
29
  "fuzzy_threshold_invoice_classification": 70,
29
30
  # Chunking params
src/pdf_processing.py CHANGED
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
32
32
  from src.prompts.prompt_library import prompt_library
33
33
  from src.utils import (
34
34
  extract_top_pages,
35
- generate_schema_structure,
36
35
  get_pdf_page_count,
37
36
  get_processor_name,
38
37
  run_background_tasks,
@@ -250,7 +249,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
250
249
  prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
250
 
252
251
  tasks = []
253
- semaphore = asyncio.Semaphore(50)
254
252
  # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
255
253
  for chunk in (
256
254
  split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -258,8 +256,12 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
258
256
  else [file_content]
259
257
  ):
260
258
  tasks.append(
261
- process_chunk_with_semaphore(
262
- semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
259
+ process_chunk_with_retry(
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
263
265
  )
264
266
  )
265
267
 
@@ -271,25 +273,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
271
273
  return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
272
274
 
273
275
 
274
- async def process_chunk_with_semaphore(
275
- semaphore,
276
- chunk_content,
277
- prompt,
278
- response_schema,
279
- llm_client,
280
- input_doc_type,
281
- ):
282
- """Process a chunk with a semaphore to limit concurrency."""
283
- async with semaphore:
284
- return await process_chunk_with_retry(
285
- chunk_content,
286
- prompt,
287
- response_schema,
288
- llm_client,
289
- input_doc_type,
290
- )
291
-
292
-
293
276
  async def process_chunk_with_retry(
294
277
  chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
295
278
  ):
@@ -188,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
188
188
  reverse_charge_info["formattedValue"] = reverse_charge_value
189
189
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
190
190
 
191
+ # Partner Name
192
+ partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
193
+
191
194
  # Process everything in one go
192
- processed_items = await process_line_items_batch(params, line_items, reverse_charge)
195
+ processed_items = await process_line_items_batch(
196
+ params, line_items, reverse_charge, partner_name
197
+ )
193
198
 
194
199
  # Update your main data structure
195
200
  aggregated_data["lineItem"] = processed_items
196
201
 
197
202
 
198
203
  async def process_line_items_batch(
199
- params: dict, line_items: list[dict], reverse_charge=None
204
+ params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
200
205
  ):
201
206
  """
202
207
  Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -234,23 +239,12 @@ async def process_line_items_batch(
234
239
 
235
240
  # Batch API Call for Embedding lookups
236
241
  if pending_line_items:
237
- values_to_fetch = list(set(pending_line_items.values()))
238
- logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
239
-
240
- # Await the batch response {"desc1": "code1", "desc2": "code2"}
241
- api_results = await get_tms_mappings(
242
- input_list=values_to_fetch, embedding_type="line_items"
243
- )
242
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
244
243
 
245
- # Merge API results back into original list
246
244
  for index, desc in pending_line_items.items():
247
- # Get result from API response, or None if API failed for that item
248
- forto_code = api_results.get(desc)
249
-
250
- # Update the original item
251
245
  line_items[index]["itemCode"] = {
252
246
  "documentValue": desc,
253
- "formattedValue": forto_code, # Might be None if API failed
247
+ "formattedValue": code_map.get(desc),
254
248
  "page": line_items[index]["lineItemDescription"].get("page"),
255
249
  }
256
250
 
@@ -344,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
344
338
  return None
345
339
 
346
340
 
347
- async def associate_forto_item_code(line_item_data, params):
341
+ async def associate_forto_item_code(line_item_data, params, partner_name=None):
348
342
  """
349
343
  Associates Forto item codes to a list of line item descriptions.
350
344
  Args:
351
345
  line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
352
346
  params (dict): Parameters containing lookup data and thresholds.
347
+ partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
353
348
 
354
349
  Returns:
355
350
  list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -371,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
371
366
 
372
367
  # Batch API Call for Embedding lookups
373
368
  if pending_line_items:
374
- api_results = await get_tms_mappings(
375
- input_list=list(pending_line_items.values()),
376
- embedding_type="line_items",
377
- )
369
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
378
370
 
379
- # Merge API results back into original list
380
371
  for desc, f_desc in pending_line_items.items():
381
- code = api_results.get(f_desc)
382
- result.append({"description": desc, "itemCode": code})
372
+ result.append(
373
+ {
374
+ "description": desc,
375
+ "itemCode": code_map.get(f_desc),
376
+ }
377
+ )
378
+
379
+ return result
380
+
383
381
 
382
+ async def fetch_line_item_codes(
383
+ pending_line_items: dict,
384
+ partner_name: str | None,
385
+ params: dict,
386
+ ):
387
+ """Returns: {original_description: mapped_code_or_None}"""
388
+ t_mode = (
389
+ find_matching_lineitem(
390
+ partner_name.upper(),
391
+ params["lookup_data"]["intermodal_partners"],
392
+ threshold=87,
393
+ )
394
+ if partner_name
395
+ else None
396
+ )
397
+
398
+ unique_descs = list(set(pending_line_items.values()))
399
+ logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
400
+
401
+ # Build API input map
402
+ api_input_map = {
403
+ desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
404
+ }
405
+
406
+ api_results = await get_tms_mappings(
407
+ input_list=list(api_input_map.values()),
408
+ embedding_type="line_items",
409
+ )
410
+
411
+ # Normalize response back to original descriptions
412
+ result = {
413
+ original_desc: api_results.get(api_desc)
414
+ for original_desc, api_desc in api_input_map.items()
415
+ }
384
416
  return result
@@ -48,7 +48,7 @@ Your role is to accurately extract specific entities from these invoices to supp
48
48
  - issueDate: The date the document was issued.
49
49
  - dueDate: The date by which the payment should be made. Do Not calculate dueDate based on issueDate or any other date. Extract it directly from the invoice.
50
50
 
51
- - lineItem: Details of each COGS and Customs line item on the invoice. Make sure to extract each amount and currency separately.
51
+ - lineItem: Details of each COGS and Customs line item on the invoice from each page. Make sure to extract each amount and currency separately.
52
52
  - uniqueId: A unique id which associated with the lineItem as each line item can belong to a different shipment. Extract only if its available in the line item. Either a shipmentId starting with an S and followed by 6 or 8 numeric values or a mblNumber. If shipmentId or mblNumber does not exist, set it to containerNumber.
53
53
  - lineItemDescription: The name or description of the item. Usually, it will be a one line sentence.
54
54
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
@@ -92,6 +92,7 @@ Your role is to accurately extract specific entities from these invoices to supp
92
92
 
93
93
  IMPORTANT NOTE:
94
94
  - Ensure all extracted values are directly from the document. Do not make assumptions or modifications.
95
+ - Extract line items from each page if the invoice spans multiple pages.
95
96
  - Do not normalize or modify any entity values.
96
97
  - Pay attention to the line item details and paymentInformation, as they may vary significantly across different invoices.
97
98
 
@@ -46,7 +46,7 @@ Your role is to accurately extract specific entities from these invoices to supp
46
46
 
47
47
  - eta and etd: Few invoices contains same date for ARRIVED/DEPARTED or ETA/ETD. Extract it for both eta and etd.
48
48
 
49
- - lineItem: Details of each COGS and Customs line item on the invoice. Make sure to extract each amount and currency separately.
49
+ - lineItem: Details of each COGS and Customs line item on the invoice from each page. Make sure to extract each amount and currency separately.
50
50
  - uniqueId: A unique id which associated with the lineItem as each line item can belong to a different shipment. Extract only if its available in the line item. Either a shipmentId starting with an S and followed by 6 or 8 numeric values or a mblNumber. If shipmentId or mblNumber does not exist, set it to containerNumber.
51
51
  - lineItemDescription: The name or description of the item. Usually, it will be a one line sentence.
52
52
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
src/setup.py CHANGED
@@ -184,6 +184,9 @@ def setup_lookup_data(params):
184
184
  input_path_item_code = (
185
185
  f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
186
186
  )
187
+ input_path_intermodal_partners = (
188
+ f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
189
+ )
187
190
  input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
188
191
  input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
189
192
 
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
194
197
  return json.loads(downloaded_data)
195
198
 
196
199
  data["item_code"] = download_json_from_bucket(input_path_item_code)
200
+ data["intermodal_partners"] = download_json_from_bucket(
201
+ input_path_intermodal_partners
202
+ )
197
203
  data["invoice_classification"] = download_json_from_bucket(
198
204
  input_path_invoice_classification
199
205
  )