data-science-document-ai 1.53.0__tar.gz → 1.54.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/constants.py +2 -1
  4. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/pdf_processing.py +6 -23
  5. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_partner_invoice.py +55 -23
  6. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/setup.py +6 -0
  7. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/constants_sandbox.py +0 -0
  8. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/docai.py +0 -0
  9. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/docai_processor_config.yaml +0 -0
  10. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/excel_processing.py +0 -0
  11. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/io.py +0 -0
  12. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/llm.py +0 -0
  13. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/log_setup.py +0 -0
  14. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/postprocessing/common.py +0 -0
  15. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  16. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  17. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
  18. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
  19. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  20. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  21. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  22. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  23. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  24. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  25. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  26. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  27. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  28. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  29. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  30. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  31. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  32. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  33. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  34. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  35. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  36. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  37. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  38. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  39. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  40. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  41. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  42. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  43. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  44. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  45. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  46. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  47. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  48. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  49. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  50. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  51. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  52. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  53. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  54. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  55. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
  56. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  57. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/prompts/prompt_library.py +0 -0
  58. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/tms.py +0 -0
  59. {data_science_document_ai-1.53.0 → data_science_document_ai-1.54.0}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.53.0
3
+ Version: 1.54.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.53.0"
3
+ version = "1.54.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -20,10 +20,11 @@ project_parameters = {
20
20
  # Fuzzy lookup
21
21
  "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
22
  "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
23
24
  "invoice_classification_lookup": "invoice_classification.json",
24
25
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
26
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 90,
27
+ "fuzzy_threshold_item_code": 92,
27
28
  "fuzzy_threshold_reverse_charge": 80,
28
29
  "fuzzy_threshold_invoice_classification": 70,
29
30
  # Chunking params
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
32
32
  from src.prompts.prompt_library import prompt_library
33
33
  from src.utils import (
34
34
  extract_top_pages,
35
- generate_schema_structure,
36
35
  get_pdf_page_count,
37
36
  get_processor_name,
38
37
  run_background_tasks,
@@ -250,7 +249,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
250
249
  prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
250
 
252
251
  tasks = []
253
- semaphore = asyncio.Semaphore(50)
254
252
  # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
255
253
  for chunk in (
256
254
  split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -258,8 +256,12 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
258
256
  else [file_content]
259
257
  ):
260
258
  tasks.append(
261
- process_chunk_with_semaphore(
262
- semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
259
+ process_chunk_with_retry(
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
263
265
  )
264
266
  )
265
267
 
@@ -271,25 +273,6 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
271
273
  return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
272
274
 
273
275
 
274
- async def process_chunk_with_semaphore(
275
- semaphore,
276
- chunk_content,
277
- prompt,
278
- response_schema,
279
- llm_client,
280
- input_doc_type,
281
- ):
282
- """Process a chunk with a semaphore to limit concurrency."""
283
- async with semaphore:
284
- return await process_chunk_with_retry(
285
- chunk_content,
286
- prompt,
287
- response_schema,
288
- llm_client,
289
- input_doc_type,
290
- )
291
-
292
-
293
276
  async def process_chunk_with_retry(
294
277
  chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
295
278
  ):
@@ -188,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
188
188
  reverse_charge_info["formattedValue"] = reverse_charge_value
189
189
  reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
190
190
 
191
+ # Partner Name
192
+ partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
193
+
191
194
  # Process everything in one go
192
- processed_items = await process_line_items_batch(params, line_items, reverse_charge)
195
+ processed_items = await process_line_items_batch(
196
+ params, line_items, reverse_charge, partner_name
197
+ )
193
198
 
194
199
  # Update your main data structure
195
200
  aggregated_data["lineItem"] = processed_items
196
201
 
197
202
 
198
203
  async def process_line_items_batch(
199
- params: dict, line_items: list[dict], reverse_charge=None
204
+ params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
200
205
  ):
201
206
  """
202
207
  Processes all line items efficiently using a "Split-Apply-Combine" strategy.
@@ -234,23 +239,12 @@ async def process_line_items_batch(
234
239
 
235
240
  # Batch API Call for Embedding lookups
236
241
  if pending_line_items:
237
- values_to_fetch = list(set(pending_line_items.values()))
238
- logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
239
-
240
- # Await the batch response {"desc1": "code1", "desc2": "code2"}
241
- api_results = await get_tms_mappings(
242
- input_list=values_to_fetch, embedding_type="line_items"
243
- )
242
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
244
243
 
245
- # Merge API results back into original list
246
244
  for index, desc in pending_line_items.items():
247
- # Get result from API response, or None if API failed for that item
248
- forto_code = api_results.get(desc)
249
-
250
- # Update the original item
251
245
  line_items[index]["itemCode"] = {
252
246
  "documentValue": desc,
253
- "formattedValue": forto_code, # Might be None if API failed
247
+ "formattedValue": code_map.get(desc),
254
248
  "page": line_items[index]["lineItemDescription"].get("page"),
255
249
  }
256
250
 
@@ -344,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
344
338
  return None
345
339
 
346
340
 
347
- async def associate_forto_item_code(line_item_data, params):
341
+ async def associate_forto_item_code(line_item_data, params, partner_name=None):
348
342
  """
349
343
  Associates Forto item codes to a list of line item descriptions.
350
344
  Args:
351
345
  line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
352
346
  params (dict): Parameters containing lookup data and thresholds.
347
+ partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
353
348
 
354
349
  Returns:
355
350
  list: A list of dictionaries with 'description' and 'itemCode' keys.
@@ -371,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
371
366
 
372
367
  # Batch API Call for Embedding lookups
373
368
  if pending_line_items:
374
- api_results = await get_tms_mappings(
375
- input_list=list(pending_line_items.values()),
376
- embedding_type="line_items",
377
- )
369
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
378
370
 
379
- # Merge API results back into original list
380
371
  for desc, f_desc in pending_line_items.items():
381
- code = api_results.get(f_desc)
382
- result.append({"description": desc, "itemCode": code})
372
+ result.append(
373
+ {
374
+ "description": desc,
375
+ "itemCode": code_map.get(f_desc),
376
+ }
377
+ )
378
+
379
+ return result
380
+
383
381
 
382
+ async def fetch_line_item_codes(
383
+ pending_line_items: dict,
384
+ partner_name: str | None,
385
+ params: dict,
386
+ ):
387
+ """Returns: {original_description: mapped_code_or_None}"""
388
+ t_mode = (
389
+ find_matching_lineitem(
390
+ partner_name.upper(),
391
+ params["lookup_data"]["intermodal_partners"],
392
+ threshold=87,
393
+ )
394
+ if partner_name
395
+ else None
396
+ )
397
+
398
+ unique_descs = list(set(pending_line_items.values()))
399
+ logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
400
+
401
+ # Build API input map
402
+ api_input_map = {
403
+ desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
404
+ }
405
+
406
+ api_results = await get_tms_mappings(
407
+ input_list=list(api_input_map.values()),
408
+ embedding_type="line_items",
409
+ )
410
+
411
+ # Normalize response back to original descriptions
412
+ result = {
413
+ original_desc: api_results.get(api_desc)
414
+ for original_desc, api_desc in api_input_map.items()
415
+ }
384
416
  return result
@@ -184,6 +184,9 @@ def setup_lookup_data(params):
184
184
  input_path_item_code = (
185
185
  f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
186
186
  )
187
+ input_path_intermodal_partners = (
188
+ f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
189
+ )
187
190
  input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
188
191
  input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
189
192
 
@@ -194,6 +197,9 @@ def setup_lookup_data(params):
194
197
  return json.loads(downloaded_data)
195
198
 
196
199
  data["item_code"] = download_json_from_bucket(input_path_item_code)
200
+ data["intermodal_partners"] = download_json_from_bucket(
201
+ input_path_intermodal_partners
202
+ )
197
203
  data["invoice_classification"] = download_json_from_bucket(
198
204
  input_path_invoice_classification
199
205
  )