data-science-document-ai 1.52.1__tar.gz → 1.53.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/pdf_processing.py +22 -2
  4. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/postprocessing/postprocess_partner_invoice.py +21 -0
  5. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/constants.py +0 -0
  6. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/constants_sandbox.py +0 -0
  7. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/docai.py +0 -0
  8. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/docai_processor_config.yaml +0 -0
  9. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/excel_processing.py +0 -0
  10. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/io.py +0 -0
  11. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/llm.py +0 -0
  12. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/log_setup.py +0 -0
  13. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/postprocessing/common.py +0 -0
  14. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  15. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  16. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/arrivalNotice/other/placeholders.json +0 -0
  17. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/arrivalNotice/other/prompt.txt +0 -0
  18. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
  19. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
  20. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
  21. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
  22. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
  23. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
  24. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
  25. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
  26. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
  27. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
  28. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
  29. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
  30. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
  31. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
  32. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bundeskasse/other/placeholders.json +0 -0
  33. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/bundeskasse/other/prompt.txt +0 -0
  34. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
  35. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
  36. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/customsAssessment/other/placeholders.json +0 -0
  37. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/customsAssessment/other/prompt.txt +0 -0
  38. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
  39. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/customsInvoice/other/prompt.txt +0 -0
  40. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/deliveryOrder/other/placeholders.json +0 -0
  41. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/deliveryOrder/other/prompt.txt +0 -0
  42. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  43. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/draftMbl/other/prompt.txt +0 -0
  44. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/finalMbL/other/placeholders.json +0 -0
  45. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/finalMbL/other/prompt.txt +0 -0
  46. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
  47. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
  48. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
  49. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/partnerInvoice/other/prompt.txt +0 -0
  50. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  51. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  52. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  53. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  54. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/shippingInstruction/other/placeholders.json +0 -0
  55. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/library/shippingInstruction/other/prompt.txt +0 -0
  56. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/prompts/prompt_library.py +0 -0
  57. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/setup.py +0 -0
  58. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/tms.py +0 -0
  59. {data_science_document_ai-1.52.1 → data_science_document_ai-1.53.0}/src/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.52.1
3
+ Version: 1.53.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.52.1"
3
+ version = "1.53.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -250,6 +250,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
250
250
  prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
251
 
252
252
  tasks = []
253
+ semaphore = asyncio.Semaphore(50)
253
254
  # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
254
255
  for chunk in (
255
256
  split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -257,8 +258,8 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
257
258
  else [file_content]
258
259
  ):
259
260
  tasks.append(
260
- process_chunk_with_retry(
261
- chunk, prompt, response_schema, llm_client, input_doc_type
261
+ process_chunk_with_semaphore(
262
+ semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
262
263
  )
263
264
  )
264
265
 
@@ -270,6 +271,25 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
270
271
  return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
271
272
 
272
273
 
274
+ async def process_chunk_with_semaphore(
275
+ semaphore,
276
+ chunk_content,
277
+ prompt,
278
+ response_schema,
279
+ llm_client,
280
+ input_doc_type,
281
+ ):
282
+ """Process a chunk with a semaphore to limit concurrency."""
283
+ async with semaphore:
284
+ return await process_chunk_with_retry(
285
+ chunk_content,
286
+ prompt,
287
+ response_schema,
288
+ llm_client,
289
+ input_doc_type,
290
+ )
291
+
292
+
273
293
  async def process_chunk_with_retry(
274
294
  chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
275
295
  ):
@@ -1,4 +1,6 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
+ from collections import defaultdict
3
+
2
4
  from rapidfuzz import fuzz, process
3
5
 
4
6
  from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
143
145
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
144
146
 
145
147
 
148
+ def select_unique_bank_account(bank_account):
149
+ # Select the unique bank account if multiple are present
150
+ if isinstance(bank_account, list) and bank_account:
151
+ best = defaultdict(lambda: None)
152
+
153
+ for item in bank_account:
154
+ dv = item["documentValue"]
155
+ if best[dv] is None or item["page"] < best[dv]["page"]:
156
+ best[dv] = item
157
+
158
+ unique = list(best.values())
159
+ return unique
160
+
161
+
146
162
  async def process_partner_invoice(params, aggregated_data, document_type_code):
147
163
  """Process the partner invoice data."""
148
164
  # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
150
166
  post_process_bundeskasse(aggregated_data)
151
167
  return
152
168
 
169
+ if "bankAccount" in aggregated_data:
170
+ aggregated_data["bankAccount"] = select_unique_bank_account(
171
+ aggregated_data["bankAccount"]
172
+ )
173
+
153
174
  line_items = aggregated_data.get("lineItem", [])
154
175
  # Add debug logging
155
176
  logger.info(f"Processing partnerInvoice with {len(line_items)} line items")