data-science-document-ai 1.51.1__py3-none-any.whl → 1.53.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.51.1
3
+ Version: 1.53.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
6
  src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
7
  src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
9
+ src/pdf_processing.py,sha256=ER-gwh_YfJ-bMqh3nI8K89CZPAdPNnwjLmJ-5cnn1Rk,20469
10
10
  src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=V9ANqlSBBO2_YEyfyCms7vjhUi9pjlGeRfUhMQQ4F6c,13507
14
14
  src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
15
  src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
@@ -55,6 +55,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
55
55
  src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
56
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
57
  src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
- data_science_document_ai-1.51.1.dist-info/METADATA,sha256=vJ9ivHgPOvyMkfqtL2893McNMupkNvNHYJd95IF4CMQ,2152
59
- data_science_document_ai-1.51.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
- data_science_document_ai-1.51.1.dist-info/RECORD,,
58
+ data_science_document_ai-1.53.0.dist-info/METADATA,sha256=s3O_vRgHuHypiv9A6R2NWiwexz4SYjjmuKARhcpwgCo,2152
59
+ data_science_document_ai-1.53.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.53.0.dist-info/RECORD,,
src/pdf_processing.py CHANGED
@@ -250,6 +250,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
250
250
  prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
251
 
252
252
  tasks = []
253
+ semaphore = asyncio.Semaphore(50)
253
254
  # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
254
255
  for chunk in (
255
256
  split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
@@ -257,8 +258,8 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
257
258
  else [file_content]
258
259
  ):
259
260
  tasks.append(
260
- process_chunk_with_retry(
261
- chunk, prompt, response_schema, llm_client, input_doc_type
261
+ process_chunk_with_semaphore(
262
+ semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
262
263
  )
263
264
  )
264
265
 
@@ -270,6 +271,25 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
270
271
  return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
271
272
 
272
273
 
274
+ async def process_chunk_with_semaphore(
275
+ semaphore,
276
+ chunk_content,
277
+ prompt,
278
+ response_schema,
279
+ llm_client,
280
+ input_doc_type,
281
+ ):
282
+ """Process a chunk with a semaphore to limit concurrency."""
283
+ async with semaphore:
284
+ return await process_chunk_with_retry(
285
+ chunk_content,
286
+ prompt,
287
+ response_schema,
288
+ llm_client,
289
+ input_doc_type,
290
+ )
291
+
292
+
273
293
  async def process_chunk_with_retry(
274
294
  chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
275
295
  ):
@@ -1,4 +1,6 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
+ from collections import defaultdict
3
+
2
4
  from rapidfuzz import fuzz, process
3
5
 
4
6
  from src.io import logger
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
143
145
  ] = "Dasbachstraße 15, 54292 Trier, Germany"
144
146
 
145
147
 
148
+ def select_unique_bank_account(bank_account):
149
+ # Select the unique bank account if multiple are present
150
+ if isinstance(bank_account, list) and bank_account:
151
+ best = defaultdict(lambda: None)
152
+
153
+ for item in bank_account:
154
+ dv = item["documentValue"]
155
+ if best[dv] is None or item["page"] < best[dv]["page"]:
156
+ best[dv] = item
157
+
158
+ unique = list(best.values())
159
+ return unique
160
+
161
+
146
162
  async def process_partner_invoice(params, aggregated_data, document_type_code):
147
163
  """Process the partner invoice data."""
148
164
  # Post process bundeskasse invoices
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
150
166
  post_process_bundeskasse(aggregated_data)
151
167
  return
152
168
 
169
+ if "bankAccount" in aggregated_data:
170
+ aggregated_data["bankAccount"] = select_unique_bank_account(
171
+ aggregated_data["bankAccount"]
172
+ )
173
+
153
174
  line_items = aggregated_data.get("lineItem", [])
154
175
  # Add debug logging
155
176
  logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
@@ -285,11 +306,14 @@ def if_reverse_charge_sentence(sentence: str, params):
285
306
  return False
286
307
 
287
308
  # Check if the sentence is similar to any of the reverse charge sentences
288
- _, is_reverse_charge = get_fuzzy_match_score(
289
- sentence, reverse_charge_sentences, threshold
309
+ match, _ = get_fuzzy_match_score(
310
+ sentence, list(reverse_charge_sentences.keys()), threshold
290
311
  )
291
312
 
292
- return is_reverse_charge
313
+ if match:
314
+ return reverse_charge_sentences[match]
315
+
316
+ return False
293
317
 
294
318
 
295
319
  def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):