data-science-document-ai 1.51.1__py3-none-any.whl → 1.53.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/RECORD +5 -5
- src/pdf_processing.py +22 -2
- src/postprocessing/postprocess_partner_invoice.py +27 -3
- {data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/RECORD
RENAMED
|
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
|
|
|
6
6
|
src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
7
7
|
src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=ER-gwh_YfJ-bMqh3nI8K89CZPAdPNnwjLmJ-5cnn1Rk,20469
|
|
10
10
|
src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
-
src/postprocessing/postprocess_partner_invoice.py,sha256=
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=V9ANqlSBBO2_YEyfyCms7vjhUi9pjlGeRfUhMQQ4F6c,13507
|
|
14
14
|
src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
|
|
15
15
|
src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
|
|
16
16
|
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
@@ -55,6 +55,6 @@ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40
|
|
|
55
55
|
src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
|
|
56
56
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
57
57
|
src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
|
|
58
|
-
data_science_document_ai-1.
|
|
59
|
-
data_science_document_ai-1.
|
|
60
|
-
data_science_document_ai-1.
|
|
58
|
+
data_science_document_ai-1.53.0.dist-info/METADATA,sha256=s3O_vRgHuHypiv9A6R2NWiwexz4SYjjmuKARhcpwgCo,2152
|
|
59
|
+
data_science_document_ai-1.53.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
60
|
+
data_science_document_ai-1.53.0.dist-info/RECORD,,
|
src/pdf_processing.py
CHANGED
|
@@ -250,6 +250,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
250
250
|
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
251
251
|
|
|
252
252
|
tasks = []
|
|
253
|
+
semaphore = asyncio.Semaphore(50)
|
|
253
254
|
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
254
255
|
for chunk in (
|
|
255
256
|
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
@@ -257,8 +258,8 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
257
258
|
else [file_content]
|
|
258
259
|
):
|
|
259
260
|
tasks.append(
|
|
260
|
-
|
|
261
|
-
chunk, prompt, response_schema, llm_client, input_doc_type
|
|
261
|
+
process_chunk_with_semaphore(
|
|
262
|
+
semaphore, chunk, prompt, response_schema, llm_client, input_doc_type
|
|
262
263
|
)
|
|
263
264
|
)
|
|
264
265
|
|
|
@@ -270,6 +271,25 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
270
271
|
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
271
272
|
|
|
272
273
|
|
|
274
|
+
async def process_chunk_with_semaphore(
|
|
275
|
+
semaphore,
|
|
276
|
+
chunk_content,
|
|
277
|
+
prompt,
|
|
278
|
+
response_schema,
|
|
279
|
+
llm_client,
|
|
280
|
+
input_doc_type,
|
|
281
|
+
):
|
|
282
|
+
"""Process a chunk with a semaphore to limit concurrency."""
|
|
283
|
+
async with semaphore:
|
|
284
|
+
return await process_chunk_with_retry(
|
|
285
|
+
chunk_content,
|
|
286
|
+
prompt,
|
|
287
|
+
response_schema,
|
|
288
|
+
llm_client,
|
|
289
|
+
input_doc_type,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
273
293
|
async def process_chunk_with_retry(
|
|
274
294
|
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
275
295
|
):
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
2
4
|
from rapidfuzz import fuzz, process
|
|
3
5
|
|
|
4
6
|
from src.io import logger
|
|
@@ -143,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
143
145
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
144
146
|
|
|
145
147
|
|
|
148
|
+
def select_unique_bank_account(bank_account):
|
|
149
|
+
# Select the unique bank account if multiple are present
|
|
150
|
+
if isinstance(bank_account, list) and bank_account:
|
|
151
|
+
best = defaultdict(lambda: None)
|
|
152
|
+
|
|
153
|
+
for item in bank_account:
|
|
154
|
+
dv = item["documentValue"]
|
|
155
|
+
if best[dv] is None or item["page"] < best[dv]["page"]:
|
|
156
|
+
best[dv] = item
|
|
157
|
+
|
|
158
|
+
unique = list(best.values())
|
|
159
|
+
return unique
|
|
160
|
+
|
|
161
|
+
|
|
146
162
|
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
147
163
|
"""Process the partner invoice data."""
|
|
148
164
|
# Post process bundeskasse invoices
|
|
@@ -150,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
150
166
|
post_process_bundeskasse(aggregated_data)
|
|
151
167
|
return
|
|
152
168
|
|
|
169
|
+
if "bankAccount" in aggregated_data:
|
|
170
|
+
aggregated_data["bankAccount"] = select_unique_bank_account(
|
|
171
|
+
aggregated_data["bankAccount"]
|
|
172
|
+
)
|
|
173
|
+
|
|
153
174
|
line_items = aggregated_data.get("lineItem", [])
|
|
154
175
|
# Add debug logging
|
|
155
176
|
logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
|
|
@@ -285,11 +306,14 @@ def if_reverse_charge_sentence(sentence: str, params):
|
|
|
285
306
|
return False
|
|
286
307
|
|
|
287
308
|
# Check if the sentence is similar to any of the reverse charge sentences
|
|
288
|
-
|
|
289
|
-
sentence, reverse_charge_sentences, threshold
|
|
309
|
+
match, _ = get_fuzzy_match_score(
|
|
310
|
+
sentence, list(reverse_charge_sentences.keys()), threshold
|
|
290
311
|
)
|
|
291
312
|
|
|
292
|
-
|
|
313
|
+
if match:
|
|
314
|
+
return reverse_charge_sentences[match]
|
|
315
|
+
|
|
316
|
+
return False
|
|
293
317
|
|
|
294
318
|
|
|
295
319
|
def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
{data_science_document_ai-1.51.1.dist-info → data_science_document_ai-1.53.0.dist-info}/WHEEL
RENAMED
|
File without changes
|