data-science-document-ai 1.42.5__py3-none-any.whl → 1.57.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/METADATA +2 -2
- data_science_document_ai-1.57.0.dist-info/RECORD +60 -0
- src/constants.py +13 -34
- src/docai_processor_config.yaml +0 -69
- src/excel_processing.py +24 -14
- src/io.py +23 -0
- src/llm.py +0 -29
- src/pdf_processing.py +183 -76
- src/postprocessing/common.py +172 -28
- src/postprocessing/postprocess_partner_invoice.py +194 -59
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +135 -21
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +21 -17
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +136 -22
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +52 -58
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +135 -21
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +10 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +135 -21
- src/prompts/library/bookingConfirmation/msc/prompt.txt +10 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +149 -21
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +11 -3
- src/prompts/library/bookingConfirmation/other/placeholders.json +149 -21
- src/prompts/library/bookingConfirmation/other/prompt.txt +56 -57
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +149 -21
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +11 -1
- src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- src/prompts/library/bundeskasse/other/prompt.txt +7 -5
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/prompt.txt +4 -3
- src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +80 -0
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
- src/prompts/prompt_library.py +0 -4
- src/setup.py +25 -24
- src/utils.py +120 -68
- data_science_document_ai-1.42.5.dist-info/RECORD +0 -57
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/WHEEL +0 -0
src/pdf_processing.py
CHANGED
|
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
|
|
12
|
+
from ddtrace import tracer
|
|
12
13
|
from fastapi import HTTPException
|
|
13
14
|
from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
14
15
|
|
|
@@ -31,9 +32,10 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
31
32
|
from src.prompts.prompt_library import prompt_library
|
|
32
33
|
from src.utils import (
|
|
33
34
|
extract_top_pages,
|
|
34
|
-
|
|
35
|
+
get_pdf_page_count,
|
|
35
36
|
get_processor_name,
|
|
36
37
|
run_background_tasks,
|
|
38
|
+
split_pdf_into_chunks,
|
|
37
39
|
transform_schema_strings,
|
|
38
40
|
validate_based_on_schema,
|
|
39
41
|
)
|
|
@@ -193,38 +195,29 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
193
195
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
194
196
|
"""
|
|
195
197
|
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
196
|
-
|
|
197
|
-
extract_top_pages(file_content, num_pages=5)
|
|
198
|
-
if input_doc_type == "bundeskasse"
|
|
199
|
-
else file_content
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
# convert file_content to required document
|
|
203
|
-
document = llm_client.prepare_document_for_gemini(file_content)
|
|
198
|
+
if input_doc_type == "bundeskasse":
|
|
199
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
204
200
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
208
|
-
if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
|
|
209
|
-
else generate_schema_structure(params, input_doc_type)
|
|
210
|
-
)
|
|
201
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
202
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
211
203
|
|
|
212
204
|
carrier = "other"
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
)
|
|
219
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
220
|
-
"placeholders"
|
|
221
|
-
][input_doc_type]
|
|
205
|
+
carrier_schema = (
|
|
206
|
+
prompt_library.library.get("preprocessing", {})
|
|
207
|
+
.get("carrier", {})
|
|
208
|
+
.get("placeholders", {})
|
|
209
|
+
.get(input_doc_type)
|
|
210
|
+
)
|
|
222
211
|
|
|
212
|
+
if carrier_schema:
|
|
223
213
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
224
214
|
carrier_prompt = carrier_prompt.replace(
|
|
225
215
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
226
216
|
)
|
|
227
217
|
|
|
218
|
+
# convert file_content to required document
|
|
219
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
220
|
+
|
|
228
221
|
# identify carrier for customized prompting
|
|
229
222
|
carrier = await identify_carrier(
|
|
230
223
|
document,
|
|
@@ -234,30 +227,122 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
234
227
|
doc_type=input_doc_type,
|
|
235
228
|
)
|
|
236
229
|
|
|
237
|
-
|
|
238
|
-
response_schema = prompt_library.library[input_doc_type][carrier][
|
|
239
|
-
"placeholders"
|
|
240
|
-
]
|
|
241
|
-
|
|
230
|
+
# Select prompt
|
|
242
231
|
if (
|
|
243
|
-
input_doc_type in prompt_library.library
|
|
244
|
-
|
|
232
|
+
input_doc_type not in prompt_library.library
|
|
233
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
245
234
|
):
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
235
|
+
return {}
|
|
236
|
+
|
|
237
|
+
# get the related prompt from predefined prompt library
|
|
238
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
239
|
+
|
|
240
|
+
# get the schema placeholder
|
|
241
|
+
response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
|
|
242
|
+
|
|
243
|
+
# Add page-number extraction for moderately large docs
|
|
244
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
245
|
+
|
|
246
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
247
|
+
if not use_chunking and number_of_pages > 1:
|
|
248
|
+
response_schema = transform_schema_strings(response_schema)
|
|
249
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
250
|
+
|
|
251
|
+
tasks = []
|
|
252
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
253
|
+
for chunk in (
|
|
254
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
255
|
+
if use_chunking
|
|
256
|
+
else [file_content]
|
|
257
|
+
):
|
|
258
|
+
tasks.append(
|
|
259
|
+
process_chunk_with_retry(
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
265
|
+
)
|
|
255
266
|
)
|
|
256
267
|
|
|
257
|
-
|
|
268
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
258
269
|
|
|
259
|
-
|
|
260
|
-
|
|
270
|
+
if use_chunking:
|
|
271
|
+
return merge_llm_results(results, response_schema)
|
|
272
|
+
else:
|
|
273
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
async def process_chunk_with_retry(
|
|
277
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
278
|
+
):
|
|
279
|
+
"""Process a chunk with retries in case of failure."""
|
|
280
|
+
for attempt in range(1, retries + 1):
|
|
281
|
+
try:
|
|
282
|
+
return await process_chunk(
|
|
283
|
+
chunk_content=chunk_content,
|
|
284
|
+
prompt=prompt,
|
|
285
|
+
response_schema=response_schema,
|
|
286
|
+
llm_client=llm_client,
|
|
287
|
+
input_doc_type=input_doc_type,
|
|
288
|
+
)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
291
|
+
if attempt == retries:
|
|
292
|
+
raise
|
|
293
|
+
await asyncio.sleep(1) # small backoff
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
async def process_chunk(
|
|
297
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
298
|
+
):
|
|
299
|
+
"""Process a chunk with Gemini."""
|
|
300
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
301
|
+
return await llm_client.get_unified_json_genai(
|
|
302
|
+
prompt=prompt,
|
|
303
|
+
document=document,
|
|
304
|
+
response_schema=response_schema,
|
|
305
|
+
doc_type=input_doc_type,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def merge_llm_results(results, response_schema):
|
|
310
|
+
"""Merge LLM results from multiple chunks."""
|
|
311
|
+
merged = {}
|
|
312
|
+
for i, result in enumerate(results):
|
|
313
|
+
if not isinstance(result, dict):
|
|
314
|
+
continue
|
|
315
|
+
# Add page number to all values coming from this chunk
|
|
316
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
317
|
+
|
|
318
|
+
# Merge the result into the final merged dictionary
|
|
319
|
+
for key, value in result.items():
|
|
320
|
+
field_type = (
|
|
321
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if key not in merged:
|
|
325
|
+
if field_type == "ARRAY":
|
|
326
|
+
# append the values as a list
|
|
327
|
+
merged[key] = (
|
|
328
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
merged[key] = value
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
if field_type == "ARRAY":
|
|
335
|
+
# append list contents across chunks
|
|
336
|
+
if isinstance(value, list):
|
|
337
|
+
merged[key].extend(value)
|
|
338
|
+
else:
|
|
339
|
+
merged[key].append(value)
|
|
340
|
+
|
|
341
|
+
# take first non-null value only
|
|
342
|
+
if merged[key] in (None, "", [], {}):
|
|
343
|
+
merged[key] = value
|
|
344
|
+
|
|
345
|
+
return merged
|
|
261
346
|
|
|
262
347
|
|
|
263
348
|
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
|
@@ -268,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
|
|
|
268
353
|
# Add currency from the amount field
|
|
269
354
|
if input_doc_type in ["commercialInvoice"]:
|
|
270
355
|
result = postprocessing_commercial_invoice(result, params, input_doc_type)
|
|
271
|
-
|
|
272
|
-
result = postprocess_booking_confirmation(result)
|
|
356
|
+
|
|
273
357
|
return result, llm_client.model_id
|
|
274
358
|
|
|
275
359
|
|
|
@@ -288,13 +372,14 @@ def combine_llm_results_w_doc_ai(
|
|
|
288
372
|
Returns:
|
|
289
373
|
combined result
|
|
290
374
|
"""
|
|
291
|
-
result =
|
|
292
|
-
|
|
293
|
-
|
|
375
|
+
result = remove_none_values(llm)
|
|
376
|
+
|
|
377
|
+
docAi = doc_ai.copy()
|
|
378
|
+
if not docAi:
|
|
294
379
|
return result
|
|
295
380
|
|
|
296
381
|
# Merge top-level keys
|
|
297
|
-
result.update({k: v for k, v in
|
|
382
|
+
result.update({k: v for k, v in docAi.items() if k not in result})
|
|
298
383
|
|
|
299
384
|
if (
|
|
300
385
|
input_doc_type
|
|
@@ -302,28 +387,28 @@ def combine_llm_results_w_doc_ai(
|
|
|
302
387
|
and keys_to_combine
|
|
303
388
|
):
|
|
304
389
|
result.update(
|
|
305
|
-
{key:
|
|
390
|
+
{key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
|
|
306
391
|
)
|
|
307
392
|
return result
|
|
308
393
|
|
|
309
394
|
# Handle specific key-based merging logic for multiple keys
|
|
310
395
|
if keys_to_combine:
|
|
311
396
|
for key in keys_to_combine:
|
|
312
|
-
if key in
|
|
397
|
+
if key in docAi.keys():
|
|
313
398
|
# Merge the list of dictionaries
|
|
314
|
-
# If the length of the
|
|
315
|
-
if len(
|
|
316
|
-
result[key] =
|
|
399
|
+
# If the length of the docAi list is less than the LLM result, replace with the docAi list
|
|
400
|
+
if len(docAi[key]) < len(result[key]):
|
|
401
|
+
result[key] = docAi[key]
|
|
317
402
|
else:
|
|
318
|
-
# If the length of the
|
|
403
|
+
# If the length of the docAi list is greater than or equal to the LLM result,
|
|
319
404
|
# add & merge the dictionaries
|
|
320
|
-
if isinstance(
|
|
321
|
-
for i in range(len(
|
|
405
|
+
if isinstance(docAi[key], list):
|
|
406
|
+
for i in range(len(docAi[key])):
|
|
322
407
|
if i == len(result[key]):
|
|
323
|
-
result[key].append(
|
|
408
|
+
result[key].append(docAi[key][i])
|
|
324
409
|
else:
|
|
325
|
-
for sub_key in
|
|
326
|
-
result[key][i][sub_key] =
|
|
410
|
+
for sub_key in docAi[key][i].keys():
|
|
411
|
+
result[key][i][sub_key] = docAi[key][i][sub_key]
|
|
327
412
|
return result
|
|
328
413
|
|
|
329
414
|
|
|
@@ -334,15 +419,9 @@ async def extract_data_by_doctype(
|
|
|
334
419
|
processor_client,
|
|
335
420
|
if_use_docai,
|
|
336
421
|
if_use_llm,
|
|
422
|
+
llm_client,
|
|
337
423
|
isBetaTest=False,
|
|
338
424
|
):
|
|
339
|
-
# Select LLM client (Using 2.5 Flash model for Bundeskasse)
|
|
340
|
-
llm_client = (
|
|
341
|
-
params["LlmClient_Flash"]
|
|
342
|
-
if input_doc_type == "bundeskasse"
|
|
343
|
-
else params["LlmClient"]
|
|
344
|
-
)
|
|
345
|
-
|
|
346
425
|
async def extract_w_docai():
|
|
347
426
|
return await extract_data_from_pdf_w_docai(
|
|
348
427
|
params=params,
|
|
@@ -391,6 +470,7 @@ async def data_extraction_manual_flow(
|
|
|
391
470
|
meta,
|
|
392
471
|
processor_client,
|
|
393
472
|
schema_client,
|
|
473
|
+
use_default_logging=False,
|
|
394
474
|
):
|
|
395
475
|
"""
|
|
396
476
|
Process a PDF file and extract data from it.
|
|
@@ -411,15 +491,26 @@ async def data_extraction_manual_flow(
|
|
|
411
491
|
"""
|
|
412
492
|
# Get the start time for processing
|
|
413
493
|
start_time = asyncio.get_event_loop().time()
|
|
494
|
+
|
|
495
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
496
|
+
llm_client = (
|
|
497
|
+
params["LlmClient_Flash"]
|
|
498
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
499
|
+
else params["LlmClient"]
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
page_count = None
|
|
414
503
|
# Validate the file type
|
|
415
504
|
if mime_type == "application/pdf":
|
|
505
|
+
if_use_docai = params["if_use_docai"]
|
|
506
|
+
|
|
416
507
|
# Enable Doc Ai only for certain document types.
|
|
417
|
-
if_use_docai
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
508
|
+
if params["if_use_docai"]:
|
|
509
|
+
if_use_docai = (
|
|
510
|
+
True
|
|
511
|
+
if meta.documentTypeCode in params["model_config"]["stable"]
|
|
512
|
+
else False
|
|
513
|
+
)
|
|
423
514
|
|
|
424
515
|
(
|
|
425
516
|
extracted_data,
|
|
@@ -431,9 +522,11 @@ async def data_extraction_manual_flow(
|
|
|
431
522
|
meta.documentTypeCode,
|
|
432
523
|
processor_client,
|
|
433
524
|
if_use_docai=if_use_docai,
|
|
434
|
-
if_use_llm=if_use_llm,
|
|
525
|
+
if_use_llm=params["if_use_llm"],
|
|
526
|
+
llm_client=llm_client,
|
|
435
527
|
isBetaTest=False,
|
|
436
528
|
)
|
|
529
|
+
page_count = get_pdf_page_count(file_content)
|
|
437
530
|
|
|
438
531
|
elif "excel" in mime_type or "spreadsheet" in mime_type:
|
|
439
532
|
# Extract data from the Excel file
|
|
@@ -442,8 +535,19 @@ async def data_extraction_manual_flow(
|
|
|
442
535
|
input_doc_type=meta.documentTypeCode,
|
|
443
536
|
file_content=file_content,
|
|
444
537
|
mime_type=mime_type,
|
|
538
|
+
llm_client=llm_client,
|
|
445
539
|
)
|
|
446
540
|
|
|
541
|
+
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
542
|
+
# Note: we use the span metric instead of len(extracted_data) because
|
|
543
|
+
# some sheets may fail extraction and not appear in extracted_data
|
|
544
|
+
span = tracer.current_span()
|
|
545
|
+
page_count = span.get_metric("est_page_count") if span else len(extracted_data)
|
|
546
|
+
if page_count > 100:
|
|
547
|
+
logger.warning(
|
|
548
|
+
f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
|
|
549
|
+
)
|
|
550
|
+
|
|
447
551
|
else:
|
|
448
552
|
raise HTTPException(
|
|
449
553
|
status_code=400,
|
|
@@ -451,7 +555,7 @@ async def data_extraction_manual_flow(
|
|
|
451
555
|
)
|
|
452
556
|
# Create the result dictionary with the extracted data
|
|
453
557
|
extracted_data = await format_all_entities(
|
|
454
|
-
extracted_data, meta.documentTypeCode, params
|
|
558
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
455
559
|
)
|
|
456
560
|
result = {
|
|
457
561
|
"id": meta.id,
|
|
@@ -466,7 +570,9 @@ async def data_extraction_manual_flow(
|
|
|
466
570
|
logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
|
|
467
571
|
|
|
468
572
|
# Schedule background tasks without using FastAPI's BackgroundTasks
|
|
469
|
-
if
|
|
573
|
+
if (
|
|
574
|
+
os.getenv("CLUSTER") != "ode"
|
|
575
|
+
) & use_default_logging: # skip data export to bigquery in ODE environment
|
|
470
576
|
asyncio.create_task(
|
|
471
577
|
run_background_tasks(
|
|
472
578
|
params,
|
|
@@ -477,6 +583,7 @@ async def data_extraction_manual_flow(
|
|
|
477
583
|
processor_version,
|
|
478
584
|
mime_type,
|
|
479
585
|
elapsed_time,
|
|
586
|
+
page_count,
|
|
480
587
|
)
|
|
481
588
|
)
|
|
482
589
|
return result
|