data-science-document-ai 1.42.5__py3-none-any.whl → 1.57.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/METADATA +2 -2
  2. data_science_document_ai-1.57.0.dist-info/RECORD +60 -0
  3. src/constants.py +13 -34
  4. src/docai_processor_config.yaml +0 -69
  5. src/excel_processing.py +24 -14
  6. src/io.py +23 -0
  7. src/llm.py +0 -29
  8. src/pdf_processing.py +183 -76
  9. src/postprocessing/common.py +172 -28
  10. src/postprocessing/postprocess_partner_invoice.py +194 -59
  11. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  12. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  13. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +135 -21
  14. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +21 -17
  15. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +136 -22
  16. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +52 -58
  17. src/prompts/library/bookingConfirmation/maersk/placeholders.json +135 -21
  18. src/prompts/library/bookingConfirmation/maersk/prompt.txt +10 -1
  19. src/prompts/library/bookingConfirmation/msc/placeholders.json +135 -21
  20. src/prompts/library/bookingConfirmation/msc/prompt.txt +10 -1
  21. src/prompts/library/bookingConfirmation/oocl/placeholders.json +149 -21
  22. src/prompts/library/bookingConfirmation/oocl/prompt.txt +11 -3
  23. src/prompts/library/bookingConfirmation/other/placeholders.json +149 -21
  24. src/prompts/library/bookingConfirmation/other/prompt.txt +56 -57
  25. src/prompts/library/bookingConfirmation/yangming/placeholders.json +149 -21
  26. src/prompts/library/bookingConfirmation/yangming/prompt.txt +11 -1
  27. src/prompts/library/bundeskasse/other/placeholders.json +5 -5
  28. src/prompts/library/bundeskasse/other/prompt.txt +7 -5
  29. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  30. src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
  31. src/prompts/library/customsAssessment/other/placeholders.json +70 -0
  32. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  33. src/prompts/library/customsInvoice/other/prompt.txt +4 -3
  34. src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
  35. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  36. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  37. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  38. src/prompts/library/finalMbL/other/placeholders.json +80 -0
  39. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  40. src/prompts/library/packingList/other/placeholders.json +98 -0
  41. src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
  42. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  43. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  44. src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
  45. src/prompts/prompt_library.py +0 -4
  46. src/setup.py +25 -24
  47. src/utils.py +120 -68
  48. data_science_document_ai-1.42.5.dist-info/RECORD +0 -57
  49. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
  50. src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
  51. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  52. src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
  53. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/WHEEL +0 -0
src/pdf_processing.py CHANGED
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
9
9
  import asyncio
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
@@ -31,9 +32,10 @@ from src.postprocessing.postprocess_partner_invoice import (
31
32
  from src.prompts.prompt_library import prompt_library
32
33
  from src.utils import (
33
34
  extract_top_pages,
34
- generate_schema_structure,
35
+ get_pdf_page_count,
35
36
  get_processor_name,
36
37
  run_background_tasks,
38
+ split_pdf_into_chunks,
37
39
  transform_schema_strings,
38
40
  validate_based_on_schema,
39
41
  )
@@ -193,38 +195,29 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
193
195
  result (dict): The structured data extracted from the document, formatted as JSON.
194
196
  """
195
197
  # Bundeskasse invoices contains all the required information in the first 3 pages.
196
- file_content = (
197
- extract_top_pages(file_content, num_pages=5)
198
- if input_doc_type == "bundeskasse"
199
- else file_content
200
- )
201
-
202
- # convert file_content to required document
203
- document = llm_client.prepare_document_for_gemini(file_content)
198
+ if input_doc_type == "bundeskasse":
199
+ file_content = extract_top_pages(file_content, num_pages=5)
204
200
 
205
- # get the schema placeholder from the Doc AI and generate the response structure
206
- response_schema = (
207
- prompt_library.library[input_doc_type]["other"]["placeholders"]
208
- if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
209
- else generate_schema_structure(params, input_doc_type)
210
- )
201
+ number_of_pages = get_pdf_page_count(file_content)
202
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
211
203
 
212
204
  carrier = "other"
213
- if (
214
- "preprocessing" in prompt_library.library.keys()
215
- and "carrier" in prompt_library.library["preprocessing"].keys()
216
- and input_doc_type
217
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
218
- ):
219
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
220
- "placeholders"
221
- ][input_doc_type]
205
+ carrier_schema = (
206
+ prompt_library.library.get("preprocessing", {})
207
+ .get("carrier", {})
208
+ .get("placeholders", {})
209
+ .get(input_doc_type)
210
+ )
222
211
 
212
+ if carrier_schema:
223
213
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
224
214
  carrier_prompt = carrier_prompt.replace(
225
215
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
226
216
  )
227
217
 
218
+ # convert file_content to required document
219
+ document = llm_client.prepare_document_for_gemini(file_content)
220
+
228
221
  # identify carrier for customized prompting
229
222
  carrier = await identify_carrier(
230
223
  document,
@@ -234,30 +227,122 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
234
227
  doc_type=input_doc_type,
235
228
  )
236
229
 
237
- if input_doc_type == "bookingConfirmation":
238
- response_schema = prompt_library.library[input_doc_type][carrier][
239
- "placeholders"
240
- ]
241
-
230
+ # Select prompt
242
231
  if (
243
- input_doc_type in prompt_library.library.keys()
244
- and carrier in prompt_library.library[input_doc_type].keys()
232
+ input_doc_type not in prompt_library.library
233
+ or carrier not in prompt_library.library[input_doc_type]
245
234
  ):
246
- # get the related prompt from predefined prompt library
247
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
248
-
249
- # generate the result with LLM (gemini)
250
- result = await llm_client.get_unified_json_genai(
251
- prompt=prompt,
252
- document=document,
253
- response_schema=response_schema,
254
- doc_type=input_doc_type,
235
+ return {}
236
+
237
+ # get the related prompt from predefined prompt library
238
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
239
+
240
+ # get the schema placeholder
241
+ response_schema = prompt_library.library[input_doc_type][carrier]["placeholders"]
242
+
243
+ # Add page-number extraction for moderately large docs
244
+ use_chunking = number_of_pages >= params["chunk_after"]
245
+
246
+ # Update schema and prompt to extract value-page_number pairs
247
+ if not use_chunking and number_of_pages > 1:
248
+ response_schema = transform_schema_strings(response_schema)
249
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
250
+
251
+ tasks = []
252
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
253
+ for chunk in (
254
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
255
+ if use_chunking
256
+ else [file_content]
257
+ ):
258
+ tasks.append(
259
+ process_chunk_with_retry(
260
+ chunk,
261
+ prompt,
262
+ response_schema,
263
+ llm_client,
264
+ input_doc_type,
265
+ )
255
266
  )
256
267
 
257
- result = llm_prediction_to_tuples(result)
268
+ results = await asyncio.gather(*tasks, return_exceptions=True)
258
269
 
259
- return result
260
- return {}
270
+ if use_chunking:
271
+ return merge_llm_results(results, response_schema)
272
+ else:
273
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
274
+
275
+
276
+ async def process_chunk_with_retry(
277
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
278
+ ):
279
+ """Process a chunk with retries in case of failure."""
280
+ for attempt in range(1, retries + 1):
281
+ try:
282
+ return await process_chunk(
283
+ chunk_content=chunk_content,
284
+ prompt=prompt,
285
+ response_schema=response_schema,
286
+ llm_client=llm_client,
287
+ input_doc_type=input_doc_type,
288
+ )
289
+ except Exception as e:
290
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
291
+ if attempt == retries:
292
+ raise
293
+ await asyncio.sleep(1) # small backoff
294
+
295
+
296
+ async def process_chunk(
297
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
298
+ ):
299
+ """Process a chunk with Gemini."""
300
+ document = llm_client.prepare_document_for_gemini(chunk_content)
301
+ return await llm_client.get_unified_json_genai(
302
+ prompt=prompt,
303
+ document=document,
304
+ response_schema=response_schema,
305
+ doc_type=input_doc_type,
306
+ )
307
+
308
+
309
+ def merge_llm_results(results, response_schema):
310
+ """Merge LLM results from multiple chunks."""
311
+ merged = {}
312
+ for i, result in enumerate(results):
313
+ if not isinstance(result, dict):
314
+ continue
315
+ # Add page number to all values coming from this chunk
316
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
317
+
318
+ # Merge the result into the final merged dictionary
319
+ for key, value in result.items():
320
+ field_type = (
321
+ response_schema["properties"].get(key, {}).get("type", "").upper()
322
+ )
323
+
324
+ if key not in merged:
325
+ if field_type == "ARRAY":
326
+ # append the values as a list
327
+ merged[key] = (
328
+ value if isinstance(value, list) else ([value] if value else [])
329
+ )
330
+ else:
331
+ merged[key] = value
332
+ continue
333
+
334
+ if field_type == "ARRAY":
335
+ # append list contents across chunks
336
+ if isinstance(value, list):
337
+ merged[key].extend(value)
338
+ else:
339
+ merged[key].append(value)
340
+
341
+ # take first non-null value only
342
+ if merged[key] in (None, "", [], {}):
343
+ merged[key] = value
344
+
345
+ return merged
261
346
 
262
347
 
263
348
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -268,8 +353,7 @@ async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_
268
353
  # Add currency from the amount field
269
354
  if input_doc_type in ["commercialInvoice"]:
270
355
  result = postprocessing_commercial_invoice(result, params, input_doc_type)
271
- elif input_doc_type == "bookingConfirmation":
272
- result = postprocess_booking_confirmation(result)
356
+
273
357
  return result, llm_client.model_id
274
358
 
275
359
 
@@ -288,13 +372,14 @@ def combine_llm_results_w_doc_ai(
288
372
  Returns:
289
373
  combined result
290
374
  """
291
- result = doc_ai.copy()
292
- llm = remove_none_values(llm)
293
- if not llm:
375
+ result = remove_none_values(llm)
376
+
377
+ docAi = doc_ai.copy()
378
+ if not docAi:
294
379
  return result
295
380
 
296
381
  # Merge top-level keys
297
- result.update({k: v for k, v in llm.items() if k not in result})
382
+ result.update({k: v for k, v in docAi.items() if k not in result})
298
383
 
299
384
  if (
300
385
  input_doc_type
@@ -302,28 +387,28 @@ def combine_llm_results_w_doc_ai(
302
387
  and keys_to_combine
303
388
  ):
304
389
  result.update(
305
- {key: llm.get(key) for key in keys_to_combine if key in llm.keys()}
390
+ {key: docAi.get(key) for key in keys_to_combine if key in docAi.keys()}
306
391
  )
307
392
  return result
308
393
 
309
394
  # Handle specific key-based merging logic for multiple keys
310
395
  if keys_to_combine:
311
396
  for key in keys_to_combine:
312
- if key in llm.keys():
397
+ if key in docAi.keys():
313
398
  # Merge the list of dictionaries
314
- # If the length of the LLM list is less than the Doc AI result, replace with the LLM list
315
- if len(llm[key]) < len(result[key]):
316
- result[key] = llm[key]
399
+ # If the length of the docAi list is less than the LLM result, replace with the docAi list
400
+ if len(docAi[key]) < len(result[key]):
401
+ result[key] = docAi[key]
317
402
  else:
318
- # If the length of the LLM list is greater than or equal to the Doc AI result,
403
+ # If the length of the docAi list is greater than or equal to the LLM result,
319
404
  # add & merge the dictionaries
320
- if isinstance(llm[key], list):
321
- for i in range(len(llm[key])):
405
+ if isinstance(docAi[key], list):
406
+ for i in range(len(docAi[key])):
322
407
  if i == len(result[key]):
323
- result[key].append(llm[key][i])
408
+ result[key].append(docAi[key][i])
324
409
  else:
325
- for sub_key in llm[key][i].keys():
326
- result[key][i][sub_key] = llm[key][i][sub_key]
410
+ for sub_key in docAi[key][i].keys():
411
+ result[key][i][sub_key] = docAi[key][i][sub_key]
327
412
  return result
328
413
 
329
414
 
@@ -334,15 +419,9 @@ async def extract_data_by_doctype(
334
419
  processor_client,
335
420
  if_use_docai,
336
421
  if_use_llm,
422
+ llm_client,
337
423
  isBetaTest=False,
338
424
  ):
339
- # Select LLM client (Using 2.5 Flash model for Bundeskasse)
340
- llm_client = (
341
- params["LlmClient_Flash"]
342
- if input_doc_type == "bundeskasse"
343
- else params["LlmClient"]
344
- )
345
-
346
425
  async def extract_w_docai():
347
426
  return await extract_data_from_pdf_w_docai(
348
427
  params=params,
@@ -391,6 +470,7 @@ async def data_extraction_manual_flow(
391
470
  meta,
392
471
  processor_client,
393
472
  schema_client,
473
+ use_default_logging=False,
394
474
  ):
395
475
  """
396
476
  Process a PDF file and extract data from it.
@@ -411,15 +491,26 @@ async def data_extraction_manual_flow(
411
491
  """
412
492
  # Get the start time for processing
413
493
  start_time = asyncio.get_event_loop().time()
494
+
495
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
496
+ llm_client = (
497
+ params["LlmClient_Flash"]
498
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
499
+ else params["LlmClient"]
500
+ )
501
+
502
+ page_count = None
414
503
  # Validate the file type
415
504
  if mime_type == "application/pdf":
505
+ if_use_docai = params["if_use_docai"]
506
+
416
507
  # Enable Doc Ai only for certain document types.
417
- if_use_docai = (
418
- True if meta.documentTypeCode in params["model_config"]["stable"] else False
419
- )
420
- if_use_llm = (
421
- True if meta.documentTypeCode in params["key_to_combine"].keys() else False
422
- )
508
+ if params["if_use_docai"]:
509
+ if_use_docai = (
510
+ True
511
+ if meta.documentTypeCode in params["model_config"]["stable"]
512
+ else False
513
+ )
423
514
 
424
515
  (
425
516
  extracted_data,
@@ -431,9 +522,11 @@ async def data_extraction_manual_flow(
431
522
  meta.documentTypeCode,
432
523
  processor_client,
433
524
  if_use_docai=if_use_docai,
434
- if_use_llm=if_use_llm,
525
+ if_use_llm=params["if_use_llm"],
526
+ llm_client=llm_client,
435
527
  isBetaTest=False,
436
528
  )
529
+ page_count = get_pdf_page_count(file_content)
437
530
 
438
531
  elif "excel" in mime_type or "spreadsheet" in mime_type:
439
532
  # Extract data from the Excel file
@@ -442,8 +535,19 @@ async def data_extraction_manual_flow(
442
535
  input_doc_type=meta.documentTypeCode,
443
536
  file_content=file_content,
444
537
  mime_type=mime_type,
538
+ llm_client=llm_client,
445
539
  )
446
540
 
541
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
542
+ # Note: we use the span metric instead of len(extracted_data) because
543
+ # some sheets may fail extraction and not appear in extracted_data
544
+ span = tracer.current_span()
545
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
546
+ if page_count > 100:
547
+ logger.warning(
548
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
549
+ )
550
+
447
551
  else:
448
552
  raise HTTPException(
449
553
  status_code=400,
@@ -451,7 +555,7 @@ async def data_extraction_manual_flow(
451
555
  )
452
556
  # Create the result dictionary with the extracted data
453
557
  extracted_data = await format_all_entities(
454
- extracted_data, meta.documentTypeCode, params
558
+ extracted_data, meta.documentTypeCode, params, mime_type
455
559
  )
456
560
  result = {
457
561
  "id": meta.id,
@@ -466,7 +570,9 @@ async def data_extraction_manual_flow(
466
570
  logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
467
571
 
468
572
  # Schedule background tasks without using FastAPI's BackgroundTasks
469
- if os.getenv("CLUSTER") != "ode": # skip data export to bigquery in ODE environment
573
+ if (
574
+ os.getenv("CLUSTER") != "ode"
575
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
470
576
  asyncio.create_task(
471
577
  run_background_tasks(
472
578
  params,
@@ -477,6 +583,7 @@ async def data_extraction_manual_flow(
477
583
  processor_version,
478
584
  mime_type,
479
585
  elapsed_time,
586
+ page_count,
480
587
  )
481
588
  )
482
589
  return result