data-science-document-ai 1.37.0__py3-none-any.whl → 1.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.37.0.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +3 -3
  2. data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
  3. {data_science_document_ai-1.37.0.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +1 -1
  4. src/constants.py +6 -10
  5. src/docai.py +14 -5
  6. src/docai_processor_config.yaml +0 -56
  7. src/excel_processing.py +34 -13
  8. src/io.py +69 -1
  9. src/llm.py +10 -32
  10. src/pdf_processing.py +192 -57
  11. src/postprocessing/common.py +252 -590
  12. src/postprocessing/postprocess_partner_invoice.py +139 -89
  13. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  14. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  15. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  16. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  17. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  18. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  19. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  20. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  21. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  22. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  23. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  24. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  25. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  26. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  27. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  28. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  29. src/prompts/library/bundeskasse/other/placeholders.json +25 -25
  30. src/prompts/library/bundeskasse/other/prompt.txt +8 -6
  31. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  32. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  33. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  34. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  35. src/prompts/library/customsInvoice/other/placeholders.json +29 -20
  36. src/prompts/library/customsInvoice/other/prompt.txt +9 -4
  37. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  38. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  39. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  40. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  41. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  42. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  43. src/prompts/library/packingList/other/placeholders.json +98 -0
  44. src/prompts/library/packingList/other/prompt.txt +1 -1
  45. src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
  46. src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  47. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  48. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  49. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  50. src/setup.py +13 -61
  51. src/utils.py +189 -29
  52. data_science_document_ai-1.37.0.dist-info/RECORD +0 -59
  53. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  54. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  55. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  56. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
src/pdf_processing.py CHANGED
@@ -9,12 +9,17 @@ logger = logging.getLogger(__name__)
9
9
  import asyncio
10
10
  from collections import defaultdict
11
11
 
12
+ from ddtrace import tracer
12
13
  from fastapi import HTTPException
13
14
  from google.cloud.documentai_v1 import Document as docaiv1_document
14
15
 
15
16
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
17
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values
18
+ from src.postprocessing.common import (
19
+ format_all_entities,
20
+ llm_prediction_to_tuples,
21
+ remove_none_values,
22
+ )
18
23
  from src.postprocessing.postprocess_booking_confirmation import (
19
24
  postprocess_booking_confirmation,
20
25
  )
@@ -28,13 +33,18 @@ from src.prompts.prompt_library import prompt_library
28
33
  from src.utils import (
29
34
  extract_top_pages,
30
35
  generate_schema_structure,
36
+ get_pdf_page_count,
31
37
  get_processor_name,
32
38
  run_background_tasks,
39
+ split_pdf_into_chunks,
40
+ transform_schema_strings,
33
41
  validate_based_on_schema,
34
42
  )
35
43
 
36
44
 
37
- async def process_file_w_docai(params, image_content, client, processor_name):
45
+ async def process_file_w_docai(
46
+ params, image_content, client, processor_name, doc_type=None
47
+ ):
38
48
  """
39
49
  Process a file using Document AI.
40
50
 
@@ -43,6 +53,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
43
53
  image_content (bytes): The file to be processed. It can be bytes object.
44
54
  client: The Document AI client.
45
55
  processor_name (str): The name of the processor to be used.
56
+ doc_type (str, optional): Document type for cost tracking labels.
46
57
 
47
58
  Returns:
48
59
  The processed document.
@@ -54,7 +65,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
54
65
 
55
66
  try:
56
67
  logger.info("Processing document...")
57
- result = await _process_pdf_w_docai(image_content, client, processor_name)
68
+ result = await _process_pdf_w_docai(
69
+ image_content, client, processor_name, doc_type=doc_type
70
+ )
58
71
  except Exception as e:
59
72
  if e.reason == "PAGE_LIMIT_EXCEEDED":
60
73
  logger.warning(
@@ -63,7 +76,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
63
76
  # Process the document in batch method (offline processing)
64
77
  try:
65
78
  result = await _batch_process_pdf_w_docai(
66
- params, image_content, client, processor_name
79
+ params, image_content, client, processor_name, doc_type=doc_type
67
80
  )
68
81
  except Exception as batch_e:
69
82
  logger.error(f"Error processing document {batch_e}.")
@@ -93,7 +106,7 @@ async def extract_data_from_pdf_w_docai(
93
106
  )
94
107
 
95
108
  result = await process_file_w_docai(
96
- params, file_content, processor_client, processor_name
109
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
97
110
  )
98
111
 
99
112
  # Create an entity object to store the result in gcs
@@ -104,9 +117,22 @@ async def extract_data_from_pdf_w_docai(
104
117
  # Extract entities from the result
105
118
  for entity in result.entities:
106
119
  value = (
107
- {child.type_: child.mention_text for child in entity.properties}
120
+ {
121
+ child.type_: (
122
+ child.mention_text,
123
+ child.page_anchor.page_refs[0].page
124
+ if hasattr(child.page_anchor.page_refs[0], "page")
125
+ else 0,
126
+ )
127
+ for child in entity.properties
128
+ }
108
129
  if entity.properties
109
- else entity.mention_text
130
+ else (
131
+ entity.mention_text,
132
+ entity.page_anchor.page_refs[0].page
133
+ if hasattr(entity.page_anchor.page_refs[0], "page")
134
+ else 0,
135
+ )
110
136
  )
111
137
  aggregated_data[entity.type_].append(value)
112
138
 
@@ -137,7 +163,9 @@ async def extract_data_from_pdf_w_docai(
137
163
  return aggregated_data, result_for_store, processor_version
138
164
 
139
165
 
140
- async def identify_carrier(document, llm_client, prompt, response_schema):
166
+ async def identify_carrier(
167
+ document, llm_client, prompt, response_schema, doc_type=None
168
+ ):
141
169
  """Identify the carrier from the Booking Confirmation document."""
142
170
 
143
171
  result = await llm_client.ask_gemini(
@@ -145,6 +173,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
145
173
  document=document,
146
174
  response_schema=response_schema,
147
175
  response_mime_type="text/x.enum",
176
+ doc_type=doc_type,
148
177
  )
149
178
 
150
179
  if result:
@@ -167,61 +196,150 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
167
196
  result (dict): The structured data extracted from the document, formatted as JSON.
168
197
  """
169
198
  # Bundeskasse invoices contains all the required information in the first 3 pages.
170
- file_content = (
171
- extract_top_pages(file_content, num_pages=5)
172
- if input_doc_type == "bundeskasse"
173
- else file_content
174
- )
199
+ if input_doc_type == "bundeskasse":
200
+ file_content = extract_top_pages(file_content, num_pages=5)
175
201
 
176
- # convert file_content to required document
177
- document = llm_client.prepare_document_for_gemini(file_content)
202
+ number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
178
204
 
179
- # get the schema placeholder from the Doc AI and generate the response structure
180
- response_schema = (
181
- prompt_library.library[input_doc_type]["other"]["placeholders"]
182
- if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
183
- else generate_schema_structure(params, input_doc_type)
184
- )
205
+ # get the schema placeholder
206
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
185
207
 
186
208
  carrier = "other"
187
- if (
188
- "preprocessing" in prompt_library.library.keys()
189
- and "carrier" in prompt_library.library["preprocessing"].keys()
190
- and input_doc_type
191
- in prompt_library.library["preprocessing"]["carrier"]["placeholders"].keys()
192
- ):
193
- carrier_schema = prompt_library.library["preprocessing"]["carrier"][
194
- "placeholders"
195
- ][input_doc_type]
209
+ carrier_schema = (
210
+ prompt_library.library.get("preprocessing", {})
211
+ .get("carrier", {})
212
+ .get("placeholders", {})
213
+ .get(input_doc_type)
214
+ )
196
215
 
216
+ if carrier_schema:
197
217
  carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
198
218
  carrier_prompt = carrier_prompt.replace(
199
219
  "DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
200
220
  )
201
221
 
222
+ # convert file_content to required document
223
+ document = llm_client.prepare_document_for_gemini(file_content)
224
+
202
225
  # identify carrier for customized prompting
203
226
  carrier = await identify_carrier(
204
- document, llm_client, carrier_prompt, carrier_schema
227
+ document,
228
+ llm_client,
229
+ carrier_prompt,
230
+ carrier_schema,
231
+ doc_type=input_doc_type,
205
232
  )
206
233
 
207
- if input_doc_type == "bookingConfirmation":
208
- response_schema = prompt_library.library[input_doc_type][carrier][
209
- "placeholders"
210
- ]
211
-
234
+ # Select prompt
212
235
  if (
213
- input_doc_type in prompt_library.library.keys()
214
- and carrier in prompt_library.library[input_doc_type].keys()
236
+ input_doc_type not in prompt_library.library
237
+ or carrier not in prompt_library.library[input_doc_type]
215
238
  ):
216
- # get the related prompt from predefined prompt library
217
- prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
239
+ return {}
240
+
241
+ # get the related prompt from predefined prompt library
242
+ prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
243
+
244
+ # Add page-number extraction for moderately large docs
245
+ use_chunking = number_of_pages >= params["chunk_after"]
218
246
 
219
- # generate the result with LLM (gemini)
220
- result = await llm_client.get_unified_json_genai(
221
- prompt=prompt, document=document, response_schema=response_schema
247
+ # Update schema and prompt to extract value-page_number pairs
248
+ if not use_chunking and number_of_pages > 1:
249
+ response_schema = transform_schema_strings(response_schema)
250
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
251
+
252
+ tasks = []
253
+ # Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
254
+ for chunk in (
255
+ split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
256
+ if use_chunking
257
+ else [file_content]
258
+ ):
259
+ tasks.append(
260
+ process_chunk_with_retry(
261
+ chunk, prompt, response_schema, llm_client, input_doc_type
262
+ )
222
263
  )
223
- return result
224
- return {}
264
+
265
+ results = await asyncio.gather(*tasks, return_exceptions=True)
266
+
267
+ if use_chunking:
268
+ return merge_llm_results(results, response_schema)
269
+ else:
270
+ return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
271
+
272
+
273
+ async def process_chunk_with_retry(
274
+ chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
275
+ ):
276
+ """Process a chunk with retries in case of failure."""
277
+ for attempt in range(1, retries + 1):
278
+ try:
279
+ return await process_chunk(
280
+ chunk_content=chunk_content,
281
+ prompt=prompt,
282
+ response_schema=response_schema,
283
+ llm_client=llm_client,
284
+ input_doc_type=input_doc_type,
285
+ )
286
+ except Exception as e:
287
+ logger.error(f"Chunk failed on attempt {attempt}: {e}")
288
+ if attempt == retries:
289
+ raise
290
+ await asyncio.sleep(1) # small backoff
291
+
292
+
293
+ async def process_chunk(
294
+ chunk_content, prompt, response_schema, llm_client, input_doc_type
295
+ ):
296
+ """Process a chunk with Gemini."""
297
+ document = llm_client.prepare_document_for_gemini(chunk_content)
298
+ return await llm_client.get_unified_json_genai(
299
+ prompt=prompt,
300
+ document=document,
301
+ response_schema=response_schema,
302
+ doc_type=input_doc_type,
303
+ )
304
+
305
+
306
+ def merge_llm_results(results, response_schema):
307
+ """Merge LLM results from multiple chunks."""
308
+ merged = {}
309
+ for i, result in enumerate(results):
310
+ if not isinstance(result, dict):
311
+ continue
312
+ # Add page number to all values coming from this chunk
313
+ result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
314
+
315
+ # Merge the result into the final merged dictionary
316
+ for key, value in result.items():
317
+ field_type = (
318
+ response_schema["properties"].get(key, {}).get("type", "").upper()
319
+ )
320
+
321
+ if key not in merged:
322
+ if field_type == "ARRAY":
323
+ # append the values as a list
324
+ merged[key] = (
325
+ value if isinstance(value, list) else ([value] if value else [])
326
+ )
327
+ else:
328
+ merged[key] = value
329
+ continue
330
+
331
+ if field_type == "ARRAY":
332
+ # append list contents across chunks
333
+ if isinstance(value, list):
334
+ merged[key].extend(value)
335
+ else:
336
+ merged[key].append(value)
337
+
338
+ # take first non-null value only
339
+ if merged[key] in (None, "", [], {}):
340
+ merged[key] = value
341
+
342
+ return merged
225
343
 
226
344
 
227
345
  async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
@@ -298,15 +416,9 @@ async def extract_data_by_doctype(
298
416
  processor_client,
299
417
  if_use_docai,
300
418
  if_use_llm,
419
+ llm_client,
301
420
  isBetaTest=False,
302
421
  ):
303
- # Select LLM client (Using 2.5 Flash model for Bundeskasse)
304
- llm_client = (
305
- params["LlmClient_Flash"]
306
- if input_doc_type == "bundeskasse"
307
- else params["LlmClient"]
308
- )
309
-
310
422
  async def extract_w_docai():
311
423
  return await extract_data_from_pdf_w_docai(
312
424
  params=params,
@@ -355,7 +467,7 @@ async def data_extraction_manual_flow(
355
467
  meta,
356
468
  processor_client,
357
469
  schema_client,
358
- embed_manager,
470
+ use_default_logging=False,
359
471
  ):
360
472
  """
361
473
  Process a PDF file and extract data from it.
@@ -367,7 +479,6 @@ async def data_extraction_manual_flow(
367
479
  meta (DocumentMeta): Metadata associated with the document.
368
480
  processor_client (DocumentProcessorClient): Client for the Document AI processor.
369
481
  schema_client (DocumentSchemaClient): Client for the Document AI schema.
370
- embed_manager (EmbeddingsManager): Manager for embeddings.
371
482
 
372
483
  Returns:
373
484
  dict: A dictionary containing the processed document information.
@@ -377,6 +488,15 @@ async def data_extraction_manual_flow(
377
488
  """
378
489
  # Get the start time for processing
379
490
  start_time = asyncio.get_event_loop().time()
491
+
492
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
493
+ llm_client = (
494
+ params["LlmClient_Flash"]
495
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
496
+ else params["LlmClient"]
497
+ )
498
+
499
+ page_count = None
380
500
  # Validate the file type
381
501
  if mime_type == "application/pdf":
382
502
  # Enable Doc Ai only for certain document types.
@@ -398,8 +518,10 @@ async def data_extraction_manual_flow(
398
518
  processor_client,
399
519
  if_use_docai=if_use_docai,
400
520
  if_use_llm=if_use_llm,
521
+ llm_client=llm_client,
401
522
  isBetaTest=False,
402
523
  )
524
+ page_count = get_pdf_page_count(file_content)
403
525
 
404
526
  elif "excel" in mime_type or "spreadsheet" in mime_type:
405
527
  # Extract data from the Excel file
@@ -407,10 +529,20 @@ async def data_extraction_manual_flow(
407
529
  params=params,
408
530
  input_doc_type=meta.documentTypeCode,
409
531
  file_content=file_content,
410
- schema_client=schema_client,
411
532
  mime_type=mime_type,
533
+ llm_client=llm_client,
412
534
  )
413
535
 
536
+ # Get sheet count from dd-trace span (set in extract_data_from_excel)
537
+ # Note: we use the span metric instead of len(extracted_data) because
538
+ # some sheets may fail extraction and not appear in extracted_data
539
+ span = tracer.current_span()
540
+ page_count = span.get_metric("est_page_count") if span else len(extracted_data)
541
+ if page_count > 100:
542
+ logger.warning(
543
+ f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
544
+ )
545
+
414
546
  else:
415
547
  raise HTTPException(
416
548
  status_code=400,
@@ -418,7 +550,7 @@ async def data_extraction_manual_flow(
418
550
  )
419
551
  # Create the result dictionary with the extracted data
420
552
  extracted_data = await format_all_entities(
421
- extracted_data, embed_manager, meta.documentTypeCode, params
553
+ extracted_data, meta.documentTypeCode, params, mime_type
422
554
  )
423
555
  result = {
424
556
  "id": meta.id,
@@ -433,7 +565,9 @@ async def data_extraction_manual_flow(
433
565
  logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
434
566
 
435
567
  # Schedule background tasks without using FastAPI's BackgroundTasks
436
- if os.getenv("CLUSTER") != "ode": # skip data export to bigquery in ODE environment
568
+ if (
569
+ os.getenv("CLUSTER") != "ode"
570
+ ) & use_default_logging: # skip data export to bigquery in ODE environment
437
571
  asyncio.create_task(
438
572
  run_background_tasks(
439
573
  params,
@@ -444,6 +578,7 @@ async def data_extraction_manual_flow(
444
578
  processor_version,
445
579
  mime_type,
446
580
  elapsed_time,
581
+ page_count,
447
582
  )
448
583
  )
449
584
  return result