data-science-document-ai 1.43.4__py3-none-any.whl → 1.43.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.4
3
+ Version: 1.43.6
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -2,12 +2,12 @@ src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
4
  src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
5
- src/excel_processing.py,sha256=AppxrliVj7cLv1I_X7xC5bq4OPFAeiVNMNwcp-TZZDs,3466
5
+ src/excel_processing.py,sha256=PdypkXHf-hln5cq5TyJ_IVybZk-rJF1NKZ50KXuOSdY,3390
6
6
  src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
7
  src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=sr41bSMbH-WwWQ9uF5WKnsrkXJzCDirziv3TaS8hoPQ,17164
10
- src/postprocessing/common.py,sha256=b0VpxM-levZi_8H5a9gDNjx-67W6F7xRFUDa7CJJGgQ,22214
9
+ src/pdf_processing.py,sha256=DaFM8ioERj7YeC8Yjki_dfSnKt0lf7DB14ks9i4OAfA,17741
10
+ src/postprocessing/common.py,sha256=fU3ECfnR0rpF21DnVYM2YM7kPEB4gRJuMasyrNupsaA,23026
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
@@ -26,12 +26,12 @@ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOc
26
26
  src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
27
27
  src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
28
28
  src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
29
- src/prompts/library/bundeskasse/other/prompt.txt,sha256=WV4D3ellIcB2cVmsZXCpbbHOShYY8VN_iZrYOuyoqzw,2937
29
+ src/prompts/library/bundeskasse/other/prompt.txt,sha256=MBv4MIMASMstkzDS7H0q_pNJbPQeadP1vcmhCRrpjQ4,2906
30
30
  src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
31
- src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
31
+ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
32
32
  src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
33
33
  src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
34
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=Q5ihAVaZFToZ75D01ICEdCRB8nY_FD5DL3yuFvJ4418,9632
34
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
35
35
  src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
36
36
  src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
37
37
  src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
@@ -44,16 +44,16 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
44
44
  src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
45
45
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
46
46
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
47
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=fGUtMYWvhedmSiv9xShRv0cHXmEws1D9pQmZP1E2gl0,7806
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=4WGEQ6EiOtQxB7iwKy_Hg0PQzCEoFbjJUwEawwTgWiw,7775
48
48
  src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
49
  src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
50
  src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
51
51
  src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
52
52
  src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
53
- src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
53
+ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
54
54
  src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
55
55
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
56
- src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
57
- data_science_document_ai-1.43.4.dist-info/METADATA,sha256=bcmTXEnl4r0z7IqelSFuCyfxNJjnPvEY2snX1WViH9s,2152
58
- data_science_document_ai-1.43.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
- data_science_document_ai-1.43.4.dist-info/RECORD,,
56
+ src/utils.py,sha256=iUFjfIKXl_MwkPXPMfK0ZAB9aZ__N6e8mWTBbBiPki4,16568
57
+ data_science_document_ai-1.43.6.dist-info/METADATA,sha256=hyfRauOLmwLyBPOsJKBmKH70yWCvjZnXbeUkY6fX8aY,2152
58
+ data_science_document_ai-1.43.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
+ data_science_document_ai-1.43.6.dist-info/RECORD,,
src/excel_processing.py CHANGED
@@ -19,7 +19,7 @@ from src.utils import estimate_page_count, generate_schema_structure, get_excel_
19
19
 
20
20
 
21
21
  async def extract_data_from_sheet(
22
- params, sheet_name, sheet, response_schema, doc_type=None
22
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
23
23
  ):
24
24
  logger.info(f"Processing sheet: {sheet_name}")
25
25
  excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
@@ -34,7 +34,7 @@ async def extract_data_from_sheet(
34
34
  prompt_docai = prompt_excel_extraction(worksheet)
35
35
 
36
36
  try:
37
- result = await params["LlmClient"].get_unified_json_genai(
37
+ result = await llm_client.get_unified_json_genai(
38
38
  prompt_docai,
39
39
  response_schema=response_schema,
40
40
  doc_type=doc_type,
@@ -51,6 +51,7 @@ async def extract_data_from_excel(
51
51
  input_doc_type,
52
52
  file_content,
53
53
  mime_type,
54
+ llm_client,
54
55
  ):
55
56
  """Extract data from the Excel file.
56
57
 
@@ -59,6 +60,7 @@ async def extract_data_from_excel(
59
60
  input_doc_type (str): The type of the document.
60
61
  file_content (bytes): The content of the Excel file to process.
61
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
62
64
 
63
65
  Returns:
64
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -95,7 +97,7 @@ async def extract_data_from_excel(
95
97
  # Excel files may contain multiple sheets. Extract data from each sheet
96
98
  sheet_extract_tasks = [
97
99
  extract_data_from_sheet(
98
- params,
100
+ llm_client,
99
101
  sheet_name,
100
102
  workbook[sheet_name],
101
103
  response_schema,
@@ -105,7 +107,4 @@ async def extract_data_from_excel(
105
107
  ]
106
108
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
107
109
 
108
- # Convert LLM prediction dictionary to tuples of (value, page_number).
109
- extracted_data = llm_prediction_to_tuples(extracted_data)
110
-
111
- return extracted_data, extracted_data, params["gemini_params"]["model_id"]
110
+ return extracted_data, extracted_data, llm_client.model_id
src/pdf_processing.py CHANGED
@@ -36,6 +36,7 @@ from src.utils import (
36
36
  get_pdf_page_count,
37
37
  get_processor_name,
38
38
  run_background_tasks,
39
+ transform_schema_strings,
39
40
  validate_based_on_schema,
40
41
  )
41
42
 
@@ -199,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
199
200
  if input_doc_type == "bundeskasse"
200
201
  else file_content
201
202
  )
203
+ number_of_pages = get_pdf_page_count(file_content)
202
204
 
203
205
  # convert file_content to required document
204
206
  document = llm_client.prepare_document_for_gemini(file_content)
@@ -254,6 +256,13 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
254
256
  # get the related prompt from predefined prompt library
255
257
  prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
256
258
 
259
+ # Update schema to extract value-page_number pairs
260
+ if number_of_pages > 1:
261
+ response_schema = transform_schema_strings(response_schema)
262
+
263
+ # Update the prompt to instruct LLM to include page numbers
264
+ prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
265
+
257
266
  # generate the result with LLM (gemini)
258
267
  result = await llm_client.get_unified_json_genai(
259
268
  prompt=prompt,
@@ -262,7 +271,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
262
271
  doc_type=input_doc_type,
263
272
  )
264
273
 
265
- result = llm_prediction_to_tuples(result)
274
+ result = llm_prediction_to_tuples(result, number_of_pages)
266
275
 
267
276
  return result
268
277
  return {}
@@ -342,15 +351,9 @@ async def extract_data_by_doctype(
342
351
  processor_client,
343
352
  if_use_docai,
344
353
  if_use_llm,
354
+ llm_client,
345
355
  isBetaTest=False,
346
356
  ):
347
- # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
348
- llm_client = (
349
- params["LlmClient_Flash"]
350
- if input_doc_type not in ["customsInvoice", "partnerInvoice"]
351
- else params["LlmClient"]
352
- )
353
-
354
357
  async def extract_w_docai():
355
358
  return await extract_data_from_pdf_w_docai(
356
359
  params=params,
@@ -420,6 +423,14 @@ async def data_extraction_manual_flow(
420
423
  """
421
424
  # Get the start time for processing
422
425
  start_time = asyncio.get_event_loop().time()
426
+
427
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
428
+ llm_client = (
429
+ params["LlmClient_Flash"]
430
+ if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
431
+ else params["LlmClient"]
432
+ )
433
+
423
434
  page_count = None
424
435
  # Validate the file type
425
436
  if mime_type == "application/pdf":
@@ -442,6 +453,7 @@ async def data_extraction_manual_flow(
442
453
  processor_client,
443
454
  if_use_docai=if_use_docai,
444
455
  if_use_llm=if_use_llm,
456
+ llm_client=llm_client,
445
457
  isBetaTest=False,
446
458
  )
447
459
  page_count = get_pdf_page_count(file_content)
@@ -453,6 +465,7 @@ async def data_extraction_manual_flow(
453
465
  input_doc_type=meta.documentTypeCode,
454
466
  file_content=file_content,
455
467
  mime_type=mime_type,
468
+ llm_client=llm_client,
456
469
  )
457
470
 
458
471
  # Get sheet count from dd-trace span (set in extract_data_from_excel)
@@ -472,7 +485,7 @@ async def data_extraction_manual_flow(
472
485
  )
473
486
  # Create the result dictionary with the extracted data
474
487
  extracted_data = await format_all_entities(
475
- extracted_data, meta.documentTypeCode, params
488
+ extracted_data, meta.documentTypeCode, params, mime_type
476
489
  )
477
490
  result = {
478
491
  "id": meta.id,
@@ -372,27 +372,30 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
372
372
  return re.sub(r"\s{2,}", " ", lineitem).strip()
373
373
 
374
374
 
375
- async def format_label(entity_k, entity_value, document_type_code, params):
375
+ async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
376
376
  llm_client = params["LlmClient"]
377
377
  if isinstance(entity_value, dict): # if it's a nested entity
378
378
  format_tasks = [
379
- format_label(sub_k, sub_v, document_type_code, params)
379
+ format_label(sub_k, sub_v, document_type_code, params, mime_type)
380
380
  for sub_k, sub_v in entity_value.items()
381
381
  ]
382
382
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
383
383
  if isinstance(entity_value, list):
384
384
  format_tasks = await asyncio.gather(
385
385
  *[
386
- format_label(entity_k, sub_v, document_type_code, params)
386
+ format_label(entity_k, sub_v, document_type_code, params, mime_type)
387
387
  for sub_v in entity_value
388
388
  ]
389
389
  )
390
390
  return entity_k, [v for _, v in format_tasks]
391
- if isinstance(entity_value, tuple):
392
- page = entity_value[1]
393
- entity_value = entity_value[0]
394
- else:
395
- page = -1
391
+
392
+ if mime_type == "application/pdf":
393
+ if isinstance(entity_value, tuple):
394
+ page = entity_value[1]
395
+ entity_value = entity_value[0]
396
+ else:
397
+ page = -1
398
+
396
399
  entity_key = entity_k.lower()
397
400
  formatted_value = None
398
401
 
@@ -493,8 +496,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
493
496
  result = {
494
497
  "documentValue": entity_value,
495
498
  "formattedValue": formatted_value,
496
- "page": page,
497
499
  }
500
+ if mime_type == "application/pdf":
501
+ result["page"] = page
502
+
498
503
  return entity_k, result
499
504
 
500
505
 
@@ -593,7 +598,7 @@ def decimal_convertor(value, quantity=False):
593
598
  return value
594
599
 
595
600
 
596
- async def format_all_entities(result, document_type_code, params):
601
+ async def format_all_entities(result, document_type_code, params, mime_type):
597
602
  """Format the entity values in the result dictionary."""
598
603
  # Since we treat `customsInvoice` same as `partnerInvoice`
599
604
  document_type_code = (
@@ -608,7 +613,9 @@ async def format_all_entities(result, document_type_code, params):
608
613
  return {}
609
614
 
610
615
  # Format all entities recursively
611
- _, aggregated_data = await format_label(None, result, document_type_code, params)
616
+ _, aggregated_data = await format_label(
617
+ None, result, document_type_code, params, mime_type
618
+ )
612
619
 
613
620
  # Process partner invoice on lineitem mapping and reverse charge sentence
614
621
  if document_type_code in ["partnerInvoice", "bundeskasse"]:
@@ -644,8 +651,24 @@ def remove_stop_words(lineitem: str):
644
651
  )
645
652
 
646
653
 
647
- def llm_prediction_to_tuples(llm_prediction):
654
+ def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
648
655
  """Convert LLM prediction dictionary to tuples of (value, page_number)."""
656
+
657
+ # If only 1 page, simply pair each value with page number 0
658
+ if number_of_pages == 1:
659
+ if isinstance(llm_prediction, dict):
660
+ return {
661
+ k: llm_prediction_to_tuples(v, number_of_pages)
662
+ for k, v in llm_prediction.items()
663
+ }
664
+ elif isinstance(llm_prediction, list):
665
+ return [
666
+ llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
667
+ ]
668
+ else:
669
+ return (llm_prediction, 0) if llm_prediction else None
670
+
671
+ # logic for multi-page predictions
649
672
  if isinstance(llm_prediction, dict):
650
673
  if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
651
674
  if llm_prediction["value"]:
@@ -655,11 +678,14 @@ def llm_prediction_to_tuples(llm_prediction):
655
678
  page_number = -1
656
679
  return (llm_prediction["value"], page_number)
657
680
  return None
681
+
658
682
  for key, value in llm_prediction.items():
659
683
  llm_prediction[key] = llm_prediction_to_tuples(
660
- llm_prediction.get(key, value)
684
+ llm_prediction.get(key, value), number_of_pages
661
685
  )
686
+
662
687
  elif isinstance(llm_prediction, list):
663
688
  for i, item in enumerate(llm_prediction):
664
- llm_prediction[i] = llm_prediction_to_tuples(item)
689
+ llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
690
+
665
691
  return llm_prediction
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data and page numbers starting from 0 from customs invoice documents as per the given response schema structure.<TASK>
3
+ <TASK> Your task is to extract data from customs invoice documents as per the given response schema structure. <TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.
@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
2
2
 
3
3
  Extract all the data points from the given document.
4
4
  Each data point is part of a master field called skus. There may be multiple skus entries in a document.
5
- Your task is to extract the text value of the entities and page numbers starting from 0 starting from 0 where the value was found in the document.
5
+ Your task is to extract the text value of the entities and page numbers starting from 0 where the value was found in the document.
6
6
 
7
7
 
8
8
  Instructions:
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data and their page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
3
+ <TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data and page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
3
+ <TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
@@ -4,8 +4,6 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import Dict
6
6
 
7
- from src.utils import transform_schema_strings
8
-
9
7
 
10
8
  class PromptLibrary:
11
9
  """
@@ -43,8 +41,6 @@ class PromptLibrary:
43
41
  if file == "placeholders.json":
44
42
  with open(path_to_library / prompt_type / prompt_subtype / file) as f:
45
43
  placeholders = json.load(f)
46
- if prompt_type not in ["postprocessing", "preprocessing"]:
47
- placeholders = transform_schema_strings(placeholders)
48
44
  self.library[prompt_type][prompt_subtype][
49
45
  "placeholders"
50
46
  ] = placeholders
src/utils.py CHANGED
@@ -314,9 +314,6 @@ def generate_schema_structure(params, input_doc_type):
314
314
  "type": "string",
315
315
  }
316
316
 
317
- # update schema to extract value-page_number pairs
318
- response_schema = transform_schema_strings(response_schema)
319
-
320
317
  return response_schema
321
318
 
322
319
 
@@ -446,12 +443,23 @@ def transform_schema_strings(schema):
446
443
  Returns:
447
444
  dict: The transformed schema dictionary.
448
445
  """
449
- # Base case: if the current schema definition is for a string
450
- if isinstance(schema, dict) and schema.get("type").upper() == "STRING":
451
- new_schema = {
446
+ if not isinstance(schema, dict):
447
+ return schema
448
+
449
+ schema_type = schema.get("type")
450
+ if not schema_type:
451
+ return schema
452
+
453
+ # Base case: STRING → OBJECT (only if not already transformed)
454
+ if schema_type.upper() == "STRING":
455
+ return {
452
456
  "type": "OBJECT",
453
457
  "properties": {
454
- "value": {"type": "STRING"},
458
+ "value": {
459
+ "type": "STRING",
460
+ "nullable": schema.get("nullable", False),
461
+ "description": schema.get("description", ""),
462
+ },
455
463
  "page_number": {
456
464
  "type": "STRING",
457
465
  "description": "Number of a page where the value was found in the document starting from 0.",
@@ -460,29 +468,29 @@ def transform_schema_strings(schema):
460
468
  "required": [],
461
469
  }
462
470
 
463
- # Preserve original properties like nullable and description on the new 'value' key
464
- if "nullable" in schema:
465
- new_schema["properties"]["value"]["nullable"] = schema["nullable"]
466
- if "description" in schema:
467
- new_schema["properties"]["value"]["description"] = schema["description"]
471
+ # Skip already transformed OBJECT (has both 'value' & 'page_number')
472
+ if (
473
+ schema_type.upper() == "OBJECT"
474
+ and "properties" in schema
475
+ and {"value", "page_number"}.issubset(schema["properties"].keys())
476
+ ):
477
+ return schema
468
478
 
479
+ # Recursive case for OBJECT
480
+ if schema_type.upper() == "OBJECT" and "properties" in schema:
481
+ new_schema = schema.copy()
482
+ new_schema["properties"] = {
483
+ k: transform_schema_strings(v) for k, v in schema["properties"].items()
484
+ }
469
485
  return new_schema
470
486
 
471
- # Recursive case: if the schema is a dictionary
472
- elif isinstance(schema, dict) and schema.get("type").upper() == "OBJECT":
473
- transformed_schema = schema.copy()
474
- for key, value in schema.get("properties").items():
475
- transformed_schema["properties"][key] = transform_schema_strings(value)
476
- return transformed_schema
477
-
478
- # Recursive case: if the schema is a list
479
- elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
480
- schema["items"] = transform_schema_strings(schema["items"])
481
- return schema
487
+ # Recursive case for ARRAY
488
+ if schema_type.upper() == "ARRAY" and "items" in schema:
489
+ new_schema = schema.copy()
490
+ new_schema["items"] = transform_schema_strings(schema["items"])
491
+ return new_schema
482
492
 
483
- # Base case: for non-dict/list values (e.g., None, bool, str)
484
- else:
485
- return schema
493
+ return schema
486
494
 
487
495
 
488
496
  def estimate_page_count(sheet):