data-science-document-ai 1.43.4__py3-none-any.whl → 1.43.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.4.dist-info → data_science_document_ai-1.43.6.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.43.4.dist-info → data_science_document_ai-1.43.6.dist-info}/RECORD +12 -12
- src/excel_processing.py +6 -7
- src/pdf_processing.py +22 -9
- src/postprocessing/common.py +40 -14
- src/prompts/library/bundeskasse/other/prompt.txt +1 -1
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
- src/prompts/prompt_library.py +0 -4
- src/utils.py +34 -26
- {data_science_document_ai-1.43.4.dist-info → data_science_document_ai-1.43.6.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.43.4.dist-info → data_science_document_ai-1.43.6.dist-info}/RECORD
RENAMED
|
@@ -2,12 +2,12 @@ src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
|
|
|
2
2
|
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
3
|
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
4
|
src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
|
|
5
|
-
src/excel_processing.py,sha256=
|
|
5
|
+
src/excel_processing.py,sha256=PdypkXHf-hln5cq5TyJ_IVybZk-rJF1NKZ50KXuOSdY,3390
|
|
6
6
|
src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
|
|
7
7
|
src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=DaFM8ioERj7YeC8Yjki_dfSnKt0lf7DB14ks9i4OAfA,17741
|
|
10
|
+
src/postprocessing/common.py,sha256=fU3ECfnR0rpF21DnVYM2YM7kPEB4gRJuMasyrNupsaA,23026
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
|
|
@@ -26,12 +26,12 @@ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOc
|
|
|
26
26
|
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
27
27
|
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
|
|
28
28
|
src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
|
|
29
|
-
src/prompts/library/bundeskasse/other/prompt.txt,sha256=
|
|
29
|
+
src/prompts/library/bundeskasse/other/prompt.txt,sha256=MBv4MIMASMstkzDS7H0q_pNJbPQeadP1vcmhCRrpjQ4,2906
|
|
30
30
|
src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
|
|
31
|
-
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=
|
|
31
|
+
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
|
|
32
32
|
src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
|
|
33
33
|
src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
|
|
34
|
-
src/prompts/library/customsInvoice/other/prompt.txt,sha256=
|
|
34
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
|
|
35
35
|
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
|
|
36
36
|
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
|
|
37
37
|
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
|
|
@@ -44,16 +44,16 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
|
|
|
44
44
|
src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
|
|
45
45
|
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
46
46
|
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
|
|
47
|
-
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=
|
|
47
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=4WGEQ6EiOtQxB7iwKy_Hg0PQzCEoFbjJUwEawwTgWiw,7775
|
|
48
48
|
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
49
49
|
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
50
50
|
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
|
|
51
51
|
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
52
52
|
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
|
|
53
|
-
src/prompts/prompt_library.py,sha256=
|
|
53
|
+
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
54
54
|
src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
|
|
55
55
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
56
|
-
src/utils.py,sha256=
|
|
57
|
-
data_science_document_ai-1.43.
|
|
58
|
-
data_science_document_ai-1.43.
|
|
59
|
-
data_science_document_ai-1.43.
|
|
56
|
+
src/utils.py,sha256=iUFjfIKXl_MwkPXPMfK0ZAB9aZ__N6e8mWTBbBiPki4,16568
|
|
57
|
+
data_science_document_ai-1.43.6.dist-info/METADATA,sha256=hyfRauOLmwLyBPOsJKBmKH70yWCvjZnXbeUkY6fX8aY,2152
|
|
58
|
+
data_science_document_ai-1.43.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
59
|
+
data_science_document_ai-1.43.6.dist-info/RECORD,,
|
src/excel_processing.py
CHANGED
|
@@ -19,7 +19,7 @@ from src.utils import estimate_page_count, generate_schema_structure, get_excel_
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
async def extract_data_from_sheet(
|
|
22
|
-
|
|
22
|
+
llm_client, sheet_name, sheet, response_schema, doc_type=None
|
|
23
23
|
):
|
|
24
24
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
25
25
|
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
@@ -34,7 +34,7 @@ async def extract_data_from_sheet(
|
|
|
34
34
|
prompt_docai = prompt_excel_extraction(worksheet)
|
|
35
35
|
|
|
36
36
|
try:
|
|
37
|
-
result = await
|
|
37
|
+
result = await llm_client.get_unified_json_genai(
|
|
38
38
|
prompt_docai,
|
|
39
39
|
response_schema=response_schema,
|
|
40
40
|
doc_type=doc_type,
|
|
@@ -51,6 +51,7 @@ async def extract_data_from_excel(
|
|
|
51
51
|
input_doc_type,
|
|
52
52
|
file_content,
|
|
53
53
|
mime_type,
|
|
54
|
+
llm_client,
|
|
54
55
|
):
|
|
55
56
|
"""Extract data from the Excel file.
|
|
56
57
|
|
|
@@ -59,6 +60,7 @@ async def extract_data_from_excel(
|
|
|
59
60
|
input_doc_type (str): The type of the document.
|
|
60
61
|
file_content (bytes): The content of the Excel file to process.
|
|
61
62
|
mime_type (str): The MIME type of the file.
|
|
63
|
+
llm_client: The LLM client to use for data extraction.
|
|
62
64
|
|
|
63
65
|
Returns:
|
|
64
66
|
formatted_data (list): A list of dictionaries containing the extracted data.
|
|
@@ -95,7 +97,7 @@ async def extract_data_from_excel(
|
|
|
95
97
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
96
98
|
sheet_extract_tasks = [
|
|
97
99
|
extract_data_from_sheet(
|
|
98
|
-
|
|
100
|
+
llm_client,
|
|
99
101
|
sheet_name,
|
|
100
102
|
workbook[sheet_name],
|
|
101
103
|
response_schema,
|
|
@@ -105,7 +107,4 @@ async def extract_data_from_excel(
|
|
|
105
107
|
]
|
|
106
108
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
107
109
|
|
|
108
|
-
|
|
109
|
-
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
110
|
-
|
|
111
|
-
return extracted_data, extracted_data, params["gemini_params"]["model_id"]
|
|
110
|
+
return extracted_data, extracted_data, llm_client.model_id
|
src/pdf_processing.py
CHANGED
|
@@ -36,6 +36,7 @@ from src.utils import (
|
|
|
36
36
|
get_pdf_page_count,
|
|
37
37
|
get_processor_name,
|
|
38
38
|
run_background_tasks,
|
|
39
|
+
transform_schema_strings,
|
|
39
40
|
validate_based_on_schema,
|
|
40
41
|
)
|
|
41
42
|
|
|
@@ -199,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
199
200
|
if input_doc_type == "bundeskasse"
|
|
200
201
|
else file_content
|
|
201
202
|
)
|
|
203
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
202
204
|
|
|
203
205
|
# convert file_content to required document
|
|
204
206
|
document = llm_client.prepare_document_for_gemini(file_content)
|
|
@@ -254,6 +256,13 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
254
256
|
# get the related prompt from predefined prompt library
|
|
255
257
|
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
256
258
|
|
|
259
|
+
# Update schema to extract value-page_number pairs
|
|
260
|
+
if number_of_pages > 1:
|
|
261
|
+
response_schema = transform_schema_strings(response_schema)
|
|
262
|
+
|
|
263
|
+
# Update the prompt to instruct LLM to include page numbers
|
|
264
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
265
|
+
|
|
257
266
|
# generate the result with LLM (gemini)
|
|
258
267
|
result = await llm_client.get_unified_json_genai(
|
|
259
268
|
prompt=prompt,
|
|
@@ -262,7 +271,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
262
271
|
doc_type=input_doc_type,
|
|
263
272
|
)
|
|
264
273
|
|
|
265
|
-
result = llm_prediction_to_tuples(result)
|
|
274
|
+
result = llm_prediction_to_tuples(result, number_of_pages)
|
|
266
275
|
|
|
267
276
|
return result
|
|
268
277
|
return {}
|
|
@@ -342,15 +351,9 @@ async def extract_data_by_doctype(
|
|
|
342
351
|
processor_client,
|
|
343
352
|
if_use_docai,
|
|
344
353
|
if_use_llm,
|
|
354
|
+
llm_client,
|
|
345
355
|
isBetaTest=False,
|
|
346
356
|
):
|
|
347
|
-
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
348
|
-
llm_client = (
|
|
349
|
-
params["LlmClient_Flash"]
|
|
350
|
-
if input_doc_type not in ["customsInvoice", "partnerInvoice"]
|
|
351
|
-
else params["LlmClient"]
|
|
352
|
-
)
|
|
353
|
-
|
|
354
357
|
async def extract_w_docai():
|
|
355
358
|
return await extract_data_from_pdf_w_docai(
|
|
356
359
|
params=params,
|
|
@@ -420,6 +423,14 @@ async def data_extraction_manual_flow(
|
|
|
420
423
|
"""
|
|
421
424
|
# Get the start time for processing
|
|
422
425
|
start_time = asyncio.get_event_loop().time()
|
|
426
|
+
|
|
427
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
428
|
+
llm_client = (
|
|
429
|
+
params["LlmClient_Flash"]
|
|
430
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
431
|
+
else params["LlmClient"]
|
|
432
|
+
)
|
|
433
|
+
|
|
423
434
|
page_count = None
|
|
424
435
|
# Validate the file type
|
|
425
436
|
if mime_type == "application/pdf":
|
|
@@ -442,6 +453,7 @@ async def data_extraction_manual_flow(
|
|
|
442
453
|
processor_client,
|
|
443
454
|
if_use_docai=if_use_docai,
|
|
444
455
|
if_use_llm=if_use_llm,
|
|
456
|
+
llm_client=llm_client,
|
|
445
457
|
isBetaTest=False,
|
|
446
458
|
)
|
|
447
459
|
page_count = get_pdf_page_count(file_content)
|
|
@@ -453,6 +465,7 @@ async def data_extraction_manual_flow(
|
|
|
453
465
|
input_doc_type=meta.documentTypeCode,
|
|
454
466
|
file_content=file_content,
|
|
455
467
|
mime_type=mime_type,
|
|
468
|
+
llm_client=llm_client,
|
|
456
469
|
)
|
|
457
470
|
|
|
458
471
|
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
@@ -472,7 +485,7 @@ async def data_extraction_manual_flow(
|
|
|
472
485
|
)
|
|
473
486
|
# Create the result dictionary with the extracted data
|
|
474
487
|
extracted_data = await format_all_entities(
|
|
475
|
-
extracted_data, meta.documentTypeCode, params
|
|
488
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
476
489
|
)
|
|
477
490
|
result = {
|
|
478
491
|
"id": meta.id,
|
src/postprocessing/common.py
CHANGED
|
@@ -372,27 +372,30 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
372
372
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
373
373
|
|
|
374
374
|
|
|
375
|
-
async def format_label(entity_k, entity_value, document_type_code, params):
|
|
375
|
+
async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
|
|
376
376
|
llm_client = params["LlmClient"]
|
|
377
377
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
378
378
|
format_tasks = [
|
|
379
|
-
format_label(sub_k, sub_v, document_type_code, params)
|
|
379
|
+
format_label(sub_k, sub_v, document_type_code, params, mime_type)
|
|
380
380
|
for sub_k, sub_v in entity_value.items()
|
|
381
381
|
]
|
|
382
382
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
383
383
|
if isinstance(entity_value, list):
|
|
384
384
|
format_tasks = await asyncio.gather(
|
|
385
385
|
*[
|
|
386
|
-
format_label(entity_k, sub_v, document_type_code, params)
|
|
386
|
+
format_label(entity_k, sub_v, document_type_code, params, mime_type)
|
|
387
387
|
for sub_v in entity_value
|
|
388
388
|
]
|
|
389
389
|
)
|
|
390
390
|
return entity_k, [v for _, v in format_tasks]
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
entity_value
|
|
394
|
-
|
|
395
|
-
|
|
391
|
+
|
|
392
|
+
if mime_type == "application/pdf":
|
|
393
|
+
if isinstance(entity_value, tuple):
|
|
394
|
+
page = entity_value[1]
|
|
395
|
+
entity_value = entity_value[0]
|
|
396
|
+
else:
|
|
397
|
+
page = -1
|
|
398
|
+
|
|
396
399
|
entity_key = entity_k.lower()
|
|
397
400
|
formatted_value = None
|
|
398
401
|
|
|
@@ -493,8 +496,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
493
496
|
result = {
|
|
494
497
|
"documentValue": entity_value,
|
|
495
498
|
"formattedValue": formatted_value,
|
|
496
|
-
"page": page,
|
|
497
499
|
}
|
|
500
|
+
if mime_type == "application/pdf":
|
|
501
|
+
result["page"] = page
|
|
502
|
+
|
|
498
503
|
return entity_k, result
|
|
499
504
|
|
|
500
505
|
|
|
@@ -593,7 +598,7 @@ def decimal_convertor(value, quantity=False):
|
|
|
593
598
|
return value
|
|
594
599
|
|
|
595
600
|
|
|
596
|
-
async def format_all_entities(result, document_type_code, params):
|
|
601
|
+
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
597
602
|
"""Format the entity values in the result dictionary."""
|
|
598
603
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
599
604
|
document_type_code = (
|
|
@@ -608,7 +613,9 @@ async def format_all_entities(result, document_type_code, params):
|
|
|
608
613
|
return {}
|
|
609
614
|
|
|
610
615
|
# Format all entities recursively
|
|
611
|
-
_, aggregated_data = await format_label(
|
|
616
|
+
_, aggregated_data = await format_label(
|
|
617
|
+
None, result, document_type_code, params, mime_type
|
|
618
|
+
)
|
|
612
619
|
|
|
613
620
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
614
621
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
@@ -644,8 +651,24 @@ def remove_stop_words(lineitem: str):
|
|
|
644
651
|
)
|
|
645
652
|
|
|
646
653
|
|
|
647
|
-
def llm_prediction_to_tuples(llm_prediction):
|
|
654
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
|
|
648
655
|
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
656
|
+
|
|
657
|
+
# If only 1 page, simply pair each value with page number 0
|
|
658
|
+
if number_of_pages == 1:
|
|
659
|
+
if isinstance(llm_prediction, dict):
|
|
660
|
+
return {
|
|
661
|
+
k: llm_prediction_to_tuples(v, number_of_pages)
|
|
662
|
+
for k, v in llm_prediction.items()
|
|
663
|
+
}
|
|
664
|
+
elif isinstance(llm_prediction, list):
|
|
665
|
+
return [
|
|
666
|
+
llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
|
|
667
|
+
]
|
|
668
|
+
else:
|
|
669
|
+
return (llm_prediction, 0) if llm_prediction else None
|
|
670
|
+
|
|
671
|
+
# logic for multi-page predictions
|
|
649
672
|
if isinstance(llm_prediction, dict):
|
|
650
673
|
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
651
674
|
if llm_prediction["value"]:
|
|
@@ -655,11 +678,14 @@ def llm_prediction_to_tuples(llm_prediction):
|
|
|
655
678
|
page_number = -1
|
|
656
679
|
return (llm_prediction["value"], page_number)
|
|
657
680
|
return None
|
|
681
|
+
|
|
658
682
|
for key, value in llm_prediction.items():
|
|
659
683
|
llm_prediction[key] = llm_prediction_to_tuples(
|
|
660
|
-
llm_prediction.get(key, value)
|
|
684
|
+
llm_prediction.get(key, value), number_of_pages
|
|
661
685
|
)
|
|
686
|
+
|
|
662
687
|
elif isinstance(llm_prediction, list):
|
|
663
688
|
for i, item in enumerate(llm_prediction):
|
|
664
|
-
llm_prediction[i] = llm_prediction_to_tuples(item)
|
|
689
|
+
llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
|
|
690
|
+
|
|
665
691
|
return llm_prediction
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from customs invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.
|
|
@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
|
|
|
2
2
|
|
|
3
3
|
Extract all the data points from the given document.
|
|
4
4
|
Each data point is part of a master field called skus. There may be multiple skus entries in a document.
|
|
5
|
-
Your task is to extract the text value of the entities and page numbers starting from 0
|
|
5
|
+
Your task is to extract the text value of the entities and page numbers starting from 0 where the value was found in the document.
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
Instructions:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
|
src/prompts/prompt_library.py
CHANGED
|
@@ -4,8 +4,6 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict
|
|
6
6
|
|
|
7
|
-
from src.utils import transform_schema_strings
|
|
8
|
-
|
|
9
7
|
|
|
10
8
|
class PromptLibrary:
|
|
11
9
|
"""
|
|
@@ -43,8 +41,6 @@ class PromptLibrary:
|
|
|
43
41
|
if file == "placeholders.json":
|
|
44
42
|
with open(path_to_library / prompt_type / prompt_subtype / file) as f:
|
|
45
43
|
placeholders = json.load(f)
|
|
46
|
-
if prompt_type not in ["postprocessing", "preprocessing"]:
|
|
47
|
-
placeholders = transform_schema_strings(placeholders)
|
|
48
44
|
self.library[prompt_type][prompt_subtype][
|
|
49
45
|
"placeholders"
|
|
50
46
|
] = placeholders
|
src/utils.py
CHANGED
|
@@ -314,9 +314,6 @@ def generate_schema_structure(params, input_doc_type):
|
|
|
314
314
|
"type": "string",
|
|
315
315
|
}
|
|
316
316
|
|
|
317
|
-
# update schema to extract value-page_number pairs
|
|
318
|
-
response_schema = transform_schema_strings(response_schema)
|
|
319
|
-
|
|
320
317
|
return response_schema
|
|
321
318
|
|
|
322
319
|
|
|
@@ -446,12 +443,23 @@ def transform_schema_strings(schema):
|
|
|
446
443
|
Returns:
|
|
447
444
|
dict: The transformed schema dictionary.
|
|
448
445
|
"""
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
446
|
+
if not isinstance(schema, dict):
|
|
447
|
+
return schema
|
|
448
|
+
|
|
449
|
+
schema_type = schema.get("type")
|
|
450
|
+
if not schema_type:
|
|
451
|
+
return schema
|
|
452
|
+
|
|
453
|
+
# Base case: STRING → OBJECT (only if not already transformed)
|
|
454
|
+
if schema_type.upper() == "STRING":
|
|
455
|
+
return {
|
|
452
456
|
"type": "OBJECT",
|
|
453
457
|
"properties": {
|
|
454
|
-
"value": {
|
|
458
|
+
"value": {
|
|
459
|
+
"type": "STRING",
|
|
460
|
+
"nullable": schema.get("nullable", False),
|
|
461
|
+
"description": schema.get("description", ""),
|
|
462
|
+
},
|
|
455
463
|
"page_number": {
|
|
456
464
|
"type": "STRING",
|
|
457
465
|
"description": "Number of a page where the value was found in the document starting from 0.",
|
|
@@ -460,29 +468,29 @@ def transform_schema_strings(schema):
|
|
|
460
468
|
"required": [],
|
|
461
469
|
}
|
|
462
470
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
471
|
+
# Skip already transformed OBJECT (has both 'value' & 'page_number')
|
|
472
|
+
if (
|
|
473
|
+
schema_type.upper() == "OBJECT"
|
|
474
|
+
and "properties" in schema
|
|
475
|
+
and {"value", "page_number"}.issubset(schema["properties"].keys())
|
|
476
|
+
):
|
|
477
|
+
return schema
|
|
468
478
|
|
|
479
|
+
# Recursive case for OBJECT
|
|
480
|
+
if schema_type.upper() == "OBJECT" and "properties" in schema:
|
|
481
|
+
new_schema = schema.copy()
|
|
482
|
+
new_schema["properties"] = {
|
|
483
|
+
k: transform_schema_strings(v) for k, v in schema["properties"].items()
|
|
484
|
+
}
|
|
469
485
|
return new_schema
|
|
470
486
|
|
|
471
|
-
# Recursive case
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
return transformed_schema
|
|
477
|
-
|
|
478
|
-
# Recursive case: if the schema is a list
|
|
479
|
-
elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
|
|
480
|
-
schema["items"] = transform_schema_strings(schema["items"])
|
|
481
|
-
return schema
|
|
487
|
+
# Recursive case for ARRAY
|
|
488
|
+
if schema_type.upper() == "ARRAY" and "items" in schema:
|
|
489
|
+
new_schema = schema.copy()
|
|
490
|
+
new_schema["items"] = transform_schema_strings(schema["items"])
|
|
491
|
+
return new_schema
|
|
482
492
|
|
|
483
|
-
|
|
484
|
-
else:
|
|
485
|
-
return schema
|
|
493
|
+
return schema
|
|
486
494
|
|
|
487
495
|
|
|
488
496
|
def estimate_page_count(sheet):
|
{data_science_document_ai-1.43.4.dist-info → data_science_document_ai-1.43.6.dist-info}/WHEEL
RENAMED
|
File without changes
|