data-science-document-ai 1.43.3__py3-none-any.whl → 1.43.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/RECORD +12 -12
- src/excel_processing.py +19 -9
- src/pdf_processing.py +22 -9
- src/postprocessing/common.py +40 -14
- src/prompts/library/bundeskasse/other/prompt.txt +1 -1
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
- src/prompts/prompt_library.py +0 -4
- src/utils.py +0 -3
- {data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/RECORD
RENAMED
|
@@ -2,12 +2,12 @@ src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
|
|
|
2
2
|
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
3
|
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
4
|
src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
|
|
5
|
-
src/excel_processing.py,sha256=
|
|
5
|
+
src/excel_processing.py,sha256=PdypkXHf-hln5cq5TyJ_IVybZk-rJF1NKZ50KXuOSdY,3390
|
|
6
6
|
src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
|
|
7
7
|
src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=DaFM8ioERj7YeC8Yjki_dfSnKt0lf7DB14ks9i4OAfA,17741
|
|
10
|
+
src/postprocessing/common.py,sha256=fU3ECfnR0rpF21DnVYM2YM7kPEB4gRJuMasyrNupsaA,23026
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
|
|
@@ -26,12 +26,12 @@ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOc
|
|
|
26
26
|
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
27
27
|
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
|
|
28
28
|
src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
|
|
29
|
-
src/prompts/library/bundeskasse/other/prompt.txt,sha256=
|
|
29
|
+
src/prompts/library/bundeskasse/other/prompt.txt,sha256=MBv4MIMASMstkzDS7H0q_pNJbPQeadP1vcmhCRrpjQ4,2906
|
|
30
30
|
src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
|
|
31
|
-
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=
|
|
31
|
+
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
|
|
32
32
|
src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
|
|
33
33
|
src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
|
|
34
|
-
src/prompts/library/customsInvoice/other/prompt.txt,sha256=
|
|
34
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
|
|
35
35
|
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
|
|
36
36
|
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
|
|
37
37
|
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
|
|
@@ -44,16 +44,16 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
|
|
|
44
44
|
src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
|
|
45
45
|
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
46
46
|
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
|
|
47
|
-
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=
|
|
47
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=4WGEQ6EiOtQxB7iwKy_Hg0PQzCEoFbjJUwEawwTgWiw,7775
|
|
48
48
|
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
49
49
|
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
50
50
|
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
|
|
51
51
|
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
52
52
|
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
|
|
53
|
-
src/prompts/prompt_library.py,sha256=
|
|
53
|
+
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
54
54
|
src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
|
|
55
55
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
56
|
-
src/utils.py,sha256=
|
|
57
|
-
data_science_document_ai-1.43.
|
|
58
|
-
data_science_document_ai-1.43.
|
|
59
|
-
data_science_document_ai-1.43.
|
|
56
|
+
src/utils.py,sha256=Ro4FEYo28VgJwTy842MkNrK5MIAWglW0CmDcfDEhmAo,16514
|
|
57
|
+
data_science_document_ai-1.43.5.dist-info/METADATA,sha256=2XHEh0gDLvzPfNKgt1mwIx4THUV5dgIFLK3K2tWFgqQ,2152
|
|
58
|
+
data_science_document_ai-1.43.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
59
|
+
data_science_document_ai-1.43.5.dist-info/RECORD,,
|
src/excel_processing.py
CHANGED
|
@@ -9,17 +9,17 @@ from src.postprocessing.common import llm_prediction_to_tuples
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
11
|
import asyncio
|
|
12
|
-
import json
|
|
13
12
|
|
|
14
13
|
import numpy as np
|
|
15
14
|
import pandas as pd
|
|
16
15
|
|
|
17
16
|
from src.llm import prompt_excel_extraction
|
|
17
|
+
from src.prompts.prompt_library import prompt_library
|
|
18
18
|
from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
async def extract_data_from_sheet(
|
|
22
|
-
|
|
22
|
+
llm_client, sheet_name, sheet, response_schema, doc_type=None
|
|
23
23
|
):
|
|
24
24
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
25
25
|
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
@@ -34,7 +34,7 @@ async def extract_data_from_sheet(
|
|
|
34
34
|
prompt_docai = prompt_excel_extraction(worksheet)
|
|
35
35
|
|
|
36
36
|
try:
|
|
37
|
-
result = await
|
|
37
|
+
result = await llm_client.get_unified_json_genai(
|
|
38
38
|
prompt_docai,
|
|
39
39
|
response_schema=response_schema,
|
|
40
40
|
doc_type=doc_type,
|
|
@@ -51,6 +51,7 @@ async def extract_data_from_excel(
|
|
|
51
51
|
input_doc_type,
|
|
52
52
|
file_content,
|
|
53
53
|
mime_type,
|
|
54
|
+
llm_client,
|
|
54
55
|
):
|
|
55
56
|
"""Extract data from the Excel file.
|
|
56
57
|
|
|
@@ -59,6 +60,7 @@ async def extract_data_from_excel(
|
|
|
59
60
|
input_doc_type (str): The type of the document.
|
|
60
61
|
file_content (bytes): The content of the Excel file to process.
|
|
61
62
|
mime_type (str): The MIME type of the file.
|
|
63
|
+
llm_client: The LLM client to use for data extraction.
|
|
62
64
|
|
|
63
65
|
Returns:
|
|
64
66
|
formatted_data (list): A list of dictionaries containing the extracted data.
|
|
@@ -67,7 +69,18 @@ async def extract_data_from_excel(
|
|
|
67
69
|
|
|
68
70
|
"""
|
|
69
71
|
# Generate the response structure
|
|
70
|
-
response_schema =
|
|
72
|
+
response_schema = (
|
|
73
|
+
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
74
|
+
if input_doc_type
|
|
75
|
+
in [
|
|
76
|
+
"partnerInvoice",
|
|
77
|
+
"customsInvoice",
|
|
78
|
+
"bundeskasse",
|
|
79
|
+
"commercialInvoice",
|
|
80
|
+
"packingList",
|
|
81
|
+
]
|
|
82
|
+
else generate_schema_structure(params, input_doc_type)
|
|
83
|
+
)
|
|
71
84
|
|
|
72
85
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
73
86
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
@@ -84,7 +97,7 @@ async def extract_data_from_excel(
|
|
|
84
97
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
85
98
|
sheet_extract_tasks = [
|
|
86
99
|
extract_data_from_sheet(
|
|
87
|
-
|
|
100
|
+
llm_client,
|
|
88
101
|
sheet_name,
|
|
89
102
|
workbook[sheet_name],
|
|
90
103
|
response_schema,
|
|
@@ -94,7 +107,4 @@ async def extract_data_from_excel(
|
|
|
94
107
|
]
|
|
95
108
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
96
109
|
|
|
97
|
-
|
|
98
|
-
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
99
|
-
|
|
100
|
-
return extracted_data, extracted_data, params["gemini_params"]["model_id"]
|
|
110
|
+
return extracted_data, extracted_data, llm_client.model_id
|
src/pdf_processing.py
CHANGED
|
@@ -36,6 +36,7 @@ from src.utils import (
|
|
|
36
36
|
get_pdf_page_count,
|
|
37
37
|
get_processor_name,
|
|
38
38
|
run_background_tasks,
|
|
39
|
+
transform_schema_strings,
|
|
39
40
|
validate_based_on_schema,
|
|
40
41
|
)
|
|
41
42
|
|
|
@@ -199,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
199
200
|
if input_doc_type == "bundeskasse"
|
|
200
201
|
else file_content
|
|
201
202
|
)
|
|
203
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
202
204
|
|
|
203
205
|
# convert file_content to required document
|
|
204
206
|
document = llm_client.prepare_document_for_gemini(file_content)
|
|
@@ -254,6 +256,13 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
254
256
|
# get the related prompt from predefined prompt library
|
|
255
257
|
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
256
258
|
|
|
259
|
+
# Update schema to extract value-page_number pairs
|
|
260
|
+
if number_of_pages > 1:
|
|
261
|
+
response_schema = transform_schema_strings(response_schema)
|
|
262
|
+
|
|
263
|
+
# Update the prompt to instruct LLM to include page numbers
|
|
264
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
265
|
+
|
|
257
266
|
# generate the result with LLM (gemini)
|
|
258
267
|
result = await llm_client.get_unified_json_genai(
|
|
259
268
|
prompt=prompt,
|
|
@@ -262,7 +271,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
262
271
|
doc_type=input_doc_type,
|
|
263
272
|
)
|
|
264
273
|
|
|
265
|
-
result = llm_prediction_to_tuples(result)
|
|
274
|
+
result = llm_prediction_to_tuples(result, number_of_pages)
|
|
266
275
|
|
|
267
276
|
return result
|
|
268
277
|
return {}
|
|
@@ -342,15 +351,9 @@ async def extract_data_by_doctype(
|
|
|
342
351
|
processor_client,
|
|
343
352
|
if_use_docai,
|
|
344
353
|
if_use_llm,
|
|
354
|
+
llm_client,
|
|
345
355
|
isBetaTest=False,
|
|
346
356
|
):
|
|
347
|
-
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
348
|
-
llm_client = (
|
|
349
|
-
params["LlmClient_Flash"]
|
|
350
|
-
if input_doc_type not in ["customsInvoice", "partnerInvoice"]
|
|
351
|
-
else params["LlmClient"]
|
|
352
|
-
)
|
|
353
|
-
|
|
354
357
|
async def extract_w_docai():
|
|
355
358
|
return await extract_data_from_pdf_w_docai(
|
|
356
359
|
params=params,
|
|
@@ -420,6 +423,14 @@ async def data_extraction_manual_flow(
|
|
|
420
423
|
"""
|
|
421
424
|
# Get the start time for processing
|
|
422
425
|
start_time = asyncio.get_event_loop().time()
|
|
426
|
+
|
|
427
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
428
|
+
llm_client = (
|
|
429
|
+
params["LlmClient_Flash"]
|
|
430
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
431
|
+
else params["LlmClient"]
|
|
432
|
+
)
|
|
433
|
+
|
|
423
434
|
page_count = None
|
|
424
435
|
# Validate the file type
|
|
425
436
|
if mime_type == "application/pdf":
|
|
@@ -442,6 +453,7 @@ async def data_extraction_manual_flow(
|
|
|
442
453
|
processor_client,
|
|
443
454
|
if_use_docai=if_use_docai,
|
|
444
455
|
if_use_llm=if_use_llm,
|
|
456
|
+
llm_client=llm_client,
|
|
445
457
|
isBetaTest=False,
|
|
446
458
|
)
|
|
447
459
|
page_count = get_pdf_page_count(file_content)
|
|
@@ -453,6 +465,7 @@ async def data_extraction_manual_flow(
|
|
|
453
465
|
input_doc_type=meta.documentTypeCode,
|
|
454
466
|
file_content=file_content,
|
|
455
467
|
mime_type=mime_type,
|
|
468
|
+
llm_client=llm_client,
|
|
456
469
|
)
|
|
457
470
|
|
|
458
471
|
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
@@ -472,7 +485,7 @@ async def data_extraction_manual_flow(
|
|
|
472
485
|
)
|
|
473
486
|
# Create the result dictionary with the extracted data
|
|
474
487
|
extracted_data = await format_all_entities(
|
|
475
|
-
extracted_data, meta.documentTypeCode, params
|
|
488
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
476
489
|
)
|
|
477
490
|
result = {
|
|
478
491
|
"id": meta.id,
|
src/postprocessing/common.py
CHANGED
|
@@ -372,27 +372,30 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
372
372
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
373
373
|
|
|
374
374
|
|
|
375
|
-
async def format_label(entity_k, entity_value, document_type_code, params):
|
|
375
|
+
async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
|
|
376
376
|
llm_client = params["LlmClient"]
|
|
377
377
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
378
378
|
format_tasks = [
|
|
379
|
-
format_label(sub_k, sub_v, document_type_code, params)
|
|
379
|
+
format_label(sub_k, sub_v, document_type_code, params, mime_type)
|
|
380
380
|
for sub_k, sub_v in entity_value.items()
|
|
381
381
|
]
|
|
382
382
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
383
383
|
if isinstance(entity_value, list):
|
|
384
384
|
format_tasks = await asyncio.gather(
|
|
385
385
|
*[
|
|
386
|
-
format_label(entity_k, sub_v, document_type_code, params)
|
|
386
|
+
format_label(entity_k, sub_v, document_type_code, params, mime_type)
|
|
387
387
|
for sub_v in entity_value
|
|
388
388
|
]
|
|
389
389
|
)
|
|
390
390
|
return entity_k, [v for _, v in format_tasks]
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
entity_value
|
|
394
|
-
|
|
395
|
-
|
|
391
|
+
|
|
392
|
+
if mime_type == "application/pdf":
|
|
393
|
+
if isinstance(entity_value, tuple):
|
|
394
|
+
page = entity_value[1]
|
|
395
|
+
entity_value = entity_value[0]
|
|
396
|
+
else:
|
|
397
|
+
page = -1
|
|
398
|
+
|
|
396
399
|
entity_key = entity_k.lower()
|
|
397
400
|
formatted_value = None
|
|
398
401
|
|
|
@@ -493,8 +496,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
493
496
|
result = {
|
|
494
497
|
"documentValue": entity_value,
|
|
495
498
|
"formattedValue": formatted_value,
|
|
496
|
-
"page": page,
|
|
497
499
|
}
|
|
500
|
+
if mime_type == "application/pdf":
|
|
501
|
+
result["page"] = page
|
|
502
|
+
|
|
498
503
|
return entity_k, result
|
|
499
504
|
|
|
500
505
|
|
|
@@ -593,7 +598,7 @@ def decimal_convertor(value, quantity=False):
|
|
|
593
598
|
return value
|
|
594
599
|
|
|
595
600
|
|
|
596
|
-
async def format_all_entities(result, document_type_code, params):
|
|
601
|
+
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
597
602
|
"""Format the entity values in the result dictionary."""
|
|
598
603
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
599
604
|
document_type_code = (
|
|
@@ -608,7 +613,9 @@ async def format_all_entities(result, document_type_code, params):
|
|
|
608
613
|
return {}
|
|
609
614
|
|
|
610
615
|
# Format all entities recursively
|
|
611
|
-
_, aggregated_data = await format_label(
|
|
616
|
+
_, aggregated_data = await format_label(
|
|
617
|
+
None, result, document_type_code, params, mime_type
|
|
618
|
+
)
|
|
612
619
|
|
|
613
620
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
614
621
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
@@ -644,8 +651,24 @@ def remove_stop_words(lineitem: str):
|
|
|
644
651
|
)
|
|
645
652
|
|
|
646
653
|
|
|
647
|
-
def llm_prediction_to_tuples(llm_prediction):
|
|
654
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
|
|
648
655
|
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
656
|
+
|
|
657
|
+
# If only 1 page, simply pair each value with page number 0
|
|
658
|
+
if number_of_pages == 1:
|
|
659
|
+
if isinstance(llm_prediction, dict):
|
|
660
|
+
return {
|
|
661
|
+
k: llm_prediction_to_tuples(v, number_of_pages)
|
|
662
|
+
for k, v in llm_prediction.items()
|
|
663
|
+
}
|
|
664
|
+
elif isinstance(llm_prediction, list):
|
|
665
|
+
return [
|
|
666
|
+
llm_prediction_to_tuples(v, number_of_pages) for v in llm_prediction
|
|
667
|
+
]
|
|
668
|
+
else:
|
|
669
|
+
return (llm_prediction, 0) if llm_prediction else None
|
|
670
|
+
|
|
671
|
+
# logic for multi-page predictions
|
|
649
672
|
if isinstance(llm_prediction, dict):
|
|
650
673
|
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
651
674
|
if llm_prediction["value"]:
|
|
@@ -655,11 +678,14 @@ def llm_prediction_to_tuples(llm_prediction):
|
|
|
655
678
|
page_number = -1
|
|
656
679
|
return (llm_prediction["value"], page_number)
|
|
657
680
|
return None
|
|
681
|
+
|
|
658
682
|
for key, value in llm_prediction.items():
|
|
659
683
|
llm_prediction[key] = llm_prediction_to_tuples(
|
|
660
|
-
llm_prediction.get(key, value)
|
|
684
|
+
llm_prediction.get(key, value), number_of_pages
|
|
661
685
|
)
|
|
686
|
+
|
|
662
687
|
elif isinstance(llm_prediction, list):
|
|
663
688
|
for i, item in enumerate(llm_prediction):
|
|
664
|
-
llm_prediction[i] = llm_prediction_to_tuples(item)
|
|
689
|
+
llm_prediction[i] = llm_prediction_to_tuples(item, number_of_pages)
|
|
690
|
+
|
|
665
691
|
return llm_prediction
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from customs invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.
|
|
@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
|
|
|
2
2
|
|
|
3
3
|
Extract all the data points from the given document.
|
|
4
4
|
Each data point is part of a master field called skus. There may be multiple skus entries in a document.
|
|
5
|
-
Your task is to extract the text value of the entities and page numbers starting from 0
|
|
5
|
+
Your task is to extract the text value of the entities and page numbers starting from 0 where the value was found in the document.
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
Instructions:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
|
src/prompts/prompt_library.py
CHANGED
|
@@ -4,8 +4,6 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict
|
|
6
6
|
|
|
7
|
-
from src.utils import transform_schema_strings
|
|
8
|
-
|
|
9
7
|
|
|
10
8
|
class PromptLibrary:
|
|
11
9
|
"""
|
|
@@ -43,8 +41,6 @@ class PromptLibrary:
|
|
|
43
41
|
if file == "placeholders.json":
|
|
44
42
|
with open(path_to_library / prompt_type / prompt_subtype / file) as f:
|
|
45
43
|
placeholders = json.load(f)
|
|
46
|
-
if prompt_type not in ["postprocessing", "preprocessing"]:
|
|
47
|
-
placeholders = transform_schema_strings(placeholders)
|
|
48
44
|
self.library[prompt_type][prompt_subtype][
|
|
49
45
|
"placeholders"
|
|
50
46
|
] = placeholders
|
src/utils.py
CHANGED
{data_science_document_ai-1.43.3.dist-info → data_science_document_ai-1.43.5.dist-info}/WHEEL
RENAMED
|
File without changes
|