data-science-document-ai 1.43.6__tar.gz → 1.51.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/PKG-INFO +1 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/pyproject.toml +1 -1
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/constants.py +6 -8
- data_science_document_ai-1.51.0/src/docai_processor_config.yaml +22 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/excel_processing.py +7 -18
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/io.py +23 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/llm.py +0 -29
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/pdf_processing.py +118 -53
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/common.py +132 -25
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_partner_invoice.py +137 -58
- data_science_document_ai-1.51.0/src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- data_science_document_ai-1.51.0/src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bundeskasse/other/prompt.txt +6 -4
- data_science_document_ai-1.51.0/src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- data_science_document_ai-1.51.0/src/prompts/library/customsAssessment/other/prompt.txt +29 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- data_science_document_ai-1.51.0/src/prompts/library/deliveryOrder/other/placeholders.json +82 -0
- data_science_document_ai-1.51.0/src/prompts/library/deliveryOrder/other/prompt.txt +36 -0
- data_science_document_ai-1.51.0/src/prompts/library/draftMbl/other/placeholders.json +80 -0
- data_science_document_ai-1.51.0/src/prompts/library/draftMbl/other/prompt.txt +34 -0
- data_science_document_ai-1.51.0/src/prompts/library/finalMbL/other/placeholders.json +80 -0
- data_science_document_ai-1.51.0/src/prompts/library/finalMbL/other/prompt.txt +34 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/partnerInvoice/other/prompt.txt +4 -2
- data_science_document_ai-1.51.0/src/prompts/library/preprocessing/carrier/placeholders.json +14 -0
- data_science_document_ai-1.51.0/src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- data_science_document_ai-1.51.0/src/prompts/library/shippingInstruction/other/prompt.txt +28 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/setup.py +9 -16
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/utils.py +63 -41
- data_science_document_ai-1.43.6/src/docai_processor_config.yaml +0 -64
- data_science_document_ai-1.43.6/src/prompts/library/customsAssessment/other/prompt.txt +0 -42
- data_science_document_ai-1.43.6/src/prompts/library/deliveryOrder/other/placeholders.json +0 -29
- data_science_document_ai-1.43.6/src/prompts/library/deliveryOrder/other/prompt.txt +0 -50
- data_science_document_ai-1.43.6/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- data_science_document_ai-1.43.6/src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- data_science_document_ai-1.43.6/src/prompts/library/draftMbl/other/placeholders.json +0 -80
- data_science_document_ai-1.43.6/src/prompts/library/draftMbl/other/prompt.txt +0 -44
- data_science_document_ai-1.43.6/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- data_science_document_ai-1.43.6/src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- data_science_document_ai-1.43.6/src/prompts/library/finalMbL/other/prompt.txt +0 -44
- data_science_document_ai-1.43.6/src/prompts/library/preprocessing/carrier/placeholders.json +0 -30
- data_science_document_ai-1.43.6/src/prompts/library/shippingInstruction/other/prompt.txt +0 -16
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/docai.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/log_setup.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.51.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -23,9 +23,12 @@ project_parameters = {
|
|
|
23
23
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
24
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
25
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
26
|
+
"fuzzy_threshold_item_code": 90,
|
|
27
27
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
28
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
|
+
# Chunking params
|
|
30
|
+
"chunk_size": 1, # page (do not change this without changing the page number logic)
|
|
31
|
+
"chunk_after": 10, # pages
|
|
29
32
|
# Big Query
|
|
30
33
|
"g_ai_gbq_db_schema": "document_ai",
|
|
31
34
|
"g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
|
|
@@ -50,13 +53,6 @@ project_parameters = {
|
|
|
50
53
|
"model_selector": {
|
|
51
54
|
"stable": {
|
|
52
55
|
"bookingConfirmation": 1,
|
|
53
|
-
"finalMbL": 0,
|
|
54
|
-
"draftMbl": 0,
|
|
55
|
-
"arrivalNotice": 0,
|
|
56
|
-
"shippingInstruction": 0,
|
|
57
|
-
"customsAssessment": 0,
|
|
58
|
-
"deliveryOrder": 0,
|
|
59
|
-
"partnerInvoice": 0,
|
|
60
56
|
},
|
|
61
57
|
"beta": {
|
|
62
58
|
"bookingConfirmation": 0,
|
|
@@ -84,8 +80,10 @@ project_parameters = {
|
|
|
84
80
|
# Key to combine the LLM results with the Doc Ai results
|
|
85
81
|
"key_to_combine": {
|
|
86
82
|
"bookingConfirmation": ["transportLegs"],
|
|
83
|
+
"arrivalNotice": ["containers"],
|
|
87
84
|
"finalMbL": ["containers"],
|
|
88
85
|
"draftMbl": ["containers"],
|
|
86
|
+
"deliveryOrder": ["Equipment", "TransportLeg"],
|
|
89
87
|
"customsAssessment": ["containers"],
|
|
90
88
|
"packingList": ["skuData"],
|
|
91
89
|
"commercialInvoice": ["skus"],
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
models_project_id: "738250249861"
|
|
2
|
+
model_config:
|
|
3
|
+
stable:
|
|
4
|
+
bookingConfirmation:
|
|
5
|
+
- id: "dc3e714cd168aeaa"
|
|
6
|
+
details:
|
|
7
|
+
display_name: "doc_cap_bookingConfirmation"
|
|
8
|
+
author: "reet.kanjilal@forto.com"
|
|
9
|
+
created_date: ""
|
|
10
|
+
- id: "3c280b11bdb3ed89"
|
|
11
|
+
details:
|
|
12
|
+
display_name: "doc_cap_BC_mlg"
|
|
13
|
+
author: "igor.tonko@forto.com"
|
|
14
|
+
created_date: ""
|
|
15
|
+
|
|
16
|
+
beta:
|
|
17
|
+
bookingConfirmation:
|
|
18
|
+
- id: "3c280b11bdb3ed89"
|
|
19
|
+
details:
|
|
20
|
+
display_name: "doc_cap_BC_mlg"
|
|
21
|
+
author: "igor.tonko@forto.com"
|
|
22
|
+
created_date: ""
|
|
@@ -4,8 +4,6 @@ import logging
|
|
|
4
4
|
|
|
5
5
|
from ddtrace import tracer
|
|
6
6
|
|
|
7
|
-
from src.postprocessing.common import llm_prediction_to_tuples
|
|
8
|
-
|
|
9
7
|
logger = logging.getLogger(__name__)
|
|
10
8
|
|
|
11
9
|
import asyncio
|
|
@@ -13,9 +11,8 @@ import asyncio
|
|
|
13
11
|
import numpy as np
|
|
14
12
|
import pandas as pd
|
|
15
13
|
|
|
16
|
-
from src.llm import prompt_excel_extraction
|
|
17
14
|
from src.prompts.prompt_library import prompt_library
|
|
18
|
-
from src.utils import estimate_page_count,
|
|
15
|
+
from src.utils import estimate_page_count, get_excel_sheets
|
|
19
16
|
|
|
20
17
|
|
|
21
18
|
async def extract_data_from_sheet(
|
|
@@ -31,11 +28,14 @@ async def extract_data_from_sheet(
|
|
|
31
28
|
)
|
|
32
29
|
|
|
33
30
|
# Prompt for the LLM JSON
|
|
34
|
-
|
|
31
|
+
prompt = prompt_library.library[doc_type]["other"]["prompt"]
|
|
32
|
+
|
|
33
|
+
# Join the worksheet content with the prompt
|
|
34
|
+
prompt = worksheet + "\n" + prompt
|
|
35
35
|
|
|
36
36
|
try:
|
|
37
37
|
result = await llm_client.get_unified_json_genai(
|
|
38
|
-
|
|
38
|
+
prompt,
|
|
39
39
|
response_schema=response_schema,
|
|
40
40
|
doc_type=doc_type,
|
|
41
41
|
)
|
|
@@ -69,18 +69,7 @@ async def extract_data_from_excel(
|
|
|
69
69
|
|
|
70
70
|
"""
|
|
71
71
|
# Generate the response structure
|
|
72
|
-
response_schema =
|
|
73
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
74
|
-
if input_doc_type
|
|
75
|
-
in [
|
|
76
|
-
"partnerInvoice",
|
|
77
|
-
"customsInvoice",
|
|
78
|
-
"bundeskasse",
|
|
79
|
-
"commercialInvoice",
|
|
80
|
-
"packingList",
|
|
81
|
-
]
|
|
82
|
-
else generate_schema_structure(params, input_doc_type)
|
|
83
|
-
)
|
|
72
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
84
73
|
|
|
85
74
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
86
75
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
156
156
|
return result
|
|
157
157
|
|
|
158
158
|
|
|
159
|
+
def bq_logs(data_to_insert, params):
|
|
160
|
+
"""Insert logs into Google BigQuery.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
data_to_insert (list): The data to insert into BigQuery.
|
|
164
|
+
params (dict): The parameters dictionary.
|
|
165
|
+
"""
|
|
166
|
+
# Use the pre-initialized BigQuery client
|
|
167
|
+
bq_client = params["bq_client"]
|
|
168
|
+
# Get the table string
|
|
169
|
+
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
170
|
+
|
|
171
|
+
logger.info(f"Log table: {table_string}")
|
|
172
|
+
# Insert the rows into the table
|
|
173
|
+
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
174
|
+
|
|
175
|
+
# Check if there were any errors inserting the rows
|
|
176
|
+
if not insert_logs:
|
|
177
|
+
logger.info("New rows have been added.")
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
180
|
+
|
|
181
|
+
|
|
159
182
|
# type: ignore
|
|
@@ -201,33 +201,4 @@ class LlmClient:
|
|
|
201
201
|
return response
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def prompt_excel_extraction(excel_structured_text):
|
|
205
|
-
"""Write a prompt to extract data from Excel files.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
excel_structured_text (str): The structured text of the Excel file.
|
|
209
|
-
|
|
210
|
-
Returns:
|
|
211
|
-
prompt str: The prompt for common json.
|
|
212
|
-
"""
|
|
213
|
-
prompt = f"""{excel_structured_text}
|
|
214
|
-
|
|
215
|
-
Task: Fill in the following dictionary from the information in the given in the above excel data.
|
|
216
|
-
|
|
217
|
-
Instructions:
|
|
218
|
-
- Do not change the keys of the following dictionary.
|
|
219
|
-
- The values should be filled in as per the schema provided below.
|
|
220
|
-
- If an entity contains a 'display_name', consider its properties as child data points in the below format.
|
|
221
|
-
{{'data-field': {{
|
|
222
|
-
'child-data-field': 'type -occurrence_type- description',
|
|
223
|
-
}}
|
|
224
|
-
}}
|
|
225
|
-
- The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
|
|
226
|
-
- Ensure the schema reflects the hierarchical relationship.
|
|
227
|
-
- Use the data field description to understand the context of the data.
|
|
228
|
-
|
|
229
|
-
"""
|
|
230
|
-
return prompt
|
|
231
|
-
|
|
232
|
-
|
|
233
204
|
# pylint: enable=all
|
|
@@ -36,6 +36,7 @@ from src.utils import (
|
|
|
36
36
|
get_pdf_page_count,
|
|
37
37
|
get_processor_name,
|
|
38
38
|
run_background_tasks,
|
|
39
|
+
split_pdf_into_chunks,
|
|
39
40
|
transform_schema_strings,
|
|
40
41
|
validate_based_on_schema,
|
|
41
42
|
)
|
|
@@ -195,46 +196,32 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
195
196
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
196
197
|
"""
|
|
197
198
|
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
198
|
-
|
|
199
|
-
extract_top_pages(file_content, num_pages=5)
|
|
200
|
-
|
|
201
|
-
else file_content
|
|
202
|
-
)
|
|
199
|
+
if input_doc_type == "bundeskasse":
|
|
200
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
201
|
+
|
|
203
202
|
number_of_pages = get_pdf_page_count(file_content)
|
|
203
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
204
204
|
|
|
205
|
-
#
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
# get the schema placeholder from the Doc AI and generate the response structure
|
|
209
|
-
response_schema = (
|
|
210
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
211
|
-
if input_doc_type
|
|
212
|
-
in [
|
|
213
|
-
"partnerInvoice",
|
|
214
|
-
"customsInvoice",
|
|
215
|
-
"bundeskasse",
|
|
216
|
-
"commercialInvoice",
|
|
217
|
-
"packingList",
|
|
218
|
-
]
|
|
219
|
-
else generate_schema_structure(params, input_doc_type)
|
|
220
|
-
)
|
|
205
|
+
# get the schema placeholder
|
|
206
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
221
207
|
|
|
222
208
|
carrier = "other"
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
230
|
-
"placeholders"
|
|
231
|
-
][input_doc_type]
|
|
209
|
+
carrier_schema = (
|
|
210
|
+
prompt_library.library.get("preprocessing", {})
|
|
211
|
+
.get("carrier", {})
|
|
212
|
+
.get("placeholders", {})
|
|
213
|
+
.get(input_doc_type)
|
|
214
|
+
)
|
|
232
215
|
|
|
216
|
+
if carrier_schema:
|
|
233
217
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
234
218
|
carrier_prompt = carrier_prompt.replace(
|
|
235
219
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
236
220
|
)
|
|
237
221
|
|
|
222
|
+
# convert file_content to required document
|
|
223
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
224
|
+
|
|
238
225
|
# identify carrier for customized prompting
|
|
239
226
|
carrier = await identify_carrier(
|
|
240
227
|
document,
|
|
@@ -244,37 +231,115 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
244
231
|
doc_type=input_doc_type,
|
|
245
232
|
)
|
|
246
233
|
|
|
247
|
-
|
|
248
|
-
response_schema = prompt_library.library[input_doc_type][carrier][
|
|
249
|
-
"placeholders"
|
|
250
|
-
]
|
|
251
|
-
|
|
234
|
+
# Select prompt
|
|
252
235
|
if (
|
|
253
|
-
input_doc_type in prompt_library.library
|
|
254
|
-
|
|
236
|
+
input_doc_type not in prompt_library.library
|
|
237
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
255
238
|
):
|
|
256
|
-
|
|
257
|
-
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
239
|
+
return {}
|
|
258
240
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
response_schema = transform_schema_strings(response_schema)
|
|
241
|
+
# get the related prompt from predefined prompt library
|
|
242
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
262
243
|
|
|
263
|
-
|
|
264
|
-
|
|
244
|
+
# Add page-number extraction for moderately large docs
|
|
245
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
265
246
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
247
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
248
|
+
if not use_chunking and number_of_pages > 1:
|
|
249
|
+
response_schema = transform_schema_strings(response_schema)
|
|
250
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
251
|
+
|
|
252
|
+
tasks = []
|
|
253
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
254
|
+
for chunk in (
|
|
255
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
256
|
+
if use_chunking
|
|
257
|
+
else [file_content]
|
|
258
|
+
):
|
|
259
|
+
tasks.append(
|
|
260
|
+
process_chunk_with_retry(
|
|
261
|
+
chunk, prompt, response_schema, llm_client, input_doc_type
|
|
262
|
+
)
|
|
272
263
|
)
|
|
273
264
|
|
|
274
|
-
|
|
265
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
275
266
|
|
|
276
|
-
|
|
277
|
-
|
|
267
|
+
if use_chunking:
|
|
268
|
+
return merge_llm_results(results, response_schema)
|
|
269
|
+
else:
|
|
270
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
async def process_chunk_with_retry(
|
|
274
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
275
|
+
):
|
|
276
|
+
"""Process a chunk with retries in case of failure."""
|
|
277
|
+
for attempt in range(1, retries + 1):
|
|
278
|
+
try:
|
|
279
|
+
return await process_chunk(
|
|
280
|
+
chunk_content=chunk_content,
|
|
281
|
+
prompt=prompt,
|
|
282
|
+
response_schema=response_schema,
|
|
283
|
+
llm_client=llm_client,
|
|
284
|
+
input_doc_type=input_doc_type,
|
|
285
|
+
)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
288
|
+
if attempt == retries:
|
|
289
|
+
raise
|
|
290
|
+
await asyncio.sleep(1) # small backoff
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
async def process_chunk(
|
|
294
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
295
|
+
):
|
|
296
|
+
"""Process a chunk with Gemini."""
|
|
297
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
298
|
+
return await llm_client.get_unified_json_genai(
|
|
299
|
+
prompt=prompt,
|
|
300
|
+
document=document,
|
|
301
|
+
response_schema=response_schema,
|
|
302
|
+
doc_type=input_doc_type,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def merge_llm_results(results, response_schema):
|
|
307
|
+
"""Merge LLM results from multiple chunks."""
|
|
308
|
+
merged = {}
|
|
309
|
+
for i, result in enumerate(results):
|
|
310
|
+
if not isinstance(result, dict):
|
|
311
|
+
continue
|
|
312
|
+
# Add page number to all values coming from this chunk
|
|
313
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
314
|
+
|
|
315
|
+
# Merge the result into the final merged dictionary
|
|
316
|
+
for key, value in result.items():
|
|
317
|
+
field_type = (
|
|
318
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
if key not in merged:
|
|
322
|
+
if field_type == "ARRAY":
|
|
323
|
+
# append the values as a list
|
|
324
|
+
merged[key] = (
|
|
325
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
merged[key] = value
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
if field_type == "ARRAY":
|
|
332
|
+
# append list contents across chunks
|
|
333
|
+
if isinstance(value, list):
|
|
334
|
+
merged[key].extend(value)
|
|
335
|
+
else:
|
|
336
|
+
merged[key].append(value)
|
|
337
|
+
|
|
338
|
+
# take first non-null value only
|
|
339
|
+
if merged[key] in (None, "", [], {}):
|
|
340
|
+
merged[key] = value
|
|
341
|
+
|
|
342
|
+
return merged
|
|
278
343
|
|
|
279
344
|
|
|
280
345
|
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
{data_science_document_ai-1.43.6 → data_science_document_ai-1.51.0}/src/postprocessing/common.py
RENAMED
|
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
|
|
|
12
12
|
from src.io import logger
|
|
13
13
|
from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
|
|
14
14
|
from src.prompts.prompt_library import prompt_library
|
|
15
|
-
from src.utils import get_tms_mappings
|
|
15
|
+
from src.utils import batch_fetch_all_mappings, get_tms_mappings
|
|
16
16
|
|
|
17
17
|
tms_domain = os.environ["TMS_DOMAIN"]
|
|
18
18
|
|
|
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
|
|
|
134
134
|
formatted_value: string
|
|
135
135
|
|
|
136
136
|
"""
|
|
137
|
+
# Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
|
|
138
|
+
value = remove_unwanted_patterns(data_field_value)
|
|
139
|
+
|
|
137
140
|
formatted_value = ""
|
|
138
|
-
for c in
|
|
141
|
+
for c in value:
|
|
139
142
|
if c.isnumeric() or c in [",", ".", "-"]:
|
|
140
143
|
formatted_value += c
|
|
141
144
|
|
|
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
|
|
|
320
323
|
lineitem = lineitem.replace("HIGH CUBE", "")
|
|
321
324
|
|
|
322
325
|
# Remove container size e.g., 20FT, 40HC, etc.
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
pattern = [
|
|
327
|
+
f"{s}{t}"
|
|
328
|
+
for s in ("20|22|40|45".split("|"))
|
|
329
|
+
for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
|
|
330
|
+
]
|
|
331
|
+
lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
|
|
326
332
|
|
|
327
333
|
return lineitem
|
|
328
334
|
|
|
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
|
|
|
372
378
|
return re.sub(r"\s{2,}", " ", lineitem).strip()
|
|
373
379
|
|
|
374
380
|
|
|
375
|
-
async def format_label(
|
|
381
|
+
async def format_label(
|
|
382
|
+
entity_k,
|
|
383
|
+
entity_value,
|
|
384
|
+
document_type_code,
|
|
385
|
+
params,
|
|
386
|
+
mime_type,
|
|
387
|
+
container_map,
|
|
388
|
+
terminal_map,
|
|
389
|
+
depot_map,
|
|
390
|
+
):
|
|
376
391
|
llm_client = params["LlmClient"]
|
|
377
392
|
if isinstance(entity_value, dict): # if it's a nested entity
|
|
378
393
|
format_tasks = [
|
|
379
|
-
format_label(
|
|
394
|
+
format_label(
|
|
395
|
+
sub_k,
|
|
396
|
+
sub_v,
|
|
397
|
+
document_type_code,
|
|
398
|
+
params,
|
|
399
|
+
mime_type,
|
|
400
|
+
container_map,
|
|
401
|
+
terminal_map,
|
|
402
|
+
depot_map,
|
|
403
|
+
)
|
|
380
404
|
for sub_k, sub_v in entity_value.items()
|
|
381
405
|
]
|
|
382
406
|
return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
|
|
383
407
|
if isinstance(entity_value, list):
|
|
384
408
|
format_tasks = await asyncio.gather(
|
|
385
409
|
*[
|
|
386
|
-
format_label(
|
|
410
|
+
format_label(
|
|
411
|
+
entity_k,
|
|
412
|
+
sub_v,
|
|
413
|
+
document_type_code,
|
|
414
|
+
params,
|
|
415
|
+
mime_type,
|
|
416
|
+
container_map,
|
|
417
|
+
terminal_map,
|
|
418
|
+
depot_map,
|
|
419
|
+
)
|
|
387
420
|
for sub_v in entity_value
|
|
388
421
|
]
|
|
389
422
|
)
|
|
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
|
|
|
405
438
|
)
|
|
406
439
|
|
|
407
440
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
408
|
-
formatted_value =
|
|
441
|
+
formatted_value = container_map.get(entity_value)
|
|
409
442
|
|
|
410
443
|
elif check_formatting_rule(entity_k, document_type_code, "terminal"):
|
|
411
|
-
formatted_value =
|
|
444
|
+
formatted_value = terminal_map.get(entity_value)
|
|
412
445
|
|
|
413
446
|
elif check_formatting_rule(entity_k, document_type_code, "depot"):
|
|
414
|
-
formatted_value =
|
|
447
|
+
formatted_value = depot_map.get(entity_value)
|
|
415
448
|
|
|
416
449
|
elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
|
|
417
450
|
try:
|
|
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
|
507
540
|
"""Get port code using AI model."""
|
|
508
541
|
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
509
542
|
|
|
510
|
-
|
|
543
|
+
result = await get_tms_mappings(port, "ports", port_llm)
|
|
544
|
+
return result.get(port, None)
|
|
511
545
|
|
|
512
546
|
|
|
513
547
|
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
|
|
|
598
632
|
return value
|
|
599
633
|
|
|
600
634
|
|
|
635
|
+
async def collect_mapping_requests(entity_value, document_type_code):
|
|
636
|
+
"""Collect all unique container types, terminals, and depots from the entity value."""
|
|
637
|
+
# Sets to store unique values
|
|
638
|
+
container_types = set()
|
|
639
|
+
terminals = set()
|
|
640
|
+
depots = set()
|
|
641
|
+
|
|
642
|
+
def walk(key, value):
|
|
643
|
+
key_lower = key.lower()
|
|
644
|
+
|
|
645
|
+
# nested dict
|
|
646
|
+
if isinstance(value, dict):
|
|
647
|
+
for k, v in value.items():
|
|
648
|
+
walk(k, v)
|
|
649
|
+
|
|
650
|
+
# list of values
|
|
651
|
+
elif isinstance(value, list):
|
|
652
|
+
for item in value:
|
|
653
|
+
walk(key, item)
|
|
654
|
+
|
|
655
|
+
# leaf node
|
|
656
|
+
else:
|
|
657
|
+
if key_lower in ("containertype", "containersize"):
|
|
658
|
+
# Take only "20DV" from ('20DV', 0) if it's a tuple
|
|
659
|
+
container_types.add(value[0]) if isinstance(
|
|
660
|
+
value, tuple
|
|
661
|
+
) else container_types.add(value)
|
|
662
|
+
|
|
663
|
+
elif check_formatting_rule(key, document_type_code, "terminal"):
|
|
664
|
+
terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
|
|
665
|
+
value
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
elif check_formatting_rule(key, document_type_code, "depot"):
|
|
669
|
+
depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
|
|
670
|
+
|
|
671
|
+
walk("root", entity_value)
|
|
672
|
+
|
|
673
|
+
return container_types, terminals, depots
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
async def format_all_labels(entity_data, document_type_code, params, mime_type):
|
|
677
|
+
"""Format all labels in the entity data using cached mappings."""
|
|
678
|
+
# Collect all mapping values needed
|
|
679
|
+
container_req, terminal_req, depot_req = await collect_mapping_requests(
|
|
680
|
+
entity_data, document_type_code
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Batch fetch mappings
|
|
684
|
+
container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
|
|
685
|
+
container_req, terminal_req, depot_req
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Format labels using cached mappings
|
|
689
|
+
_, result = await format_label(
|
|
690
|
+
"root",
|
|
691
|
+
entity_data,
|
|
692
|
+
document_type_code,
|
|
693
|
+
params,
|
|
694
|
+
mime_type,
|
|
695
|
+
container_map,
|
|
696
|
+
terminal_map,
|
|
697
|
+
depot_map,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return _, result
|
|
701
|
+
|
|
702
|
+
|
|
601
703
|
async def format_all_entities(result, document_type_code, params, mime_type):
|
|
602
704
|
"""Format the entity values in the result dictionary."""
|
|
603
705
|
# Since we treat `customsInvoice` same as `partnerInvoice`
|
|
@@ -613,13 +715,13 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
613
715
|
return {}
|
|
614
716
|
|
|
615
717
|
# Format all entities recursively
|
|
616
|
-
_, aggregated_data = await
|
|
617
|
-
|
|
718
|
+
_, aggregated_data = await format_all_labels(
|
|
719
|
+
result, document_type_code, params, mime_type
|
|
618
720
|
)
|
|
619
721
|
|
|
620
722
|
# Process partner invoice on lineitem mapping and reverse charge sentence
|
|
621
723
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
622
|
-
process_partner_invoice(params, aggregated_data, document_type_code)
|
|
724
|
+
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
623
725
|
|
|
624
726
|
logger.info("Data Extraction completed successfully")
|
|
625
727
|
return aggregated_data
|
|
@@ -651,41 +753,46 @@ def remove_stop_words(lineitem: str):
|
|
|
651
753
|
)
|
|
652
754
|
|
|
653
755
|
|
|
654
|
-
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1):
|
|
756
|
+
def llm_prediction_to_tuples(llm_prediction, number_of_pages=-1, page_number=None):
|
|
655
757
|
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
656
|
-
|
|
657
758
|
# If only 1 page, simply pair each value with page number 0
|
|
658
759
|
if number_of_pages == 1:
|
|
760
|
+
effective_page = 0 if page_number is None else page_number
|
|
659
761
|
if isinstance(llm_prediction, dict):
|
|
660
762
|
return {
|
|
661
|
-
k: llm_prediction_to_tuples(
|
|
763
|
+
k: llm_prediction_to_tuples(
|
|
764
|
+
v, number_of_pages, page_number=effective_page
|
|
765
|
+
)
|
|
662
766
|
for k, v in llm_prediction.items()
|
|
663
767
|
}
|
|
664
768
|
elif isinstance(llm_prediction, list):
|
|
665
769
|
return [
|
|
666
|
-
llm_prediction_to_tuples(v, number_of_pages)
|
|
770
|
+
llm_prediction_to_tuples(v, number_of_pages, page_number=effective_page)
|
|
771
|
+
for v in llm_prediction
|
|
667
772
|
]
|
|
668
773
|
else:
|
|
669
|
-
return (llm_prediction,
|
|
774
|
+
return (llm_prediction, effective_page) if llm_prediction else None
|
|
670
775
|
|
|
671
776
|
# logic for multi-page predictions
|
|
672
777
|
if isinstance(llm_prediction, dict):
|
|
673
778
|
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
674
779
|
if llm_prediction["value"]:
|
|
675
780
|
try:
|
|
676
|
-
|
|
781
|
+
_page_number = int(llm_prediction["page_number"])
|
|
677
782
|
except: # noqa: E722
|
|
678
|
-
|
|
679
|
-
return (llm_prediction["value"],
|
|
783
|
+
_page_number = -1
|
|
784
|
+
return (llm_prediction["value"], _page_number)
|
|
680
785
|
return None
|
|
681
786
|
|
|
682
787
|
for key, value in llm_prediction.items():
|
|
683
788
|
llm_prediction[key] = llm_prediction_to_tuples(
|
|
684
|
-
llm_prediction.get(key, value), number_of_pages
|
|
789
|
+
llm_prediction.get(key, value), number_of_pages, page_number
|
|
685
790
|
)
|
|
686
791
|
|
|
687
792
|
elif isinstance(llm_prediction, list):
|
|
688
793
|
for i, item in enumerate(llm_prediction):
|
|
689
|
-
llm_prediction[i] = llm_prediction_to_tuples(
|
|
794
|
+
llm_prediction[i] = llm_prediction_to_tuples(
|
|
795
|
+
item, number_of_pages, page_number
|
|
796
|
+
)
|
|
690
797
|
|
|
691
798
|
return llm_prediction
|