data-science-document-ai 1.45.2__tar.gz → 1.56.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/PKG-INFO +1 -1
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/pyproject.toml +1 -1
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/constants.py +4 -8
- data_science_document_ai-1.56.1/src/docai_processor_config.yaml +22 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/excel_processing.py +7 -17
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/llm.py +0 -29
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/pdf_processing.py +7 -16
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/postprocessing/postprocess_partner_invoice.py +99 -30
- data_science_document_ai-1.56.1/src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- data_science_document_ai-1.56.1/src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bundeskasse/other/prompt.txt +6 -4
- data_science_document_ai-1.56.1/src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- data_science_document_ai-1.56.1/src/prompts/library/customsAssessment/other/prompt.txt +29 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/customsInvoice/other/prompt.txt +2 -1
- data_science_document_ai-1.56.1/src/prompts/library/deliveryOrder/other/placeholders.json +82 -0
- data_science_document_ai-1.56.1/src/prompts/library/deliveryOrder/other/prompt.txt +36 -0
- data_science_document_ai-1.56.1/src/prompts/library/draftMbl/other/placeholders.json +80 -0
- data_science_document_ai-1.56.1/src/prompts/library/draftMbl/other/prompt.txt +34 -0
- data_science_document_ai-1.56.1/src/prompts/library/finalMbL/other/placeholders.json +80 -0
- data_science_document_ai-1.56.1/src/prompts/library/finalMbL/other/prompt.txt +34 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/partnerInvoice/other/prompt.txt +3 -4
- data_science_document_ai-1.56.1/src/prompts/library/preprocessing/carrier/placeholders.json +14 -0
- data_science_document_ai-1.56.1/src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- data_science_document_ai-1.56.1/src/prompts/library/shippingInstruction/other/prompt.txt +28 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/setup.py +15 -16
- data_science_document_ai-1.45.2/src/docai_processor_config.yaml +0 -64
- data_science_document_ai-1.45.2/src/prompts/library/customsAssessment/other/prompt.txt +0 -42
- data_science_document_ai-1.45.2/src/prompts/library/deliveryOrder/other/placeholders.json +0 -29
- data_science_document_ai-1.45.2/src/prompts/library/deliveryOrder/other/prompt.txt +0 -50
- data_science_document_ai-1.45.2/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- data_science_document_ai-1.45.2/src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- data_science_document_ai-1.45.2/src/prompts/library/draftMbl/other/placeholders.json +0 -80
- data_science_document_ai-1.45.2/src/prompts/library/draftMbl/other/prompt.txt +0 -44
- data_science_document_ai-1.45.2/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- data_science_document_ai-1.45.2/src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- data_science_document_ai-1.45.2/src/prompts/library/finalMbL/other/prompt.txt +0 -44
- data_science_document_ai-1.45.2/src/prompts/library/preprocessing/carrier/placeholders.json +0 -30
- data_science_document_ai-1.45.2/src/prompts/library/shippingInstruction/other/prompt.txt +0 -16
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/docai.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/io.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/log_setup.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/postprocessing/common.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/msc/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/commercialInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/commercialInvoice/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/customsInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/packingList/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/packingList/other/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/partnerInvoice/other/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/prompts/prompt_library.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/tms.py +0 -0
- {data_science_document_ai-1.45.2 → data_science_document_ai-1.56.1}/src/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.56.1"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -20,10 +20,11 @@ project_parameters = {
|
|
|
20
20
|
# Fuzzy lookup
|
|
21
21
|
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
22
|
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
23
24
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
25
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
26
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
27
28
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
29
|
"fuzzy_threshold_invoice_classification": 70,
|
|
29
30
|
# Chunking params
|
|
@@ -53,13 +54,6 @@ project_parameters = {
|
|
|
53
54
|
"model_selector": {
|
|
54
55
|
"stable": {
|
|
55
56
|
"bookingConfirmation": 1,
|
|
56
|
-
"finalMbL": 0,
|
|
57
|
-
"draftMbl": 0,
|
|
58
|
-
"arrivalNotice": 0,
|
|
59
|
-
"shippingInstruction": 0,
|
|
60
|
-
"customsAssessment": 0,
|
|
61
|
-
"deliveryOrder": 0,
|
|
62
|
-
"partnerInvoice": 0,
|
|
63
57
|
},
|
|
64
58
|
"beta": {
|
|
65
59
|
"bookingConfirmation": 0,
|
|
@@ -87,8 +81,10 @@ project_parameters = {
|
|
|
87
81
|
# Key to combine the LLM results with the Doc Ai results
|
|
88
82
|
"key_to_combine": {
|
|
89
83
|
"bookingConfirmation": ["transportLegs"],
|
|
84
|
+
"arrivalNotice": ["containers"],
|
|
90
85
|
"finalMbL": ["containers"],
|
|
91
86
|
"draftMbl": ["containers"],
|
|
87
|
+
"deliveryOrder": ["Equipment", "TransportLeg"],
|
|
92
88
|
"customsAssessment": ["containers"],
|
|
93
89
|
"packingList": ["skuData"],
|
|
94
90
|
"commercialInvoice": ["skus"],
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
models_project_id: "738250249861"
|
|
2
|
+
model_config:
|
|
3
|
+
stable:
|
|
4
|
+
bookingConfirmation:
|
|
5
|
+
- id: "dc3e714cd168aeaa"
|
|
6
|
+
details:
|
|
7
|
+
display_name: "doc_cap_bookingConfirmation"
|
|
8
|
+
author: "reet.kanjilal@forto.com"
|
|
9
|
+
created_date: ""
|
|
10
|
+
- id: "3c280b11bdb3ed89"
|
|
11
|
+
details:
|
|
12
|
+
display_name: "doc_cap_BC_mlg"
|
|
13
|
+
author: "igor.tonko@forto.com"
|
|
14
|
+
created_date: ""
|
|
15
|
+
|
|
16
|
+
beta:
|
|
17
|
+
bookingConfirmation:
|
|
18
|
+
- id: "3c280b11bdb3ed89"
|
|
19
|
+
details:
|
|
20
|
+
display_name: "doc_cap_BC_mlg"
|
|
21
|
+
author: "igor.tonko@forto.com"
|
|
22
|
+
created_date: ""
|
|
@@ -11,9 +11,8 @@ import asyncio
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
import pandas as pd
|
|
13
13
|
|
|
14
|
-
from src.llm import prompt_excel_extraction
|
|
15
14
|
from src.prompts.prompt_library import prompt_library
|
|
16
|
-
from src.utils import estimate_page_count,
|
|
15
|
+
from src.utils import estimate_page_count, get_excel_sheets
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
async def extract_data_from_sheet(
|
|
@@ -29,11 +28,14 @@ async def extract_data_from_sheet(
|
|
|
29
28
|
)
|
|
30
29
|
|
|
31
30
|
# Prompt for the LLM JSON
|
|
32
|
-
|
|
31
|
+
prompt = prompt_library.library[doc_type]["other"]["prompt"]
|
|
32
|
+
|
|
33
|
+
# Join the worksheet content with the prompt
|
|
34
|
+
prompt = worksheet + "\n" + prompt
|
|
33
35
|
|
|
34
36
|
try:
|
|
35
37
|
result = await llm_client.get_unified_json_genai(
|
|
36
|
-
|
|
38
|
+
prompt,
|
|
37
39
|
response_schema=response_schema,
|
|
38
40
|
doc_type=doc_type,
|
|
39
41
|
)
|
|
@@ -67,19 +69,7 @@ async def extract_data_from_excel(
|
|
|
67
69
|
|
|
68
70
|
"""
|
|
69
71
|
# Generate the response structure
|
|
70
|
-
response_schema =
|
|
71
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
72
|
-
if input_doc_type
|
|
73
|
-
in [
|
|
74
|
-
"partnerInvoice",
|
|
75
|
-
"customsInvoice",
|
|
76
|
-
"bundeskasse",
|
|
77
|
-
"commercialInvoice",
|
|
78
|
-
"packingList",
|
|
79
|
-
"bookingConfirmation",
|
|
80
|
-
]
|
|
81
|
-
else generate_schema_structure(params, input_doc_type)
|
|
82
|
-
)
|
|
72
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
83
73
|
|
|
84
74
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
85
75
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
@@ -201,33 +201,4 @@ class LlmClient:
|
|
|
201
201
|
return response
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def prompt_excel_extraction(excel_structured_text):
|
|
205
|
-
"""Write a prompt to extract data from Excel files.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
excel_structured_text (str): The structured text of the Excel file.
|
|
209
|
-
|
|
210
|
-
Returns:
|
|
211
|
-
prompt str: The prompt for common json.
|
|
212
|
-
"""
|
|
213
|
-
prompt = f"""{excel_structured_text}
|
|
214
|
-
|
|
215
|
-
Task: Fill in the following dictionary from the information in the given in the above excel data.
|
|
216
|
-
|
|
217
|
-
Instructions:
|
|
218
|
-
- Do not change the keys of the following dictionary.
|
|
219
|
-
- The values should be filled in as per the schema provided below.
|
|
220
|
-
- If an entity contains a 'display_name', consider its properties as child data points in the below format.
|
|
221
|
-
{{'data-field': {{
|
|
222
|
-
'child-data-field': 'type -occurrence_type- description',
|
|
223
|
-
}}
|
|
224
|
-
}}
|
|
225
|
-
- The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
|
|
226
|
-
- Ensure the schema reflects the hierarchical relationship.
|
|
227
|
-
- Use the data field description to understand the context of the data.
|
|
228
|
-
|
|
229
|
-
"""
|
|
230
|
-
return prompt
|
|
231
|
-
|
|
232
|
-
|
|
233
204
|
# pylint: enable=all
|
|
@@ -32,7 +32,6 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
32
32
|
from src.prompts.prompt_library import prompt_library
|
|
33
33
|
from src.utils import (
|
|
34
34
|
extract_top_pages,
|
|
35
|
-
generate_schema_structure,
|
|
36
35
|
get_pdf_page_count,
|
|
37
36
|
get_processor_name,
|
|
38
37
|
run_background_tasks,
|
|
@@ -202,20 +201,8 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
202
201
|
number_of_pages = get_pdf_page_count(file_content)
|
|
203
202
|
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
204
203
|
|
|
205
|
-
# get the schema placeholder
|
|
206
|
-
response_schema =
|
|
207
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
208
|
-
if input_doc_type
|
|
209
|
-
in [
|
|
210
|
-
"partnerInvoice",
|
|
211
|
-
"customsInvoice",
|
|
212
|
-
"bundeskasse",
|
|
213
|
-
"commercialInvoice",
|
|
214
|
-
"packingList",
|
|
215
|
-
"bookingConfirmation",
|
|
216
|
-
]
|
|
217
|
-
else generate_schema_structure(params, input_doc_type)
|
|
218
|
-
)
|
|
204
|
+
# get the schema placeholder
|
|
205
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
219
206
|
|
|
220
207
|
carrier = "other"
|
|
221
208
|
carrier_schema = (
|
|
@@ -270,7 +257,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
270
257
|
):
|
|
271
258
|
tasks.append(
|
|
272
259
|
process_chunk_with_retry(
|
|
273
|
-
chunk,
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
274
265
|
)
|
|
275
266
|
)
|
|
276
267
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
2
4
|
from rapidfuzz import fuzz, process
|
|
3
5
|
|
|
4
6
|
from src.io import logger
|
|
@@ -103,9 +105,18 @@ def post_process_bundeskasse(aggregated_data):
|
|
|
103
105
|
)
|
|
104
106
|
|
|
105
107
|
# Check if the deferredDutyPayer is forto
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
KEYWORDS = {"de789147263644738", "forto", "009812"}
|
|
109
|
+
|
|
110
|
+
def is_forto_recipient(line_item: dict) -> bool:
|
|
111
|
+
values_to_check = [
|
|
112
|
+
line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
|
|
113
|
+
line_item.get("vatId", {}).get("documentValue", ""),
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
combined = " ".join(values_to_check).lower()
|
|
117
|
+
return any(keyword in combined for keyword in KEYWORDS)
|
|
118
|
+
|
|
119
|
+
if is_forto_recipient(line_item):
|
|
109
120
|
is_recipient_forto = True
|
|
110
121
|
|
|
111
122
|
update_recipient_and_vendor(aggregated_data, is_recipient_forto)
|
|
@@ -134,6 +145,20 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
134
145
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
135
146
|
|
|
136
147
|
|
|
148
|
+
def select_unique_bank_account(bank_account):
|
|
149
|
+
# Select the unique bank account if multiple are present
|
|
150
|
+
if isinstance(bank_account, list) and bank_account:
|
|
151
|
+
best = defaultdict(lambda: None)
|
|
152
|
+
|
|
153
|
+
for item in bank_account:
|
|
154
|
+
dv = item["documentValue"]
|
|
155
|
+
if best[dv] is None or item["page"] < best[dv]["page"]:
|
|
156
|
+
best[dv] = item
|
|
157
|
+
|
|
158
|
+
unique = list(best.values())
|
|
159
|
+
return unique
|
|
160
|
+
|
|
161
|
+
|
|
137
162
|
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
138
163
|
"""Process the partner invoice data."""
|
|
139
164
|
# Post process bundeskasse invoices
|
|
@@ -141,6 +166,11 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
141
166
|
post_process_bundeskasse(aggregated_data)
|
|
142
167
|
return
|
|
143
168
|
|
|
169
|
+
if "bankAccount" in aggregated_data:
|
|
170
|
+
aggregated_data["bankAccount"] = select_unique_bank_account(
|
|
171
|
+
aggregated_data["bankAccount"]
|
|
172
|
+
)
|
|
173
|
+
|
|
144
174
|
line_items = aggregated_data.get("lineItem", [])
|
|
145
175
|
# Add debug logging
|
|
146
176
|
logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
|
|
@@ -158,15 +188,20 @@ async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
158
188
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
159
189
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
160
190
|
|
|
191
|
+
# Partner Name
|
|
192
|
+
partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
|
|
193
|
+
|
|
161
194
|
# Process everything in one go
|
|
162
|
-
processed_items = await process_line_items_batch(
|
|
195
|
+
processed_items = await process_line_items_batch(
|
|
196
|
+
params, line_items, reverse_charge, partner_name
|
|
197
|
+
)
|
|
163
198
|
|
|
164
199
|
# Update your main data structure
|
|
165
200
|
aggregated_data["lineItem"] = processed_items
|
|
166
201
|
|
|
167
202
|
|
|
168
203
|
async def process_line_items_batch(
|
|
169
|
-
params: dict, line_items: list[dict], reverse_charge=None
|
|
204
|
+
params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
|
|
170
205
|
):
|
|
171
206
|
"""
|
|
172
207
|
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
@@ -204,23 +239,12 @@ async def process_line_items_batch(
|
|
|
204
239
|
|
|
205
240
|
# Batch API Call for Embedding lookups
|
|
206
241
|
if pending_line_items:
|
|
207
|
-
|
|
208
|
-
logger.info(f"Mapping {len(values_to_fetch)} line items from Embedding API...")
|
|
209
|
-
|
|
210
|
-
# Await the batch response {"desc1": "code1", "desc2": "code2"}
|
|
211
|
-
api_results = await get_tms_mappings(
|
|
212
|
-
input_list=values_to_fetch, embedding_type="line_items"
|
|
213
|
-
)
|
|
242
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
214
243
|
|
|
215
|
-
# Merge API results back into original list
|
|
216
244
|
for index, desc in pending_line_items.items():
|
|
217
|
-
# Get result from API response, or None if API failed for that item
|
|
218
|
-
forto_code = api_results.get(desc)
|
|
219
|
-
|
|
220
|
-
# Update the original item
|
|
221
245
|
line_items[index]["itemCode"] = {
|
|
222
246
|
"documentValue": desc,
|
|
223
|
-
"formattedValue":
|
|
247
|
+
"formattedValue": code_map.get(desc),
|
|
224
248
|
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
225
249
|
}
|
|
226
250
|
|
|
@@ -229,8 +253,12 @@ async def process_line_items_batch(
|
|
|
229
253
|
[
|
|
230
254
|
item.update({"reverseChargeSentence": reverse_charge})
|
|
231
255
|
for item in line_items
|
|
232
|
-
if
|
|
256
|
+
if (
|
|
257
|
+
(item.get("itemCode") and item["itemCode"]["formattedValue"] != "CDU")
|
|
258
|
+
or not item.get("itemCode")
|
|
259
|
+
)
|
|
233
260
|
]
|
|
261
|
+
|
|
234
262
|
return line_items
|
|
235
263
|
|
|
236
264
|
|
|
@@ -272,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
|
|
|
272
300
|
return False
|
|
273
301
|
|
|
274
302
|
# Check if the sentence is similar to any of the reverse charge sentences
|
|
275
|
-
|
|
276
|
-
sentence, reverse_charge_sentences, threshold
|
|
303
|
+
match, _ = get_fuzzy_match_score(
|
|
304
|
+
sentence, list(reverse_charge_sentences.keys()), threshold
|
|
277
305
|
)
|
|
278
306
|
|
|
279
|
-
|
|
307
|
+
if match:
|
|
308
|
+
return reverse_charge_sentences[match]
|
|
309
|
+
|
|
310
|
+
return False
|
|
280
311
|
|
|
281
312
|
|
|
282
313
|
def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
@@ -307,12 +338,13 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
307
338
|
return None
|
|
308
339
|
|
|
309
340
|
|
|
310
|
-
async def associate_forto_item_code(line_item_data, params):
|
|
341
|
+
async def associate_forto_item_code(line_item_data, params, partner_name=None):
|
|
311
342
|
"""
|
|
312
343
|
Associates Forto item codes to a list of line item descriptions.
|
|
313
344
|
Args:
|
|
314
345
|
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
315
346
|
params (dict): Parameters containing lookup data and thresholds.
|
|
347
|
+
partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
|
|
316
348
|
|
|
317
349
|
Returns:
|
|
318
350
|
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
@@ -334,14 +366,51 @@ async def associate_forto_item_code(line_item_data, params):
|
|
|
334
366
|
|
|
335
367
|
# Batch API Call for Embedding lookups
|
|
336
368
|
if pending_line_items:
|
|
337
|
-
|
|
338
|
-
input_list=list(pending_line_items.values()),
|
|
339
|
-
embedding_type="line_items",
|
|
340
|
-
)
|
|
369
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
341
370
|
|
|
342
|
-
# Merge API results back into original list
|
|
343
371
|
for desc, f_desc in pending_line_items.items():
|
|
344
|
-
|
|
345
|
-
|
|
372
|
+
result.append(
|
|
373
|
+
{
|
|
374
|
+
"description": desc,
|
|
375
|
+
"itemCode": code_map.get(f_desc),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return result
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
async def fetch_line_item_codes(
|
|
383
|
+
pending_line_items: dict,
|
|
384
|
+
partner_name: str | None,
|
|
385
|
+
params: dict,
|
|
386
|
+
):
|
|
387
|
+
"""Returns: {original_description: mapped_code_or_None}"""
|
|
388
|
+
t_mode = (
|
|
389
|
+
find_matching_lineitem(
|
|
390
|
+
partner_name.upper(),
|
|
391
|
+
params["lookup_data"]["intermodal_partners"],
|
|
392
|
+
threshold=87,
|
|
393
|
+
)
|
|
394
|
+
if partner_name
|
|
395
|
+
else None
|
|
396
|
+
)
|
|
346
397
|
|
|
398
|
+
unique_descs = list(set(pending_line_items.values()))
|
|
399
|
+
logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
|
|
400
|
+
|
|
401
|
+
# Build API input map
|
|
402
|
+
api_input_map = {
|
|
403
|
+
desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
api_results = await get_tms_mappings(
|
|
407
|
+
input_list=list(api_input_map.values()),
|
|
408
|
+
embedding_type="line_items",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Normalize response back to original descriptions
|
|
412
|
+
result = {
|
|
413
|
+
original_desc: api_results.get(api_desc)
|
|
414
|
+
for original_desc, api_desc in api_input_map.items()
|
|
415
|
+
}
|
|
347
416
|
return result
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"bookingNumber": {
|
|
5
|
+
"type": "STRING",
|
|
6
|
+
"nullable": true,
|
|
7
|
+
"description": "The booking number associated with the Arrival Notice document. They are often referred to as 'Booking Number', 'Booking No.', 'Booking Ref.', 'Booking Reference', 'Booking ID', 'carrier's reference' or 'Order Ref'."
|
|
8
|
+
},
|
|
9
|
+
"destinationTerminal": {
|
|
10
|
+
"type": "STRING",
|
|
11
|
+
"nullable": true,
|
|
12
|
+
"description": "The terminal at the destination port where the container will be delivered."
|
|
13
|
+
},
|
|
14
|
+
"eta": {
|
|
15
|
+
"type": "STRING",
|
|
16
|
+
"nullable": true,
|
|
17
|
+
"description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
|
|
18
|
+
},
|
|
19
|
+
"mblNumber": {
|
|
20
|
+
"type": "STRING",
|
|
21
|
+
"nullable": true,
|
|
22
|
+
"description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
|
|
23
|
+
},
|
|
24
|
+
"portOfDischarge": {
|
|
25
|
+
"type": "STRING",
|
|
26
|
+
"nullable": true,
|
|
27
|
+
"description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment."
|
|
28
|
+
},
|
|
29
|
+
"vesselName": {
|
|
30
|
+
"type": "STRING",
|
|
31
|
+
"nullable": true,
|
|
32
|
+
"description": "The name of the vessel carrying the shipment."
|
|
33
|
+
},
|
|
34
|
+
"containers": {
|
|
35
|
+
"type": "ARRAY",
|
|
36
|
+
"items": {
|
|
37
|
+
"type": "OBJECT",
|
|
38
|
+
"properties": {
|
|
39
|
+
"containerNumber": {
|
|
40
|
+
"type": "STRING",
|
|
41
|
+
"nullable": true,
|
|
42
|
+
"description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
|
|
43
|
+
},
|
|
44
|
+
"containerType": {
|
|
45
|
+
"type": "STRING",
|
|
46
|
+
"nullable": true,
|
|
47
|
+
"description": "The size of the container associated with the containerNumber, such as 20ft, 40ft, 40HC, 20DC etc."
|
|
48
|
+
},
|
|
49
|
+
"grossWeight": {
|
|
50
|
+
"type": "STRING",
|
|
51
|
+
"nullable": true,
|
|
52
|
+
"description": "The gross weight of the container. Usually mentioned as G.W or GW or Gross Weight, etc.."
|
|
53
|
+
},
|
|
54
|
+
"measurements": {
|
|
55
|
+
"type": "STRING",
|
|
56
|
+
"nullable": true,
|
|
57
|
+
"description": "The volume of the container. Usually, it is measured in 'Cubic Meter (cbm)' or dimensions. But volume in 'cbm' is preferred."
|
|
58
|
+
},
|
|
59
|
+
"sealNumber": {
|
|
60
|
+
"type": "STRING",
|
|
61
|
+
"nullable": true,
|
|
62
|
+
"description": "The seal number associated with the container Number. But it is not same as the container number."
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": ["containerNumber", "containerType", "grossWeight"]
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"required": ["bookingNumber", "destinationTerminal", "eta", "portOfDischarge", "vesselName", "containers"]
|
|
70
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
|
+
|
|
3
|
+
<TASK> Your task is to extract data from Arrival Notice documents as per the given response schema structure. <TASK>
|
|
4
|
+
|
|
5
|
+
<CONTEXT>
|
|
6
|
+
The Freight Forwarding company receives Arrival Notice from shipping lines.
|
|
7
|
+
These documents contain various details related to arrival of a shipment to the port of destination such as container numbers, estimated time of arrival, vessel details and containers information.
|
|
8
|
+
They may be written in different languages such as English, German, Italian and can appear in a variety of formats and layouts.
|
|
9
|
+
Your role is to accurately extract specific entities from these Arrival Notices to support efficient processing and accurate record-keeping.
|
|
10
|
+
<CONTEXT>
|
|
11
|
+
|
|
12
|
+
<INSTRUCTIONS>
|
|
13
|
+
- Populate fields as defined in the response schema.
|
|
14
|
+
- Multiple Containers entries may exist, capture all instances under "containers".
|
|
15
|
+
- Use the data field description to understand the context of the data.
|
|
16
|
+
|
|
17
|
+
- bookingNumbers:
|
|
18
|
+
- Booking numbers are unique identifiers for shipments. They are often referred to as "Booking Number", "Booking No.", "Booking Ref.", "Booking Reference", "Booking ID", "SACO-Pos.", "Order Ref", "Unsere Referenz", or "Unsere Position"
|
|
19
|
+
- If there is a unique_id that starts with "S" followed by 6 or 8 digits, it is a shipmentID, not a bookingNumber.
|
|
20
|
+
|
|
21
|
+
- destinationTerminal:
|
|
22
|
+
- Destination Terminal can also be referred to as "Destination Termina;", "Pickup Location", "Delivery Location", "Delivery Terminal", "Empfangsort", "Entladeort", or "Abladestelle".
|
|
23
|
+
|
|
24
|
+
- mblNumbers:
|
|
25
|
+
- Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", "HBL No.", or "M-AWB Nummer".
|
|
26
|
+
- Bill of Lading Number is known as mblNumber. Not a shipmentID even if it starts with "S".
|
|
27
|
+
- mblNumber from Hapag-Lloyd always starts with HLC.... (e.g., "HLCUTS12303AWNT3) and named as SEA WAYBILL or "SWB-NR.
|
|
28
|
+
|
|
29
|
+
- eta:
|
|
30
|
+
- Estimated Time of Arrival (ETA) is the expected date and time when the shipment will arrive at the destination port.
|
|
31
|
+
- It can be referred to as "ETA", "Estimated Arrival", "Voraussichtliche Ankunft", "Ankunftszeit", "Arrivo", "Due to arrive at Terminal"
|
|
32
|
+
|
|
33
|
+
- vesselName:
|
|
34
|
+
- Vessel Name is the name of the ship carrying the cargo. It can be referred to as "Vessel", "Ship Name", "Schiff", "Schiffsname", "Nave", or "Vessel/Flight No.".
|
|
35
|
+
|
|
36
|
+
- containers: Details of each container on the arrival notice. Make sure to extract each container information separately.
|
|
37
|
+
- containerNumber: Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU 7222892).
|
|
38
|
+
- sealNumber: Seal numbers are unique identifiers for shipping seals. They are usually mentioned as seal numbers in the document but they are definitely not container numbers.
|
|
39
|
+
|
|
40
|
+
<INSTRUCTIONS>
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"
|
|
4
|
+
"grandTotal": {
|
|
5
5
|
"type": "STRING",
|
|
6
6
|
"nullable": true,
|
|
7
|
-
"description": "The
|
|
7
|
+
"description": "The overall total amount of the invoice. It can be found with the key words Gesamtabgabenbetrag, Gesamtbetrag, or Zu erstattender Abgabenbetrag"
|
|
8
8
|
},
|
|
9
|
-
"
|
|
9
|
+
"currencyCode": {
|
|
10
10
|
"type": "STRING",
|
|
11
11
|
"nullable": true,
|
|
12
|
-
"description": "The
|
|
12
|
+
"description": "The currency in which the invoice is issued. Extract the currency associated with the grand total (grandTotal) amount. It is majorly mentioned as EUR, Euro or €."
|
|
13
13
|
},
|
|
14
14
|
"issueDate": {
|
|
15
15
|
"type": "STRING",
|
|
@@ -54,7 +54,7 @@
|
|
|
54
54
|
"deferredDutyPayer": {
|
|
55
55
|
"type": "STRING",
|
|
56
56
|
"nullable": true,
|
|
57
|
-
"description": "It can be identified under
|
|
57
|
+
"description": "It can be identified under 'Aufschubenhmer' for each line item"
|
|
58
58
|
},
|
|
59
59
|
"name": {
|
|
60
60
|
"type": "STRING",
|
|
@@ -13,6 +13,7 @@ Your role is to accurately extract specific entities from these Customs invoices
|
|
|
13
13
|
- Populate fields as defined in the response schema.
|
|
14
14
|
- Multiple line item entries may exist, capture all instances under "lineItem".
|
|
15
15
|
- Use the data field description to understand the context of the data.
|
|
16
|
+
- The amount and the currency is always in EUR both for grandTotal and line items.
|
|
16
17
|
|
|
17
18
|
- containerNumber:
|
|
18
19
|
- Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU7222892).
|
|
@@ -31,15 +32,16 @@ Your role is to accurately extract specific entities from these Customs invoices
|
|
|
31
32
|
- Credit Note Invoice Number is a unique identifier for the credit note, it starts with "ATS" only (e.g., ATS.....).
|
|
32
33
|
- NIZZA is not a credit note invoice number.
|
|
33
34
|
|
|
34
|
-
- grandTotal
|
|
35
|
+
- grandTotal:
|
|
36
|
+
- It can be found with the key words Gesamtabgabenbetrag, Gesamtbetragin. In credit notes, it can be found under "Zu erstattender Abgabenbetrag".
|
|
37
|
+
- grandTotal value is always or mostly mentioned in EUR currency as it is issued by German Customs.
|
|
35
38
|
|
|
36
39
|
- serviceDate can also be referred to as "Zollanmeldung" or "Eingangdatum" in the invoice.
|
|
37
40
|
- issueDate can also be referred to as "Einfuhrabgabenbescheid" in the invoice. issueDate and serviceDate can be same in some cases.
|
|
38
41
|
- vendor details can be "Hauptzollamt" details in the top portion of the invoice.
|
|
39
42
|
|
|
40
|
-
- lineItem:
|
|
41
|
-
-
|
|
42
|
-
- totalAmount in the Credit Note is the Differenzbetrag in the line items.
|
|
43
|
+
- lineItem: Each line item should be extracted only once. Give priority to the first occurrence of the line item details in the document.
|
|
44
|
+
- totalAmount in the Credit Note is the Differenzbetrag in the line items. The totalAmount value is always or mostly mentioned in EUR currency.
|
|
43
45
|
- deferredDutyPayer can be identified under "Aufschubenhmer" for each line item. It is a combination of number code and entity.
|
|
44
46
|
|
|
45
47
|
You can usually find all the information in the top 2 pages of the invoice.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"consignee": {
|
|
5
|
+
"type": "STRING",
|
|
6
|
+
"nullable": true,
|
|
7
|
+
"description": "The receiver or buyer of the goods. It can be find with the keywords like Importeur, Anmelder, Empfanger, Consignee, Buyer, Receiver, etc.."
|
|
8
|
+
},
|
|
9
|
+
"countryOfOrigin": {
|
|
10
|
+
"type": "STRING",
|
|
11
|
+
"nullable": true,
|
|
12
|
+
"description": "The country where the goods were manufactured or produced. It can be identified as Land van oorsprong, Ursprungsland in the document."
|
|
13
|
+
},
|
|
14
|
+
"MRN": {
|
|
15
|
+
"type": "STRING",
|
|
16
|
+
"nullable": true,
|
|
17
|
+
"description": "Movement Reference Number (MRN) is a unique identifier assigned to each customs declaration for goods being imported or exported within the European Union (EU). It is used to track and monitor the movement of goods across EU member states. It can be found with MRN, Reg. Nr., Reg. Kennzeigechen, etc.."
|
|
18
|
+
},
|
|
19
|
+
"shipper": {
|
|
20
|
+
"type": "STRING",
|
|
21
|
+
"nullable": true,
|
|
22
|
+
"description": "The seller or shipper of the goods. It is often indicated by the term Shipper, Speditore, Esportatore, Exporteur, Versender."
|
|
23
|
+
},
|
|
24
|
+
"totalValueOfGoods": {
|
|
25
|
+
"type": "STRING",
|
|
26
|
+
"nullable": true,
|
|
27
|
+
"description": "The total monetary value of the goods being shipped, usually declared for customs purposes. It can be found with Waarde, Warenwert, Factuurwaarde, Invoice Value, etc.."
|
|
28
|
+
},
|
|
29
|
+
"containers": {
|
|
30
|
+
"type": "ARRAY",
|
|
31
|
+
"items": {
|
|
32
|
+
"type": "OBJECT",
|
|
33
|
+
"properties": {
|
|
34
|
+
"containerNumber": {
|
|
35
|
+
"type": "STRING",
|
|
36
|
+
"nullable": true,
|
|
37
|
+
"description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
|
|
38
|
+
},
|
|
39
|
+
"goodsDescription": {
|
|
40
|
+
"type": "STRING",
|
|
41
|
+
"nullable": true,
|
|
42
|
+
"description": "A brief description of the goods contained within the container. It can be found with goods description, Bezeichnung, goederenomschrijving."
|
|
43
|
+
},
|
|
44
|
+
"grossWeight": {
|
|
45
|
+
"type": "STRING",
|
|
46
|
+
"nullable": true,
|
|
47
|
+
"description": "The gross weight of the container. Usually mentioned as G.W or GW, Bruto, or Gross Weight, etc.."
|
|
48
|
+
},
|
|
49
|
+
"nettWeight": {
|
|
50
|
+
"type": "STRING",
|
|
51
|
+
"nullable": true,
|
|
52
|
+
"description": "The net weight of the goods inside the container. Usually mentioned as N.W or NW, Net Weight, or Netto, Eigenmasse, etc.."
|
|
53
|
+
},
|
|
54
|
+
"packagingNumber": {
|
|
55
|
+
"type": "STRING",
|
|
56
|
+
"nullable": true,
|
|
57
|
+
"description": "The quantity of the goods. Usually, the quantity is in pallets, PLT, cartons, CTNS, pieces, PCS, packages, boxes, etc. Please prioritize the packaging types based on their size, as follows: Pallets (PLT) >> Cartons (CTNS) >> Pieces (PCS). Extract the Larger packaging types that will have a lower count."
|
|
58
|
+
},
|
|
59
|
+
"packagingType": {
|
|
60
|
+
"type": "STRING",
|
|
61
|
+
"nullable": true,
|
|
62
|
+
"description": "The packaging type is the unit of packagingNumber. Example; pallets, PLT, cartons, CTNS, pieces, PCS, packages, etc. Sometimes, the packaging type is available in the column name of the packagingNumber."
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": ["containerNumber", "goodsDescription", "grossWeight", "nettWeight", "packagingNumber", "packagingType"]
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"required": ["countryOfOrigin", "MRN", "totalValueOfGoods", "containers"]
|
|
70
|
+
}
|