data-science-document-ai 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +2 -2
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD +34 -31
- src/constants.py +7 -10
- src/docai_processor_config.yaml +0 -56
- src/excel_processing.py +24 -14
- src/io.py +23 -0
- src/llm.py +0 -29
- src/pdf_processing.py +156 -51
- src/postprocessing/common.py +172 -28
- src/postprocessing/postprocess_partner_invoice.py +194 -59
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- src/prompts/library/bundeskasse/other/prompt.txt +7 -5
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/prompt.txt +4 -3
- src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +80 -0
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
- src/prompts/prompt_library.py +0 -4
- src/setup.py +15 -16
- src/utils.py +120 -68
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""This module contains the postprocessing functions for the partner invoice."""
|
|
2
|
-
from
|
|
2
|
+
from collections import defaultdict
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from rapidfuzz import fuzz, process
|
|
5
5
|
|
|
6
6
|
from src.io import logger
|
|
7
7
|
from src.utils import get_tms_mappings
|
|
@@ -105,9 +105,18 @@ def post_process_bundeskasse(aggregated_data):
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
# Check if the deferredDutyPayer is forto
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
KEYWORDS = {"de789147263644738", "forto", "009812"}
|
|
109
|
+
|
|
110
|
+
def is_forto_recipient(line_item: dict) -> bool:
|
|
111
|
+
values_to_check = [
|
|
112
|
+
line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
|
|
113
|
+
line_item.get("vatId", {}).get("documentValue", ""),
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
combined = " ".join(values_to_check).lower()
|
|
117
|
+
return any(keyword in combined for keyword in KEYWORDS)
|
|
118
|
+
|
|
119
|
+
if is_forto_recipient(line_item):
|
|
111
120
|
is_recipient_forto = True
|
|
112
121
|
|
|
113
122
|
update_recipient_and_vendor(aggregated_data, is_recipient_forto)
|
|
@@ -136,13 +145,32 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
|
|
|
136
145
|
] = "Dasbachstraße 15, 54292 Trier, Germany"
|
|
137
146
|
|
|
138
147
|
|
|
139
|
-
def
|
|
148
|
+
def select_unique_bank_account(bank_account):
|
|
149
|
+
# Select the unique bank account if multiple are present
|
|
150
|
+
if isinstance(bank_account, list) and bank_account:
|
|
151
|
+
best = defaultdict(lambda: None)
|
|
152
|
+
|
|
153
|
+
for item in bank_account:
|
|
154
|
+
dv = item["documentValue"]
|
|
155
|
+
if best[dv] is None or item["page"] < best[dv]["page"]:
|
|
156
|
+
best[dv] = item
|
|
157
|
+
|
|
158
|
+
unique = list(best.values())
|
|
159
|
+
return unique
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
140
163
|
"""Process the partner invoice data."""
|
|
141
164
|
# Post process bundeskasse invoices
|
|
142
165
|
if document_type_code == "bundeskasse":
|
|
143
166
|
post_process_bundeskasse(aggregated_data)
|
|
144
167
|
return
|
|
145
168
|
|
|
169
|
+
if "bankAccount" in aggregated_data:
|
|
170
|
+
aggregated_data["bankAccount"] = select_unique_bank_account(
|
|
171
|
+
aggregated_data["bankAccount"]
|
|
172
|
+
)
|
|
173
|
+
|
|
146
174
|
line_items = aggregated_data.get("lineItem", [])
|
|
147
175
|
# Add debug logging
|
|
148
176
|
logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
|
|
@@ -160,27 +188,78 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
|
|
|
160
188
|
reverse_charge_info["formattedValue"] = reverse_charge_value
|
|
161
189
|
reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
|
|
162
190
|
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
191
|
+
# Partner Name
|
|
192
|
+
partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
|
|
193
|
+
|
|
194
|
+
# Process everything in one go
|
|
195
|
+
processed_items = await process_line_items_batch(
|
|
196
|
+
params, line_items, reverse_charge, partner_name
|
|
197
|
+
)
|
|
170
198
|
|
|
171
|
-
|
|
172
|
-
|
|
199
|
+
# Update your main data structure
|
|
200
|
+
aggregated_data["lineItem"] = processed_items
|
|
173
201
|
|
|
174
|
-
if reverse_charge:
|
|
175
|
-
# Distribute reverseChargeSentence to all line items
|
|
176
|
-
line_item["reverseChargeSentence"] = reverse_charge
|
|
177
|
-
line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
|
|
178
202
|
|
|
203
|
+
async def process_line_items_batch(
|
|
204
|
+
params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Processes all line items efficiently using a "Split-Apply-Combine" strategy.
|
|
208
|
+
"""
|
|
209
|
+
# To store items that need external API lookup
|
|
210
|
+
pending_line_items = {}
|
|
211
|
+
|
|
212
|
+
# Check Fuzzy Matching
|
|
213
|
+
logger.info(f"Mapping line item codes with Fuzzy matching....")
|
|
214
|
+
for i, item in enumerate(line_items):
|
|
215
|
+
description_obj = item.get("lineItemDescription")
|
|
216
|
+
|
|
217
|
+
if not description_obj or not description_obj.get("formattedValue"):
|
|
218
|
+
continue
|
|
219
|
+
# Get the formatted description text
|
|
220
|
+
desc = description_obj["formattedValue"]
|
|
221
|
+
|
|
222
|
+
# Find Fuzzy Match
|
|
223
|
+
matched_code = find_matching_lineitem(
|
|
224
|
+
desc,
|
|
225
|
+
params["lookup_data"]["item_code"],
|
|
226
|
+
params["fuzzy_threshold_item_code"],
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if matched_code:
|
|
230
|
+
# Set the code to the line item
|
|
231
|
+
item["itemCode"] = {
|
|
232
|
+
"documentValue": desc,
|
|
233
|
+
"formattedValue": matched_code,
|
|
234
|
+
"page": description_obj.get("page"),
|
|
235
|
+
}
|
|
236
|
+
else:
|
|
237
|
+
# Store for batch API call
|
|
238
|
+
pending_line_items[i] = desc
|
|
239
|
+
|
|
240
|
+
# Batch API Call for Embedding lookups
|
|
241
|
+
if pending_line_items:
|
|
242
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
243
|
+
|
|
244
|
+
for index, desc in pending_line_items.items():
|
|
245
|
+
line_items[index]["itemCode"] = {
|
|
246
|
+
"documentValue": desc,
|
|
247
|
+
"formattedValue": code_map.get(desc),
|
|
248
|
+
"page": line_items[index]["lineItemDescription"].get("page"),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# Add reverse charge here if exists
|
|
252
|
+
if reverse_charge:
|
|
253
|
+
[
|
|
254
|
+
item.update({"reverseChargeSentence": reverse_charge})
|
|
255
|
+
for item in line_items
|
|
256
|
+
if (
|
|
257
|
+
(item.get("itemCode") and item["itemCode"]["formattedValue"] != "CDU")
|
|
258
|
+
or not item.get("itemCode")
|
|
259
|
+
)
|
|
260
|
+
]
|
|
179
261
|
|
|
180
|
-
|
|
181
|
-
"""Compute the fuzzy matching score between a new line item and a key."""
|
|
182
|
-
new_lineitem, key = args
|
|
183
|
-
return key, fuzz.ratio(new_lineitem, key)
|
|
262
|
+
return line_items
|
|
184
263
|
|
|
185
264
|
|
|
186
265
|
def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
@@ -195,16 +274,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
|
|
|
195
274
|
tuple: (best_match, score) if above threshold, else (None, 0)
|
|
196
275
|
"""
|
|
197
276
|
# Use multiprocessing to find the best match
|
|
198
|
-
|
|
199
|
-
|
|
277
|
+
result = process.extractOne(
|
|
278
|
+
target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if result is None:
|
|
282
|
+
return None, False
|
|
200
283
|
|
|
201
|
-
|
|
202
|
-
best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
|
|
284
|
+
match, score, index = result
|
|
203
285
|
|
|
204
|
-
# return best_match
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
return best_match, True
|
|
286
|
+
# return best_match if the best match score is above a threshold (e.g., 80)
|
|
287
|
+
if match:
|
|
288
|
+
return match, True
|
|
208
289
|
|
|
209
290
|
return None, False
|
|
210
291
|
|
|
@@ -219,11 +300,14 @@ def if_reverse_charge_sentence(sentence: str, params):
|
|
|
219
300
|
return False
|
|
220
301
|
|
|
221
302
|
# Check if the sentence is similar to any of the reverse charge sentences
|
|
222
|
-
|
|
223
|
-
sentence, reverse_charge_sentences, threshold
|
|
303
|
+
match, _ = get_fuzzy_match_score(
|
|
304
|
+
sentence, list(reverse_charge_sentences.keys()), threshold
|
|
224
305
|
)
|
|
225
306
|
|
|
226
|
-
|
|
307
|
+
if match:
|
|
308
|
+
return reverse_charge_sentences[match]
|
|
309
|
+
|
|
310
|
+
return False
|
|
227
311
|
|
|
228
312
|
|
|
229
313
|
def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
@@ -236,46 +320,97 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
|
|
|
236
320
|
Returns:
|
|
237
321
|
str: The best matching 'Forto SLI' value from the dictionary.
|
|
238
322
|
"""
|
|
239
|
-
new_lineitem = new_lineitem.upper()
|
|
240
|
-
|
|
241
323
|
# Check if the new line item is already in the dictionary
|
|
242
324
|
if new_lineitem in kvp_dict:
|
|
243
325
|
return kvp_dict[new_lineitem]
|
|
244
326
|
|
|
245
327
|
# Get the best fuzzy match score for the extracted line item
|
|
246
|
-
|
|
247
|
-
new_lineitem,
|
|
328
|
+
match, _ = get_fuzzy_match_score(
|
|
329
|
+
new_lineitem,
|
|
330
|
+
list(kvp_dict.keys()),
|
|
331
|
+
threshold,
|
|
248
332
|
)
|
|
249
333
|
|
|
250
|
-
|
|
251
|
-
|
|
334
|
+
if match:
|
|
335
|
+
# find the code from the kvp_dict
|
|
336
|
+
return kvp_dict[match]
|
|
252
337
|
|
|
253
|
-
|
|
254
|
-
"""
|
|
255
|
-
Finds a match for the input string using fuzzy matching first, then embedding fallback.
|
|
338
|
+
return None
|
|
256
339
|
|
|
257
|
-
1. Tries to find a fuzzy match for input_string against the keys in
|
|
258
|
-
mapping_data using RapidFuzz, requiring a score >= fuzzy_threshold.
|
|
259
|
-
2. If found, returns the corresponding value from mapping_data.
|
|
260
|
-
3. If not found above threshold, calls the embedding_fallback function.
|
|
261
340
|
|
|
341
|
+
async def associate_forto_item_code(line_item_data, params, partner_name=None):
|
|
342
|
+
"""
|
|
343
|
+
Associates Forto item codes to a list of line item descriptions.
|
|
262
344
|
Args:
|
|
263
|
-
|
|
264
|
-
params: Parameters containing
|
|
345
|
+
line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
|
|
346
|
+
params (dict): Parameters containing lookup data and thresholds.
|
|
347
|
+
partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
|
|
265
348
|
|
|
266
349
|
Returns:
|
|
267
|
-
|
|
350
|
+
list: A list of dictionaries with 'description' and 'itemCode' keys.
|
|
268
351
|
"""
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
352
|
+
|
|
353
|
+
result = []
|
|
354
|
+
pending_line_items = {}
|
|
355
|
+
for desc, f_desc in line_item_data.items():
|
|
356
|
+
# Get the Forto item code using fuzzy matching
|
|
357
|
+
code = find_matching_lineitem(
|
|
358
|
+
new_lineitem=f_desc,
|
|
359
|
+
kvp_dict=params["lookup_data"]["item_code"],
|
|
360
|
+
threshold=params["fuzzy_threshold_item_code"],
|
|
361
|
+
)
|
|
362
|
+
if code:
|
|
363
|
+
result.append({"description": desc, "itemCode": code})
|
|
364
|
+
else:
|
|
365
|
+
pending_line_items[desc] = f_desc
|
|
366
|
+
|
|
367
|
+
# Batch API Call for Embedding lookups
|
|
368
|
+
if pending_line_items:
|
|
369
|
+
code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
|
|
370
|
+
|
|
371
|
+
for desc, f_desc in pending_line_items.items():
|
|
372
|
+
result.append(
|
|
373
|
+
{
|
|
374
|
+
"description": desc,
|
|
375
|
+
"itemCode": code_map.get(f_desc),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return result
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
async def fetch_line_item_codes(
|
|
383
|
+
pending_line_items: dict,
|
|
384
|
+
partner_name: str | None,
|
|
385
|
+
params: dict,
|
|
386
|
+
):
|
|
387
|
+
"""Returns: {original_description: mapped_code_or_None}"""
|
|
388
|
+
t_mode = (
|
|
389
|
+
find_matching_lineitem(
|
|
390
|
+
partner_name.upper(),
|
|
391
|
+
params["lookup_data"]["intermodal_partners"],
|
|
392
|
+
threshold=87,
|
|
393
|
+
)
|
|
394
|
+
if partner_name
|
|
395
|
+
else None
|
|
274
396
|
)
|
|
275
397
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
forto_item_code = get_tms_mappings(input_string, "line_items")
|
|
398
|
+
unique_descs = list(set(pending_line_items.values()))
|
|
399
|
+
logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
|
|
279
400
|
|
|
280
|
-
|
|
401
|
+
# Build API input map
|
|
402
|
+
api_input_map = {
|
|
403
|
+
desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
api_results = await get_tms_mappings(
|
|
407
|
+
input_list=list(api_input_map.values()),
|
|
408
|
+
embedding_type="line_items",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Normalize response back to original descriptions
|
|
412
|
+
result = {
|
|
413
|
+
original_desc: api_results.get(api_desc)
|
|
414
|
+
for original_desc, api_desc in api_input_map.items()
|
|
415
|
+
}
|
|
281
416
|
return result
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"bookingNumber": {
|
|
5
|
+
"type": "STRING",
|
|
6
|
+
"nullable": true,
|
|
7
|
+
"description": "The booking number associated with the Arrival Notice document. They are often referred to as 'Booking Number', 'Booking No.', 'Booking Ref.', 'Booking Reference', 'Booking ID', 'carrier's reference' or 'Order Ref'."
|
|
8
|
+
},
|
|
9
|
+
"destinationTerminal": {
|
|
10
|
+
"type": "STRING",
|
|
11
|
+
"nullable": true,
|
|
12
|
+
"description": "The terminal at the destination port where the container will be delivered."
|
|
13
|
+
},
|
|
14
|
+
"eta": {
|
|
15
|
+
"type": "STRING",
|
|
16
|
+
"nullable": true,
|
|
17
|
+
"description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
|
|
18
|
+
},
|
|
19
|
+
"mblNumber": {
|
|
20
|
+
"type": "STRING",
|
|
21
|
+
"nullable": true,
|
|
22
|
+
"description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
|
|
23
|
+
},
|
|
24
|
+
"portOfDischarge": {
|
|
25
|
+
"type": "STRING",
|
|
26
|
+
"nullable": true,
|
|
27
|
+
"description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment."
|
|
28
|
+
},
|
|
29
|
+
"vesselName": {
|
|
30
|
+
"type": "STRING",
|
|
31
|
+
"nullable": true,
|
|
32
|
+
"description": "The name of the vessel carrying the shipment."
|
|
33
|
+
},
|
|
34
|
+
"containers": {
|
|
35
|
+
"type": "ARRAY",
|
|
36
|
+
"items": {
|
|
37
|
+
"type": "OBJECT",
|
|
38
|
+
"properties": {
|
|
39
|
+
"containerNumber": {
|
|
40
|
+
"type": "STRING",
|
|
41
|
+
"nullable": true,
|
|
42
|
+
"description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
|
|
43
|
+
},
|
|
44
|
+
"containerType": {
|
|
45
|
+
"type": "STRING",
|
|
46
|
+
"nullable": true,
|
|
47
|
+
"description": "The size of the container associated with the containerNumber, such as 20ft, 40ft, 40HC, 20DC etc."
|
|
48
|
+
},
|
|
49
|
+
"grossWeight": {
|
|
50
|
+
"type": "STRING",
|
|
51
|
+
"nullable": true,
|
|
52
|
+
"description": "The gross weight of the container. Usually mentioned as G.W or GW or Gross Weight, etc.."
|
|
53
|
+
},
|
|
54
|
+
"measurements": {
|
|
55
|
+
"type": "STRING",
|
|
56
|
+
"nullable": true,
|
|
57
|
+
"description": "The volume of the container. Usually, it is measured in 'Cubic Meter (cbm)' or dimensions. But volume in 'cbm' is preferred."
|
|
58
|
+
},
|
|
59
|
+
"sealNumber": {
|
|
60
|
+
"type": "STRING",
|
|
61
|
+
"nullable": true,
|
|
62
|
+
"description": "The seal number associated with the container Number. But it is not same as the container number."
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": ["containerNumber", "containerType", "grossWeight"]
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"required": ["bookingNumber", "destinationTerminal", "eta", "portOfDischarge", "vesselName", "containers"]
|
|
70
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
|
+
|
|
3
|
+
<TASK> Your task is to extract data from Arrival Notice documents as per the given response schema structure. <TASK>
|
|
4
|
+
|
|
5
|
+
<CONTEXT>
|
|
6
|
+
The Freight Forwarding company receives Arrival Notice from shipping lines.
|
|
7
|
+
These documents contain various details related to arrival of a shipment to the port of destination such as container numbers, estimated time of arrival, vessel details and containers information.
|
|
8
|
+
They may be written in different languages such as English, German, Italian and can appear in a variety of formats and layouts.
|
|
9
|
+
Your role is to accurately extract specific entities from these Arrival Notices to support efficient processing and accurate record-keeping.
|
|
10
|
+
<CONTEXT>
|
|
11
|
+
|
|
12
|
+
<INSTRUCTIONS>
|
|
13
|
+
- Populate fields as defined in the response schema.
|
|
14
|
+
- Multiple Containers entries may exist, capture all instances under "containers".
|
|
15
|
+
- Use the data field description to understand the context of the data.
|
|
16
|
+
|
|
17
|
+
- bookingNumbers:
|
|
18
|
+
- Booking numbers are unique identifiers for shipments. They are often referred to as "Booking Number", "Booking No.", "Booking Ref.", "Booking Reference", "Booking ID", "SACO-Pos.", "Order Ref", "Unsere Referenz", or "Unsere Position"
|
|
19
|
+
- If there is a unique_id that starts with "S" followed by 6 or 8 digits, it is a shipmentID, not a bookingNumber.
|
|
20
|
+
|
|
21
|
+
- destinationTerminal:
|
|
22
|
+
- Destination Terminal can also be referred to as "Destination Termina;", "Pickup Location", "Delivery Location", "Delivery Terminal", "Empfangsort", "Entladeort", or "Abladestelle".
|
|
23
|
+
|
|
24
|
+
- mblNumbers:
|
|
25
|
+
- Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", "HBL No.", or "M-AWB Nummer".
|
|
26
|
+
- Bill of Lading Number is known as mblNumber. Not a shipmentID even if it starts with "S".
|
|
27
|
+
- mblNumber from Hapag-Lloyd always starts with HLC.... (e.g., "HLCUTS12303AWNT3) and named as SEA WAYBILL or "SWB-NR.
|
|
28
|
+
|
|
29
|
+
- eta:
|
|
30
|
+
- Estimated Time of Arrival (ETA) is the expected date and time when the shipment will arrive at the destination port.
|
|
31
|
+
- It can be referred to as "ETA", "Estimated Arrival", "Voraussichtliche Ankunft", "Ankunftszeit", "Arrivo", "Due to arrive at Terminal"
|
|
32
|
+
|
|
33
|
+
- vesselName:
|
|
34
|
+
- Vessel Name is the name of the ship carrying the cargo. It can be referred to as "Vessel", "Ship Name", "Schiff", "Schiffsname", "Nave", or "Vessel/Flight No.".
|
|
35
|
+
|
|
36
|
+
- containers: Details of each container on the arrival notice. Make sure to extract each container information separately.
|
|
37
|
+
- containerNumber: Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU 7222892).
|
|
38
|
+
- sealNumber: Seal numbers are unique identifiers for shipping seals. They are usually mentioned as seal numbers in the document but they are definitely not container numbers.
|
|
39
|
+
|
|
40
|
+
<INSTRUCTIONS>
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "OBJECT",
|
|
3
3
|
"properties": {
|
|
4
|
-
"
|
|
4
|
+
"grandTotal": {
|
|
5
5
|
"type": "STRING",
|
|
6
6
|
"nullable": true,
|
|
7
|
-
"description": "The
|
|
7
|
+
"description": "The overall total amount of the invoice. It can be found with the key words Gesamtabgabenbetrag, Gesamtbetrag, or Zu erstattender Abgabenbetrag"
|
|
8
8
|
},
|
|
9
|
-
"
|
|
9
|
+
"currencyCode": {
|
|
10
10
|
"type": "STRING",
|
|
11
11
|
"nullable": true,
|
|
12
|
-
"description": "The
|
|
12
|
+
"description": "The currency in which the invoice is issued. Extract the currency associated with the grand total (grandTotal) amount. It is majorly mentioned as EUR, Euro or €."
|
|
13
13
|
},
|
|
14
14
|
"issueDate": {
|
|
15
15
|
"type": "STRING",
|
|
@@ -54,7 +54,7 @@
|
|
|
54
54
|
"deferredDutyPayer": {
|
|
55
55
|
"type": "STRING",
|
|
56
56
|
"nullable": true,
|
|
57
|
-
"description": "It can be identified under
|
|
57
|
+
"description": "It can be identified under 'Aufschubenhmer' for each line item"
|
|
58
58
|
},
|
|
59
59
|
"name": {
|
|
60
60
|
"type": "STRING",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
|
|
2
2
|
|
|
3
|
-
<TASK>Your task is to extract data
|
|
3
|
+
<TASK> Your task is to extract data from customs invoice documents as per the given response schema structure. <TASK>
|
|
4
4
|
|
|
5
5
|
<CONTEXT>
|
|
6
6
|
The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.
|
|
@@ -13,6 +13,7 @@ Your role is to accurately extract specific entities from these Customs invoices
|
|
|
13
13
|
- Populate fields as defined in the response schema.
|
|
14
14
|
- Multiple line item entries may exist, capture all instances under "lineItem".
|
|
15
15
|
- Use the data field description to understand the context of the data.
|
|
16
|
+
- The amount and the currency is always in EUR both for grandTotal and line items.
|
|
16
17
|
|
|
17
18
|
- containerNumber:
|
|
18
19
|
- Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU7222892).
|
|
@@ -31,15 +32,16 @@ Your role is to accurately extract specific entities from these Customs invoices
|
|
|
31
32
|
- Credit Note Invoice Number is a unique identifier for the credit note, it starts with "ATS" only (e.g., ATS.....).
|
|
32
33
|
- NIZZA is not a credit note invoice number.
|
|
33
34
|
|
|
34
|
-
- grandTotal
|
|
35
|
+
- grandTotal:
|
|
36
|
+
- It can be found with the key words Gesamtabgabenbetrag, Gesamtbetragin. In credit notes, it can be found under "Zu erstattender Abgabenbetrag".
|
|
37
|
+
- grandTotal value is always or mostly mentioned in EUR currency as it is issued by German Customs.
|
|
35
38
|
|
|
36
39
|
- serviceDate can also be referred to as "Zollanmeldung" or "Eingangdatum" in the invoice.
|
|
37
40
|
- issueDate can also be referred to as "Einfuhrabgabenbescheid" in the invoice. issueDate and serviceDate can be same in some cases.
|
|
38
41
|
- vendor details can be "Hauptzollamt" details in the top portion of the invoice.
|
|
39
42
|
|
|
40
|
-
- lineItem:
|
|
41
|
-
-
|
|
42
|
-
- totalAmount in the Credit Note is the Differenzbetrag in the line items.
|
|
43
|
+
- lineItem: Each line item should be extracted only once. Give priority to the first occurrence of the line item details in the document.
|
|
44
|
+
- totalAmount in the Credit Note is the Differenzbetrag in the line items. The totalAmount value is always or mostly mentioned in EUR currency.
|
|
43
45
|
- deferredDutyPayer can be identified under "Aufschubenhmer" for each line item. It is a combination of number code and entity.
|
|
44
46
|
|
|
45
47
|
You can usually find all the information in the top 2 pages of the invoice.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"consignee": {
|
|
5
|
+
"type": "string",
|
|
6
|
+
"nullable": true,
|
|
7
|
+
"description": "The receiver or buyer of the goods."
|
|
8
|
+
},
|
|
9
|
+
"currency": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"nullable": true,
|
|
12
|
+
"description": "The currency of the totalAmount."
|
|
13
|
+
},
|
|
14
|
+
"grossWeight": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"nullable": true,
|
|
17
|
+
"description": "The total gross weight of all the goods. Usually mentioned as G.W or GW or Gross Weight, etc.."
|
|
18
|
+
},
|
|
19
|
+
"incoterm": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"nullable": true,
|
|
22
|
+
"description": "An Incoterm is a 3 letter standardized trade term defining the responsibilities of buyers and sellers in international shipping and logistics. For example, FOB, CFR, DAP, CIF, etc..."
|
|
23
|
+
},
|
|
24
|
+
"invoiceDate": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"nullable": true,
|
|
27
|
+
"description": "A date that the invoice was created or issued."
|
|
28
|
+
},
|
|
29
|
+
"invoiceNumber": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"nullable": true,
|
|
32
|
+
"description": "The invoice number of the commercial invoice document."
|
|
33
|
+
},
|
|
34
|
+
"measurement": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"nullable": true,
|
|
37
|
+
"description": "The volume of the goods. Usually, it is measured in \"Cubic Meter (cbm)\" or dimensions. But volume in \"cbm\" is preferred."
|
|
38
|
+
},
|
|
39
|
+
"netWeight": {
|
|
40
|
+
"type": "string",
|
|
41
|
+
"nullable": true,
|
|
42
|
+
"description": "The total net weight of all the goods. Usually, mentioned as N.W or NW or Net Weight, etc.."
|
|
43
|
+
},
|
|
44
|
+
"shipper": {
|
|
45
|
+
"type": "string",
|
|
46
|
+
"nullable": true,
|
|
47
|
+
"description": "The seller or shipper of the goods."
|
|
48
|
+
},
|
|
49
|
+
"totalAmount": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"nullable": true,
|
|
52
|
+
"description": "The total amount of all the goods mentioned in the invoice."
|
|
53
|
+
},
|
|
54
|
+
"skus": {
|
|
55
|
+
"type": "ARRAY",
|
|
56
|
+
"items": {
|
|
57
|
+
"type": "OBJECT",
|
|
58
|
+
"properties": {
|
|
59
|
+
"amount": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"nullable": true,
|
|
62
|
+
"description": "Amount of the goods."
|
|
63
|
+
},
|
|
64
|
+
"containerNumber": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"nullable": true,
|
|
67
|
+
"description": "Container Number consists of 4 capital letters followed by 7 digits. Example: TEMU7972458. Usually mentioned as Container Number, CONTAINER NO. Containers, or Container / Truck No"
|
|
68
|
+
},
|
|
69
|
+
"currency": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"nullable": true,
|
|
72
|
+
"description": "The currency of the Amount. Usually mentioned in USD, EURO, CNY, $, or any other currency units and symbols."
|
|
73
|
+
},
|
|
74
|
+
"goodsDescription": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"nullable": true,
|
|
77
|
+
"description": "Description of the goods."
|
|
78
|
+
},
|
|
79
|
+
"grossWeight": {
|
|
80
|
+
"type": "string",
|
|
81
|
+
"nullable": true,
|
|
82
|
+
"description": "The gross weight of an individual product/goods. Usually, mentioned as G.W or GW or Gross Weight, etc.."
|
|
83
|
+
},
|
|
84
|
+
"hsCode": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"nullable": true,
|
|
87
|
+
"description": "The harmonized system code of a goods."
|
|
88
|
+
},
|
|
89
|
+
"materialNumber": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"nullable": true,
|
|
92
|
+
"description": "Material number of the product or goods."
|
|
93
|
+
},
|
|
94
|
+
"netWeight": {
|
|
95
|
+
"type": "string",
|
|
96
|
+
"nullable": true,
|
|
97
|
+
"description": "The net weight of an individual product/goods. Usually, mentioned as N.W or NW or Net Weight, etc.."
|
|
98
|
+
},
|
|
99
|
+
"packagingQuantity": {
|
|
100
|
+
"type": "string",
|
|
101
|
+
"nullable": true,
|
|
102
|
+
"description": "The quantity of the goods. Usually, the quantity is in pallets, PLT, cartons, CTNS, pieces, PCS, packages, boxes, etc. Please prioritize the packaging types based on their size, as follows: Pallets (PLT) >> Cartons (CTNS) >> Pieces (PCS). Extract the Larger packaging types that will have a lower count."
|
|
103
|
+
},
|
|
104
|
+
"packageType": {
|
|
105
|
+
"type": "string",
|
|
106
|
+
"nullable": true,
|
|
107
|
+
"description": "The packaging type is the unit of packagingQuantity. Example; pallets, PLT, cartons, CTNS, pieces, PCS, packages, etc. Sometimes, the packaging type is available in the column name of the packagingQuantity."
|
|
108
|
+
},
|
|
109
|
+
"poNumber": {
|
|
110
|
+
"type": "string",
|
|
111
|
+
"nullable": true,
|
|
112
|
+
"description": "Purchase order of the goods."
|
|
113
|
+
},
|
|
114
|
+
"skuNumber": {
|
|
115
|
+
"type": "string",
|
|
116
|
+
"nullable": true,
|
|
117
|
+
"description": "SKU number of the goods."
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
"required": []
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
"required": []
|
|
125
|
+
}
|
|
@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
|
|
|
2
2
|
|
|
3
3
|
Extract all the data points from the given document.
|
|
4
4
|
Each data point is part of a master field called skus. There may be multiple skus entries in a document.
|
|
5
|
-
Your task is to extract the text value of the entities and page numbers starting from 0
|
|
5
|
+
Your task is to extract the text value of the entities and page numbers starting from 0 where the value was found in the document.
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
Instructions:
|