data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
  2. data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
  3. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
  4. src/constants.py +42 -12
  5. src/constants_sandbox.py +2 -22
  6. src/docai.py +18 -7
  7. src/docai_processor_config.yaml +0 -64
  8. src/excel_processing.py +34 -15
  9. src/io.py +74 -6
  10. src/llm.py +12 -34
  11. src/pdf_processing.py +228 -78
  12. src/postprocessing/common.py +495 -618
  13. src/postprocessing/postprocess_partner_invoice.py +383 -27
  14. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  15. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  16. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  17. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  18. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  19. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  24. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  26. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  28. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  29. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  30. src/prompts/library/bundeskasse/other/placeholders.json +113 -0
  31. src/prompts/library/bundeskasse/other/prompt.txt +48 -0
  32. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  33. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  34. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  35. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  36. src/prompts/library/customsInvoice/other/placeholders.json +205 -0
  37. src/prompts/library/customsInvoice/other/prompt.txt +105 -0
  38. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  39. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  40. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  41. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  42. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  43. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  44. src/prompts/library/packingList/other/placeholders.json +98 -0
  45. src/prompts/library/packingList/other/prompt.txt +1 -1
  46. src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
  47. src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
  48. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  49. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  50. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  51. src/setup.py +73 -63
  52. src/utils.py +207 -30
  53. data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
  54. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  56. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  57. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
@@ -1,60 +1,416 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
- from src.postprocessing.common import extract_string
2
+ from collections import defaultdict
3
+
4
+ from rapidfuzz import fuzz, process
5
+
6
+ from src.io import logger
7
+ from src.utils import get_tms_mappings
3
8
 
4
9
 
5
10
  def postprocessing_partner_invoice(partner_invoice):
6
11
  """Apply postprocessing to the partner invoice data."""
7
- # flatten the invoice amount
12
+ # Flatten the invoice amount
8
13
  for amount in partner_invoice.get("invoiceAmount", {}):
9
14
  if isinstance(amount, list):
10
15
  amount = amount[0]
11
16
  if isinstance(amount, dict):
12
- for amount_key, val in amount.items():
13
- partner_invoice[f"invoiceAmount_{amount_key}"] = val
17
+ partner_invoice.update(amount)
14
18
  break
15
- # remove invoiceAmount -comes from doc ai-
16
- partner_invoice.pop("invoiceAmount")
17
- # remove containers -comes from doc ai-
18
- partner_invoice.pop("containers")
19
19
 
20
- # some datapoints marked as optional multiple
21
- # these are optional multiple for the 'collective invoices' not for the single ones
22
- # make sure that we send only one item, collective invoices will handled by llms
23
- optional_multiple_list = ["dueDate", "eta", "etd", "fortoEntity", "hblNumber"]
20
+ # Remove invoiceAmount - comes from DocAI
21
+ if partner_invoice.get("invoiceAmount") is not None:
22
+ partner_invoice.pop("invoiceAmount")
23
+
24
+ # Remove containers - comes from DocAI
25
+ # TODO: we can distribute containers to line items based on location proximity
26
+ if partner_invoice.get("containers") is not None:
27
+ partner_invoice.pop("containers")
24
28
 
29
+ # Ensure only one item for optional multiple fields
30
+ optional_multiple_list = [
31
+ "dueDate",
32
+ "eta",
33
+ "etd",
34
+ "fortoEntity",
35
+ "hblNumber",
36
+ "reverseChargeSentence",
37
+ ]
25
38
  for k, v in partner_invoice.items():
26
39
  if (k in optional_multiple_list) and isinstance(v, list):
27
40
  partner_invoice[k] = v[0]
28
41
 
42
+ # Update keys
29
43
  key_updates = {
30
- 'pod': 'portOfDischarge',
31
- 'pol': 'portOfLoading',
32
- 'containerSize': 'containerType',
33
- 'invoiceAmount_currencyCode': 'currencyCode',
34
- 'invoiceAmount_grandTotal': 'grandTotal',
35
- 'invoiceAmount_vatAmount': 'vatAmount',
36
- 'invoiceAmount_vatApplicableAmount': 'totalAmountGross',
37
- 'invoiceAmount_vatPercentage': 'vatPercentage',
38
- 'name': 'lineItemDescription',
39
- 'unit': 'quantity'
40
- }
44
+ "pod": "portOfDischarge",
45
+ "pol": "portOfLoading",
46
+ "name": "lineItemDescription",
47
+ "unit": "quantity",
48
+ }
41
49
 
42
50
  def update_keys(d, key_updates):
43
51
  """
44
52
  Recursively updates keys in a dictionary according to a mapping provided in key_updates.
45
-
53
+
46
54
  d: The original dictionary
47
55
  key_updates: A dictionary mapping old key names to new key names
48
-
56
+
49
57
  return A new dictionary with updated key names
50
58
  """
51
59
  if isinstance(d, dict):
52
- return {key_updates.get(k, k): update_keys(v, key_updates) for k, v in d.items()}
60
+ return {
61
+ key_updates.get(k, k): update_keys(v, key_updates) for k, v in d.items()
62
+ }
53
63
  elif isinstance(d, list):
54
64
  return [update_keys(item, key_updates) for item in d]
55
65
  else:
56
66
  return d
57
67
 
58
68
  updated_data = update_keys(partner_invoice, key_updates)
59
-
60
69
  return updated_data
70
+
71
+
72
+ def post_process_bundeskasse(aggregated_data):
73
+ """Post-process the Bundeskasse invoice data."""
74
+ # Check if the Credit note number starts with ATS and classify it to Credit Note else Invoice
75
+ invoice_type = (
76
+ "bundeskasseCreditNote"
77
+ if aggregated_data.get("creditNoteInvoiceNumber", {})
78
+ .get("documentValue", "")
79
+ .startswith("ATS")
80
+ else "bundeskasseInvoice"
81
+ )
82
+
83
+ aggregated_data["documentType"] = {
84
+ "documentValue": invoice_type,
85
+ "formattedValue": invoice_type,
86
+ }
87
+
88
+ # Predefine mappings for tax codes
89
+ tax_type_mappings = {
90
+ "A0000": "Zölle (ohne EGKS-Zölle, Ausgleichs-, Antidumping- und Zusatzzölle, Zölle auf Agrarwaren) (ZOLLEU)",
91
+ "B0000": "Einfuhrumsatzsteuer(EUSt)",
92
+ "A3000": "Endgültige Antidumpingzölle(ANTIDUMPEU)",
93
+ }
94
+
95
+ line_items = aggregated_data.get("lineItem", [])
96
+ is_recipient_forto = False # Check if Forto account is in any line item
97
+
98
+ # Process each line item
99
+ for line_item in line_items:
100
+ tax_type = line_item.get("taxType")
101
+ if tax_type:
102
+ # Map the tax type to the corresponding value
103
+ line_item["name"]["formattedValue"] = tax_type_mappings.get(
104
+ tax_type.get("documentValue"), line_item["name"]["documentValue"]
105
+ )
106
+
107
+ # Check if the deferredDutyPayer is forto
108
+ KEYWORDS = {"de789147263644738", "forto", "009812"}
109
+
110
+ def is_forto_recipient(line_item: dict) -> bool:
111
+ values_to_check = [
112
+ line_item.get("deferredDutyPayer", {}).get("documentValue", ""),
113
+ line_item.get("vatId", {}).get("documentValue", ""),
114
+ ]
115
+
116
+ combined = " ".join(values_to_check).lower()
117
+ return any(keyword in combined for keyword in KEYWORDS)
118
+
119
+ if is_forto_recipient(line_item):
120
+ is_recipient_forto = True
121
+
122
+ update_recipient_and_vendor(aggregated_data, is_recipient_forto)
123
+
124
+
125
+ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
126
+ """Update the recipient and vendor information in the aggregated data."""
127
+ # Check if the "recipientName" and "recipientAddress" keys exist
128
+ keys_to_init = ["recipientName", "recipientAddress", "vendorName", "vendorAddress"]
129
+ for key in keys_to_init:
130
+ aggregated_data.setdefault(key, {"formattedValue": "", "documentValue": ""})
131
+
132
+ if is_recipient_forto:
133
+ # Update the aggregated data with the recipient information
134
+ aggregated_data["recipientName"][
135
+ "formattedValue"
136
+ ] = "Forto Logistics SE & Co KG"
137
+ aggregated_data["recipientAddress"][
138
+ "formattedValue"
139
+ ] = "Schönhauser Allee 9, 10119 Berlin, Germany"
140
+
141
+ # Update the vendor details always to Bundeskasse Trier
142
+ aggregated_data["vendorName"]["formattedValue"] = "Bundeskasse Trier"
143
+ aggregated_data["vendorAddress"][
144
+ "formattedValue"
145
+ ] = "Dasbachstraße 15, 54292 Trier, Germany"
146
+
147
+
148
+ def select_unique_bank_account(bank_account):
149
+ # Select the unique bank account if multiple are present
150
+ if isinstance(bank_account, list) and bank_account:
151
+ best = defaultdict(lambda: None)
152
+
153
+ for item in bank_account:
154
+ dv = item["documentValue"]
155
+ if best[dv] is None or item["page"] < best[dv]["page"]:
156
+ best[dv] = item
157
+
158
+ unique = list(best.values())
159
+ return unique
160
+
161
+
162
+ async def process_partner_invoice(params, aggregated_data, document_type_code):
163
+ """Process the partner invoice data."""
164
+ # Post process bundeskasse invoices
165
+ if document_type_code == "bundeskasse":
166
+ post_process_bundeskasse(aggregated_data)
167
+ return
168
+
169
+ if "bankAccount" in aggregated_data:
170
+ aggregated_data["bankAccount"] = select_unique_bank_account(
171
+ aggregated_data["bankAccount"]
172
+ )
173
+
174
+ line_items = aggregated_data.get("lineItem", [])
175
+ # Add debug logging
176
+ logger.info(f"Processing partnerInvoice with {len(line_items)} line items")
177
+
178
+ reverse_charge = None
179
+ reverse_charge_info = aggregated_data.get("reverseChargeSentence")
180
+
181
+ # Check if reverseChargeSentence exists and has the expected structure
182
+ if isinstance(reverse_charge_info, dict):
183
+ # Get the reverse charge sentence and Check if the reverse charge sentence is present
184
+ rev_charge_sentence = reverse_charge_info.get("formattedValue", "")
185
+ reverse_charge_value = if_reverse_charge_sentence(rev_charge_sentence, params)
186
+
187
+ # Assign the reverse charge value to the aggregated data
188
+ reverse_charge_info["formattedValue"] = reverse_charge_value
189
+ reverse_charge = aggregated_data.pop("reverseChargeSentence", None)
190
+
191
+ # Partner Name
192
+ partner_name = aggregated_data.get("vendorName", {}).get("documentValue", None)
193
+
194
+ # Process everything in one go
195
+ processed_items = await process_line_items_batch(
196
+ params, line_items, reverse_charge, partner_name
197
+ )
198
+
199
+ # Update your main data structure
200
+ aggregated_data["lineItem"] = processed_items
201
+
202
+
203
+ async def process_line_items_batch(
204
+ params: dict, line_items: list[dict], reverse_charge=None, partner_name=None
205
+ ):
206
+ """
207
+ Processes all line items efficiently using a "Split-Apply-Combine" strategy.
208
+ """
209
+ # To store items that need external API lookup
210
+ pending_line_items = {}
211
+
212
+ # Check Fuzzy Matching
213
+ logger.info(f"Mapping line item codes with Fuzzy matching....")
214
+ for i, item in enumerate(line_items):
215
+ description_obj = item.get("lineItemDescription")
216
+
217
+ if not description_obj or not description_obj.get("formattedValue"):
218
+ continue
219
+ # Get the formatted description text
220
+ desc = description_obj["formattedValue"]
221
+
222
+ # Find Fuzzy Match
223
+ matched_code = find_matching_lineitem(
224
+ desc,
225
+ params["lookup_data"]["item_code"],
226
+ params["fuzzy_threshold_item_code"],
227
+ )
228
+
229
+ if matched_code:
230
+ # Set the code to the line item
231
+ item["itemCode"] = {
232
+ "documentValue": desc,
233
+ "formattedValue": matched_code,
234
+ "page": description_obj.get("page"),
235
+ }
236
+ else:
237
+ # Store for batch API call
238
+ pending_line_items[i] = desc
239
+
240
+ # Batch API Call for Embedding lookups
241
+ if pending_line_items:
242
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
243
+
244
+ for index, desc in pending_line_items.items():
245
+ line_items[index]["itemCode"] = {
246
+ "documentValue": desc,
247
+ "formattedValue": code_map.get(desc),
248
+ "page": line_items[index]["lineItemDescription"].get("page"),
249
+ }
250
+
251
+ # Add reverse charge here if exists
252
+ if reverse_charge:
253
+ [
254
+ item.update({"reverseChargeSentence": reverse_charge})
255
+ for item in line_items
256
+ if (
257
+ (item.get("itemCode") and item["itemCode"]["formattedValue"] != "CDU")
258
+ or not item.get("itemCode")
259
+ )
260
+ ]
261
+
262
+ return line_items
263
+
264
+
265
+ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
266
+ """Get the best fuzzy match for a target string from a list of candidates.
267
+
268
+ Args:
269
+ target (str): The string to match.
270
+ sentences (list): List of strings to match against.
271
+ threshold (int): Minimum score threshold to consider a match.
272
+
273
+ Returns:
274
+ tuple: (best_match, score) if above threshold, else (None, 0)
275
+ """
276
+ # Use multiprocessing to find the best match
277
+ result = process.extractOne(
278
+ target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
279
+ )
280
+
281
+ if result is None:
282
+ return None, False
283
+
284
+ match, score, index = result
285
+
286
+ # return best_match if the best match score is above a threshold (e.g., 80)
287
+ if match:
288
+ return match, True
289
+
290
+ return None, False
291
+
292
+
293
+ def if_reverse_charge_sentence(sentence: str, params):
294
+ """Check if the reverse charge sentence is present in the line item."""
295
+ reverse_charge_sentences = params["lookup_data"]["reverse_charge_sentences"]
296
+ threshold = params["fuzzy_threshold_reverse_charge"]
297
+
298
+ # Check if ("ARTICLE 144", "ART. 144") in the sentence
299
+ if "ARTICLE 144" in sentence or "ART 144" in sentence:
300
+ return False
301
+
302
+ # Check if the sentence is similar to any of the reverse charge sentences
303
+ match, _ = get_fuzzy_match_score(
304
+ sentence, list(reverse_charge_sentences.keys()), threshold
305
+ )
306
+
307
+ if match:
308
+ return reverse_charge_sentences[match]
309
+
310
+ return False
311
+
312
+
313
+ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
314
+ """Find the best matching line item from the key-value pair dictionary using fuzzy matching.
315
+
316
+ Args:
317
+ new_lineitem (str): The new line item to be matched.
318
+ kvp_dict (dict): The key-value pair dictionary with 'Processed Lineitem' as key and 'Forto SLI' as value.
319
+ threshold (int): Minimum score threshold to consider a match.
320
+ Returns:
321
+ str: The best matching 'Forto SLI' value from the dictionary.
322
+ """
323
+ # Check if the new line item is already in the dictionary
324
+ if new_lineitem in kvp_dict:
325
+ return kvp_dict[new_lineitem]
326
+
327
+ # Get the best fuzzy match score for the extracted line item
328
+ match, _ = get_fuzzy_match_score(
329
+ new_lineitem,
330
+ list(kvp_dict.keys()),
331
+ threshold,
332
+ )
333
+
334
+ if match:
335
+ # find the code from the kvp_dict
336
+ return kvp_dict[match]
337
+
338
+ return None
339
+
340
+
341
+ async def associate_forto_item_code(line_item_data, params, partner_name=None):
342
+ """
343
+ Associates Forto item codes to a list of line item descriptions.
344
+ Args:
345
+ line_item_data (dict): A dictionary where keys are original descriptions and values are cleaned descriptions.
346
+ params (dict): Parameters containing lookup data and thresholds.
347
+ partner_name (str, optional): The name of the partner for context in matching. Defaults to None.
348
+
349
+ Returns:
350
+ list: A list of dictionaries with 'description' and 'itemCode' keys.
351
+ """
352
+
353
+ result = []
354
+ pending_line_items = {}
355
+ for desc, f_desc in line_item_data.items():
356
+ # Get the Forto item code using fuzzy matching
357
+ code = find_matching_lineitem(
358
+ new_lineitem=f_desc,
359
+ kvp_dict=params["lookup_data"]["item_code"],
360
+ threshold=params["fuzzy_threshold_item_code"],
361
+ )
362
+ if code:
363
+ result.append({"description": desc, "itemCode": code})
364
+ else:
365
+ pending_line_items[desc] = f_desc
366
+
367
+ # Batch API Call for Embedding lookups
368
+ if pending_line_items:
369
+ code_map = await fetch_line_item_codes(pending_line_items, partner_name, params)
370
+
371
+ for desc, f_desc in pending_line_items.items():
372
+ result.append(
373
+ {
374
+ "description": desc,
375
+ "itemCode": code_map.get(f_desc),
376
+ }
377
+ )
378
+
379
+ return result
380
+
381
+
382
+ async def fetch_line_item_codes(
383
+ pending_line_items: dict,
384
+ partner_name: str | None,
385
+ params: dict,
386
+ ):
387
+ """Returns: {original_description: mapped_code_or_None}"""
388
+ t_mode = (
389
+ find_matching_lineitem(
390
+ partner_name.upper(),
391
+ params["lookup_data"]["intermodal_partners"],
392
+ threshold=87,
393
+ )
394
+ if partner_name
395
+ else None
396
+ )
397
+
398
+ unique_descs = list(set(pending_line_items.values()))
399
+ logger.info(f"Mapping {len(unique_descs)} line items from Embedding API...")
400
+
401
+ # Build API input map
402
+ api_input_map = {
403
+ desc: f"{t_mode} - {desc}" if t_mode else desc for desc in unique_descs
404
+ }
405
+
406
+ api_results = await get_tms_mappings(
407
+ input_list=list(api_input_map.values()),
408
+ embedding_type="line_items",
409
+ )
410
+
411
+ # Normalize response back to original descriptions
412
+ result = {
413
+ original_desc: api_results.get(api_desc)
414
+ for original_desc, api_desc in api_input_map.items()
415
+ }
416
+ return result
@@ -0,0 +1,70 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "bookingNumber": {
5
+ "type": "STRING",
6
+ "nullable": true,
7
+ "description": "The booking number associated with the Arrival Notice document. They are often referred to as 'Booking Number', 'Booking No.', 'Booking Ref.', 'Booking Reference', 'Booking ID', 'carrier's reference' or 'Order Ref'."
8
+ },
9
+ "destinationTerminal": {
10
+ "type": "STRING",
11
+ "nullable": true,
12
+ "description": "The terminal at the destination port where the container will be delivered."
13
+ },
14
+ "eta": {
15
+ "type": "STRING",
16
+ "nullable": true,
17
+ "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."
18
+ },
19
+ "mblNumber": {
20
+ "type": "STRING",
21
+ "nullable": true,
22
+ "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
23
+ },
24
+ "portOfDischarge": {
25
+ "type": "STRING",
26
+ "nullable": true,
27
+ "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment."
28
+ },
29
+ "vesselName": {
30
+ "type": "STRING",
31
+ "nullable": true,
32
+ "description": "The name of the vessel carrying the shipment."
33
+ },
34
+ "containers": {
35
+ "type": "ARRAY",
36
+ "items": {
37
+ "type": "OBJECT",
38
+ "properties": {
39
+ "containerNumber": {
40
+ "type": "STRING",
41
+ "nullable": true,
42
+ "description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
43
+ },
44
+ "containerType": {
45
+ "type": "STRING",
46
+ "nullable": true,
47
+ "description": "The size of the container associated with the containerNumber, such as 20ft, 40ft, 40HC, 20DC etc."
48
+ },
49
+ "grossWeight": {
50
+ "type": "STRING",
51
+ "nullable": true,
52
+ "description": "The gross weight of the container. Usually mentioned as G.W or GW or Gross Weight, etc.."
53
+ },
54
+ "measurements": {
55
+ "type": "STRING",
56
+ "nullable": true,
57
+ "description": "The volume of the container. Usually, it is measured in 'Cubic Meter (cbm)' or dimensions. But volume in 'cbm' is preferred."
58
+ },
59
+ "sealNumber": {
60
+ "type": "STRING",
61
+ "nullable": true,
62
+ "description": "The seal number associated with the container Number. But it is not same as the container number."
63
+ }
64
+ },
65
+ "required": ["containerNumber", "containerType", "grossWeight"]
66
+ }
67
+ }
68
+ },
69
+ "required": ["bookingNumber", "destinationTerminal", "eta", "portOfDischarge", "vesselName", "containers"]
70
+ }
@@ -0,0 +1,40 @@
1
+ <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
+
3
+ <TASK> Your task is to extract data from Arrival Notice documents as per the given response schema structure. <TASK>
4
+
5
+ <CONTEXT>
6
+ The Freight Forwarding company receives Arrival Notice from shipping lines.
7
+ These documents contain various details related to arrival of a shipment to the port of destination such as container numbers, estimated time of arrival, vessel details and containers information.
8
+ They may be written in different languages such as English, German, Italian and can appear in a variety of formats and layouts.
9
+ Your role is to accurately extract specific entities from these Arrival Notices to support efficient processing and accurate record-keeping.
10
+ <CONTEXT>
11
+
12
+ <INSTRUCTIONS>
13
+ - Populate fields as defined in the response schema.
14
+ - Multiple Containers entries may exist, capture all instances under "containers".
15
+ - Use the data field description to understand the context of the data.
16
+
17
+ - bookingNumbers:
18
+ - Booking numbers are unique identifiers for shipments. They are often referred to as "Booking Number", "Booking No.", "Booking Ref.", "Booking Reference", "Booking ID", "SACO-Pos.", "Order Ref", "Unsere Referenz", or "Unsere Position"
19
+ - If there is a unique_id that starts with "S" followed by 6 or 8 digits, it is a shipmentID, not a bookingNumber.
20
+
21
+ - destinationTerminal:
22
+ - Destination Terminal can also be referred to as "Destination Termina;", "Pickup Location", "Delivery Location", "Delivery Terminal", "Empfangsort", "Entladeort", or "Abladestelle".
23
+
24
+ - mblNumbers:
25
+ - Commonly known as "Bill of Lading Number", "BILL OF LADING NO.", "BL Number", "BL No.", "B/L No.", "BL-Nr.", "B/L", "HBL No.", or "M-AWB Nummer".
26
+ - Bill of Lading Number is known as mblNumber. Not a shipmentID even if it starts with "S".
27
+ - mblNumber from Hapag-Lloyd always starts with HLC.... (e.g., "HLCUTS12303AWNT3) and named as SEA WAYBILL or "SWB-NR.
28
+
29
+ - eta:
30
+ - Estimated Time of Arrival (ETA) is the expected date and time when the shipment will arrive at the destination port.
31
+ - It can be referred to as "ETA", "Estimated Arrival", "Voraussichtliche Ankunft", "Ankunftszeit", "Arrivo", "Due to arrive at Terminal"
32
+
33
+ - vesselName:
34
+ - Vessel Name is the name of the ship carrying the cargo. It can be referred to as "Vessel", "Ship Name", "Schiff", "Schiffsname", "Nave", or "Vessel/Flight No.".
35
+
36
+ - containers: Details of each container on the arrival notice. Make sure to extract each container information separately.
37
+ - containerNumber: Container Number consists of 4 capital letters followed by 7 digits (e.g., TEMU7972458, CAIU 7222892).
38
+ - sealNumber: Seal numbers are unique identifiers for shipping seals. They are usually mentioned as seal numbers in the document but they are definitely not container numbers.
39
+
40
+ <INSTRUCTIONS>
@@ -1,28 +1,28 @@
1
1
  {
2
2
  "type": "OBJECT",
3
3
  "properties": {
4
- "cfsCutOff": {"type": "string", "nullable": true, "description": "he date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
- "bookingNumber": {"type": "string", "nullable": true},
6
- "cyCutOff": {"type": "string", "nullable": true},
7
- "gateInReference": {"type": "string", "nullable": true},
8
- "gateInTerminal": {"type": "string", "nullable": true},
9
- "mblNumber": {"type": "string", "nullable": true},
10
- "pickUpReference": {"type": "string", "nullable": true},
11
- "pickUpTerminal": {"type": "string", "nullable": true},
12
- "siCutOff": {"type": "string", "nullable": true},
13
- "vgmCutOff": {"type": "string", "nullable": true},
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
14
  "transportLegs": {
15
15
  "type": "ARRAY",
16
16
  "items": {
17
17
  "type": "OBJECT",
18
18
  "properties": {
19
- "eta": {"type": "string", "nullable": true},
20
- "etd": {"type": "string", "nullable": true},
21
- "imoNumber": {"type": "string", "nullable": true},
22
- "portOfDischarge": {"type": "string", "nullable": true},
23
- "portOfLoading": {"type": "string", "nullable": true},
24
- "vesselName": {"type": "string", "nullable": true},
25
- "voyage": {"type": "string", "nullable": true}
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
26
  },
27
27
  "required": []
28
28
  }
@@ -1,3 +1,4 @@
1
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
1
2
  ```json
2
3
  {
3
4
  "mblNumber": "Extract the value after the label 'BOOKING NO.'.",
@@ -1,32 +1,32 @@
1
1
  {
2
2
  "type": "OBJECT",
3
3
  "properties": {
4
- "cfsCutOff": {"type": "string", "nullable": true, "description": "he date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
- "bookingNumber": {"type": "string", "nullable": true},
6
- "cyCutOff": {"type": "string", "nullable": true},
7
- "gateInReference": {"type": "string", "nullable": true},
8
- "gateInTerminal": {"type": "string", "nullable": true},
9
- "mblNumber": {"type": "string", "nullable": true},
10
- "pickUpReference": {"type": "string", "nullable": true},
11
- "pickUpTerminal": {"type": "string", "nullable": true},
12
- "siCutOff": {"type": "string", "nullable": true},
13
- "vgmCutOff": {"type": "string", "nullable": true},
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
14
  "transportLegs": {
15
15
  "type": "ARRAY",
16
16
  "items": {
17
17
  "type": "OBJECT",
18
18
  "properties": {
19
- "eta": {"type": "string", "nullable": true},
20
- "etd": {"type": "string", "nullable": true},
21
- "imoNumber": {"type": "string", "nullable": true},
22
- "portOfDischarge": {"type": "string", "nullable": true},
23
- "portOfLoading": {"type": "string", "nullable": true},
24
- "vesselName": {"type": "string", "nullable": true},
25
- "voyage": {"type": "string", "nullable": true}
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
26
  },
27
27
  "required": []
28
28
  }
29
29
  }
30
30
  },
31
31
  "required": []
32
- }
32
+ }