data-science-document-ai 1.40.3__tar.gz → 1.41.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/excel_processing.py +4 -0
  4. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/pdf_processing.py +14 -3
  5. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/postprocessing/common.py +66 -17
  6. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +32 -0
  7. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  8. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +32 -0
  9. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  10. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +32 -0
  11. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  12. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +32 -0
  13. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  14. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +32 -0
  15. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  16. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/other/placeholders.json +32 -0
  17. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  18. data_science_document_ai-1.41.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +32 -0
  19. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  20. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bundeskasse/other/placeholders.json +19 -19
  21. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/bundeskasse/other/prompt.txt +1 -1
  22. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  23. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/customsAssessment/other/prompt.txt +1 -1
  24. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/customsInvoice/other/placeholders.json +19 -19
  25. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
  26. data_science_document_ai-1.41.0/src/prompts/library/deliveryOrder/other/placeholders.json +29 -0
  27. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
  28. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +2 -1
  29. {data_science_document_ai-1.40.3/src/prompts/library/finalMbL → data_science_document_ai-1.41.0/src/prompts/library/draftMbl}/maersk/prompt.txt +2 -0
  30. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/draftMbl/other/prompt.txt +1 -1
  31. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +1 -1
  32. {data_science_document_ai-1.40.3/src/prompts/library/draftMbl → data_science_document_ai-1.41.0/src/prompts/library/finalMbL}/maersk/prompt.txt +2 -0
  33. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/finalMbL/other/prompt.txt +1 -1
  34. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/packingList/other/prompt.txt +1 -1
  35. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/partnerInvoice/other/placeholders.json +12 -60
  36. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
  37. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
  38. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/prompt_library.py +4 -0
  39. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/utils.py +57 -0
  40. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
  41. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
  42. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
  43. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
  44. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
  45. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
  46. data_science_document_ai-1.40.3/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
  47. data_science_document_ai-1.40.3/src/prompts/library/customsAssessment/other/placeholders.json +0 -19
  48. data_science_document_ai-1.40.3/src/prompts/library/deliveryOrder/other/placeholders.json +0 -31
  49. data_science_document_ai-1.40.3/src/prompts/library/finalMbL/other/placeholders.json +0 -80
  50. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/constants.py +0 -0
  51. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/constants_sandbox.py +0 -0
  52. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/docai.py +0 -0
  53. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/docai_processor_config.yaml +0 -0
  54. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/io.py +0 -0
  55. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/llm.py +0 -0
  56. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/log_setup.py +0 -0
  57. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  58. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  59. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  60. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  61. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  62. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  63. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  64. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  65. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/setup.py +0 -0
  66. {data_science_document_ai-1.40.3 → data_science_document_ai-1.41.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.40.3
3
+ Version: 1.41.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.40.3"
3
+ version = "1.41.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -2,6 +2,8 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from src.postprocessing.common import llm_prediction_to_tuples
6
+
5
7
  logger = logging.getLogger(__name__)
6
8
 
7
9
  import asyncio
@@ -73,6 +75,8 @@ async def extract_data_from_excel(
73
75
  ]
74
76
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
75
77
 
78
+ # Convert LLM prediction dictionary to tuples of (value, page_number).
79
+ extracted_data = llm_prediction_to_tuples(extracted_data)
76
80
  stored_data = json.dumps(extracted_data)
77
81
 
78
82
  return extracted_data, stored_data, params["gemini_params"]["model_id"]
@@ -14,7 +14,7 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
14
14
 
15
15
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
16
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values
17
+ from src.postprocessing.common import format_all_entities, remove_none_values, llm_prediction_to_tuples
18
18
  from src.postprocessing.postprocess_booking_confirmation import (
19
19
  postprocess_booking_confirmation,
20
20
  )
@@ -31,6 +31,7 @@ from src.utils import (
31
31
  get_processor_name,
32
32
  run_background_tasks,
33
33
  validate_based_on_schema,
34
+ transform_schema_strings
34
35
  )
35
36
 
36
37
 
@@ -104,9 +105,16 @@ async def extract_data_from_pdf_w_docai(
104
105
  # Extract entities from the result
105
106
  for entity in result.entities:
106
107
  value = (
107
- {child.type_: child.mention_text for child in entity.properties}
108
+ {child.type_: (child.mention_text,
109
+ child.page_anchor.page_refs[0].page
110
+ if hasattr(child.page_anchor.page_refs[0], "page")
111
+ else 0)
112
+ for child in entity.properties}
108
113
  if entity.properties
109
- else entity.mention_text
114
+ else (entity.mention_text,
115
+ entity.page_anchor.page_refs[0].page
116
+ if hasattr(entity.page_anchor.page_refs[0], "page")
117
+ else 0)
110
118
  )
111
119
  aggregated_data[entity.type_].append(value)
112
120
 
@@ -220,6 +228,9 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
220
228
  result = await llm_client.get_unified_json_genai(
221
229
  prompt=prompt, document=document, response_schema=response_schema
222
230
  )
231
+
232
+ result = llm_prediction_to_tuples(result)
233
+
223
234
  return result
224
235
  return {}
225
236
 
@@ -380,6 +380,11 @@ async def format_label(entity_k, entity_value, document_type_code, params):
380
380
  ]
381
381
  )
382
382
  return entity_k, [v for _, v in format_tasks]
383
+ if isinstance(entity_value, tuple):
384
+ page = entity_value[1]
385
+ entity_value = entity_value[0]
386
+ else:
387
+ page = -1
383
388
  entity_key = entity_k.lower()
384
389
  formatted_value = None
385
390
 
@@ -446,10 +451,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
446
451
  elif "reversechargesentence" in entity_key:
447
452
  formatted_value = clean_item_description(entity_value, remove_numbers=False)
448
453
 
454
+ elif "quantity" in entity_key:
455
+ if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
456
+ # For partner invoice, quantity can be mentioned as whole number
457
+ formatted_value = decimal_convertor(
458
+ extract_number(entity_value), quantity=True
459
+ )
460
+ else:
461
+ formatted_value = extract_number(entity_value)
462
+
449
463
  elif any(
450
464
  numeric_indicator in entity_key
451
465
  for numeric_indicator in [
452
- "quantity",
453
466
  "value",
454
467
  "amount",
455
468
  "price",
@@ -466,6 +479,7 @@ async def format_label(entity_k, entity_value, document_type_code, params):
466
479
  result = {
467
480
  "documentValue": entity_value,
468
481
  "formattedValue": formatted_value,
482
+ "page": page,
469
483
  }
470
484
  return entity_k, result
471
485
 
@@ -514,7 +528,7 @@ async def get_port_code_llm(port: str, llm_client):
514
528
  return None
515
529
 
516
530
 
517
- def decimal_convertor(value):
531
+ def decimal_convertor(value, quantity=False):
518
532
  """Convert EU values to English values."""
519
533
  if value is None:
520
534
  return None
@@ -522,25 +536,39 @@ def decimal_convertor(value):
522
536
  # Remove spaces
523
537
  value = value.strip().replace(" ", "")
524
538
 
525
- # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
526
- if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
527
- value = value.replace(".", "").replace(",", ".")
539
+ if not quantity:
540
+ # Convert comma to dot for decimal point (e.g., 4.123,45 -> 4123.45)
541
+ if re.match(r"^\d{1,3}(\.\d{3})*,\d{1,2}$", value):
542
+ value = value.replace(".", "").replace(",", ".")
543
+
544
+ # European style integer with thousand separator: 2.500
545
+ elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
546
+ value = value.replace(".", "")
547
+
548
+ # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
549
+ elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
550
+ value = value.replace(",", "")
528
551
 
529
- # European style integer with thousand separator: 2.500
530
- elif re.match(r"^\d{1,3}(\.\d{3})+$", value):
531
- value = value.replace(".", "")
552
+ # English style integer with thousand separator: 2,500
553
+ elif re.match(r"^\d{1,3}(,\d{3})+$", value):
554
+ value = value.replace(",", "")
532
555
 
533
- # Format english values as well for consistency (e.g., 4,123.45 -> 4123.45)
534
- elif re.match(r"^\d{1,3}(,\d{3})*\.\d{1,2}$", value):
535
- value = value.replace(",", "")
556
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
557
+ if re.match(r"^\d+,\d{1,2}$", value):
558
+ value = value.replace(",", ".")
536
559
 
537
- # English style integer with thousand separator: 2,500
538
- elif re.match(r"^\d{1,3}(,\d{3})+$", value):
539
- value = value.replace(",", "")
560
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
561
+ elif re.match(r"^\d+\.\d{3,}$", value):
562
+ value = value[: value.index(".") + 3]
540
563
 
541
- # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
542
- elif re.match(r"^\d+,\d{1,2}$", value):
543
- value = value.replace(",", ".")
564
+ else: # quantity=True only last two
565
+ # Just replace comma decimals with dot (e.g., 65,45 -> 65.45)
566
+ if re.match(r"^\d+,\d{1,2}$", value):
567
+ value = value.replace(",", ".")
568
+
569
+ # If there are more than 3 0s after decimal point, consider only 2 decimal points (e.g., 8.500000 -> 8.50)
570
+ elif re.match(r"^\d+\.\d{3,}$", value):
571
+ value = value[: value.index(".") + 3]
544
572
 
545
573
  return value
546
574
 
@@ -594,3 +622,24 @@ def remove_stop_words(lineitem: str):
594
622
  .upper()
595
623
  .strip()
596
624
  )
625
+
626
+
627
+ def llm_prediction_to_tuples(llm_prediction):
628
+ """Convert LLM prediction dictionary to tuples of (value, page_number)."""
629
+ if isinstance(llm_prediction, dict):
630
+ if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
631
+ if llm_prediction["value"]:
632
+ try:
633
+ page_number = int(llm_prediction["page_number"])
634
+ except: # noqa: E722
635
+ page_number = -1
636
+ return (llm_prediction["value"], page_number)
637
+ return None
638
+ for key, value in llm_prediction.items():
639
+ llm_prediction[key] = llm_prediction_to_tuples(
640
+ llm_prediction.get(key, value)
641
+ )
642
+ elif isinstance(llm_prediction, list):
643
+ for i, item in enumerate(llm_prediction):
644
+ llm_prediction[i] = llm_prediction_to_tuples(item)
645
+ return llm_prediction
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -1,3 +1,4 @@
1
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
1
2
  ```json
2
3
  {
3
4
  "mblNumber": "Extract the value after the label 'BOOKING NO.'.",
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "bookingNumber": {"type": "STRING", "nullable": true},
5
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Further explanation and Keywords for the transportLegs part as follows. The below 2 conditions is crucial. Take attention here:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "portOfDischarge": {"type": "STRING", "nullable": true},
22
+ "portOfLoading": {"type": "STRING", "nullable": true},
23
+ "vesselName": {"type": "STRING", "nullable": true},
24
+ "voyage": {"type": "STRING", "nullable": true},
25
+ "imoNumber": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -1,4 +1,6 @@
1
- bookingNumber: Extract the booking number. This information can be found near the labels "BOOKING ACKNOWLEDGEMENT" or "BOOKING NUMBER".
1
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
2
+
3
+ bookingNumber: Extract the booking number. This information can be found near the labels "BOOKING ACKNOWLEDGEMENT" or "BOOKING NUMBER".
2
4
  gateInReference: This field should have the same value as the bookingNumber.
3
5
  cyCutOff: Look for the "INTENDED FCL CY CUT-OFF" label and extract the date and time value.
4
6
  vgmCutOff: Look for the "INTENDED VGM CUT-OFF" label and extract the date and time value.
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Further explanation for the transportLegs part as follows:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -2,47 +2,47 @@
2
2
  "type": "OBJECT",
3
3
  "properties": {
4
4
  "currencyCode": {
5
- "type": "string",
5
+ "type": "STRING",
6
6
  "nullable": true,
7
7
  "description": "The currency in which the invoice is issued."
8
8
  },
9
9
  "grandTotal": {
10
- "type": "string",
10
+ "type": "STRING",
11
11
  "nullable": true,
12
12
  "description": "The overall total amount of the invoice."
13
13
  },
14
14
  "issueDate": {
15
- "type": "string",
15
+ "type": "STRING",
16
16
  "nullable": true,
17
17
  "description": "The date the document was issued."
18
18
  },
19
19
  "recipientAddress": {
20
- "type": "string",
20
+ "type": "STRING",
21
21
  "nullable": true,
22
22
  "description": "The address of the recipient."
23
23
  },
24
24
  "recipientName": {
25
- "type": "string",
25
+ "type": "STRING",
26
26
  "nullable": true,
27
27
  "description": "The name of the recipient."
28
28
  },
29
29
  "serviceDate": {
30
- "type": "string",
30
+ "type": "STRING",
31
31
  "nullable": true,
32
32
  "description": "The date of service or transaction."
33
33
  },
34
34
  "shipmentId": {
35
- "type": "string",
35
+ "type": "STRING",
36
36
  "nullable": true,
37
37
  "description": "Starting with an \"S\" and followed by 6 or 7 digits. Example: S124321"
38
38
  },
39
39
  "vendorName": {
40
- "type": "string",
40
+ "type": "STRING",
41
41
  "nullable": true,
42
42
  "description": "The name of the vendor."
43
43
  },
44
44
  "vendorAddress": {
45
- "type": "string",
45
+ "type": "STRING",
46
46
  "nullable": true,
47
47
  "description": "The address of the vendor."
48
48
  },
@@ -52,37 +52,37 @@
52
52
  "type": "OBJECT",
53
53
  "properties": {
54
54
  "deferredDutyPayer": {
55
- "type": "string",
55
+ "type": "STRING",
56
56
  "nullable": true,
57
57
  "description": "It can be identified under \"Aufschubenhmer\" for each line item"
58
58
  },
59
59
  "name": {
60
- "type": "string",
60
+ "type": "STRING",
61
61
  "nullable": true,
62
62
  "description": "The name or description of the line item A0000 and B0000"
63
63
  },
64
64
  "taxType": {
65
- "type": "string",
65
+ "type": "STRING",
66
66
  "nullable": true,
67
67
  "description": "It's a line item mentioned in the invoice. For example; A0000 and B0000"
68
68
  },
69
69
  "totalAmount": {
70
- "type": "string",
70
+ "type": "STRING",
71
71
  "nullable": true,
72
72
  "description": "The total amount for the line item."
73
73
  },
74
74
  "totalAmountCurrency": {
75
- "type": "string",
75
+ "type": "STRING",
76
76
  "nullable": true,
77
77
  "description": "The currency of the total amount."
78
78
  },
79
79
  "vatId": {
80
- "type": "string",
80
+ "type": "STRING",
81
81
  "nullable": true,
82
82
  "description": "The VAT identification number. This is named a Konto-Nummer for each line item."
83
83
  },
84
84
  "dueDate": {
85
- "type": "string",
85
+ "type": "STRING",
86
86
  "nullable": true,
87
87
  "description": "It's a due date. Due date to pay the amount. It's usually mentioned either in a date or a number of days format"
88
88
  }
@@ -91,20 +91,20 @@
91
91
  }
92
92
  },
93
93
  "invoiceNumber": {
94
- "type": "string",
94
+ "type": "STRING",
95
95
  "nullable": true,
96
96
  "description": "Invoice Number is a unique identifier for the invoice, it starts with \"ATC\", \"AT-C\", or \"AT/C\" only (e.g., ATC40, AT-C-40-, AT/C/40/....). Do NOT extract \"NIZZA-Registrierkennzeichen number."
97
97
  },
98
98
  "containerNumber": {
99
99
  "type": "ARRAY",
100
100
  "items": {
101
- "type": "string",
101
+ "type": "STRING",
102
102
  "nullable": true,
103
103
  "description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
104
104
  }
105
105
  },
106
106
  "creditNoteInvoiceNumber": {
107
- "type": "string",
107
+ "type": "STRING",
108
108
  "nullable": true,
109
109
  "description": "The unique identifier for the associated Invoice. The number usually starts with ATS..."
110
110
  }