data-science-document-ai 1.40.4__tar.gz → 1.42.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/PKG-INFO +1 -1
  2. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/pyproject.toml +1 -1
  3. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/docai.py +14 -5
  4. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/excel_processing.py +14 -4
  5. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/io.py +26 -1
  6. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/llm.py +10 -3
  7. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/pdf_processing.py +46 -10
  8. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/common.py +34 -5
  9. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +32 -0
  10. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  11. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +32 -0
  12. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  13. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +32 -0
  14. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  15. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +32 -0
  16. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  17. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +32 -0
  18. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  19. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/other/placeholders.json +32 -0
  20. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  21. data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +32 -0
  22. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  23. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bundeskasse/other/placeholders.json +19 -19
  24. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bundeskasse/other/prompt.txt +1 -1
  25. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  26. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsAssessment/other/prompt.txt +1 -1
  27. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsInvoice/other/placeholders.json +19 -19
  28. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
  29. data_science_document_ai-1.42.0/src/prompts/library/deliveryOrder/other/placeholders.json +29 -0
  30. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
  31. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +2 -1
  32. {data_science_document_ai-1.40.4/src/prompts/library/finalMbL → data_science_document_ai-1.42.0/src/prompts/library/draftMbl}/maersk/prompt.txt +2 -0
  33. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/other/prompt.txt +1 -1
  34. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +1 -1
  35. {data_science_document_ai-1.40.4/src/prompts/library/draftMbl → data_science_document_ai-1.42.0/src/prompts/library/finalMbL}/maersk/prompt.txt +2 -0
  36. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/finalMbL/other/prompt.txt +1 -1
  37. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/packingList/other/prompt.txt +1 -1
  38. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/partnerInvoice/other/placeholders.json +12 -60
  39. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
  40. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
  41. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/prompt_library.py +4 -0
  42. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/setup.py +5 -1
  43. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/utils.py +64 -4
  44. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
  45. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
  46. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
  47. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
  48. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
  49. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
  50. data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
  51. data_science_document_ai-1.40.4/src/prompts/library/customsAssessment/other/placeholders.json +0 -19
  52. data_science_document_ai-1.40.4/src/prompts/library/deliveryOrder/other/placeholders.json +0 -31
  53. data_science_document_ai-1.40.4/src/prompts/library/finalMbL/other/placeholders.json +0 -80
  54. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/constants.py +0 -0
  55. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/constants_sandbox.py +0 -0
  56. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/docai_processor_config.yaml +0 -0
  57. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/log_setup.py +0 -0
  58. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
  59. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
  60. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
  61. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
  62. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
  63. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
  64. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
  65. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
  66. {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/tms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.40.4
3
+ Version: 1.42.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "data-science-document-ai"
3
- version = "1.40.4"
3
+ version = "1.42.0"
4
4
  description = "\"Document AI repo for data science\""
5
5
  authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
6
6
  packages = [
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -2,6 +2,8 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from src.postprocessing.common import llm_prediction_to_tuples
6
+
5
7
  logger = logging.getLogger(__name__)
6
8
 
7
9
  import asyncio
@@ -14,7 +16,9 @@ from src.llm import prompt_excel_extraction
14
16
  from src.utils import generate_schema_structure, get_excel_sheets
15
17
 
16
18
 
17
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
19
+ async def extract_data_from_sheet(
20
+ params, sheet_name, sheet, response_schema, doc_type=None
21
+ ):
18
22
  logger.info(f"Processing sheet: {sheet_name}")
19
23
  excel_content = pd.DataFrame(sheet.values)
20
24
  # Convert to Markdown format for the LLM model
@@ -30,6 +34,7 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
30
34
  result = await params["LlmClient"].get_unified_json_genai(
31
35
  prompt_docai,
32
36
  response_schema=response_schema,
37
+ doc_type=doc_type,
33
38
  )
34
39
  except Exception as e:
35
40
  result = {}
@@ -67,12 +72,17 @@ async def extract_data_from_excel(
67
72
  # Excel files may contain multiple sheets. Extract data from each sheet
68
73
  sheet_extract_tasks = [
69
74
  extract_data_from_sheet(
70
- params, sheet_name, workbook[sheet_name], response_schema
75
+ params,
76
+ sheet_name,
77
+ workbook[sheet_name],
78
+ response_schema,
79
+ doc_type=input_doc_type,
71
80
  )
72
81
  for sheet_name in sheets
73
82
  ]
74
83
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
75
84
 
76
- stored_data = json.dumps(extracted_data)
85
+ # Convert LLM prediction dictionary to tuples of (value, page_number).
86
+ extracted_data = llm_prediction_to_tuples(extracted_data)
77
87
 
78
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
88
+ return extracted_data, extracted_data, params["gemini_params"]["model_id"]
@@ -11,6 +11,28 @@ from pathlib import Path
11
11
 
12
12
  from google.cloud import bigquery, storage
13
13
 
14
+ from src.constants import project_parameters
15
+
16
+
17
+ def get_gcp_labels(**extra_labels):
18
+ """Generate standardized GCP labels for cost tracking.
19
+
20
+ Args:
21
+ **extra_labels: Additional custom labels
22
+
23
+ Returns:
24
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
25
+ """
26
+ labels = {
27
+ "ds-project-name": project_parameters["project_name"],
28
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
29
+ }
30
+
31
+ # Add any extra labels passed in
32
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
33
+
34
+ return labels
35
+
14
36
 
15
37
  def get_bq_client(params):
16
38
  """Get Google BigQuery client."""
@@ -18,7 +40,7 @@ def get_bq_client(params):
18
40
  job_config = bigquery.QueryJobConfig(
19
41
  allow_large_results=True,
20
42
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
43
+ labels=get_gcp_labels(),
22
44
  )
23
45
  return bq_client, job_config
24
46
 
@@ -112,3 +134,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
134
  Path(directory).mkdir(parents=True, exist_ok=True)
113
135
  blob.download_to_filename(directory_local / Path(blob.name))
114
136
  return result
137
+
138
+
139
+ # type: ignore
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)
@@ -14,7 +14,11 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
14
14
 
15
15
  from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
16
16
  from src.excel_processing import extract_data_from_excel
17
- from src.postprocessing.common import format_all_entities, remove_none_values
17
+ from src.postprocessing.common import (
18
+ format_all_entities,
19
+ llm_prediction_to_tuples,
20
+ remove_none_values,
21
+ )
18
22
  from src.postprocessing.postprocess_booking_confirmation import (
19
23
  postprocess_booking_confirmation,
20
24
  )
@@ -30,11 +34,14 @@ from src.utils import (
30
34
  generate_schema_structure,
31
35
  get_processor_name,
32
36
  run_background_tasks,
37
+ transform_schema_strings,
33
38
  validate_based_on_schema,
34
39
  )
35
40
 
36
41
 
37
- async def process_file_w_docai(params, image_content, client, processor_name):
42
+ async def process_file_w_docai(
43
+ params, image_content, client, processor_name, doc_type=None
44
+ ):
38
45
  """
39
46
  Process a file using Document AI.
40
47
 
@@ -43,6 +50,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
43
50
  image_content (bytes): The file to be processed. It can be bytes object.
44
51
  client: The Document AI client.
45
52
  processor_name (str): The name of the processor to be used.
53
+ doc_type (str, optional): Document type for cost tracking labels.
46
54
 
47
55
  Returns:
48
56
  The processed document.
@@ -54,7 +62,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
54
62
 
55
63
  try:
56
64
  logger.info("Processing document...")
57
- result = await _process_pdf_w_docai(image_content, client, processor_name)
65
+ result = await _process_pdf_w_docai(
66
+ image_content, client, processor_name, doc_type=doc_type
67
+ )
58
68
  except Exception as e:
59
69
  if e.reason == "PAGE_LIMIT_EXCEEDED":
60
70
  logger.warning(
@@ -63,7 +73,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
63
73
  # Process the document in batch method (offline processing)
64
74
  try:
65
75
  result = await _batch_process_pdf_w_docai(
66
- params, image_content, client, processor_name
76
+ params, image_content, client, processor_name, doc_type=doc_type
67
77
  )
68
78
  except Exception as batch_e:
69
79
  logger.error(f"Error processing document {batch_e}.")
@@ -93,7 +103,7 @@ async def extract_data_from_pdf_w_docai(
93
103
  )
94
104
 
95
105
  result = await process_file_w_docai(
96
- params, file_content, processor_client, processor_name
106
+ params, file_content, processor_client, processor_name, doc_type=input_doc_type
97
107
  )
98
108
 
99
109
  # Create an entity object to store the result in gcs
@@ -104,9 +114,22 @@ async def extract_data_from_pdf_w_docai(
104
114
  # Extract entities from the result
105
115
  for entity in result.entities:
106
116
  value = (
107
- {child.type_: child.mention_text for child in entity.properties}
117
+ {
118
+ child.type_: (
119
+ child.mention_text,
120
+ child.page_anchor.page_refs[0].page
121
+ if hasattr(child.page_anchor.page_refs[0], "page")
122
+ else 0,
123
+ )
124
+ for child in entity.properties
125
+ }
108
126
  if entity.properties
109
- else entity.mention_text
127
+ else (
128
+ entity.mention_text,
129
+ entity.page_anchor.page_refs[0].page
130
+ if hasattr(entity.page_anchor.page_refs[0], "page")
131
+ else 0,
132
+ )
110
133
  )
111
134
  aggregated_data[entity.type_].append(value)
112
135
 
@@ -137,7 +160,9 @@ async def extract_data_from_pdf_w_docai(
137
160
  return aggregated_data, result_for_store, processor_version
138
161
 
139
162
 
140
- async def identify_carrier(document, llm_client, prompt, response_schema):
163
+ async def identify_carrier(
164
+ document, llm_client, prompt, response_schema, doc_type=None
165
+ ):
141
166
  """Identify the carrier from the Booking Confirmation document."""
142
167
 
143
168
  result = await llm_client.ask_gemini(
@@ -145,6 +170,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
145
170
  document=document,
146
171
  response_schema=response_schema,
147
172
  response_mime_type="text/x.enum",
173
+ doc_type=doc_type,
148
174
  )
149
175
 
150
176
  if result:
@@ -201,7 +227,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
201
227
 
202
228
  # identify carrier for customized prompting
203
229
  carrier = await identify_carrier(
204
- document, llm_client, carrier_prompt, carrier_schema
230
+ document,
231
+ llm_client,
232
+ carrier_prompt,
233
+ carrier_schema,
234
+ doc_type=input_doc_type,
205
235
  )
206
236
 
207
237
  if input_doc_type == "bookingConfirmation":
@@ -218,8 +248,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
218
248
 
219
249
  # generate the result with LLM (gemini)
220
250
  result = await llm_client.get_unified_json_genai(
221
- prompt=prompt, document=document, response_schema=response_schema
251
+ prompt=prompt,
252
+ document=document,
253
+ response_schema=response_schema,
254
+ doc_type=input_doc_type,
222
255
  )
256
+
257
+ result = llm_prediction_to_tuples(result)
258
+
223
259
  return result
224
260
  return {}
225
261
 
@@ -380,11 +380,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
380
380
  ]
381
381
  )
382
382
  return entity_k, [v for _, v in format_tasks]
383
+ if isinstance(entity_value, tuple):
384
+ page = entity_value[1]
385
+ entity_value = entity_value[0]
386
+ else:
387
+ page = -1
383
388
  entity_key = entity_k.lower()
384
389
  formatted_value = None
385
390
 
386
391
  if entity_key.startswith("port"):
387
- formatted_value = await get_port_code_ai(entity_value, llm_client)
392
+ formatted_value = await get_port_code_ai(
393
+ entity_value, llm_client, doc_type=document_type_code
394
+ )
388
395
 
389
396
  elif (entity_key == "containertype") or (entity_key == "containersize"):
390
397
  formatted_value = get_tms_mappings(entity_value, "container_types")
@@ -474,18 +481,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
474
481
  result = {
475
482
  "documentValue": entity_value,
476
483
  "formattedValue": formatted_value,
484
+ "page": page,
477
485
  }
478
486
  return entity_k, result
479
487
 
480
488
 
481
- async def get_port_code_ai(port: str, llm_client):
489
+ async def get_port_code_ai(port: str, llm_client, doc_type=None):
482
490
  """Get port code using AI model."""
483
- port_llm = await get_port_code_llm(port, llm_client)
491
+ port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
484
492
 
485
493
  return get_tms_mappings(port, "ports", port_llm)
486
494
 
487
495
 
488
- async def get_port_code_llm(port: str, llm_client):
496
+ async def get_port_code_llm(port: str, llm_client, doc_type=None):
489
497
  if (
490
498
  "postprocessing" in prompt_library.library.keys()
491
499
  and "port_code" in prompt_library.library["postprocessing"].keys()
@@ -512,7 +520,7 @@ async def get_port_code_llm(port: str, llm_client):
512
520
  }
513
521
 
514
522
  response = await llm_client.get_unified_json_genai(
515
- prompt, response_schema=response_schema, model="chatgpt"
523
+ prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
516
524
  )
517
525
  try:
518
526
  mapped_port = response["port"]
@@ -616,3 +624,24 @@ def remove_stop_words(lineitem: str):
616
624
  .upper()
617
625
  .strip()
618
626
  )
627
+
628
+
629
+ def llm_prediction_to_tuples(llm_prediction):
630
+ """Convert LLM prediction dictionary to tuples of (value, page_number)."""
631
+ if isinstance(llm_prediction, dict):
632
+ if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
633
+ if llm_prediction["value"]:
634
+ try:
635
+ page_number = int(llm_prediction["page_number"])
636
+ except: # noqa: E722
637
+ page_number = -1
638
+ return (llm_prediction["value"], page_number)
639
+ return None
640
+ for key, value in llm_prediction.items():
641
+ llm_prediction[key] = llm_prediction_to_tuples(
642
+ llm_prediction.get(key, value)
643
+ )
644
+ elif isinstance(llm_prediction, list):
645
+ for i, item in enumerate(llm_prediction):
646
+ llm_prediction[i] = llm_prediction_to_tuples(item)
647
+ return llm_prediction
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -1,3 +1,4 @@
1
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
1
2
  ```json
2
3
  {
3
4
  "mblNumber": "Extract the value after the label 'BOOKING NO.'.",
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "bookingNumber": {"type": "STRING", "nullable": true},
5
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "imoNumber": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }
@@ -18,7 +18,7 @@ transportLegs:
18
18
  vesselName: The name of the vessel for a specific leg.
19
19
  voyage: The journey or route taken by the vessel for a specific leg.
20
20
 
21
- your task is to extract the text value of the following entities:
21
+ your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Further explanation and Keywords for the transportLegs part as follows. The below 2 conditions is crucial. Take attention here:
@@ -0,0 +1,32 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
5
+ "bookingNumber": {"type": "STRING", "nullable": true},
6
+ "cyCutOff": {"type": "STRING", "nullable": true},
7
+ "gateInReference": {"type": "STRING", "nullable": true},
8
+ "gateInTerminal": {"type": "STRING", "nullable": true},
9
+ "mblNumber": {"type": "STRING", "nullable": true},
10
+ "pickUpReference": {"type": "STRING", "nullable": true},
11
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
12
+ "siCutOff": {"type": "STRING", "nullable": true},
13
+ "vgmCutOff": {"type": "STRING", "nullable": true},
14
+ "transportLegs": {
15
+ "type": "ARRAY",
16
+ "items": {
17
+ "type": "OBJECT",
18
+ "properties": {
19
+ "eta": {"type": "STRING", "nullable": true},
20
+ "etd": {"type": "STRING", "nullable": true},
21
+ "portOfDischarge": {"type": "STRING", "nullable": true},
22
+ "portOfLoading": {"type": "STRING", "nullable": true},
23
+ "vesselName": {"type": "STRING", "nullable": true},
24
+ "voyage": {"type": "STRING", "nullable": true},
25
+ "imoNumber": {"type": "STRING", "nullable": true}
26
+ },
27
+ "required": []
28
+ }
29
+ }
30
+ },
31
+ "required": []
32
+ }