data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
- data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
- src/constants.py +42 -12
- src/constants_sandbox.py +2 -22
- src/docai.py +18 -7
- src/docai_processor_config.yaml +0 -64
- src/excel_processing.py +34 -15
- src/io.py +74 -6
- src/llm.py +12 -34
- src/pdf_processing.py +228 -78
- src/postprocessing/common.py +495 -618
- src/postprocessing/postprocess_partner_invoice.py +383 -27
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +113 -0
- src/prompts/library/bundeskasse/other/prompt.txt +48 -0
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/placeholders.json +67 -16
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/placeholders.json +205 -0
- src/prompts/library/customsInvoice/other/prompt.txt +105 -0
- src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +34 -34
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
- src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
- src/setup.py +73 -63
- src/utils.py +207 -30
- data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
src/setup.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
"""Contains project setup parameters and initialization functions."""
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
# import streamlit as st
|
|
2
|
+
import json
|
|
5
3
|
import os
|
|
6
4
|
import random
|
|
7
5
|
import time
|
|
8
|
-
from pathlib import Path
|
|
9
6
|
|
|
10
7
|
import toml
|
|
11
8
|
import vertexai
|
|
@@ -18,7 +15,7 @@ from src.constants import project_parameters
|
|
|
18
15
|
from src.constants_sandbox import project_parameters_sandbox
|
|
19
16
|
|
|
20
17
|
# Parent repos are imported without .
|
|
21
|
-
from src.io import
|
|
18
|
+
from src.io import get_bq_client, get_storage_client, logger
|
|
22
19
|
from src.llm import LlmClient
|
|
23
20
|
|
|
24
21
|
|
|
@@ -68,52 +65,16 @@ def get_docai_schema_client(params, async_=True):
|
|
|
68
65
|
return client
|
|
69
66
|
|
|
70
67
|
|
|
71
|
-
def
|
|
72
|
-
"""
|
|
73
|
-
|
|
74
|
-
parser.add_argument(
|
|
75
|
-
"--scope",
|
|
76
|
-
type=str,
|
|
77
|
-
dest="scope",
|
|
78
|
-
required=False,
|
|
79
|
-
help="Whether the function should 'upload' or 'download' documents",
|
|
80
|
-
)
|
|
81
|
-
parser.add_argument(
|
|
82
|
-
"--document_name",
|
|
83
|
-
type=str,
|
|
84
|
-
dest="document_name",
|
|
85
|
-
required=False,
|
|
86
|
-
help="Category of the document (e.g., 'commercialInvoice', 'packingList')",
|
|
87
|
-
)
|
|
88
|
-
parser.add_argument(
|
|
89
|
-
"--for_combinations",
|
|
90
|
-
type=bool,
|
|
91
|
-
default=False,
|
|
92
|
-
dest="for_combinations",
|
|
93
|
-
required=False,
|
|
94
|
-
help="A flag to download documents into a special subfolder",
|
|
95
|
-
)
|
|
96
|
-
parser.add_argument(
|
|
97
|
-
"--n_samples",
|
|
98
|
-
type=int,
|
|
99
|
-
default=50,
|
|
100
|
-
dest="n_samples",
|
|
101
|
-
required=False,
|
|
102
|
-
help="A number of samples to download",
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
# Remove declared missing arguments (e.g. model_type)
|
|
106
|
-
args = vars(parser.parse_args())
|
|
107
|
-
args_no_null = {
|
|
108
|
-
k: v.split(",") if isinstance(v, str) else v
|
|
109
|
-
for k, v in args.items()
|
|
110
|
-
if v is not None
|
|
111
|
-
}
|
|
112
|
-
return args_no_null
|
|
68
|
+
def setup_params(args=None):
|
|
69
|
+
"""
|
|
70
|
+
Set up the application parameters.
|
|
113
71
|
|
|
72
|
+
Args:
|
|
73
|
+
args: Command-line arguments.
|
|
114
74
|
|
|
115
|
-
|
|
116
|
-
|
|
75
|
+
Returns:
|
|
76
|
+
params: Dictionary containing application parameters.
|
|
77
|
+
"""
|
|
117
78
|
if args is None:
|
|
118
79
|
args = {}
|
|
119
80
|
|
|
@@ -123,13 +84,22 @@ def setup_params(args=None):
|
|
|
123
84
|
# Update parameters with constants
|
|
124
85
|
params.update(project_parameters)
|
|
125
86
|
|
|
126
|
-
|
|
127
|
-
if
|
|
87
|
+
cluster = os.getenv("CLUSTER", "").lower()
|
|
88
|
+
# Update the parameters with the sandbox parameters if the cluster is not production and not ODE
|
|
89
|
+
if cluster not in ("production", "ode"):
|
|
128
90
|
params.update(project_parameters_sandbox)
|
|
129
91
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
92
|
+
# Set up the bucket constants for ODE environment
|
|
93
|
+
if cluster == "ode":
|
|
94
|
+
ode_env_vars = {
|
|
95
|
+
"doc_ai_bucket_project_name": "PROJECT_ID",
|
|
96
|
+
"doc_ai_bucket_name": "BUCKET_NAME",
|
|
97
|
+
"doc_ai_bucket_batch_input": "INPUT_BUCKET_NAME",
|
|
98
|
+
"doc_ai_bucket_batch_output": "OUTPUT_BUCKET_NAME",
|
|
99
|
+
}
|
|
100
|
+
params.update(
|
|
101
|
+
{key: os.getenv(env_var) for key, env_var in ode_env_vars.items()}
|
|
102
|
+
)
|
|
133
103
|
|
|
134
104
|
# print cluster info
|
|
135
105
|
logger.info(f"Cluster: {os.getenv('CLUSTER')}")
|
|
@@ -145,6 +115,10 @@ def setup_params(args=None):
|
|
|
145
115
|
|
|
146
116
|
params = setup_docai_client_and_path(params)
|
|
147
117
|
|
|
118
|
+
# Set up BigQuery client for logging
|
|
119
|
+
bq_client, _ = get_bq_client(params)
|
|
120
|
+
params["bq_client"] = bq_client
|
|
121
|
+
|
|
148
122
|
# Set up Vertex AI for text embeddings
|
|
149
123
|
setup_vertexai(params)
|
|
150
124
|
|
|
@@ -156,18 +130,16 @@ def setup_params(args=None):
|
|
|
156
130
|
assert params.keys() & yaml_content.keys() == set()
|
|
157
131
|
params.update(yaml_content)
|
|
158
132
|
|
|
159
|
-
#
|
|
160
|
-
client = get_storage_client(params)
|
|
161
|
-
bucket = client.bucket(params["doc_ai_bucket_name"])
|
|
162
|
-
downloaded_meta = download_dir_from_bucket(
|
|
163
|
-
bucket, params["g_model_data_folder"], Path(params["local_model_data_folder"])
|
|
164
|
-
)
|
|
165
|
-
if not downloaded_meta:
|
|
166
|
-
logger.info(f"Could not load models metadata from cloud.")
|
|
167
|
-
|
|
133
|
+
# Set up LLM clients
|
|
168
134
|
params["LlmClient"] = LlmClient(
|
|
169
135
|
openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
|
|
170
136
|
)
|
|
137
|
+
params["LlmClient_Flash"] = LlmClient(
|
|
138
|
+
openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Load lookup data from GCS bucket
|
|
142
|
+
setup_lookup_data(params)
|
|
171
143
|
|
|
172
144
|
return params
|
|
173
145
|
|
|
@@ -198,3 +170,41 @@ def setup_vertexai(params):
|
|
|
198
170
|
project=params["g_ai_project_name"],
|
|
199
171
|
location=params["g_region"],
|
|
200
172
|
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def setup_lookup_data(params):
|
|
176
|
+
"""
|
|
177
|
+
Loads JSON mapping data from given GCP Bucket.
|
|
178
|
+
"""
|
|
179
|
+
client = get_storage_client(params)
|
|
180
|
+
bucket = client.bucket(params["doc_ai_bucket_name"])
|
|
181
|
+
|
|
182
|
+
data = dict()
|
|
183
|
+
|
|
184
|
+
input_path_item_code = (
|
|
185
|
+
f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
|
|
186
|
+
)
|
|
187
|
+
input_path_intermodal_partners = (
|
|
188
|
+
f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
|
|
189
|
+
)
|
|
190
|
+
input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
|
|
191
|
+
input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
|
|
192
|
+
|
|
193
|
+
def download_json_from_bucket(path):
|
|
194
|
+
"""Download JSON data from a specified path in a GCP bucket."""
|
|
195
|
+
blob = bucket.blob(path)
|
|
196
|
+
downloaded_data = blob.download_as_text(encoding="utf-8")
|
|
197
|
+
return json.loads(downloaded_data)
|
|
198
|
+
|
|
199
|
+
data["item_code"] = download_json_from_bucket(input_path_item_code)
|
|
200
|
+
data["intermodal_partners"] = download_json_from_bucket(
|
|
201
|
+
input_path_intermodal_partners
|
|
202
|
+
)
|
|
203
|
+
data["invoice_classification"] = download_json_from_bucket(
|
|
204
|
+
input_path_invoice_classification
|
|
205
|
+
)
|
|
206
|
+
data["reverse_charge_sentences"] = download_json_from_bucket(
|
|
207
|
+
input_path_reverse_charge
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
params["lookup_data"] = data
|
src/utils.py
CHANGED
|
@@ -6,36 +6,29 @@ import json
|
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Literal
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
10
10
|
|
|
11
|
+
import httpx
|
|
12
|
+
import numpy as np
|
|
11
13
|
import openpyxl
|
|
12
14
|
import pandas as pd
|
|
13
15
|
from google.cloud import documentai_v1beta3 as docu_ai_beta
|
|
16
|
+
from pypdf import PdfReader, PdfWriter
|
|
14
17
|
|
|
15
|
-
from src.io import
|
|
18
|
+
from src.io import bq_logs, get_storage_client, logger
|
|
16
19
|
|
|
17
20
|
|
|
18
|
-
def
|
|
19
|
-
"""
|
|
21
|
+
def get_pdf_page_count(pdf_bytes):
|
|
22
|
+
"""Get the number of pages in a PDF document efficiently.
|
|
20
23
|
|
|
21
24
|
Args:
|
|
22
|
-
|
|
23
|
-
|
|
25
|
+
pdf_bytes (bytes): The PDF content as bytes.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
int: The number of pages in the PDF.
|
|
24
29
|
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# Get the table string
|
|
28
|
-
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
29
|
-
|
|
30
|
-
logger.info(f"Log table: {table_string}")
|
|
31
|
-
# Insert the rows into the table
|
|
32
|
-
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
33
|
-
|
|
34
|
-
# Check if there were any errors inserting the rows
|
|
35
|
-
if not insert_logs:
|
|
36
|
-
logger.info("New rows have been added.")
|
|
37
|
-
else:
|
|
38
|
-
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
30
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
31
|
+
return len(reader.pages)
|
|
39
32
|
|
|
40
33
|
|
|
41
34
|
async def get_data_set_schema_from_docai(
|
|
@@ -137,7 +130,12 @@ def store_json_in_gcs(
|
|
|
137
130
|
bucket = storage_client.bucket(params.get("doc_ai_bucket_name"))
|
|
138
131
|
full_object_name = folder_path + document_id
|
|
139
132
|
blob = bucket.blob(full_object_name)
|
|
140
|
-
|
|
133
|
+
|
|
134
|
+
# Convert dict to JSON string if needed
|
|
135
|
+
json_string = (
|
|
136
|
+
json.dumps(json_data) if isinstance(json_data, dict) else json_data
|
|
137
|
+
)
|
|
138
|
+
blob.upload_from_string(json_string, content_type="application/json")
|
|
141
139
|
|
|
142
140
|
logger.info(
|
|
143
141
|
f"JSON object stored successfully in gs://{params.get('doc_ai_bucket_name')}/{full_object_name}" # noqa
|
|
@@ -156,6 +154,8 @@ async def run_background_tasks(
|
|
|
156
154
|
store_data,
|
|
157
155
|
processor_version,
|
|
158
156
|
mime_type,
|
|
157
|
+
elapsed_time=None,
|
|
158
|
+
page_count=None,
|
|
159
159
|
):
|
|
160
160
|
"""
|
|
161
161
|
Run background tasks asynchronously.
|
|
@@ -168,6 +168,8 @@ async def run_background_tasks(
|
|
|
168
168
|
store_data: The data to store in GCS.
|
|
169
169
|
processor_version: The processor version used to extract the data.
|
|
170
170
|
mime_type: The MIME type of the document.
|
|
171
|
+
elapsed_time: The time taken to process the document.
|
|
172
|
+
page_count (int, optional): The number of pages in the document.
|
|
171
173
|
|
|
172
174
|
Returns:
|
|
173
175
|
None
|
|
@@ -176,13 +178,8 @@ async def run_background_tasks(
|
|
|
176
178
|
|
|
177
179
|
await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
|
|
178
180
|
|
|
179
|
-
#
|
|
180
|
-
page_count
|
|
181
|
-
# calculate the number of pages processed for PDFs
|
|
182
|
-
try:
|
|
183
|
-
if mime_type == "application/pdf":
|
|
184
|
-
page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
|
|
185
|
-
except AttributeError:
|
|
181
|
+
# Use the passed page_count or default to 0 if not provided
|
|
182
|
+
if page_count is None:
|
|
186
183
|
page_count = 0
|
|
187
184
|
|
|
188
185
|
# Log the request in BigQuery
|
|
@@ -200,6 +197,7 @@ async def run_background_tasks(
|
|
|
200
197
|
"processor_version": processor_version,
|
|
201
198
|
"page_count": page_count,
|
|
202
199
|
"mime_type": mime_type,
|
|
200
|
+
"elapsed_time": elapsed_time,
|
|
203
201
|
}
|
|
204
202
|
],
|
|
205
203
|
params,
|
|
@@ -240,7 +238,6 @@ def generate_schema_structure(params, input_doc_type):
|
|
|
240
238
|
Args:
|
|
241
239
|
params (dict): Parameters dictionary.
|
|
242
240
|
input_doc_type (str): Document type to select the appropriate schema.
|
|
243
|
-
schema_client (documentai_v1beta3.DocumentServiceClient): Schema client.
|
|
244
241
|
|
|
245
242
|
Returns:
|
|
246
243
|
dict: The response schema structure.
|
|
@@ -254,7 +251,7 @@ def generate_schema_structure(params, input_doc_type):
|
|
|
254
251
|
"type": "OBJECT",
|
|
255
252
|
"properties": {
|
|
256
253
|
prop.name: {
|
|
257
|
-
"type":
|
|
254
|
+
"type": "string",
|
|
258
255
|
"nullable": True,
|
|
259
256
|
"description": prop.description,
|
|
260
257
|
}
|
|
@@ -347,3 +344,183 @@ async def update_response_schema_from_docai(params, schema_client):
|
|
|
347
344
|
|
|
348
345
|
def get_data_set_schema(params, processor_name):
|
|
349
346
|
return params["docai_schema_dict"][processor_name]
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def extract_top_pages(pdf_bytes, num_pages=4):
|
|
350
|
+
"""Extract the top pages from a PDF document."""
|
|
351
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
352
|
+
writer = PdfWriter()
|
|
353
|
+
|
|
354
|
+
for page_num in range(min(num_pages, len(reader.pages))):
|
|
355
|
+
writer.add_page(reader.pages[page_num])
|
|
356
|
+
|
|
357
|
+
output = io.BytesIO()
|
|
358
|
+
writer.write(output)
|
|
359
|
+
|
|
360
|
+
return output.getvalue()
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
async def get_tms_mappings(
|
|
364
|
+
input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
|
|
365
|
+
) -> Dict[str, Any]:
|
|
366
|
+
"""Get TMS mappings for the given values.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
input_list (list[str]): List of strings to get embeddings for.
|
|
370
|
+
embedding_type (str): Type of embedding to use
|
|
371
|
+
(e.g., "container_types", "ports", "depots", "lineitems", "terminals").
|
|
372
|
+
llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
dict or string: A dictionary or a string with the mapping results.
|
|
376
|
+
"""
|
|
377
|
+
base_url = (
|
|
378
|
+
"http://0.0.0.0:8080/"
|
|
379
|
+
if os.getenv("CLUSTER") is None
|
|
380
|
+
else "http://tms-mappings.api.svc.cluster.local./"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Ensure clean inputs
|
|
384
|
+
if not input_list:
|
|
385
|
+
return {}
|
|
386
|
+
|
|
387
|
+
# Ensure input_list is a list
|
|
388
|
+
if not isinstance(input_list, list):
|
|
389
|
+
input_list = [input_list]
|
|
390
|
+
|
|
391
|
+
# Always send a dict with named keys
|
|
392
|
+
payload = {embedding_type: input_list}
|
|
393
|
+
|
|
394
|
+
if llm_ports:
|
|
395
|
+
payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
|
|
396
|
+
|
|
397
|
+
# Make the POST request to the TMS mappings API
|
|
398
|
+
url = f"{base_url}{embedding_type}"
|
|
399
|
+
|
|
400
|
+
# Use a timeout so the code doesn't hang forever
|
|
401
|
+
timeout = httpx.Timeout(60.0, connect=10.0)
|
|
402
|
+
|
|
403
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
404
|
+
try:
|
|
405
|
+
response = await client.post(url, json=payload)
|
|
406
|
+
response.raise_for_status()
|
|
407
|
+
|
|
408
|
+
# Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
|
|
409
|
+
return response.json().get("response", {}).get("data", {})
|
|
410
|
+
|
|
411
|
+
except httpx.HTTPStatusError as exc:
|
|
412
|
+
logger.error(
|
|
413
|
+
f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
|
|
414
|
+
)
|
|
415
|
+
return {}
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
async def batch_fetch_all_mappings(container_types, terminals, depots):
|
|
419
|
+
"""Batch fetch all mappings for container types, terminals, and depots."""
|
|
420
|
+
# run batch calls concurrently
|
|
421
|
+
results = await asyncio.gather(
|
|
422
|
+
get_tms_mappings(list(container_types), "container_types"),
|
|
423
|
+
get_tms_mappings(list(terminals), "terminals"),
|
|
424
|
+
get_tms_mappings(list(depots), "depots"),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
batch_container_map, batch_terminal_map, batch_depot_map = results
|
|
428
|
+
|
|
429
|
+
# Convert lists of tuples to dicts if necessary
|
|
430
|
+
return (
|
|
431
|
+
dict(batch_container_map or {}),
|
|
432
|
+
dict(batch_terminal_map or {}),
|
|
433
|
+
dict(batch_depot_map or {}),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def transform_schema_strings(schema):
|
|
438
|
+
"""
|
|
439
|
+
Recursively transforms a schema dictionary, replacing all "type": "STRING"
|
|
440
|
+
definitions with a new object containing "value" and "page_number" fields.
|
|
441
|
+
It preserves 'nullable' and 'description' fields by moving them to the
|
|
442
|
+
new 'value' property.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
schema (dict): The input schema dictionary.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
dict: The transformed schema dictionary.
|
|
449
|
+
"""
|
|
450
|
+
if not isinstance(schema, dict):
|
|
451
|
+
return schema
|
|
452
|
+
|
|
453
|
+
schema_type = schema.get("type")
|
|
454
|
+
if not schema_type:
|
|
455
|
+
return schema
|
|
456
|
+
|
|
457
|
+
# Base case: STRING → OBJECT (only if not already transformed)
|
|
458
|
+
if schema_type.upper() == "STRING":
|
|
459
|
+
return {
|
|
460
|
+
"type": "OBJECT",
|
|
461
|
+
"properties": {
|
|
462
|
+
"value": {
|
|
463
|
+
"type": "STRING",
|
|
464
|
+
"nullable": schema.get("nullable", False),
|
|
465
|
+
"description": schema.get("description", ""),
|
|
466
|
+
},
|
|
467
|
+
"page_number": {
|
|
468
|
+
"type": "STRING",
|
|
469
|
+
"description": "Number of a page where the value was found in the document starting from 0.",
|
|
470
|
+
},
|
|
471
|
+
},
|
|
472
|
+
"required": [],
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
# Skip already transformed OBJECT (has both 'value' & 'page_number')
|
|
476
|
+
if (
|
|
477
|
+
schema_type.upper() == "OBJECT"
|
|
478
|
+
and "properties" in schema
|
|
479
|
+
and {"value", "page_number"}.issubset(schema["properties"].keys())
|
|
480
|
+
):
|
|
481
|
+
return schema
|
|
482
|
+
|
|
483
|
+
# Recursive case for OBJECT
|
|
484
|
+
if schema_type.upper() == "OBJECT" and "properties" in schema:
|
|
485
|
+
new_schema = schema.copy()
|
|
486
|
+
new_schema["properties"] = {
|
|
487
|
+
k: transform_schema_strings(v) for k, v in schema["properties"].items()
|
|
488
|
+
}
|
|
489
|
+
return new_schema
|
|
490
|
+
|
|
491
|
+
# Recursive case for ARRAY
|
|
492
|
+
if schema_type.upper() == "ARRAY" and "items" in schema:
|
|
493
|
+
new_schema = schema.copy()
|
|
494
|
+
new_schema["items"] = transform_schema_strings(schema["items"])
|
|
495
|
+
return new_schema
|
|
496
|
+
|
|
497
|
+
return schema
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def estimate_page_count(sheet):
|
|
501
|
+
"""Assuming a page is 10 columns x 50 rows."""
|
|
502
|
+
if hasattr(sheet, "shape"):
|
|
503
|
+
pg_cnt = sheet.shape[0] * sheet.shape[1]
|
|
504
|
+
elif hasattr(sheet, "max_row"):
|
|
505
|
+
pg_cnt = sheet.max_column * sheet.max_row
|
|
506
|
+
else:
|
|
507
|
+
return None
|
|
508
|
+
return np.ceil(pg_cnt / 500)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
|
|
512
|
+
"""Split PDF into smaller page chunks."""
|
|
513
|
+
pdf = PdfReader(io.BytesIO(file_content))
|
|
514
|
+
total_pages = len(pdf.pages)
|
|
515
|
+
|
|
516
|
+
# TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
|
|
517
|
+
for i in range(0, total_pages, chunk_size):
|
|
518
|
+
writer = PdfWriter()
|
|
519
|
+
for j in range(i, min(i + chunk_size, total_pages)):
|
|
520
|
+
writer.add_page(pdf.pages[j])
|
|
521
|
+
|
|
522
|
+
buffer = io.BytesIO()
|
|
523
|
+
writer.write(buffer)
|
|
524
|
+
buffer.seek(0)
|
|
525
|
+
|
|
526
|
+
yield buffer.getvalue()
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
src/constants.py,sha256=AP5ZfxMGU745IUcSRR0z7aTssbAuJuyqhnVNN9I0L1I,2524
|
|
2
|
-
src/constants_sandbox.py,sha256=vdEOaFzeUmsKK-K66BDgfw3R-_MeQ8XNo3bIfF4EOmA,1241
|
|
3
|
-
src/docai.py,sha256=1UxBRO0oC7WbFgscQAyxjmhsvh-Oc8g60m368WFZrOw,5234
|
|
4
|
-
src/docai_processor_config.yaml,sha256=_XN0g7t9EGU9-vVmK9_t_IQ6OUkXOvawYlOjSSHOUtQ,2295
|
|
5
|
-
src/excel_processing.py,sha256=HZGIinyYXFRRAekBj0yBcTaI0MhzdGuxsTTYpEnzRm8,2559
|
|
6
|
-
src/io.py,sha256=8DxtfvsNrx7QCVPQwttGX21o0NthxHfH6zBR6X4COvg,3511
|
|
7
|
-
src/llm.py,sha256=93naoL3wviBtrA5JaQxldW6hO_Cwpc61whNuz881fDQ,7828
|
|
8
|
-
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=cVLrd-gZmbr9p_Od2ihC2LXxfW7pPhMS6SzhdKS4snM,14962
|
|
10
|
-
src/postprocessing/common.py,sha256=zmTs97KYJhOvmURP_U7RlJUxzqV--Aw62qY78XA-Tl8,33760
|
|
11
|
-
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
|
-
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
-
src/postprocessing/postprocess_partner_invoice.py,sha256=lwHr9pWRQ3LoclZbL1g4_3HCGCWk_C0C_UvbpxYIgKI,2374
|
|
14
|
-
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
|
|
15
|
-
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=qlBMFDHy-gwr2PVeuHrfMEg_8Ibdym243DnaCgINa7g,2614
|
|
16
|
-
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
|
|
17
|
-
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=sg11U3lIhhS36BsimX7IOzR7Pez_9gScdNmJna2pPuw,3355
|
|
18
|
-
src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=PKWXySGAls6A8tujbSjokYp4ldc3c0DmSP2ITKYiUF8,1405
|
|
19
|
-
src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=-00tzWzXtQnXX3EPtaCBM39leCoLa4FB52_t7Z3eoQk,3148
|
|
20
|
-
src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
|
|
21
|
-
src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=9wdbLofnp5s1acD19jCmQuw__HMcVq1yr4vIJNJlKVM,4630
|
|
22
|
-
src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=NnXjMiEsTCzTDWs2WY7BIMo2p4_98-DL3v1r7x-FL3A,1403
|
|
23
|
-
src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=aGowVvOgl4w6TjX5O2RtD4QOiWC1JnXiWgg0t0chThU,2060
|
|
24
|
-
src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
|
|
25
|
-
src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=XOrq5Ns0nl8lDI9VvoOEbIMbOQdv8mcM8HqP8-eIjc4,3304
|
|
26
|
-
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
|
|
27
|
-
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=BSFy-6zDlAmOH2uZjsp-zZkR_Uy3RS5sGtdv9wysiSI,3151
|
|
28
|
-
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=fYUF7btc48Uqv4mJH5BgJdY4JVwj9I1xKX_HRBIrN7M,2590
|
|
29
|
-
src/prompts/library/customsAssessment/other/placeholders.json,sha256=5nSGsMbpfKrpKoYImcTto_RlOvPCHyld2RlwU0Zbbqw,361
|
|
30
|
-
src/prompts/library/customsAssessment/other/prompt.txt,sha256=wgJ8PYM0PKXiIKSljhFXEFBQ23GRs2E2DE9lVwHDvBU,2116
|
|
31
|
-
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=6b_6OVsxT7bjFnV_v0OZkGEy-GN5K4AjL0ATzuoLdOU,1286
|
|
32
|
-
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=MVSS5AhkiWT17G9X4xk_AgKzYElagvWjLPCMr_ZhmOs,2393
|
|
33
|
-
src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=0k1xLW4zWaenCSNQJxXMXenIwI-eYmGgpxnAAcM3HOg,2251
|
|
34
|
-
src/prompts/library/draftMbl/maersk/prompt.txt,sha256=GxaIYlksORvD2uAbodRx_9JFJXD4XbDaVFYtpN9uzxc,2050
|
|
35
|
-
src/prompts/library/draftMbl/other/placeholders.json,sha256=wIN06_NWsESDyNEDfOLPi3F2Vq-XPa4O3U32A32s-_Q,1736
|
|
36
|
-
src/prompts/library/draftMbl/other/prompt.txt,sha256=gqbPm1joXKDUss0wU6vMc-269sx-fYWh90gWuNKOBQc,2166
|
|
37
|
-
src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=0k1xLW4zWaenCSNQJxXMXenIwI-eYmGgpxnAAcM3HOg,2251
|
|
38
|
-
src/prompts/library/finalMbL/maersk/prompt.txt,sha256=GxaIYlksORvD2uAbodRx_9JFJXD4XbDaVFYtpN9uzxc,2050
|
|
39
|
-
src/prompts/library/finalMbL/other/placeholders.json,sha256=K_yJYhQo2DnZV_Rg6xXjo6sHkSGB-SMO4IQnY47V43w,1735
|
|
40
|
-
src/prompts/library/finalMbL/other/prompt.txt,sha256=gqbPm1joXKDUss0wU6vMc-269sx-fYWh90gWuNKOBQc,2166
|
|
41
|
-
src/prompts/library/packingList/other/prompt.txt,sha256=Qw16n7_48GGFYWz2vRepNowZCX1UPXKetEZ1UqFXPdY,2764
|
|
42
|
-
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=tXkr1VVeb1qqAR0SSWYrKu1Np3LXB9o4_2Ponsu0e4k,2352
|
|
43
|
-
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=Ih5VSfVBBYbo2_ufyYvp1DNYoXDYCScw8b1ylVbftsQ,2913
|
|
44
|
-
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
45
|
-
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
46
|
-
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
|
|
47
|
-
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
48
|
-
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=fyC24ig4FyRNnLuQM69s4ZVajsK-LHIl2dvaaEXr-6Q,1327
|
|
49
|
-
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
50
|
-
src/setup.py,sha256=MiFIP8ZOD0-WhzStEme18pJ52N8YpVYNZKsaueacQd8,6531
|
|
51
|
-
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
52
|
-
src/utils.py,sha256=30EgwvPGwmtBGkX_EWI0B-PdB1wgxqRW58JKsp6Nl3I,11740
|
|
53
|
-
data_science_document_ai-1.13.0.dist-info/METADATA,sha256=Pq4RD1k4Is-HrJ9mYZu0W7N3EzJSK5BPflg10_NBEeI,1951
|
|
54
|
-
data_science_document_ai-1.13.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
55
|
-
data_science_document_ai-1.13.0.dist-info/RECORD,,
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
|
|
2
|
-
|
|
3
|
-
blNumber: Bill of Lading number.
|
|
4
|
-
voyage: The journey or route code taken by the vessel.
|
|
5
|
-
portOfLoading: The port where cargo is loaded.
|
|
6
|
-
portOfDischarge: The port where cargo is unloaded.
|
|
7
|
-
bookingNumber: A unique identifier for the booking.
|
|
8
|
-
containers:
|
|
9
|
-
containerType: Type of the shipping container, usually related to it's size.
|
|
10
|
-
grossWeight: Total weight of the cargo, including the tare weight of the container.
|
|
11
|
-
measurements: Dimensions of the cargo (length, width, height) for freight calculations.
|
|
12
|
-
packageQuantity: package quantity.
|
|
13
|
-
packageType: Type of packaging used (e.g., cartons, pallets, barrels).
|
|
14
|
-
containerNumber: Unique ID for tracking the shipping container.
|
|
15
|
-
sealNumber: Number of the container's seal.
|
|
16
|
-
vessel: The name of the vessel.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Your task is to extract the text value of the following entities:
|
|
20
|
-
|
|
21
|
-
Keywords for datapoints:
|
|
22
|
-
- blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
|
|
23
|
-
- voyage: voyage, voy. no, voyage-no.
|
|
24
|
-
- portOfLoading: port of loading, pol, from.]
|
|
25
|
-
- portOfDischarge: port of discharge, pod, delivery, to
|
|
26
|
-
- bookingNumber: Our reference, booking no., carrier reference
|
|
27
|
-
- containers:
|
|
28
|
-
- containerType: x 40' container
|
|
29
|
-
- grossWeight: gross weight
|
|
30
|
-
- measurements: Dimensions of the cargo (length, width, height) for freight calculations
|
|
31
|
-
- packageQuantity: package quantity, number and kind of packages
|
|
32
|
-
- packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
|
|
33
|
-
- containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
|
|
34
|
-
- sealNumber: seal number, seal nos., shipper seal, seal.
|
|
35
|
-
- vessel: vessel
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
You must apply the following rules:
|
|
39
|
-
- The JSON schema must be followed during the extraction.
|
|
40
|
-
- The values must only include text found in the document
|
|
41
|
-
- Do not normalize any entity value.
|
|
42
|
-
- If 'sealNumber' is not found don't add it to the result.
|
|
43
|
-
- Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
|
|
44
|
-
- Add an escape character (backwards slash) in from of all quotes in values
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
Extract the following information from the sea waybill document.
|
|
2
|
-
|
|
3
|
-
**blNumber:** Find the value labeled as "B/L No.".
|
|
4
|
-
**voyage:** Get the "Voyage No." value.
|
|
5
|
-
**portOfLoading:** Find the value in the "Port of Loading" field.
|
|
6
|
-
**portOfDischarge:** Extract the text from the "Port of Discharge" field.
|
|
7
|
-
**bookingNumber:** Look for the value associated with "Booking No.".
|
|
8
|
-
**containers:**
|
|
9
|
-
The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
|
|
10
|
-
* **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
|
|
11
|
-
* **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
|
|
12
|
-
* **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
|
|
13
|
-
* **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
|
|
14
|
-
* **packageType:** Extract the value from the "Kind of Packages" field.
|
|
15
|
-
* **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
|
|
16
|
-
* **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
|
|
17
|
-
**vessel:** Extract the text from the field "Vessel".
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
|
|
2
|
-
|
|
3
|
-
blNumber: Bill of Lading number.
|
|
4
|
-
voyage: The journey or route code taken by the vessel.
|
|
5
|
-
portOfLoading: The port where cargo is loaded.
|
|
6
|
-
portOfDischarge: The port where cargo is unloaded.
|
|
7
|
-
bookingNumber: A unique identifier for the booking.
|
|
8
|
-
containers:
|
|
9
|
-
containerType: Type of the shipping container, usually related to it's size.
|
|
10
|
-
grossWeight: Total weight of the cargo, including the tare weight of the container.
|
|
11
|
-
measurements: Dimensions of the cargo (length, width, height) for freight calculations.
|
|
12
|
-
packageQuantity: package quantity.
|
|
13
|
-
packageType: Type of packaging used (e.g., cartons, pallets, barrels).
|
|
14
|
-
containerNumber: Unique ID for tracking the shipping container.
|
|
15
|
-
sealNumber: Number of the container's seal.
|
|
16
|
-
vessel: The name of the vessel.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Your task is to extract the text value of the following entities:
|
|
20
|
-
|
|
21
|
-
Keywords for datapoints:
|
|
22
|
-
- blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
|
|
23
|
-
- voyage: voyage, voy. no, voyage-no.
|
|
24
|
-
- portOfLoading: port of loading, pol, from.]
|
|
25
|
-
- portOfDischarge: port of discharge, pod, delivery, to
|
|
26
|
-
- bookingNumber: Our reference, booking no., carrier reference
|
|
27
|
-
- containers:
|
|
28
|
-
- containerType: x 40' container
|
|
29
|
-
- grossWeight: gross weight
|
|
30
|
-
- measurements: Dimensions of the cargo (length, width, height) for freight calculations
|
|
31
|
-
- packageQuantity: package quantity, number and kind of packages
|
|
32
|
-
- packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
|
|
33
|
-
- containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
|
|
34
|
-
- sealNumber: seal number, seal nos., shipper seal, seal.
|
|
35
|
-
- vessel: vessel
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
You must apply the following rules:
|
|
39
|
-
- The JSON schema must be followed during the extraction.
|
|
40
|
-
- The values must only include text found in the document
|
|
41
|
-
- Do not normalize any entity value.
|
|
42
|
-
- If 'sealNumber' is not found don't add it to the result.
|
|
43
|
-
- Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
|
|
44
|
-
- Add an escape character (backwards slash) in from of all quotes in values
|