data-science-document-ai 1.40.4__tar.gz → 1.42.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/PKG-INFO +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/pyproject.toml +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/docai.py +14 -5
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/excel_processing.py +14 -4
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/io.py +26 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/llm.py +10 -3
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/pdf_processing.py +46 -10
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/common.py +34 -5
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/msc/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/other/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/yangming/placeholders.json +32 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bundeskasse/other/placeholders.json +19 -19
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/bundeskasse/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsAssessment/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsInvoice/other/placeholders.json +19 -19
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/customsInvoice/other/prompt.txt +1 -1
- data_science_document_ai-1.42.0/src/prompts/library/deliveryOrder/other/placeholders.json +29 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +2 -1
- {data_science_document_ai-1.40.4/src/prompts/library/finalMbL → data_science_document_ai-1.42.0/src/prompts/library/draftMbl}/maersk/prompt.txt +2 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +1 -1
- {data_science_document_ai-1.40.4/src/prompts/library/draftMbl → data_science_document_ai-1.42.0/src/prompts/library/finalMbL}/maersk/prompt.txt +2 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/finalMbL/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/packingList/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/partnerInvoice/other/placeholders.json +12 -60
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/prompt_library.py +4 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/setup.py +5 -1
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/utils.py +64 -4
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/evergreen/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/maersk/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/msc/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/oocl/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/other/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/bookingConfirmation/yangming/placeholders.json +0 -32
- data_science_document_ai-1.40.4/src/prompts/library/customsAssessment/other/placeholders.json +0 -19
- data_science_document_ai-1.40.4/src/prompts/library/deliveryOrder/other/placeholders.json +0 -31
- data_science_document_ai-1.40.4/src/prompts/library/finalMbL/other/placeholders.json +0 -80
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/constants.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/constants_sandbox.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/docai_processor_config.yaml +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/log_setup.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_booking_confirmation.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_commercial_invoice.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/postprocess_partner_invoice.py +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/draftMbl/other/placeholders.json +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/postprocessing/port_code/placeholders.json +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/postprocessing/port_code/prompt_port_code.txt +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/preprocessing/carrier/placeholders.json +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/prompts/library/preprocessing/carrier/prompt.txt +0 -0
- {data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/tms.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "data-science-document-ai"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.42.0"
|
|
4
4
|
description = "\"Document AI repo for data science\""
|
|
5
5
|
authors = ["Naomi Nguyen <naomi.nguyen@forto.com>", "Kumar Rajendrababu <kumar.rajendrababu@forto.com>", "Igor Tonko <igor.tonko@forto.com>", "Osman Demirel <osman.demirel@forto.com>"]
|
|
6
6
|
packages = [
|
|
@@ -3,11 +3,16 @@ import re
|
|
|
3
3
|
|
|
4
4
|
from google.cloud import documentai
|
|
5
5
|
|
|
6
|
-
from src.io import
|
|
6
|
+
from src.io import (
|
|
7
|
+
delete_folder_from_bucket,
|
|
8
|
+
get_gcp_labels,
|
|
9
|
+
logger,
|
|
10
|
+
upload_pdf_to_bucket,
|
|
11
|
+
)
|
|
7
12
|
from src.utils import cache_on_disk
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
15
|
+
async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
|
|
11
16
|
"""Process the PDF using Document AI.
|
|
12
17
|
|
|
13
18
|
Args:
|
|
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
15
20
|
client: The Document AI client.
|
|
16
21
|
processor_name (str): The name of the processor to be used.
|
|
17
22
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
23
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
18
24
|
|
|
19
25
|
Returns:
|
|
20
26
|
The processed document.
|
|
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
24
30
|
content=image_content, mime_type="application/pdf"
|
|
25
31
|
)
|
|
26
32
|
|
|
27
|
-
# Configure the process request
|
|
33
|
+
# Configure the process request with labels for cost tracking
|
|
28
34
|
request = documentai.ProcessRequest(
|
|
29
35
|
name=processor_name,
|
|
30
36
|
raw_document=raw_document, # field_mask=field_mask
|
|
37
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
31
38
|
)
|
|
32
39
|
result = await cache_on_disk(client.process_document, request=request)
|
|
33
40
|
|
|
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
35
42
|
|
|
36
43
|
|
|
37
44
|
async def _batch_process_pdf_w_docai(
|
|
38
|
-
params, image_content, client, processor_name, timeout=1200
|
|
45
|
+
params, image_content, client, processor_name, timeout=1200, doc_type=None
|
|
39
46
|
):
|
|
40
47
|
"""Process the PDF using Document AI Batch Process API.
|
|
41
48
|
|
|
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
|
|
|
45
52
|
processor_name (str): The name of the processor to be used.
|
|
46
53
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
47
54
|
timeout (int, optional): The timeout in seconds. Defaults to 1200.
|
|
55
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
48
56
|
|
|
49
57
|
Returns:
|
|
50
58
|
The processed document.
|
|
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
|
|
|
72
80
|
# Where to write results
|
|
73
81
|
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
|
|
74
82
|
|
|
75
|
-
# The full resource name of the processor
|
|
83
|
+
# The full resource name of the processor with labels for cost tracking
|
|
76
84
|
request = documentai.BatchProcessRequest(
|
|
77
85
|
name=processor_name,
|
|
78
86
|
input_documents=input_config,
|
|
79
87
|
document_output_config=output_config,
|
|
88
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
80
89
|
)
|
|
81
90
|
|
|
82
91
|
# BatchProcess returns a Long Running Operation (LRO)
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
+
from src.postprocessing.common import llm_prediction_to_tuples
|
|
6
|
+
|
|
5
7
|
logger = logging.getLogger(__name__)
|
|
6
8
|
|
|
7
9
|
import asyncio
|
|
@@ -14,7 +16,9 @@ from src.llm import prompt_excel_extraction
|
|
|
14
16
|
from src.utils import generate_schema_structure, get_excel_sheets
|
|
15
17
|
|
|
16
18
|
|
|
17
|
-
async def extract_data_from_sheet(
|
|
19
|
+
async def extract_data_from_sheet(
|
|
20
|
+
params, sheet_name, sheet, response_schema, doc_type=None
|
|
21
|
+
):
|
|
18
22
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
19
23
|
excel_content = pd.DataFrame(sheet.values)
|
|
20
24
|
# Convert to Markdown format for the LLM model
|
|
@@ -30,6 +34,7 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
|
|
|
30
34
|
result = await params["LlmClient"].get_unified_json_genai(
|
|
31
35
|
prompt_docai,
|
|
32
36
|
response_schema=response_schema,
|
|
37
|
+
doc_type=doc_type,
|
|
33
38
|
)
|
|
34
39
|
except Exception as e:
|
|
35
40
|
result = {}
|
|
@@ -67,12 +72,17 @@ async def extract_data_from_excel(
|
|
|
67
72
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
68
73
|
sheet_extract_tasks = [
|
|
69
74
|
extract_data_from_sheet(
|
|
70
|
-
params,
|
|
75
|
+
params,
|
|
76
|
+
sheet_name,
|
|
77
|
+
workbook[sheet_name],
|
|
78
|
+
response_schema,
|
|
79
|
+
doc_type=input_doc_type,
|
|
71
80
|
)
|
|
72
81
|
for sheet_name in sheets
|
|
73
82
|
]
|
|
74
83
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
75
84
|
|
|
76
|
-
|
|
85
|
+
# Convert LLM prediction dictionary to tuples of (value, page_number).
|
|
86
|
+
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
77
87
|
|
|
78
|
-
return extracted_data,
|
|
88
|
+
return extracted_data, extracted_data, params["gemini_params"]["model_id"]
|
|
@@ -11,6 +11,28 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from google.cloud import bigquery, storage
|
|
13
13
|
|
|
14
|
+
from src.constants import project_parameters
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_gcp_labels(**extra_labels):
|
|
18
|
+
"""Generate standardized GCP labels for cost tracking.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
**extra_labels: Additional custom labels
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
|
|
25
|
+
"""
|
|
26
|
+
labels = {
|
|
27
|
+
"ds-project-name": project_parameters["project_name"],
|
|
28
|
+
"ds-env": os.getenv("CLUSTER", "local").lower(),
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# Add any extra labels passed in
|
|
32
|
+
labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
|
|
33
|
+
|
|
34
|
+
return labels
|
|
35
|
+
|
|
14
36
|
|
|
15
37
|
def get_bq_client(params):
|
|
16
38
|
"""Get Google BigQuery client."""
|
|
@@ -18,7 +40,7 @@ def get_bq_client(params):
|
|
|
18
40
|
job_config = bigquery.QueryJobConfig(
|
|
19
41
|
allow_large_results=True,
|
|
20
42
|
# flatten_results=True,
|
|
21
|
-
labels=
|
|
43
|
+
labels=get_gcp_labels(),
|
|
22
44
|
)
|
|
23
45
|
return bq_client, job_config
|
|
24
46
|
|
|
@@ -112,3 +134,6 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
112
134
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
113
135
|
blob.download_to_filename(directory_local / Path(blob.name))
|
|
114
136
|
return result
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# type: ignore
|
|
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
|
|
|
15
15
|
Part,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from src.io import get_gcp_labels
|
|
18
19
|
from src.utils import cache_on_disk
|
|
19
20
|
|
|
20
21
|
|
|
@@ -69,6 +70,7 @@ class LlmClient:
|
|
|
69
70
|
document: str = None,
|
|
70
71
|
response_schema: dict = None,
|
|
71
72
|
response_mime_type: str = "application/json",
|
|
73
|
+
doc_type: str = None,
|
|
72
74
|
):
|
|
73
75
|
"""Ask the Gemini model a question.
|
|
74
76
|
|
|
@@ -76,6 +78,7 @@ class LlmClient:
|
|
|
76
78
|
prompt (str): The prompt to send to the model.
|
|
77
79
|
document (str, optional): An optional document to provide context.
|
|
78
80
|
response_schema (dict, optional): Defines a specific response schema for the model.
|
|
81
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
79
82
|
|
|
80
83
|
Returns:
|
|
81
84
|
str: The response from the model.
|
|
@@ -96,12 +99,13 @@ class LlmClient:
|
|
|
96
99
|
# Prepare inputs for the model
|
|
97
100
|
inputs = [document, prompt] if document else prompt
|
|
98
101
|
|
|
99
|
-
# Generate the response
|
|
102
|
+
# Generate the response with labels for cost tracking
|
|
100
103
|
model_response = await cache_on_disk(
|
|
101
104
|
self.geminy_client.generate_content_async,
|
|
102
105
|
contents=inputs,
|
|
103
106
|
generation_config=config,
|
|
104
107
|
safety_settings=self.safety_config,
|
|
108
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
105
109
|
)
|
|
106
110
|
|
|
107
111
|
response_text = model_response.text
|
|
@@ -113,7 +117,7 @@ class LlmClient:
|
|
|
113
117
|
return "{}"
|
|
114
118
|
|
|
115
119
|
async def get_unified_json_genai(
|
|
116
|
-
self, prompt, document=None, response_schema=None, model="gemini"
|
|
120
|
+
self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
|
|
117
121
|
):
|
|
118
122
|
"""Send a prompt to a Google Cloud AI Platform model and returns the generated json.
|
|
119
123
|
|
|
@@ -122,6 +126,7 @@ class LlmClient:
|
|
|
122
126
|
document: Content of the PDF document
|
|
123
127
|
response_schema: The schema to use for the response
|
|
124
128
|
model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
|
|
129
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
125
130
|
|
|
126
131
|
Returns:
|
|
127
132
|
dict: The generated json from the model.
|
|
@@ -131,7 +136,9 @@ class LlmClient:
|
|
|
131
136
|
response = await self.ask_chatgpt(prompt, document, response_schema)
|
|
132
137
|
else:
|
|
133
138
|
# Default to Gemini
|
|
134
|
-
response = await self.ask_gemini(
|
|
139
|
+
response = await self.ask_gemini(
|
|
140
|
+
prompt, document, response_schema, doc_type=doc_type
|
|
141
|
+
)
|
|
135
142
|
|
|
136
143
|
try:
|
|
137
144
|
return json.loads(response)
|
|
@@ -14,7 +14,11 @@ from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
|
14
14
|
|
|
15
15
|
from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
|
|
16
16
|
from src.excel_processing import extract_data_from_excel
|
|
17
|
-
from src.postprocessing.common import
|
|
17
|
+
from src.postprocessing.common import (
|
|
18
|
+
format_all_entities,
|
|
19
|
+
llm_prediction_to_tuples,
|
|
20
|
+
remove_none_values,
|
|
21
|
+
)
|
|
18
22
|
from src.postprocessing.postprocess_booking_confirmation import (
|
|
19
23
|
postprocess_booking_confirmation,
|
|
20
24
|
)
|
|
@@ -30,11 +34,14 @@ from src.utils import (
|
|
|
30
34
|
generate_schema_structure,
|
|
31
35
|
get_processor_name,
|
|
32
36
|
run_background_tasks,
|
|
37
|
+
transform_schema_strings,
|
|
33
38
|
validate_based_on_schema,
|
|
34
39
|
)
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
async def process_file_w_docai(
|
|
42
|
+
async def process_file_w_docai(
|
|
43
|
+
params, image_content, client, processor_name, doc_type=None
|
|
44
|
+
):
|
|
38
45
|
"""
|
|
39
46
|
Process a file using Document AI.
|
|
40
47
|
|
|
@@ -43,6 +50,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
43
50
|
image_content (bytes): The file to be processed. It can be bytes object.
|
|
44
51
|
client: The Document AI client.
|
|
45
52
|
processor_name (str): The name of the processor to be used.
|
|
53
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
46
54
|
|
|
47
55
|
Returns:
|
|
48
56
|
The processed document.
|
|
@@ -54,7 +62,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
54
62
|
|
|
55
63
|
try:
|
|
56
64
|
logger.info("Processing document...")
|
|
57
|
-
result = await _process_pdf_w_docai(
|
|
65
|
+
result = await _process_pdf_w_docai(
|
|
66
|
+
image_content, client, processor_name, doc_type=doc_type
|
|
67
|
+
)
|
|
58
68
|
except Exception as e:
|
|
59
69
|
if e.reason == "PAGE_LIMIT_EXCEEDED":
|
|
60
70
|
logger.warning(
|
|
@@ -63,7 +73,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
63
73
|
# Process the document in batch method (offline processing)
|
|
64
74
|
try:
|
|
65
75
|
result = await _batch_process_pdf_w_docai(
|
|
66
|
-
params, image_content, client, processor_name
|
|
76
|
+
params, image_content, client, processor_name, doc_type=doc_type
|
|
67
77
|
)
|
|
68
78
|
except Exception as batch_e:
|
|
69
79
|
logger.error(f"Error processing document {batch_e}.")
|
|
@@ -93,7 +103,7 @@ async def extract_data_from_pdf_w_docai(
|
|
|
93
103
|
)
|
|
94
104
|
|
|
95
105
|
result = await process_file_w_docai(
|
|
96
|
-
params, file_content, processor_client, processor_name
|
|
106
|
+
params, file_content, processor_client, processor_name, doc_type=input_doc_type
|
|
97
107
|
)
|
|
98
108
|
|
|
99
109
|
# Create an entity object to store the result in gcs
|
|
@@ -104,9 +114,22 @@ async def extract_data_from_pdf_w_docai(
|
|
|
104
114
|
# Extract entities from the result
|
|
105
115
|
for entity in result.entities:
|
|
106
116
|
value = (
|
|
107
|
-
{
|
|
117
|
+
{
|
|
118
|
+
child.type_: (
|
|
119
|
+
child.mention_text,
|
|
120
|
+
child.page_anchor.page_refs[0].page
|
|
121
|
+
if hasattr(child.page_anchor.page_refs[0], "page")
|
|
122
|
+
else 0,
|
|
123
|
+
)
|
|
124
|
+
for child in entity.properties
|
|
125
|
+
}
|
|
108
126
|
if entity.properties
|
|
109
|
-
else
|
|
127
|
+
else (
|
|
128
|
+
entity.mention_text,
|
|
129
|
+
entity.page_anchor.page_refs[0].page
|
|
130
|
+
if hasattr(entity.page_anchor.page_refs[0], "page")
|
|
131
|
+
else 0,
|
|
132
|
+
)
|
|
110
133
|
)
|
|
111
134
|
aggregated_data[entity.type_].append(value)
|
|
112
135
|
|
|
@@ -137,7 +160,9 @@ async def extract_data_from_pdf_w_docai(
|
|
|
137
160
|
return aggregated_data, result_for_store, processor_version
|
|
138
161
|
|
|
139
162
|
|
|
140
|
-
async def identify_carrier(
|
|
163
|
+
async def identify_carrier(
|
|
164
|
+
document, llm_client, prompt, response_schema, doc_type=None
|
|
165
|
+
):
|
|
141
166
|
"""Identify the carrier from the Booking Confirmation document."""
|
|
142
167
|
|
|
143
168
|
result = await llm_client.ask_gemini(
|
|
@@ -145,6 +170,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
|
|
|
145
170
|
document=document,
|
|
146
171
|
response_schema=response_schema,
|
|
147
172
|
response_mime_type="text/x.enum",
|
|
173
|
+
doc_type=doc_type,
|
|
148
174
|
)
|
|
149
175
|
|
|
150
176
|
if result:
|
|
@@ -201,7 +227,11 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
201
227
|
|
|
202
228
|
# identify carrier for customized prompting
|
|
203
229
|
carrier = await identify_carrier(
|
|
204
|
-
document,
|
|
230
|
+
document,
|
|
231
|
+
llm_client,
|
|
232
|
+
carrier_prompt,
|
|
233
|
+
carrier_schema,
|
|
234
|
+
doc_type=input_doc_type,
|
|
205
235
|
)
|
|
206
236
|
|
|
207
237
|
if input_doc_type == "bookingConfirmation":
|
|
@@ -218,8 +248,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
218
248
|
|
|
219
249
|
# generate the result with LLM (gemini)
|
|
220
250
|
result = await llm_client.get_unified_json_genai(
|
|
221
|
-
prompt=prompt,
|
|
251
|
+
prompt=prompt,
|
|
252
|
+
document=document,
|
|
253
|
+
response_schema=response_schema,
|
|
254
|
+
doc_type=input_doc_type,
|
|
222
255
|
)
|
|
256
|
+
|
|
257
|
+
result = llm_prediction_to_tuples(result)
|
|
258
|
+
|
|
223
259
|
return result
|
|
224
260
|
return {}
|
|
225
261
|
|
{data_science_document_ai-1.40.4 → data_science_document_ai-1.42.0}/src/postprocessing/common.py
RENAMED
|
@@ -380,11 +380,18 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
380
380
|
]
|
|
381
381
|
)
|
|
382
382
|
return entity_k, [v for _, v in format_tasks]
|
|
383
|
+
if isinstance(entity_value, tuple):
|
|
384
|
+
page = entity_value[1]
|
|
385
|
+
entity_value = entity_value[0]
|
|
386
|
+
else:
|
|
387
|
+
page = -1
|
|
383
388
|
entity_key = entity_k.lower()
|
|
384
389
|
formatted_value = None
|
|
385
390
|
|
|
386
391
|
if entity_key.startswith("port"):
|
|
387
|
-
formatted_value = await get_port_code_ai(
|
|
392
|
+
formatted_value = await get_port_code_ai(
|
|
393
|
+
entity_value, llm_client, doc_type=document_type_code
|
|
394
|
+
)
|
|
388
395
|
|
|
389
396
|
elif (entity_key == "containertype") or (entity_key == "containersize"):
|
|
390
397
|
formatted_value = get_tms_mappings(entity_value, "container_types")
|
|
@@ -474,18 +481,19 @@ async def format_label(entity_k, entity_value, document_type_code, params):
|
|
|
474
481
|
result = {
|
|
475
482
|
"documentValue": entity_value,
|
|
476
483
|
"formattedValue": formatted_value,
|
|
484
|
+
"page": page,
|
|
477
485
|
}
|
|
478
486
|
return entity_k, result
|
|
479
487
|
|
|
480
488
|
|
|
481
|
-
async def get_port_code_ai(port: str, llm_client):
|
|
489
|
+
async def get_port_code_ai(port: str, llm_client, doc_type=None):
|
|
482
490
|
"""Get port code using AI model."""
|
|
483
|
-
port_llm = await get_port_code_llm(port, llm_client)
|
|
491
|
+
port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
|
|
484
492
|
|
|
485
493
|
return get_tms_mappings(port, "ports", port_llm)
|
|
486
494
|
|
|
487
495
|
|
|
488
|
-
async def get_port_code_llm(port: str, llm_client):
|
|
496
|
+
async def get_port_code_llm(port: str, llm_client, doc_type=None):
|
|
489
497
|
if (
|
|
490
498
|
"postprocessing" in prompt_library.library.keys()
|
|
491
499
|
and "port_code" in prompt_library.library["postprocessing"].keys()
|
|
@@ -512,7 +520,7 @@ async def get_port_code_llm(port: str, llm_client):
|
|
|
512
520
|
}
|
|
513
521
|
|
|
514
522
|
response = await llm_client.get_unified_json_genai(
|
|
515
|
-
prompt, response_schema=response_schema, model="chatgpt"
|
|
523
|
+
prompt, response_schema=response_schema, model="chatgpt", doc_type=doc_type
|
|
516
524
|
)
|
|
517
525
|
try:
|
|
518
526
|
mapped_port = response["port"]
|
|
@@ -616,3 +624,24 @@ def remove_stop_words(lineitem: str):
|
|
|
616
624
|
.upper()
|
|
617
625
|
.strip()
|
|
618
626
|
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def llm_prediction_to_tuples(llm_prediction):
|
|
630
|
+
"""Convert LLM prediction dictionary to tuples of (value, page_number)."""
|
|
631
|
+
if isinstance(llm_prediction, dict):
|
|
632
|
+
if "page_number" in llm_prediction.keys() and "value" in llm_prediction.keys():
|
|
633
|
+
if llm_prediction["value"]:
|
|
634
|
+
try:
|
|
635
|
+
page_number = int(llm_prediction["page_number"])
|
|
636
|
+
except: # noqa: E722
|
|
637
|
+
page_number = -1
|
|
638
|
+
return (llm_prediction["value"], page_number)
|
|
639
|
+
return None
|
|
640
|
+
for key, value in llm_prediction.items():
|
|
641
|
+
llm_prediction[key] = llm_prediction_to_tuples(
|
|
642
|
+
llm_prediction.get(key, value)
|
|
643
|
+
)
|
|
644
|
+
elif isinstance(llm_prediction, list):
|
|
645
|
+
for i, item in enumerate(llm_prediction):
|
|
646
|
+
llm_prediction[i] = llm_prediction_to_tuples(item)
|
|
647
|
+
return llm_prediction
|
data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/evergreen/placeholders.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
|
+
"transportLegs": {
|
|
15
|
+
"type": "ARRAY",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "OBJECT",
|
|
18
|
+
"properties": {
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
|
+
},
|
|
27
|
+
"required": []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": []
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
|
+
"transportLegs": {
|
|
15
|
+
"type": "ARRAY",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "OBJECT",
|
|
18
|
+
"properties": {
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
|
+
},
|
|
27
|
+
"required": []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": []
|
|
32
|
+
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Keywords for datapoints:
|
data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/maersk/placeholders.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
5
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
|
+
"transportLegs": {
|
|
15
|
+
"type": "ARRAY",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "OBJECT",
|
|
18
|
+
"properties": {
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
|
+
},
|
|
27
|
+
"required": []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": []
|
|
32
|
+
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Keywords for datapoints:
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
|
+
"transportLegs": {
|
|
15
|
+
"type": "ARRAY",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "OBJECT",
|
|
18
|
+
"properties": {
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"imoNumber": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
23
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
24
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
25
|
+
"voyage": {"type": "STRING", "nullable": true}
|
|
26
|
+
},
|
|
27
|
+
"required": []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": []
|
|
32
|
+
}
|
|
@@ -18,7 +18,7 @@ transportLegs:
|
|
|
18
18
|
vesselName: The name of the vessel for a specific leg.
|
|
19
19
|
voyage: The journey or route taken by the vessel for a specific leg.
|
|
20
20
|
|
|
21
|
-
your task is to extract the text value of the following entities:
|
|
21
|
+
your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
22
22
|
SCHEMA_PLACEHOLDER
|
|
23
23
|
|
|
24
24
|
Further explanation and Keywords for the transportLegs part as follows. The below 2 conditions is crucial. Take attention here:
|
data_science_document_ai-1.42.0/src/prompts/library/bookingConfirmation/oocl/placeholders.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "OBJECT",
|
|
3
|
+
"properties": {
|
|
4
|
+
"cfsCutOff": {"type": "STRING", "nullable": true, "description": "the date by which an LCL (Less than Container Load) shipment needs to be checked in to a CFS (Container Freight Station) to meet its scheduled sailing"},
|
|
5
|
+
"bookingNumber": {"type": "STRING", "nullable": true},
|
|
6
|
+
"cyCutOff": {"type": "STRING", "nullable": true},
|
|
7
|
+
"gateInReference": {"type": "STRING", "nullable": true},
|
|
8
|
+
"gateInTerminal": {"type": "STRING", "nullable": true},
|
|
9
|
+
"mblNumber": {"type": "STRING", "nullable": true},
|
|
10
|
+
"pickUpReference": {"type": "STRING", "nullable": true},
|
|
11
|
+
"pickUpTerminal": {"type": "STRING", "nullable": true},
|
|
12
|
+
"siCutOff": {"type": "STRING", "nullable": true},
|
|
13
|
+
"vgmCutOff": {"type": "STRING", "nullable": true},
|
|
14
|
+
"transportLegs": {
|
|
15
|
+
"type": "ARRAY",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "OBJECT",
|
|
18
|
+
"properties": {
|
|
19
|
+
"eta": {"type": "STRING", "nullable": true},
|
|
20
|
+
"etd": {"type": "STRING", "nullable": true},
|
|
21
|
+
"portOfDischarge": {"type": "STRING", "nullable": true},
|
|
22
|
+
"portOfLoading": {"type": "STRING", "nullable": true},
|
|
23
|
+
"vesselName": {"type": "STRING", "nullable": true},
|
|
24
|
+
"voyage": {"type": "STRING", "nullable": true},
|
|
25
|
+
"imoNumber": {"type": "STRING", "nullable": true}
|
|
26
|
+
},
|
|
27
|
+
"required": []
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": []
|
|
32
|
+
}
|