data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
- data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
- src/constants.py +42 -12
- src/constants_sandbox.py +2 -22
- src/docai.py +18 -7
- src/docai_processor_config.yaml +0 -64
- src/excel_processing.py +34 -15
- src/io.py +74 -6
- src/llm.py +12 -34
- src/pdf_processing.py +228 -78
- src/postprocessing/common.py +495 -618
- src/postprocessing/postprocess_partner_invoice.py +383 -27
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +113 -0
- src/prompts/library/bundeskasse/other/prompt.txt +48 -0
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/placeholders.json +67 -16
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/placeholders.json +205 -0
- src/prompts/library/customsInvoice/other/prompt.txt +105 -0
- src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +34 -34
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
- src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
- src/setup.py +73 -63
- src/utils.py +207 -30
- data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
src/llm.py
CHANGED
|
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
|
|
|
15
15
|
Part,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from src.io import get_gcp_labels
|
|
18
19
|
from src.utils import cache_on_disk
|
|
19
20
|
|
|
20
21
|
|
|
@@ -28,12 +29,12 @@ class LlmClient:
|
|
|
28
29
|
# Initialize the model parameters
|
|
29
30
|
self.model_params = {
|
|
30
31
|
"temperature": parameters.get("temperature", 0),
|
|
31
|
-
"max_output_tokens": parameters.get("maxOutputTokens",
|
|
32
|
+
"max_output_tokens": parameters.get("maxOutputTokens", 65536),
|
|
32
33
|
"top_p": parameters.get("top_p", 0.8),
|
|
33
34
|
"top_k": parameters.get("top_k", 40),
|
|
34
35
|
"seed": parameters.get("seed", 42),
|
|
35
36
|
}
|
|
36
|
-
self.model_id = parameters.get("model_id", "gemini-
|
|
37
|
+
self.model_id = parameters.get("model_id", "gemini-2.5-flash")
|
|
37
38
|
# Initialize the safety configuration
|
|
38
39
|
self.safety_config = {
|
|
39
40
|
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
|
|
@@ -69,6 +70,7 @@ class LlmClient:
|
|
|
69
70
|
document: str = None,
|
|
70
71
|
response_schema: dict = None,
|
|
71
72
|
response_mime_type: str = "application/json",
|
|
73
|
+
doc_type: str = None,
|
|
72
74
|
):
|
|
73
75
|
"""Ask the Gemini model a question.
|
|
74
76
|
|
|
@@ -76,6 +78,7 @@ class LlmClient:
|
|
|
76
78
|
prompt (str): The prompt to send to the model.
|
|
77
79
|
document (str, optional): An optional document to provide context.
|
|
78
80
|
response_schema (dict, optional): Defines a specific response schema for the model.
|
|
81
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
79
82
|
|
|
80
83
|
Returns:
|
|
81
84
|
str: The response from the model.
|
|
@@ -96,12 +99,13 @@ class LlmClient:
|
|
|
96
99
|
# Prepare inputs for the model
|
|
97
100
|
inputs = [document, prompt] if document else prompt
|
|
98
101
|
|
|
99
|
-
# Generate the response
|
|
102
|
+
# Generate the response with labels for cost tracking
|
|
100
103
|
model_response = await cache_on_disk(
|
|
101
104
|
self.geminy_client.generate_content_async,
|
|
102
105
|
contents=inputs,
|
|
103
106
|
generation_config=config,
|
|
104
107
|
safety_settings=self.safety_config,
|
|
108
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
105
109
|
)
|
|
106
110
|
|
|
107
111
|
response_text = model_response.text
|
|
@@ -113,7 +117,7 @@ class LlmClient:
|
|
|
113
117
|
return "{}"
|
|
114
118
|
|
|
115
119
|
async def get_unified_json_genai(
|
|
116
|
-
self, prompt, document=None, response_schema=None, model="gemini"
|
|
120
|
+
self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
|
|
117
121
|
):
|
|
118
122
|
"""Send a prompt to a Google Cloud AI Platform model and returns the generated json.
|
|
119
123
|
|
|
@@ -122,6 +126,7 @@ class LlmClient:
|
|
|
122
126
|
document: Content of the PDF document
|
|
123
127
|
response_schema: The schema to use for the response
|
|
124
128
|
model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
|
|
129
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
125
130
|
|
|
126
131
|
Returns:
|
|
127
132
|
dict: The generated json from the model.
|
|
@@ -131,7 +136,9 @@ class LlmClient:
|
|
|
131
136
|
response = await self.ask_chatgpt(prompt, document, response_schema)
|
|
132
137
|
else:
|
|
133
138
|
# Default to Gemini
|
|
134
|
-
response = await self.ask_gemini(
|
|
139
|
+
response = await self.ask_gemini(
|
|
140
|
+
prompt, document, response_schema, doc_type=doc_type
|
|
141
|
+
)
|
|
135
142
|
|
|
136
143
|
try:
|
|
137
144
|
return json.loads(response)
|
|
@@ -194,33 +201,4 @@ class LlmClient:
|
|
|
194
201
|
return response
|
|
195
202
|
|
|
196
203
|
|
|
197
|
-
def prompt_excel_extraction(excel_structured_text):
|
|
198
|
-
"""Write a prompt to extract data from Excel files.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
excel_structured_text (str): The structured text of the Excel file.
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
prompt str: The prompt for common json.
|
|
205
|
-
"""
|
|
206
|
-
prompt = f"""{excel_structured_text}
|
|
207
|
-
|
|
208
|
-
Task: Fill in the following dictionary from the information in the given in the above excel data.
|
|
209
|
-
|
|
210
|
-
Instructions:
|
|
211
|
-
- Do not change the keys of the following dictionary.
|
|
212
|
-
- The values should be filled in as per the schema provided below.
|
|
213
|
-
- If an entity contains a 'display_name', consider its properties as child data points in the below format.
|
|
214
|
-
{{'data-field': {{
|
|
215
|
-
'child-data-field': 'type -occurrence_type- description',
|
|
216
|
-
}}
|
|
217
|
-
}}
|
|
218
|
-
- The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
|
|
219
|
-
- Ensure the schema reflects the hierarchical relationship.
|
|
220
|
-
- Use the data field description to understand the context of the data.
|
|
221
|
-
|
|
222
|
-
"""
|
|
223
|
-
return prompt
|
|
224
|
-
|
|
225
|
-
|
|
226
204
|
# pylint: enable=all
|
src/pdf_processing.py
CHANGED
|
@@ -2,19 +2,24 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
5
6
|
|
|
6
7
|
logger = logging.getLogger(__name__)
|
|
7
8
|
|
|
8
9
|
import asyncio
|
|
9
|
-
import random
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
|
|
12
|
+
from ddtrace import tracer
|
|
12
13
|
from fastapi import HTTPException
|
|
13
14
|
from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
14
15
|
|
|
15
16
|
from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
|
|
16
17
|
from src.excel_processing import extract_data_from_excel
|
|
17
|
-
from src.postprocessing.common import
|
|
18
|
+
from src.postprocessing.common import (
|
|
19
|
+
format_all_entities,
|
|
20
|
+
llm_prediction_to_tuples,
|
|
21
|
+
remove_none_values,
|
|
22
|
+
)
|
|
18
23
|
from src.postprocessing.postprocess_booking_confirmation import (
|
|
19
24
|
postprocess_booking_confirmation,
|
|
20
25
|
)
|
|
@@ -26,14 +31,19 @@ from src.postprocessing.postprocess_partner_invoice import (
|
|
|
26
31
|
)
|
|
27
32
|
from src.prompts.prompt_library import prompt_library
|
|
28
33
|
from src.utils import (
|
|
29
|
-
|
|
34
|
+
extract_top_pages,
|
|
35
|
+
get_pdf_page_count,
|
|
30
36
|
get_processor_name,
|
|
31
37
|
run_background_tasks,
|
|
38
|
+
split_pdf_into_chunks,
|
|
39
|
+
transform_schema_strings,
|
|
32
40
|
validate_based_on_schema,
|
|
33
41
|
)
|
|
34
42
|
|
|
35
43
|
|
|
36
|
-
async def process_file_w_docai(
|
|
44
|
+
async def process_file_w_docai(
|
|
45
|
+
params, image_content, client, processor_name, doc_type=None
|
|
46
|
+
):
|
|
37
47
|
"""
|
|
38
48
|
Process a file using Document AI.
|
|
39
49
|
|
|
@@ -42,6 +52,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
42
52
|
image_content (bytes): The file to be processed. It can be bytes object.
|
|
43
53
|
client: The Document AI client.
|
|
44
54
|
processor_name (str): The name of the processor to be used.
|
|
55
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
45
56
|
|
|
46
57
|
Returns:
|
|
47
58
|
The processed document.
|
|
@@ -53,7 +64,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
53
64
|
|
|
54
65
|
try:
|
|
55
66
|
logger.info("Processing document...")
|
|
56
|
-
result = await _process_pdf_w_docai(
|
|
67
|
+
result = await _process_pdf_w_docai(
|
|
68
|
+
image_content, client, processor_name, doc_type=doc_type
|
|
69
|
+
)
|
|
57
70
|
except Exception as e:
|
|
58
71
|
if e.reason == "PAGE_LIMIT_EXCEEDED":
|
|
59
72
|
logger.warning(
|
|
@@ -62,7 +75,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
62
75
|
# Process the document in batch method (offline processing)
|
|
63
76
|
try:
|
|
64
77
|
result = await _batch_process_pdf_w_docai(
|
|
65
|
-
params, image_content, client, processor_name
|
|
78
|
+
params, image_content, client, processor_name, doc_type=doc_type
|
|
66
79
|
)
|
|
67
80
|
except Exception as batch_e:
|
|
68
81
|
logger.error(f"Error processing document {batch_e}.")
|
|
@@ -92,7 +105,7 @@ async def extract_data_from_pdf_w_docai(
|
|
|
92
105
|
)
|
|
93
106
|
|
|
94
107
|
result = await process_file_w_docai(
|
|
95
|
-
params, file_content, processor_client, processor_name
|
|
108
|
+
params, file_content, processor_client, processor_name, doc_type=input_doc_type
|
|
96
109
|
)
|
|
97
110
|
|
|
98
111
|
# Create an entity object to store the result in gcs
|
|
@@ -103,9 +116,22 @@ async def extract_data_from_pdf_w_docai(
|
|
|
103
116
|
# Extract entities from the result
|
|
104
117
|
for entity in result.entities:
|
|
105
118
|
value = (
|
|
106
|
-
{
|
|
119
|
+
{
|
|
120
|
+
child.type_: (
|
|
121
|
+
child.mention_text,
|
|
122
|
+
child.page_anchor.page_refs[0].page
|
|
123
|
+
if hasattr(child.page_anchor.page_refs[0], "page")
|
|
124
|
+
else 0,
|
|
125
|
+
)
|
|
126
|
+
for child in entity.properties
|
|
127
|
+
}
|
|
107
128
|
if entity.properties
|
|
108
|
-
else
|
|
129
|
+
else (
|
|
130
|
+
entity.mention_text,
|
|
131
|
+
entity.page_anchor.page_refs[0].page
|
|
132
|
+
if hasattr(entity.page_anchor.page_refs[0], "page")
|
|
133
|
+
else 0,
|
|
134
|
+
)
|
|
109
135
|
)
|
|
110
136
|
aggregated_data[entity.type_].append(value)
|
|
111
137
|
|
|
@@ -121,7 +147,7 @@ async def extract_data_from_pdf_w_docai(
|
|
|
121
147
|
):
|
|
122
148
|
aggregated_data = postprocess_booking_confirmation(aggregated_data)
|
|
123
149
|
logger.info("Transport Legs assembled successfully")
|
|
124
|
-
elif input_doc_type
|
|
150
|
+
elif input_doc_type in ["partnerInvoice", "customsInvoice"]:
|
|
125
151
|
aggregated_data = postprocessing_partner_invoice(aggregated_data)
|
|
126
152
|
logger.info("Partner Invoice naming changed successfully")
|
|
127
153
|
|
|
@@ -136,7 +162,9 @@ async def extract_data_from_pdf_w_docai(
|
|
|
136
162
|
return aggregated_data, result_for_store, processor_version
|
|
137
163
|
|
|
138
164
|
|
|
139
|
-
async def identify_carrier(
|
|
165
|
+
async def identify_carrier(
|
|
166
|
+
document, llm_client, prompt, response_schema, doc_type=None
|
|
167
|
+
):
|
|
140
168
|
"""Identify the carrier from the Booking Confirmation document."""
|
|
141
169
|
|
|
142
170
|
result = await llm_client.ask_gemini(
|
|
@@ -144,92 +172,183 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
|
|
|
144
172
|
document=document,
|
|
145
173
|
response_schema=response_schema,
|
|
146
174
|
response_mime_type="text/x.enum",
|
|
175
|
+
doc_type=doc_type,
|
|
147
176
|
)
|
|
148
177
|
|
|
149
178
|
if result:
|
|
150
|
-
result = result.lower()
|
|
179
|
+
result = result.strip().lower()
|
|
151
180
|
else:
|
|
152
181
|
result = "other"
|
|
153
182
|
return result
|
|
154
183
|
|
|
155
184
|
|
|
156
|
-
async def process_file_w_llm(
|
|
157
|
-
params, file_content, input_doc_type, schema_client, llm_client
|
|
158
|
-
):
|
|
185
|
+
async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
159
186
|
"""Process a document using a language model (gemini) to extract structured data.
|
|
160
187
|
|
|
161
188
|
Args:
|
|
162
189
|
params (dict): The project parameters.
|
|
163
190
|
file_content (str): The content of the file to be processed.
|
|
164
191
|
input_doc_type (str): The type of document, used to select the appropriate prompt from the prompt library.
|
|
165
|
-
schema_client (object): The schema client object.
|
|
166
192
|
llm_client: The LLM client object.
|
|
167
193
|
|
|
168
194
|
Returns:
|
|
169
195
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
170
196
|
"""
|
|
171
|
-
#
|
|
172
|
-
|
|
197
|
+
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
198
|
+
if input_doc_type == "bundeskasse":
|
|
199
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
200
|
+
|
|
201
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
202
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
173
203
|
|
|
174
|
-
# get the schema placeholder
|
|
175
|
-
response_schema =
|
|
204
|
+
# get the schema placeholder
|
|
205
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
176
206
|
|
|
177
207
|
carrier = "other"
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
)
|
|
184
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
185
|
-
"placeholders"
|
|
186
|
-
][input_doc_type]
|
|
208
|
+
carrier_schema = (
|
|
209
|
+
prompt_library.library.get("preprocessing", {})
|
|
210
|
+
.get("carrier", {})
|
|
211
|
+
.get("placeholders", {})
|
|
212
|
+
.get(input_doc_type)
|
|
213
|
+
)
|
|
187
214
|
|
|
215
|
+
if carrier_schema:
|
|
188
216
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
189
217
|
carrier_prompt = carrier_prompt.replace(
|
|
190
218
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
191
219
|
)
|
|
192
220
|
|
|
221
|
+
# convert file_content to required document
|
|
222
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
223
|
+
|
|
193
224
|
# identify carrier for customized prompting
|
|
194
225
|
carrier = await identify_carrier(
|
|
195
|
-
document,
|
|
226
|
+
document,
|
|
227
|
+
llm_client,
|
|
228
|
+
carrier_prompt,
|
|
229
|
+
carrier_schema,
|
|
230
|
+
doc_type=input_doc_type,
|
|
196
231
|
)
|
|
197
232
|
|
|
198
|
-
#
|
|
199
|
-
if input_doc_type == "bookingConfirmation":
|
|
200
|
-
response_schema = prompt_library.library[input_doc_type][carrier.lower()][
|
|
201
|
-
"placeholders"
|
|
202
|
-
]
|
|
203
|
-
# There is one more additional field in partnerInvoice
|
|
204
|
-
# the reverseChargeSentence is added on later so its not available in Doc Ai schema.
|
|
205
|
-
elif input_doc_type == "partnerInvoice":
|
|
206
|
-
response_schema = prompt_library.library[input_doc_type][carrier.lower()][
|
|
207
|
-
"placeholders"
|
|
208
|
-
]
|
|
209
|
-
|
|
233
|
+
# Select prompt
|
|
210
234
|
if (
|
|
211
|
-
input_doc_type in prompt_library.library
|
|
212
|
-
|
|
235
|
+
input_doc_type not in prompt_library.library
|
|
236
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
213
237
|
):
|
|
214
|
-
|
|
215
|
-
|
|
238
|
+
return {}
|
|
239
|
+
|
|
240
|
+
# get the related prompt from predefined prompt library
|
|
241
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
242
|
+
|
|
243
|
+
# Add page-number extraction for moderately large docs
|
|
244
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
245
|
+
|
|
246
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
247
|
+
if not use_chunking and number_of_pages > 1:
|
|
248
|
+
response_schema = transform_schema_strings(response_schema)
|
|
249
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
216
250
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
251
|
+
tasks = []
|
|
252
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
253
|
+
for chunk in (
|
|
254
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
255
|
+
if use_chunking
|
|
256
|
+
else [file_content]
|
|
257
|
+
):
|
|
258
|
+
tasks.append(
|
|
259
|
+
process_chunk_with_retry(
|
|
260
|
+
chunk,
|
|
261
|
+
prompt,
|
|
262
|
+
response_schema,
|
|
263
|
+
llm_client,
|
|
264
|
+
input_doc_type,
|
|
265
|
+
)
|
|
220
266
|
)
|
|
221
|
-
|
|
222
|
-
|
|
267
|
+
|
|
268
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
269
|
+
|
|
270
|
+
if use_chunking:
|
|
271
|
+
return merge_llm_results(results, response_schema)
|
|
272
|
+
else:
|
|
273
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
async def process_chunk_with_retry(
|
|
277
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
278
|
+
):
|
|
279
|
+
"""Process a chunk with retries in case of failure."""
|
|
280
|
+
for attempt in range(1, retries + 1):
|
|
281
|
+
try:
|
|
282
|
+
return await process_chunk(
|
|
283
|
+
chunk_content=chunk_content,
|
|
284
|
+
prompt=prompt,
|
|
285
|
+
response_schema=response_schema,
|
|
286
|
+
llm_client=llm_client,
|
|
287
|
+
input_doc_type=input_doc_type,
|
|
288
|
+
)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
291
|
+
if attempt == retries:
|
|
292
|
+
raise
|
|
293
|
+
await asyncio.sleep(1) # small backoff
|
|
223
294
|
|
|
224
295
|
|
|
225
|
-
async def
|
|
226
|
-
|
|
296
|
+
async def process_chunk(
|
|
297
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
227
298
|
):
|
|
299
|
+
"""Process a chunk with Gemini."""
|
|
300
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
301
|
+
return await llm_client.get_unified_json_genai(
|
|
302
|
+
prompt=prompt,
|
|
303
|
+
document=document,
|
|
304
|
+
response_schema=response_schema,
|
|
305
|
+
doc_type=input_doc_type,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def merge_llm_results(results, response_schema):
|
|
310
|
+
"""Merge LLM results from multiple chunks."""
|
|
311
|
+
merged = {}
|
|
312
|
+
for i, result in enumerate(results):
|
|
313
|
+
if not isinstance(result, dict):
|
|
314
|
+
continue
|
|
315
|
+
# Add page number to all values coming from this chunk
|
|
316
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
317
|
+
|
|
318
|
+
# Merge the result into the final merged dictionary
|
|
319
|
+
for key, value in result.items():
|
|
320
|
+
field_type = (
|
|
321
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if key not in merged:
|
|
325
|
+
if field_type == "ARRAY":
|
|
326
|
+
# append the values as a list
|
|
327
|
+
merged[key] = (
|
|
328
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
merged[key] = value
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
if field_type == "ARRAY":
|
|
335
|
+
# append list contents across chunks
|
|
336
|
+
if isinstance(value, list):
|
|
337
|
+
merged[key].extend(value)
|
|
338
|
+
else:
|
|
339
|
+
merged[key].append(value)
|
|
340
|
+
|
|
341
|
+
# take first non-null value only
|
|
342
|
+
if merged[key] in (None, "", [], {}):
|
|
343
|
+
merged[key] = value
|
|
344
|
+
|
|
345
|
+
return merged
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
|
228
349
|
"""Extract data from the PDF file."""
|
|
229
350
|
# Process the document using LLM
|
|
230
|
-
result = await process_file_w_llm(
|
|
231
|
-
params, file_content, input_doc_type, schema_client, llm_client
|
|
232
|
-
)
|
|
351
|
+
result = await process_file_w_llm(params, file_content, input_doc_type, llm_client)
|
|
233
352
|
|
|
234
353
|
# Add currency from the amount field
|
|
235
354
|
if input_doc_type in ["commercialInvoice"]:
|
|
@@ -277,8 +396,8 @@ def combine_llm_results_w_doc_ai(
|
|
|
277
396
|
for key in keys_to_combine:
|
|
278
397
|
if key in llm.keys():
|
|
279
398
|
# Merge the list of dictionaries
|
|
399
|
+
# If the length of the LLM list is less than the Doc AI result, replace with the LLM list
|
|
280
400
|
if len(llm[key]) < len(result[key]):
|
|
281
|
-
# If the length of the LLM list is less than the Doc AI result, replace with the LLM list
|
|
282
401
|
result[key] = llm[key]
|
|
283
402
|
else:
|
|
284
403
|
# If the length of the LLM list is greater than or equal to the Doc AI result,
|
|
@@ -298,14 +417,11 @@ async def extract_data_by_doctype(
|
|
|
298
417
|
file_content,
|
|
299
418
|
input_doc_type,
|
|
300
419
|
processor_client,
|
|
301
|
-
schema_client,
|
|
302
420
|
if_use_docai,
|
|
303
421
|
if_use_llm,
|
|
422
|
+
llm_client,
|
|
304
423
|
isBetaTest=False,
|
|
305
424
|
):
|
|
306
|
-
# Select LLM client
|
|
307
|
-
llm_client = params["LlmClient"]
|
|
308
|
-
|
|
309
425
|
async def extract_w_docai():
|
|
310
426
|
return await extract_data_from_pdf_w_docai(
|
|
311
427
|
params=params,
|
|
@@ -320,7 +436,6 @@ async def extract_data_by_doctype(
|
|
|
320
436
|
params=params,
|
|
321
437
|
input_doc_type=input_doc_type,
|
|
322
438
|
file_content=file_content,
|
|
323
|
-
schema_client=schema_client,
|
|
324
439
|
llm_client=llm_client,
|
|
325
440
|
)
|
|
326
441
|
|
|
@@ -355,7 +470,7 @@ async def data_extraction_manual_flow(
|
|
|
355
470
|
meta,
|
|
356
471
|
processor_client,
|
|
357
472
|
schema_client,
|
|
358
|
-
|
|
473
|
+
use_default_logging=False,
|
|
359
474
|
):
|
|
360
475
|
"""
|
|
361
476
|
Process a PDF file and extract data from it.
|
|
@@ -367,7 +482,6 @@ async def data_extraction_manual_flow(
|
|
|
367
482
|
meta (DocumentMeta): Metadata associated with the document.
|
|
368
483
|
processor_client (DocumentProcessorClient): Client for the Document AI processor.
|
|
369
484
|
schema_client (DocumentSchemaClient): Client for the Document AI schema.
|
|
370
|
-
embed_manager (EmbeddingsManager): Manager for embeddings.
|
|
371
485
|
|
|
372
486
|
Returns:
|
|
373
487
|
dict: A dictionary containing the processed document information.
|
|
@@ -375,9 +489,23 @@ async def data_extraction_manual_flow(
|
|
|
375
489
|
Raises:
|
|
376
490
|
Refer to reasons in 400 error response examples.
|
|
377
491
|
"""
|
|
492
|
+
# Get the start time for processing
|
|
493
|
+
start_time = asyncio.get_event_loop().time()
|
|
494
|
+
|
|
495
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
496
|
+
llm_client = (
|
|
497
|
+
params["LlmClient_Flash"]
|
|
498
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
499
|
+
else params["LlmClient"]
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
page_count = None
|
|
378
503
|
# Validate the file type
|
|
379
504
|
if mime_type == "application/pdf":
|
|
380
|
-
|
|
505
|
+
# Enable Doc Ai only for certain document types.
|
|
506
|
+
if_use_docai = (
|
|
507
|
+
True if meta.documentTypeCode in params["model_config"]["stable"] else False
|
|
508
|
+
)
|
|
381
509
|
if_use_llm = (
|
|
382
510
|
True if meta.documentTypeCode in params["key_to_combine"].keys() else False
|
|
383
511
|
)
|
|
@@ -391,11 +519,12 @@ async def data_extraction_manual_flow(
|
|
|
391
519
|
file_content,
|
|
392
520
|
meta.documentTypeCode,
|
|
393
521
|
processor_client,
|
|
394
|
-
schema_client,
|
|
395
522
|
if_use_docai=if_use_docai,
|
|
396
523
|
if_use_llm=if_use_llm,
|
|
524
|
+
llm_client=llm_client,
|
|
397
525
|
isBetaTest=False,
|
|
398
526
|
)
|
|
527
|
+
page_count = get_pdf_page_count(file_content)
|
|
399
528
|
|
|
400
529
|
elif "excel" in mime_type or "spreadsheet" in mime_type:
|
|
401
530
|
# Extract data from the Excel file
|
|
@@ -403,10 +532,20 @@ async def data_extraction_manual_flow(
|
|
|
403
532
|
params=params,
|
|
404
533
|
input_doc_type=meta.documentTypeCode,
|
|
405
534
|
file_content=file_content,
|
|
406
|
-
schema_client=schema_client,
|
|
407
535
|
mime_type=mime_type,
|
|
536
|
+
llm_client=llm_client,
|
|
408
537
|
)
|
|
409
538
|
|
|
539
|
+
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
540
|
+
# Note: we use the span metric instead of len(extracted_data) because
|
|
541
|
+
# some sheets may fail extraction and not appear in extracted_data
|
|
542
|
+
span = tracer.current_span()
|
|
543
|
+
page_count = span.get_metric("est_page_count") if span else len(extracted_data)
|
|
544
|
+
if page_count > 100:
|
|
545
|
+
logger.warning(
|
|
546
|
+
f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
|
|
547
|
+
)
|
|
548
|
+
|
|
410
549
|
else:
|
|
411
550
|
raise HTTPException(
|
|
412
551
|
status_code=400,
|
|
@@ -414,7 +553,7 @@ async def data_extraction_manual_flow(
|
|
|
414
553
|
)
|
|
415
554
|
# Create the result dictionary with the extracted data
|
|
416
555
|
extracted_data = await format_all_entities(
|
|
417
|
-
extracted_data,
|
|
556
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
418
557
|
)
|
|
419
558
|
result = {
|
|
420
559
|
"id": meta.id,
|
|
@@ -422,16 +561,27 @@ async def data_extraction_manual_flow(
|
|
|
422
561
|
"data": extracted_data,
|
|
423
562
|
"processor_version": processor_version,
|
|
424
563
|
}
|
|
564
|
+
|
|
565
|
+
# Log the time taken for processing
|
|
566
|
+
end_time = asyncio.get_event_loop().time()
|
|
567
|
+
elapsed_time = end_time - start_time
|
|
568
|
+
logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
|
|
569
|
+
|
|
425
570
|
# Schedule background tasks without using FastAPI's BackgroundTasks
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
571
|
+
if (
|
|
572
|
+
os.getenv("CLUSTER") != "ode"
|
|
573
|
+
) & use_default_logging: # skip data export to bigquery in ODE environment
|
|
574
|
+
asyncio.create_task(
|
|
575
|
+
run_background_tasks(
|
|
576
|
+
params,
|
|
577
|
+
meta.id,
|
|
578
|
+
meta.documentTypeCode,
|
|
579
|
+
extracted_data,
|
|
580
|
+
store_data,
|
|
581
|
+
processor_version,
|
|
582
|
+
mime_type,
|
|
583
|
+
elapsed_time,
|
|
584
|
+
page_count,
|
|
585
|
+
)
|
|
435
586
|
)
|
|
436
|
-
)
|
|
437
587
|
return result
|