data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
- data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
- src/constants.py +6 -10
- src/docai.py +14 -5
- src/docai_processor_config.yaml +0 -56
- src/excel_processing.py +34 -13
- src/io.py +69 -1
- src/llm.py +10 -32
- src/pdf_processing.py +192 -54
- src/postprocessing/common.py +246 -44
- src/postprocessing/postprocess_partner_invoice.py +139 -85
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +25 -25
- src/prompts/library/bundeskasse/other/prompt.txt +8 -6
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/placeholders.json +67 -16
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/placeholders.json +20 -20
- src/prompts/library/customsInvoice/other/prompt.txt +4 -4
- src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +34 -34
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
- src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
- src/setup.py +13 -16
- src/utils.py +157 -45
- data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
- {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0
src/pdf_processing.py
CHANGED
|
@@ -9,12 +9,17 @@ logger = logging.getLogger(__name__)
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
|
|
12
|
+
from ddtrace import tracer
|
|
12
13
|
from fastapi import HTTPException
|
|
13
14
|
from google.cloud.documentai_v1 import Document as docaiv1_document
|
|
14
15
|
|
|
15
16
|
from src.docai import _batch_process_pdf_w_docai, _process_pdf_w_docai
|
|
16
17
|
from src.excel_processing import extract_data_from_excel
|
|
17
|
-
from src.postprocessing.common import
|
|
18
|
+
from src.postprocessing.common import (
|
|
19
|
+
format_all_entities,
|
|
20
|
+
llm_prediction_to_tuples,
|
|
21
|
+
remove_none_values,
|
|
22
|
+
)
|
|
18
23
|
from src.postprocessing.postprocess_booking_confirmation import (
|
|
19
24
|
postprocess_booking_confirmation,
|
|
20
25
|
)
|
|
@@ -28,13 +33,18 @@ from src.prompts.prompt_library import prompt_library
|
|
|
28
33
|
from src.utils import (
|
|
29
34
|
extract_top_pages,
|
|
30
35
|
generate_schema_structure,
|
|
36
|
+
get_pdf_page_count,
|
|
31
37
|
get_processor_name,
|
|
32
38
|
run_background_tasks,
|
|
39
|
+
split_pdf_into_chunks,
|
|
40
|
+
transform_schema_strings,
|
|
33
41
|
validate_based_on_schema,
|
|
34
42
|
)
|
|
35
43
|
|
|
36
44
|
|
|
37
|
-
async def process_file_w_docai(
|
|
45
|
+
async def process_file_w_docai(
|
|
46
|
+
params, image_content, client, processor_name, doc_type=None
|
|
47
|
+
):
|
|
38
48
|
"""
|
|
39
49
|
Process a file using Document AI.
|
|
40
50
|
|
|
@@ -43,6 +53,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
43
53
|
image_content (bytes): The file to be processed. It can be bytes object.
|
|
44
54
|
client: The Document AI client.
|
|
45
55
|
processor_name (str): The name of the processor to be used.
|
|
56
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
46
57
|
|
|
47
58
|
Returns:
|
|
48
59
|
The processed document.
|
|
@@ -54,7 +65,9 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
54
65
|
|
|
55
66
|
try:
|
|
56
67
|
logger.info("Processing document...")
|
|
57
|
-
result = await _process_pdf_w_docai(
|
|
68
|
+
result = await _process_pdf_w_docai(
|
|
69
|
+
image_content, client, processor_name, doc_type=doc_type
|
|
70
|
+
)
|
|
58
71
|
except Exception as e:
|
|
59
72
|
if e.reason == "PAGE_LIMIT_EXCEEDED":
|
|
60
73
|
logger.warning(
|
|
@@ -63,7 +76,7 @@ async def process_file_w_docai(params, image_content, client, processor_name):
|
|
|
63
76
|
# Process the document in batch method (offline processing)
|
|
64
77
|
try:
|
|
65
78
|
result = await _batch_process_pdf_w_docai(
|
|
66
|
-
params, image_content, client, processor_name
|
|
79
|
+
params, image_content, client, processor_name, doc_type=doc_type
|
|
67
80
|
)
|
|
68
81
|
except Exception as batch_e:
|
|
69
82
|
logger.error(f"Error processing document {batch_e}.")
|
|
@@ -93,7 +106,7 @@ async def extract_data_from_pdf_w_docai(
|
|
|
93
106
|
)
|
|
94
107
|
|
|
95
108
|
result = await process_file_w_docai(
|
|
96
|
-
params, file_content, processor_client, processor_name
|
|
109
|
+
params, file_content, processor_client, processor_name, doc_type=input_doc_type
|
|
97
110
|
)
|
|
98
111
|
|
|
99
112
|
# Create an entity object to store the result in gcs
|
|
@@ -104,9 +117,22 @@ async def extract_data_from_pdf_w_docai(
|
|
|
104
117
|
# Extract entities from the result
|
|
105
118
|
for entity in result.entities:
|
|
106
119
|
value = (
|
|
107
|
-
{
|
|
120
|
+
{
|
|
121
|
+
child.type_: (
|
|
122
|
+
child.mention_text,
|
|
123
|
+
child.page_anchor.page_refs[0].page
|
|
124
|
+
if hasattr(child.page_anchor.page_refs[0], "page")
|
|
125
|
+
else 0,
|
|
126
|
+
)
|
|
127
|
+
for child in entity.properties
|
|
128
|
+
}
|
|
108
129
|
if entity.properties
|
|
109
|
-
else
|
|
130
|
+
else (
|
|
131
|
+
entity.mention_text,
|
|
132
|
+
entity.page_anchor.page_refs[0].page
|
|
133
|
+
if hasattr(entity.page_anchor.page_refs[0], "page")
|
|
134
|
+
else 0,
|
|
135
|
+
)
|
|
110
136
|
)
|
|
111
137
|
aggregated_data[entity.type_].append(value)
|
|
112
138
|
|
|
@@ -137,7 +163,9 @@ async def extract_data_from_pdf_w_docai(
|
|
|
137
163
|
return aggregated_data, result_for_store, processor_version
|
|
138
164
|
|
|
139
165
|
|
|
140
|
-
async def identify_carrier(
|
|
166
|
+
async def identify_carrier(
|
|
167
|
+
document, llm_client, prompt, response_schema, doc_type=None
|
|
168
|
+
):
|
|
141
169
|
"""Identify the carrier from the Booking Confirmation document."""
|
|
142
170
|
|
|
143
171
|
result = await llm_client.ask_gemini(
|
|
@@ -145,6 +173,7 @@ async def identify_carrier(document, llm_client, prompt, response_schema):
|
|
|
145
173
|
document=document,
|
|
146
174
|
response_schema=response_schema,
|
|
147
175
|
response_mime_type="text/x.enum",
|
|
176
|
+
doc_type=doc_type,
|
|
148
177
|
)
|
|
149
178
|
|
|
150
179
|
if result:
|
|
@@ -167,61 +196,150 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
|
|
|
167
196
|
result (dict): The structured data extracted from the document, formatted as JSON.
|
|
168
197
|
"""
|
|
169
198
|
# Bundeskasse invoices contains all the required information in the first 3 pages.
|
|
170
|
-
|
|
171
|
-
extract_top_pages(file_content, num_pages=5)
|
|
172
|
-
if input_doc_type == "bundeskasse"
|
|
173
|
-
else file_content
|
|
174
|
-
)
|
|
199
|
+
if input_doc_type == "bundeskasse":
|
|
200
|
+
file_content = extract_top_pages(file_content, num_pages=5)
|
|
175
201
|
|
|
176
|
-
|
|
177
|
-
|
|
202
|
+
number_of_pages = get_pdf_page_count(file_content)
|
|
203
|
+
logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
|
|
178
204
|
|
|
179
|
-
# get the schema placeholder
|
|
180
|
-
response_schema =
|
|
181
|
-
prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
182
|
-
if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
|
|
183
|
-
else generate_schema_structure(params, input_doc_type)
|
|
184
|
-
)
|
|
205
|
+
# get the schema placeholder
|
|
206
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
185
207
|
|
|
186
208
|
carrier = "other"
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
)
|
|
193
|
-
carrier_schema = prompt_library.library["preprocessing"]["carrier"][
|
|
194
|
-
"placeholders"
|
|
195
|
-
][input_doc_type]
|
|
209
|
+
carrier_schema = (
|
|
210
|
+
prompt_library.library.get("preprocessing", {})
|
|
211
|
+
.get("carrier", {})
|
|
212
|
+
.get("placeholders", {})
|
|
213
|
+
.get(input_doc_type)
|
|
214
|
+
)
|
|
196
215
|
|
|
216
|
+
if carrier_schema:
|
|
197
217
|
carrier_prompt = prompt_library.library["preprocessing"]["carrier"]["prompt"]
|
|
198
218
|
carrier_prompt = carrier_prompt.replace(
|
|
199
219
|
"DOCUMENT_TYPE_PLACEHOLDER", input_doc_type
|
|
200
220
|
)
|
|
201
221
|
|
|
222
|
+
# convert file_content to required document
|
|
223
|
+
document = llm_client.prepare_document_for_gemini(file_content)
|
|
224
|
+
|
|
202
225
|
# identify carrier for customized prompting
|
|
203
226
|
carrier = await identify_carrier(
|
|
204
|
-
document,
|
|
227
|
+
document,
|
|
228
|
+
llm_client,
|
|
229
|
+
carrier_prompt,
|
|
230
|
+
carrier_schema,
|
|
231
|
+
doc_type=input_doc_type,
|
|
205
232
|
)
|
|
206
233
|
|
|
207
|
-
|
|
208
|
-
response_schema = prompt_library.library[input_doc_type][carrier][
|
|
209
|
-
"placeholders"
|
|
210
|
-
]
|
|
211
|
-
|
|
234
|
+
# Select prompt
|
|
212
235
|
if (
|
|
213
|
-
input_doc_type in prompt_library.library
|
|
214
|
-
|
|
236
|
+
input_doc_type not in prompt_library.library
|
|
237
|
+
or carrier not in prompt_library.library[input_doc_type]
|
|
215
238
|
):
|
|
216
|
-
|
|
217
|
-
|
|
239
|
+
return {}
|
|
240
|
+
|
|
241
|
+
# get the related prompt from predefined prompt library
|
|
242
|
+
prompt = prompt_library.library[input_doc_type][carrier]["prompt"]
|
|
243
|
+
|
|
244
|
+
# Add page-number extraction for moderately large docs
|
|
245
|
+
use_chunking = number_of_pages >= params["chunk_after"]
|
|
218
246
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
247
|
+
# Update schema and prompt to extract value-page_number pairs
|
|
248
|
+
if not use_chunking and number_of_pages > 1:
|
|
249
|
+
response_schema = transform_schema_strings(response_schema)
|
|
250
|
+
prompt += "\nFor each field, provide the page number where the information was found. The page numbering starts from 0."
|
|
251
|
+
|
|
252
|
+
tasks = []
|
|
253
|
+
# Process in chunks if number of pages exceeds threshold and Process all chunks concurrently
|
|
254
|
+
for chunk in (
|
|
255
|
+
split_pdf_into_chunks(file_content, chunk_size=params["chunk_size"])
|
|
256
|
+
if use_chunking
|
|
257
|
+
else [file_content]
|
|
258
|
+
):
|
|
259
|
+
tasks.append(
|
|
260
|
+
process_chunk_with_retry(
|
|
261
|
+
chunk, prompt, response_schema, llm_client, input_doc_type
|
|
262
|
+
)
|
|
222
263
|
)
|
|
223
|
-
|
|
224
|
-
|
|
264
|
+
|
|
265
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
266
|
+
|
|
267
|
+
if use_chunking:
|
|
268
|
+
return merge_llm_results(results, response_schema)
|
|
269
|
+
else:
|
|
270
|
+
return llm_prediction_to_tuples(results[0], number_of_pages=number_of_pages)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
async def process_chunk_with_retry(
|
|
274
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type, retries=2
|
|
275
|
+
):
|
|
276
|
+
"""Process a chunk with retries in case of failure."""
|
|
277
|
+
for attempt in range(1, retries + 1):
|
|
278
|
+
try:
|
|
279
|
+
return await process_chunk(
|
|
280
|
+
chunk_content=chunk_content,
|
|
281
|
+
prompt=prompt,
|
|
282
|
+
response_schema=response_schema,
|
|
283
|
+
llm_client=llm_client,
|
|
284
|
+
input_doc_type=input_doc_type,
|
|
285
|
+
)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(f"Chunk failed on attempt {attempt}: {e}")
|
|
288
|
+
if attempt == retries:
|
|
289
|
+
raise
|
|
290
|
+
await asyncio.sleep(1) # small backoff
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
async def process_chunk(
|
|
294
|
+
chunk_content, prompt, response_schema, llm_client, input_doc_type
|
|
295
|
+
):
|
|
296
|
+
"""Process a chunk with Gemini."""
|
|
297
|
+
document = llm_client.prepare_document_for_gemini(chunk_content)
|
|
298
|
+
return await llm_client.get_unified_json_genai(
|
|
299
|
+
prompt=prompt,
|
|
300
|
+
document=document,
|
|
301
|
+
response_schema=response_schema,
|
|
302
|
+
doc_type=input_doc_type,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def merge_llm_results(results, response_schema):
|
|
307
|
+
"""Merge LLM results from multiple chunks."""
|
|
308
|
+
merged = {}
|
|
309
|
+
for i, result in enumerate(results):
|
|
310
|
+
if not isinstance(result, dict):
|
|
311
|
+
continue
|
|
312
|
+
# Add page number to all values coming from this chunk
|
|
313
|
+
result = llm_prediction_to_tuples(result, number_of_pages=1, page_number=i)
|
|
314
|
+
|
|
315
|
+
# Merge the result into the final merged dictionary
|
|
316
|
+
for key, value in result.items():
|
|
317
|
+
field_type = (
|
|
318
|
+
response_schema["properties"].get(key, {}).get("type", "").upper()
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
if key not in merged:
|
|
322
|
+
if field_type == "ARRAY":
|
|
323
|
+
# append the values as a list
|
|
324
|
+
merged[key] = (
|
|
325
|
+
value if isinstance(value, list) else ([value] if value else [])
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
merged[key] = value
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
if field_type == "ARRAY":
|
|
332
|
+
# append list contents across chunks
|
|
333
|
+
if isinstance(value, list):
|
|
334
|
+
merged[key].extend(value)
|
|
335
|
+
else:
|
|
336
|
+
merged[key].append(value)
|
|
337
|
+
|
|
338
|
+
# take first non-null value only
|
|
339
|
+
if merged[key] in (None, "", [], {}):
|
|
340
|
+
merged[key] = value
|
|
341
|
+
|
|
342
|
+
return merged
|
|
225
343
|
|
|
226
344
|
|
|
227
345
|
async def extract_data_from_pdf_w_llm(params, input_doc_type, file_content, llm_client):
|
|
@@ -298,15 +416,9 @@ async def extract_data_by_doctype(
|
|
|
298
416
|
processor_client,
|
|
299
417
|
if_use_docai,
|
|
300
418
|
if_use_llm,
|
|
419
|
+
llm_client,
|
|
301
420
|
isBetaTest=False,
|
|
302
421
|
):
|
|
303
|
-
# Select LLM client (Using 2.5 Flash model for Bundeskasse)
|
|
304
|
-
llm_client = (
|
|
305
|
-
params["LlmClient_Flash"]
|
|
306
|
-
if input_doc_type == "bundeskasse"
|
|
307
|
-
else params["LlmClient"]
|
|
308
|
-
)
|
|
309
|
-
|
|
310
422
|
async def extract_w_docai():
|
|
311
423
|
return await extract_data_from_pdf_w_docai(
|
|
312
424
|
params=params,
|
|
@@ -355,6 +467,7 @@ async def data_extraction_manual_flow(
|
|
|
355
467
|
meta,
|
|
356
468
|
processor_client,
|
|
357
469
|
schema_client,
|
|
470
|
+
use_default_logging=False,
|
|
358
471
|
):
|
|
359
472
|
"""
|
|
360
473
|
Process a PDF file and extract data from it.
|
|
@@ -375,6 +488,15 @@ async def data_extraction_manual_flow(
|
|
|
375
488
|
"""
|
|
376
489
|
# Get the start time for processing
|
|
377
490
|
start_time = asyncio.get_event_loop().time()
|
|
491
|
+
|
|
492
|
+
# Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
|
|
493
|
+
llm_client = (
|
|
494
|
+
params["LlmClient_Flash"]
|
|
495
|
+
if meta.documentTypeCode not in ["customsInvoice", "partnerInvoice"]
|
|
496
|
+
else params["LlmClient"]
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
page_count = None
|
|
378
500
|
# Validate the file type
|
|
379
501
|
if mime_type == "application/pdf":
|
|
380
502
|
# Enable Doc Ai only for certain document types.
|
|
@@ -396,8 +518,10 @@ async def data_extraction_manual_flow(
|
|
|
396
518
|
processor_client,
|
|
397
519
|
if_use_docai=if_use_docai,
|
|
398
520
|
if_use_llm=if_use_llm,
|
|
521
|
+
llm_client=llm_client,
|
|
399
522
|
isBetaTest=False,
|
|
400
523
|
)
|
|
524
|
+
page_count = get_pdf_page_count(file_content)
|
|
401
525
|
|
|
402
526
|
elif "excel" in mime_type or "spreadsheet" in mime_type:
|
|
403
527
|
# Extract data from the Excel file
|
|
@@ -406,8 +530,19 @@ async def data_extraction_manual_flow(
|
|
|
406
530
|
input_doc_type=meta.documentTypeCode,
|
|
407
531
|
file_content=file_content,
|
|
408
532
|
mime_type=mime_type,
|
|
533
|
+
llm_client=llm_client,
|
|
409
534
|
)
|
|
410
535
|
|
|
536
|
+
# Get sheet count from dd-trace span (set in extract_data_from_excel)
|
|
537
|
+
# Note: we use the span metric instead of len(extracted_data) because
|
|
538
|
+
# some sheets may fail extraction and not appear in extracted_data
|
|
539
|
+
span = tracer.current_span()
|
|
540
|
+
page_count = span.get_metric("est_page_count") if span else len(extracted_data)
|
|
541
|
+
if page_count > 100:
|
|
542
|
+
logger.warning(
|
|
543
|
+
f"Check logic. Count of sheets in excel file is weirdly large: {page_count}"
|
|
544
|
+
)
|
|
545
|
+
|
|
411
546
|
else:
|
|
412
547
|
raise HTTPException(
|
|
413
548
|
status_code=400,
|
|
@@ -415,7 +550,7 @@ async def data_extraction_manual_flow(
|
|
|
415
550
|
)
|
|
416
551
|
# Create the result dictionary with the extracted data
|
|
417
552
|
extracted_data = await format_all_entities(
|
|
418
|
-
extracted_data, meta.documentTypeCode, params
|
|
553
|
+
extracted_data, meta.documentTypeCode, params, mime_type
|
|
419
554
|
)
|
|
420
555
|
result = {
|
|
421
556
|
"id": meta.id,
|
|
@@ -430,7 +565,9 @@ async def data_extraction_manual_flow(
|
|
|
430
565
|
logger.info(f"Time taken to process the document: {round(elapsed_time, 4)} seconds")
|
|
431
566
|
|
|
432
567
|
# Schedule background tasks without using FastAPI's BackgroundTasks
|
|
433
|
-
if
|
|
568
|
+
if (
|
|
569
|
+
os.getenv("CLUSTER") != "ode"
|
|
570
|
+
) & use_default_logging: # skip data export to bigquery in ODE environment
|
|
434
571
|
asyncio.create_task(
|
|
435
572
|
run_background_tasks(
|
|
436
573
|
params,
|
|
@@ -441,6 +578,7 @@ async def data_extraction_manual_flow(
|
|
|
441
578
|
processor_version,
|
|
442
579
|
mime_type,
|
|
443
580
|
elapsed_time,
|
|
581
|
+
page_count,
|
|
444
582
|
)
|
|
445
583
|
)
|
|
446
584
|
return result
|