data-science-document-ai 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +2 -2
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD +34 -31
- src/constants.py +7 -10
- src/docai_processor_config.yaml +0 -56
- src/excel_processing.py +24 -14
- src/io.py +23 -0
- src/llm.py +0 -29
- src/pdf_processing.py +156 -51
- src/postprocessing/common.py +172 -28
- src/postprocessing/postprocess_partner_invoice.py +194 -59
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- src/prompts/library/bundeskasse/other/prompt.txt +7 -5
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/prompt.txt +4 -3
- src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +80 -0
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
- src/prompts/prompt_library.py +0 -4
- src/setup.py +15 -16
- src/utils.py +120 -68
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +0 -0
src/utils.py
CHANGED
|
@@ -6,38 +6,29 @@ import json
|
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Literal
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
10
10
|
|
|
11
|
+
import httpx
|
|
12
|
+
import numpy as np
|
|
11
13
|
import openpyxl
|
|
12
14
|
import pandas as pd
|
|
13
|
-
import requests
|
|
14
15
|
from google.cloud import documentai_v1beta3 as docu_ai_beta
|
|
15
|
-
from
|
|
16
|
+
from pypdf import PdfReader, PdfWriter
|
|
16
17
|
|
|
17
|
-
from src.io import get_storage_client, logger
|
|
18
|
+
from src.io import bq_logs, get_storage_client, logger
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
def
|
|
21
|
-
"""
|
|
21
|
+
def get_pdf_page_count(pdf_bytes):
|
|
22
|
+
"""Get the number of pages in a PDF document efficiently.
|
|
22
23
|
|
|
23
24
|
Args:
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
pdf_bytes (bytes): The PDF content as bytes.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
int: The number of pages in the PDF.
|
|
26
29
|
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Get the table string
|
|
30
|
-
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
31
|
-
|
|
32
|
-
logger.info(f"Log table: {table_string}")
|
|
33
|
-
# Insert the rows into the table
|
|
34
|
-
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
35
|
-
|
|
36
|
-
# Check if there were any errors inserting the rows
|
|
37
|
-
if not insert_logs:
|
|
38
|
-
logger.info("New rows have been added.")
|
|
39
|
-
else:
|
|
40
|
-
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
30
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
31
|
+
return len(reader.pages)
|
|
41
32
|
|
|
42
33
|
|
|
43
34
|
async def get_data_set_schema_from_docai(
|
|
@@ -164,6 +155,7 @@ async def run_background_tasks(
|
|
|
164
155
|
processor_version,
|
|
165
156
|
mime_type,
|
|
166
157
|
elapsed_time=None,
|
|
158
|
+
page_count=None,
|
|
167
159
|
):
|
|
168
160
|
"""
|
|
169
161
|
Run background tasks asynchronously.
|
|
@@ -177,6 +169,7 @@ async def run_background_tasks(
|
|
|
177
169
|
processor_version: The processor version used to extract the data.
|
|
178
170
|
mime_type: The MIME type of the document.
|
|
179
171
|
elapsed_time: The time taken to process the document.
|
|
172
|
+
page_count (int, optional): The number of pages in the document.
|
|
180
173
|
|
|
181
174
|
Returns:
|
|
182
175
|
None
|
|
@@ -185,13 +178,8 @@ async def run_background_tasks(
|
|
|
185
178
|
|
|
186
179
|
await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
|
|
187
180
|
|
|
188
|
-
#
|
|
189
|
-
page_count
|
|
190
|
-
# calculate the number of pages processed for PDFs
|
|
191
|
-
try:
|
|
192
|
-
if mime_type == "application/pdf":
|
|
193
|
-
page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
|
|
194
|
-
except AttributeError:
|
|
181
|
+
# Use the passed page_count or default to 0 if not provided
|
|
182
|
+
if page_count is None:
|
|
195
183
|
page_count = 0
|
|
196
184
|
|
|
197
185
|
# Log the request in BigQuery
|
|
@@ -303,9 +291,6 @@ def generate_schema_structure(params, input_doc_type):
|
|
|
303
291
|
"type": "string",
|
|
304
292
|
}
|
|
305
293
|
|
|
306
|
-
# update schema to extract value-page_number pairs
|
|
307
|
-
response_schema = transform_schema_strings(response_schema)
|
|
308
|
-
|
|
309
294
|
return response_schema
|
|
310
295
|
|
|
311
296
|
|
|
@@ -375,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
|
|
|
375
360
|
return output.getvalue()
|
|
376
361
|
|
|
377
362
|
|
|
378
|
-
def get_tms_mappings(
|
|
379
|
-
input_list:
|
|
380
|
-
):
|
|
363
|
+
async def get_tms_mappings(
|
|
364
|
+
input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
|
|
365
|
+
) -> Dict[str, Any]:
|
|
381
366
|
"""Get TMS mappings for the given values.
|
|
382
367
|
|
|
383
368
|
Args:
|
|
@@ -387,39 +372,66 @@ def get_tms_mappings(
|
|
|
387
372
|
llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
|
|
388
373
|
|
|
389
374
|
Returns:
|
|
390
|
-
dict: A dictionary with the mapping results.
|
|
375
|
+
dict or string: A dictionary or a string with the mapping results.
|
|
391
376
|
"""
|
|
392
|
-
# To test the API locally, port-forward the embedding service in the sandbox to 8080:80
|
|
393
|
-
# If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
|
|
394
377
|
base_url = (
|
|
395
378
|
"http://0.0.0.0:8080/"
|
|
396
379
|
if os.getenv("CLUSTER") is None
|
|
397
380
|
else "http://tms-mappings.api.svc.cluster.local./"
|
|
398
381
|
)
|
|
399
382
|
|
|
383
|
+
# Ensure clean inputs
|
|
384
|
+
if not input_list:
|
|
385
|
+
return {}
|
|
386
|
+
|
|
400
387
|
# Ensure input_list is a list
|
|
401
388
|
if not isinstance(input_list, list):
|
|
402
389
|
input_list = [input_list]
|
|
403
390
|
|
|
404
391
|
# Always send a dict with named keys
|
|
405
392
|
payload = {embedding_type: input_list}
|
|
393
|
+
|
|
406
394
|
if llm_ports:
|
|
407
395
|
payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
|
|
408
396
|
|
|
409
397
|
# Make the POST request to the TMS mappings API
|
|
410
|
-
url = f"{base_url}
|
|
411
|
-
response = requests.post(url=url, json=payload)
|
|
398
|
+
url = f"{base_url}{embedding_type}"
|
|
412
399
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
400
|
+
# Use a timeout so the code doesn't hang forever
|
|
401
|
+
timeout = httpx.Timeout(60.0, connect=10.0)
|
|
402
|
+
|
|
403
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
404
|
+
try:
|
|
405
|
+
response = await client.post(url, json=payload)
|
|
406
|
+
response.raise_for_status()
|
|
417
407
|
|
|
418
|
-
|
|
419
|
-
|
|
408
|
+
# Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
|
|
409
|
+
return response.json().get("response", {}).get("data", {})
|
|
410
|
+
|
|
411
|
+
except httpx.HTTPStatusError as exc:
|
|
412
|
+
logger.error(
|
|
413
|
+
f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
|
|
414
|
+
)
|
|
415
|
+
return {}
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
async def batch_fetch_all_mappings(container_types, terminals, depots):
|
|
419
|
+
"""Batch fetch all mappings for container types, terminals, and depots."""
|
|
420
|
+
# run batch calls concurrently
|
|
421
|
+
results = await asyncio.gather(
|
|
422
|
+
get_tms_mappings(list(container_types), "container_types"),
|
|
423
|
+
get_tms_mappings(list(terminals), "terminals"),
|
|
424
|
+
get_tms_mappings(list(depots), "depots"),
|
|
420
425
|
)
|
|
421
426
|
|
|
422
|
-
|
|
427
|
+
batch_container_map, batch_terminal_map, batch_depot_map = results
|
|
428
|
+
|
|
429
|
+
# Convert lists of tuples to dicts if necessary
|
|
430
|
+
return (
|
|
431
|
+
dict(batch_container_map or {}),
|
|
432
|
+
dict(batch_terminal_map or {}),
|
|
433
|
+
dict(batch_depot_map or {}),
|
|
434
|
+
)
|
|
423
435
|
|
|
424
436
|
|
|
425
437
|
def transform_schema_strings(schema):
|
|
@@ -435,12 +447,23 @@ def transform_schema_strings(schema):
|
|
|
435
447
|
Returns:
|
|
436
448
|
dict: The transformed schema dictionary.
|
|
437
449
|
"""
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
450
|
+
if not isinstance(schema, dict):
|
|
451
|
+
return schema
|
|
452
|
+
|
|
453
|
+
schema_type = schema.get("type")
|
|
454
|
+
if not schema_type:
|
|
455
|
+
return schema
|
|
456
|
+
|
|
457
|
+
# Base case: STRING → OBJECT (only if not already transformed)
|
|
458
|
+
if schema_type.upper() == "STRING":
|
|
459
|
+
return {
|
|
441
460
|
"type": "OBJECT",
|
|
442
461
|
"properties": {
|
|
443
|
-
"value": {
|
|
462
|
+
"value": {
|
|
463
|
+
"type": "STRING",
|
|
464
|
+
"nullable": schema.get("nullable", False),
|
|
465
|
+
"description": schema.get("description", ""),
|
|
466
|
+
},
|
|
444
467
|
"page_number": {
|
|
445
468
|
"type": "STRING",
|
|
446
469
|
"description": "Number of a page where the value was found in the document starting from 0.",
|
|
@@ -449,26 +472,55 @@ def transform_schema_strings(schema):
|
|
|
449
472
|
"required": [],
|
|
450
473
|
}
|
|
451
474
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
475
|
+
# Skip already transformed OBJECT (has both 'value' & 'page_number')
|
|
476
|
+
if (
|
|
477
|
+
schema_type.upper() == "OBJECT"
|
|
478
|
+
and "properties" in schema
|
|
479
|
+
and {"value", "page_number"}.issubset(schema["properties"].keys())
|
|
480
|
+
):
|
|
481
|
+
return schema
|
|
482
|
+
|
|
483
|
+
# Recursive case for OBJECT
|
|
484
|
+
if schema_type.upper() == "OBJECT" and "properties" in schema:
|
|
485
|
+
new_schema = schema.copy()
|
|
486
|
+
new_schema["properties"] = {
|
|
487
|
+
k: transform_schema_strings(v) for k, v in schema["properties"].items()
|
|
488
|
+
}
|
|
489
|
+
return new_schema
|
|
457
490
|
|
|
491
|
+
# Recursive case for ARRAY
|
|
492
|
+
if schema_type.upper() == "ARRAY" and "items" in schema:
|
|
493
|
+
new_schema = schema.copy()
|
|
494
|
+
new_schema["items"] = transform_schema_strings(schema["items"])
|
|
458
495
|
return new_schema
|
|
459
496
|
|
|
460
|
-
|
|
461
|
-
elif isinstance(schema, dict) and schema.get("type").upper() == "OBJECT":
|
|
462
|
-
transformed_schema = schema.copy()
|
|
463
|
-
for key, value in schema.get("properties").items():
|
|
464
|
-
transformed_schema["properties"][key] = transform_schema_strings(value)
|
|
465
|
-
return transformed_schema
|
|
497
|
+
return schema
|
|
466
498
|
|
|
467
|
-
# Recursive case: if the schema is a list
|
|
468
|
-
elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
|
|
469
|
-
schema["items"] = transform_schema_strings(schema["items"])
|
|
470
|
-
return schema
|
|
471
499
|
|
|
472
|
-
|
|
500
|
+
def estimate_page_count(sheet):
|
|
501
|
+
"""Assuming a page is 10 columns x 50 rows."""
|
|
502
|
+
if hasattr(sheet, "shape"):
|
|
503
|
+
pg_cnt = sheet.shape[0] * sheet.shape[1]
|
|
504
|
+
elif hasattr(sheet, "max_row"):
|
|
505
|
+
pg_cnt = sheet.max_column * sheet.max_row
|
|
473
506
|
else:
|
|
474
|
-
return
|
|
507
|
+
return None
|
|
508
|
+
return np.ceil(pg_cnt / 500)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
|
|
512
|
+
"""Split PDF into smaller page chunks."""
|
|
513
|
+
pdf = PdfReader(io.BytesIO(file_content))
|
|
514
|
+
total_pages = len(pdf.pages)
|
|
515
|
+
|
|
516
|
+
# TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
|
|
517
|
+
for i in range(0, total_pages, chunk_size):
|
|
518
|
+
writer = PdfWriter()
|
|
519
|
+
for j in range(i, min(i + chunk_size, total_pages)):
|
|
520
|
+
writer.add_page(pdf.pages[j])
|
|
521
|
+
|
|
522
|
+
buffer = io.BytesIO()
|
|
523
|
+
writer.write(buffer)
|
|
524
|
+
buffer.seek(0)
|
|
525
|
+
|
|
526
|
+
yield buffer.getvalue()
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
|
|
2
|
-
|
|
3
|
-
blNumber: Bill of Lading number.
|
|
4
|
-
voyage: The journey or route code taken by the vessel.
|
|
5
|
-
portOfLoading: The port where cargo is loaded.
|
|
6
|
-
portOfDischarge: The port where cargo is unloaded.
|
|
7
|
-
bookingNumber: A unique identifier for the booking.
|
|
8
|
-
containers:
|
|
9
|
-
containerType: Type of the shipping container, usually related to it's size.
|
|
10
|
-
grossWeight: Total weight of the cargo, including the tare weight of the container.
|
|
11
|
-
measurements: Dimensions of the cargo (length, width, height) for freight calculations.
|
|
12
|
-
packageQuantity: package quantity.
|
|
13
|
-
packageType: Type of packaging used (e.g., cartons, pallets, barrels).
|
|
14
|
-
containerNumber: Unique ID for tracking the shipping container.
|
|
15
|
-
sealNumber: Number of the container's seal.
|
|
16
|
-
vessel: The name of the vessel.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
Keywords for datapoints:
|
|
23
|
-
- blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
|
|
24
|
-
- voyage: voyage, voy. no, voyage-no.
|
|
25
|
-
- portOfLoading: port of loading, pol, from.]
|
|
26
|
-
- portOfDischarge: port of discharge, pod, delivery, to
|
|
27
|
-
- bookingNumber: Our reference, booking no., carrier reference
|
|
28
|
-
- containers:
|
|
29
|
-
- containerType: x 40' container
|
|
30
|
-
- grossWeight: gross weight
|
|
31
|
-
- measurements: Dimensions of the cargo (length, width, height) for freight calculations
|
|
32
|
-
- packageQuantity: package quantity, number and kind of packages
|
|
33
|
-
- packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
|
|
34
|
-
- containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
|
|
35
|
-
- sealNumber: seal number, seal nos., shipper seal, seal.
|
|
36
|
-
- vessel: vessel
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
You must apply the following rules:
|
|
40
|
-
- The JSON schema must be followed during the extraction.
|
|
41
|
-
- The values must only include text found in the document
|
|
42
|
-
- Do not normalize any entity value.
|
|
43
|
-
- If 'sealNumber' is not found don't add it to the result.
|
|
44
|
-
- Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
|
|
45
|
-
- Add an escape character (backwards slash) in from of all quotes in values
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
Extract the following information from the sea waybill document.
|
|
2
|
-
Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
**blNumber:** Find the value labeled as "B/L No.".
|
|
6
|
-
**voyage:** Get the "Voyage No." value.
|
|
7
|
-
**portOfLoading:** Find the value in the "Port of Loading" field.
|
|
8
|
-
**portOfDischarge:** Extract the text from the "Port of Discharge" field.
|
|
9
|
-
**bookingNumber:** Look for the value associated with "Booking No.".
|
|
10
|
-
**containers:**
|
|
11
|
-
The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
|
|
12
|
-
* **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
|
|
13
|
-
* **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
|
|
14
|
-
* **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
|
|
15
|
-
* **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
|
|
16
|
-
* **packageType:** Extract the value from the "Kind of Packages" field.
|
|
17
|
-
* **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
|
|
18
|
-
* **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
|
|
19
|
-
**vessel:** Extract the text from the field "Vessel".
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
|
|
2
|
-
|
|
3
|
-
blNumber: Bill of Lading number.
|
|
4
|
-
voyage: The journey or route code taken by the vessel.
|
|
5
|
-
portOfLoading: The port where cargo is loaded.
|
|
6
|
-
portOfDischarge: The port where cargo is unloaded.
|
|
7
|
-
bookingNumber: A unique identifier for the booking.
|
|
8
|
-
containers:
|
|
9
|
-
containerType: Type of the shipping container, usually related to it's size.
|
|
10
|
-
grossWeight: Total weight of the cargo, including the tare weight of the container.
|
|
11
|
-
measurements: Dimensions of the cargo (length, width, height) for freight calculations.
|
|
12
|
-
packageQuantity: package quantity.
|
|
13
|
-
packageType: Type of packaging used (e.g., cartons, pallets, barrels).
|
|
14
|
-
containerNumber: Unique ID for tracking the shipping container.
|
|
15
|
-
sealNumber: Number of the container's seal.
|
|
16
|
-
vessel: The name of the vessel.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
20
|
-
|
|
21
|
-
Keywords for datapoints:
|
|
22
|
-
- blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
|
|
23
|
-
- voyage: voyage, voy. no, voyage-no.
|
|
24
|
-
- portOfLoading: port of loading, pol, from.]
|
|
25
|
-
- portOfDischarge: port of discharge, pod, delivery, to
|
|
26
|
-
- bookingNumber: Our reference, booking no., carrier reference
|
|
27
|
-
- containers:
|
|
28
|
-
- containerType: x 40' container
|
|
29
|
-
- grossWeight: gross weight
|
|
30
|
-
- measurements: Dimensions of the cargo (length, width, height) for freight calculations
|
|
31
|
-
- packageQuantity: package quantity, number and kind of packages
|
|
32
|
-
- packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
|
|
33
|
-
- containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
|
|
34
|
-
- sealNumber: seal number, seal nos., shipper seal, seal.
|
|
35
|
-
- vessel: vessel
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
You must apply the following rules:
|
|
39
|
-
- The JSON schema must be followed during the extraction.
|
|
40
|
-
- The values must only include text found in the document
|
|
41
|
-
- Do not normalize any entity value.
|
|
42
|
-
- If 'sealNumber' is not found don't add it to the result.
|
|
43
|
-
- Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
|
|
44
|
-
- Add an escape character (backwards slash) in from of all quotes in values
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
Extract the following information from the sea waybill document.
|
|
2
|
-
Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
**blNumber:** Find the value labeled as "B/L No.".
|
|
6
|
-
**voyage:** Get the "Voyage No." value.
|
|
7
|
-
**portOfLoading:** Find the value in the "Port of Loading" field.
|
|
8
|
-
**portOfDischarge:** Extract the text from the "Port of Discharge" field.
|
|
9
|
-
**bookingNumber:** Look for the value associated with "Booking No.".
|
|
10
|
-
**containers:**
|
|
11
|
-
The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
|
|
12
|
-
* **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
|
|
13
|
-
* **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
|
|
14
|
-
* **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
|
|
15
|
-
* **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
|
|
16
|
-
* **packageType:** Extract the value from the "Kind of Packages" field.
|
|
17
|
-
* **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
|
|
18
|
-
* **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
|
|
19
|
-
**vessel:** Extract the text from the field "Vessel".
|
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL
RENAMED
|
File without changes
|