data-science-document-ai 1.37.0__py3-none-any.whl → 1.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.37.0.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +3 -3
  2. data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
  3. {data_science_document_ai-1.37.0.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +1 -1
  4. src/constants.py +6 -10
  5. src/docai.py +14 -5
  6. src/docai_processor_config.yaml +0 -56
  7. src/excel_processing.py +34 -13
  8. src/io.py +69 -1
  9. src/llm.py +10 -32
  10. src/pdf_processing.py +192 -57
  11. src/postprocessing/common.py +252 -590
  12. src/postprocessing/postprocess_partner_invoice.py +139 -89
  13. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  14. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  15. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  16. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  17. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  18. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  19. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  20. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  21. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  22. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  23. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  24. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  25. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  26. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  27. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  28. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  29. src/prompts/library/bundeskasse/other/placeholders.json +25 -25
  30. src/prompts/library/bundeskasse/other/prompt.txt +8 -6
  31. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  32. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  33. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  34. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  35. src/prompts/library/customsInvoice/other/placeholders.json +29 -20
  36. src/prompts/library/customsInvoice/other/prompt.txt +9 -4
  37. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  38. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  39. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  40. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  41. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  42. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  43. src/prompts/library/packingList/other/placeholders.json +98 -0
  44. src/prompts/library/packingList/other/prompt.txt +1 -1
  45. src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
  46. src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  47. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  48. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  49. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  50. src/setup.py +13 -61
  51. src/utils.py +189 -29
  52. data_science_document_ai-1.37.0.dist-info/RECORD +0 -59
  53. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  54. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  55. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  56. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
src/setup.py CHANGED
@@ -1,12 +1,8 @@
1
1
  """Contains project setup parameters and initialization functions."""
2
- import argparse
3
2
  import json
4
-
5
- # import streamlit as st
6
3
  import os
7
4
  import random
8
5
  import time
9
- from pathlib import Path
10
6
 
11
7
  import toml
12
8
  import vertexai
@@ -19,7 +15,7 @@ from src.constants import project_parameters
19
15
  from src.constants_sandbox import project_parameters_sandbox
20
16
 
21
17
  # Parent repos are imported without .
22
- from src.io import download_dir_from_bucket, get_storage_client, logger
18
+ from src.io import get_bq_client, get_storage_client, logger
23
19
  from src.llm import LlmClient
24
20
 
25
21
 
@@ -69,50 +65,6 @@ def get_docai_schema_client(params, async_=True):
69
65
  return client
70
66
 
71
67
 
72
- def parse_input():
73
- """Manage input parameters."""
74
- parser = argparse.ArgumentParser(description="", add_help=False)
75
- parser.add_argument(
76
- "--scope",
77
- type=str,
78
- dest="scope",
79
- required=False,
80
- help="Whether the function should 'upload' or 'download' documents",
81
- )
82
- parser.add_argument(
83
- "--document_name",
84
- type=str,
85
- dest="document_name",
86
- required=False,
87
- help="Category of the document (e.g., 'commercialInvoice', 'packingList')",
88
- )
89
- parser.add_argument(
90
- "--for_combinations",
91
- type=bool,
92
- default=False,
93
- dest="for_combinations",
94
- required=False,
95
- help="A flag to download documents into a special subfolder",
96
- )
97
- parser.add_argument(
98
- "--n_samples",
99
- type=int,
100
- default=50,
101
- dest="n_samples",
102
- required=False,
103
- help="A number of samples to download",
104
- )
105
-
106
- # Remove declared missing arguments (e.g. model_type)
107
- args = vars(parser.parse_args())
108
- args_no_null = {
109
- k: v.split(",") if isinstance(v, str) else v
110
- for k, v in args.items()
111
- if v is not None
112
- }
113
- return args_no_null
114
-
115
-
116
68
  def setup_params(args=None):
117
69
  """
118
70
  Set up the application parameters.
@@ -163,6 +115,10 @@ def setup_params(args=None):
163
115
 
164
116
  params = setup_docai_client_and_path(params)
165
117
 
118
+ # Set up BigQuery client for logging
119
+ bq_client, _ = get_bq_client(params)
120
+ params["bq_client"] = bq_client
121
+
166
122
  # Set up Vertex AI for text embeddings
167
123
  setup_vertexai(params)
168
124
 
@@ -174,15 +130,7 @@ def setup_params(args=None):
174
130
  assert params.keys() & yaml_content.keys() == set()
175
131
  params.update(yaml_content)
176
132
 
177
- # Get models meta data from cloud
178
- client = get_storage_client(params)
179
- bucket = client.bucket(params["doc_ai_bucket_name"])
180
- downloaded_meta = download_dir_from_bucket(
181
- bucket, params["g_model_data_folder"], Path(params["local_model_data_folder"])
182
- )
183
- if not downloaded_meta:
184
- logger.info(f"Could not load models metadata from cloud.")
185
-
133
+ # Set up LLM clients
186
134
  params["LlmClient"] = LlmClient(
187
135
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
188
136
  )
@@ -190,7 +138,8 @@ def setup_params(args=None):
190
138
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
191
139
  )
192
140
 
193
- params["lookup_data"] = setup_lookup_data(params, bucket)
141
+ # Load lookup data from GCS bucket
142
+ setup_lookup_data(params)
194
143
 
195
144
  return params
196
145
 
@@ -223,10 +172,13 @@ def setup_vertexai(params):
223
172
  )
224
173
 
225
174
 
226
- def setup_lookup_data(params, bucket):
175
+ def setup_lookup_data(params):
227
176
  """
228
177
  Loads JSON mapping data from given GCP Bucket.
229
178
  """
179
+ client = get_storage_client(params)
180
+ bucket = client.bucket(params["doc_ai_bucket_name"])
181
+
230
182
  data = dict()
231
183
 
232
184
  input_path_item_code = (
@@ -249,4 +201,4 @@ def setup_lookup_data(params, bucket):
249
201
  input_path_reverse_charge
250
202
  )
251
203
 
252
- return data
204
+ params["lookup_data"] = data
src/utils.py CHANGED
@@ -6,37 +6,29 @@ import json
6
6
  import os
7
7
  import pickle
8
8
  from datetime import datetime
9
- from typing import Literal
9
+ from typing import Any, Dict, List, Literal, Optional
10
10
 
11
+ import httpx
12
+ import numpy as np
11
13
  import openpyxl
12
14
  import pandas as pd
13
15
  from google.cloud import documentai_v1beta3 as docu_ai_beta
14
- from PyPDF2 import PdfReader, PdfWriter
16
+ from pypdf import PdfReader, PdfWriter
15
17
 
16
- from src.io import get_bq_client, get_storage_client, logger
18
+ from src.io import bq_logs, get_storage_client, logger
17
19
 
18
20
 
19
- def bq_logs(data_to_insert, params):
20
- """Insert logs into Google BigQuery.
21
+ def get_pdf_page_count(pdf_bytes):
22
+ """Get the number of pages in a PDF document efficiently.
21
23
 
22
24
  Args:
23
- data_to_insert (list): The data to insert into BigQuery.
24
- params (dict): The parameters dictionary.
25
+ pdf_bytes (bytes): The PDF content as bytes.
26
+
27
+ Returns:
28
+ int: The number of pages in the PDF.
25
29
  """
26
- # Get the BigQuery client
27
- bq_client, config = get_bq_client(params)
28
- # Get the table string
29
- table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
30
-
31
- logger.info(f"Log table: {table_string}")
32
- # Insert the rows into the table
33
- insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
34
-
35
- # Check if there were any errors inserting the rows
36
- if not insert_logs:
37
- logger.info("New rows have been added.")
38
- else:
39
- logger.info("Errors occurred while inserting rows: ", insert_logs)
30
+ reader = PdfReader(io.BytesIO(pdf_bytes))
31
+ return len(reader.pages)
40
32
 
41
33
 
42
34
  async def get_data_set_schema_from_docai(
@@ -138,7 +130,12 @@ def store_json_in_gcs(
138
130
  bucket = storage_client.bucket(params.get("doc_ai_bucket_name"))
139
131
  full_object_name = folder_path + document_id
140
132
  blob = bucket.blob(full_object_name)
141
- blob.upload_from_string(json_data, content_type="application/json")
133
+
134
+ # Convert dict to JSON string if needed
135
+ json_string = (
136
+ json.dumps(json_data) if isinstance(json_data, dict) else json_data
137
+ )
138
+ blob.upload_from_string(json_string, content_type="application/json")
142
139
 
143
140
  logger.info(
144
141
  f"JSON object stored successfully in gs://{params.get('doc_ai_bucket_name')}/{full_object_name}" # noqa
@@ -158,6 +155,7 @@ async def run_background_tasks(
158
155
  processor_version,
159
156
  mime_type,
160
157
  elapsed_time=None,
158
+ page_count=None,
161
159
  ):
162
160
  """
163
161
  Run background tasks asynchronously.
@@ -171,6 +169,7 @@ async def run_background_tasks(
171
169
  processor_version: The processor version used to extract the data.
172
170
  mime_type: The MIME type of the document.
173
171
  elapsed_time: The time taken to process the document.
172
+ page_count (int, optional): The number of pages in the document.
174
173
 
175
174
  Returns:
176
175
  None
@@ -179,13 +178,8 @@ async def run_background_tasks(
179
178
 
180
179
  await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
181
180
 
182
- # Keep the page count as 1 for Excel files.
183
- page_count = 1
184
- # calculate the number of pages processed for PDFs
185
- try:
186
- if mime_type == "application/pdf":
187
- page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
188
- except AttributeError:
181
+ # Use the passed page_count or default to 0 if not provided
182
+ if page_count is None:
189
183
  page_count = 0
190
184
 
191
185
  # Log the request in BigQuery
@@ -364,3 +358,169 @@ def extract_top_pages(pdf_bytes, num_pages=4):
364
358
  writer.write(output)
365
359
 
366
360
  return output.getvalue()
361
+
362
+
363
+ async def get_tms_mappings(
364
+ input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
365
+ ) -> Dict[str, Any]:
366
+ """Get TMS mappings for the given values.
367
+
368
+ Args:
369
+ input_list (list[str]): List of strings to get embeddings for.
370
+ embedding_type (str): Type of embedding to use
371
+ (e.g., "container_types", "ports", "depots", "lineitems", "terminals").
372
+ llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
373
+
374
+ Returns:
375
+ dict or string: A dictionary or a string with the mapping results.
376
+ """
377
+ base_url = (
378
+ "http://0.0.0.0:8080/"
379
+ if os.getenv("CLUSTER") is None
380
+ else "http://tms-mappings.api.svc.cluster.local./"
381
+ )
382
+
383
+ # Ensure clean inputs
384
+ if not input_list:
385
+ return {}
386
+
387
+ # Ensure input_list is a list
388
+ if not isinstance(input_list, list):
389
+ input_list = [input_list]
390
+
391
+ # Always send a dict with named keys
392
+ payload = {embedding_type: input_list}
393
+
394
+ if llm_ports:
395
+ payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
396
+
397
+ # Make the POST request to the TMS mappings API
398
+ url = f"{base_url}{embedding_type}"
399
+
400
+ # Use a timeout so the code doesn't hang forever
401
+ timeout = httpx.Timeout(60.0, connect=10.0)
402
+
403
+ async with httpx.AsyncClient(timeout=timeout) as client:
404
+ try:
405
+ response = await client.post(url, json=payload)
406
+ response.raise_for_status()
407
+
408
+ # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
+ return response.json().get("response", {}).get("data", {})
410
+
411
+ except httpx.HTTPStatusError as exc:
412
+ logger.error(
413
+ f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
414
+ )
415
+ return {}
416
+
417
+
418
+ async def batch_fetch_all_mappings(container_types, terminals, depots):
419
+ """Batch fetch all mappings for container types, terminals, and depots."""
420
+ # run batch calls concurrently
421
+ results = await asyncio.gather(
422
+ get_tms_mappings(list(container_types), "container_types"),
423
+ get_tms_mappings(list(terminals), "terminals"),
424
+ get_tms_mappings(list(depots), "depots"),
425
+ )
426
+
427
+ batch_container_map, batch_terminal_map, batch_depot_map = results
428
+
429
+ # Convert lists of tuples to dicts if necessary
430
+ return (
431
+ dict(batch_container_map or {}),
432
+ dict(batch_terminal_map or {}),
433
+ dict(batch_depot_map or {}),
434
+ )
435
+
436
+
437
+ def transform_schema_strings(schema):
438
+ """
439
+ Recursively transforms a schema dictionary, replacing all "type": "STRING"
440
+ definitions with a new object containing "value" and "page_number" fields.
441
+ It preserves 'nullable' and 'description' fields by moving them to the
442
+ new 'value' property.
443
+
444
+ Args:
445
+ schema (dict): The input schema dictionary.
446
+
447
+ Returns:
448
+ dict: The transformed schema dictionary.
449
+ """
450
+ if not isinstance(schema, dict):
451
+ return schema
452
+
453
+ schema_type = schema.get("type")
454
+ if not schema_type:
455
+ return schema
456
+
457
+ # Base case: STRING → OBJECT (only if not already transformed)
458
+ if schema_type.upper() == "STRING":
459
+ return {
460
+ "type": "OBJECT",
461
+ "properties": {
462
+ "value": {
463
+ "type": "STRING",
464
+ "nullable": schema.get("nullable", False),
465
+ "description": schema.get("description", ""),
466
+ },
467
+ "page_number": {
468
+ "type": "STRING",
469
+ "description": "Number of a page where the value was found in the document starting from 0.",
470
+ },
471
+ },
472
+ "required": [],
473
+ }
474
+
475
+ # Skip already transformed OBJECT (has both 'value' & 'page_number')
476
+ if (
477
+ schema_type.upper() == "OBJECT"
478
+ and "properties" in schema
479
+ and {"value", "page_number"}.issubset(schema["properties"].keys())
480
+ ):
481
+ return schema
482
+
483
+ # Recursive case for OBJECT
484
+ if schema_type.upper() == "OBJECT" and "properties" in schema:
485
+ new_schema = schema.copy()
486
+ new_schema["properties"] = {
487
+ k: transform_schema_strings(v) for k, v in schema["properties"].items()
488
+ }
489
+ return new_schema
490
+
491
+ # Recursive case for ARRAY
492
+ if schema_type.upper() == "ARRAY" and "items" in schema:
493
+ new_schema = schema.copy()
494
+ new_schema["items"] = transform_schema_strings(schema["items"])
495
+ return new_schema
496
+
497
+ return schema
498
+
499
+
500
+ def estimate_page_count(sheet):
501
+ """Assuming a page is 10 columns x 50 rows."""
502
+ if hasattr(sheet, "shape"):
503
+ pg_cnt = sheet.shape[0] * sheet.shape[1]
504
+ elif hasattr(sheet, "max_row"):
505
+ pg_cnt = sheet.max_column * sheet.max_row
506
+ else:
507
+ return None
508
+ return np.ceil(pg_cnt / 500)
509
+
510
+
511
+ def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
512
+ """Split PDF into smaller page chunks."""
513
+ pdf = PdfReader(io.BytesIO(file_content))
514
+ total_pages = len(pdf.pages)
515
+
516
+ # TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
517
+ for i in range(0, total_pages, chunk_size):
518
+ writer = PdfWriter()
519
+ for j in range(i, min(i + chunk_size, total_pages)):
520
+ writer.add_page(pdf.pages[j])
521
+
522
+ buffer = io.BytesIO()
523
+ writer.write(buffer)
524
+ buffer.seek(0)
525
+
526
+ yield buffer.getvalue()
@@ -1,59 +0,0 @@
1
- src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
2
- src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
- src/docai.py,sha256=AepGdF3ZuSGkujLpewX393FgOBMy-e4sEudiGKho5EA,5280
4
- src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=ZUlZ5zgWObmQfAWHoSrEEITKwr-xXxuOiPC3qDnGjtQ,2459
6
- src/io.py,sha256=IXz4wWqiHa9mnHNgtrC6X9M2lItYp9eu6rHCThUIh5c,3585
7
- src/llm.py,sha256=aEK3rL8XvY7CakvkOJQmcHpEKwZRd8PPrLrzHiO-GFk,7827
8
- src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=g-WVrI6J2lbVR3eOoDJnuy4buWh7bTmO-3aezoTN3i4,15527
10
- src/postprocessing/common.py,sha256=UxwmnXH7saggxDMs9Ssx_Bp3-O9NeUrcKFWRI_QYuZ0,39583
11
- src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
- src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=oCT-l31DTosUf0cz0d5IWOF6erw6rD3rQfR58koSUeM,11760
14
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
15
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=qlBMFDHy-gwr2PVeuHrfMEg_8Ibdym243DnaCgINa7g,2614
16
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
17
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=sg11U3lIhhS36BsimX7IOzR7Pez_9gScdNmJna2pPuw,3355
18
- src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=PKWXySGAls6A8tujbSjokYp4ldc3c0DmSP2ITKYiUF8,1405
19
- src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=-00tzWzXtQnXX3EPtaCBM39leCoLa4FB52_t7Z3eoQk,3148
20
- src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
21
- src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=9wdbLofnp5s1acD19jCmQuw__HMcVq1yr4vIJNJlKVM,4630
22
- src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=NnXjMiEsTCzTDWs2WY7BIMo2p4_98-DL3v1r7x-FL3A,1403
23
- src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=aGowVvOgl4w6TjX5O2RtD4QOiWC1JnXiWgg0t0chThU,2060
24
- src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
25
- src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=XOrq5Ns0nl8lDI9VvoOEbIMbOQdv8mcM8HqP8-eIjc4,3304
26
- src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=Re2wBgZoaJ5yImUUAwZOZxFcKXHxi83TCZwTuqd2v2k,1405
27
- src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=BSFy-6zDlAmOH2uZjsp-zZkR_Uy3RS5sGtdv9wysiSI,3151
28
- src/prompts/library/bundeskasse/other/placeholders.json,sha256=vc_m1yZP__ZMsdvnIvkgKmwF1q_ZMnKuWeIRNWour3w,4054
29
- src/prompts/library/bundeskasse/other/prompt.txt,sha256=GrrLH8lSuTcWmKUPUgBob0-bS_a61jTFSf09xxcE-nU,2890
30
- src/prompts/library/commercialInvoice/other/prompt.txt,sha256=fYUF7btc48Uqv4mJH5BgJdY4JVwj9I1xKX_HRBIrN7M,2590
31
- src/prompts/library/customsAssessment/other/placeholders.json,sha256=5nSGsMbpfKrpKoYImcTto_RlOvPCHyld2RlwU0Zbbqw,361
32
- src/prompts/library/customsAssessment/other/prompt.txt,sha256=wgJ8PYM0PKXiIKSljhFXEFBQ23GRs2E2DE9lVwHDvBU,2116
33
- src/prompts/library/customsInvoice/other/placeholders.json,sha256=8wkBbDwAgRjh5AW7kE7ORfVE2LGcOE5Ko-6Uo3leVJI,11651
34
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=0YXAaCpB6tr_ed-7MCldjxha3HXZbI-m0yHwR-0sClg,8880
35
- src/prompts/library/deliveryOrder/other/placeholders.json,sha256=6b_6OVsxT7bjFnV_v0OZkGEy-GN5K4AjL0ATzuoLdOU,1286
36
- src/prompts/library/deliveryOrder/other/prompt.txt,sha256=MVSS5AhkiWT17G9X4xk_AgKzYElagvWjLPCMr_ZhmOs,2393
37
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=0k1xLW4zWaenCSNQJxXMXenIwI-eYmGgpxnAAcM3HOg,2251
38
- src/prompts/library/draftMbl/maersk/prompt.txt,sha256=GxaIYlksORvD2uAbodRx_9JFJXD4XbDaVFYtpN9uzxc,2050
39
- src/prompts/library/draftMbl/other/placeholders.json,sha256=wIN06_NWsESDyNEDfOLPi3F2Vq-XPa4O3U32A32s-_Q,1736
40
- src/prompts/library/draftMbl/other/prompt.txt,sha256=gqbPm1joXKDUss0wU6vMc-269sx-fYWh90gWuNKOBQc,2166
41
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=0k1xLW4zWaenCSNQJxXMXenIwI-eYmGgpxnAAcM3HOg,2251
42
- src/prompts/library/finalMbL/maersk/prompt.txt,sha256=GxaIYlksORvD2uAbodRx_9JFJXD4XbDaVFYtpN9uzxc,2050
43
- src/prompts/library/finalMbL/other/placeholders.json,sha256=K_yJYhQo2DnZV_Rg6xXjo6sHkSGB-SMO4IQnY47V43w,1735
44
- src/prompts/library/finalMbL/other/prompt.txt,sha256=gqbPm1joXKDUss0wU6vMc-269sx-fYWh90gWuNKOBQc,2166
45
- src/prompts/library/packingList/other/prompt.txt,sha256=Qw16n7_48GGFYWz2vRepNowZCX1UPXKetEZ1UqFXPdY,2764
46
- src/prompts/library/partnerInvoice/other/placeholders.json,sha256=y_S_iAr-hHFzrPJgquZEtR66ETMvdWp1rtwCLrLPQUU,12139
47
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=hHzI5Vq3BABBWPMsv72nlahfYjgq08OSpflDDhV0NX8,9360
48
- src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
- src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
- src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
51
- src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
52
- src/prompts/library/shippingInstruction/other/prompt.txt,sha256=fyC24ig4FyRNnLuQM69s4ZVajsK-LHIl2dvaaEXr-6Q,1327
53
- src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
54
- src/setup.py,sha256=TJu68mXS6Dx90Il8A_pHDnrIOiLD3q9f7FWgW0c1HOM,8352
55
- src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
56
- src/utils.py,sha256=68x3hakQ8aDfq7967XoTRe_vsneWnLbWp_jz8q_FrBA,12189
57
- data_science_document_ai-1.37.0.dist-info/METADATA,sha256=xutWfD4IDhKnvPWZ2d1wczu9Q70n11N4N4ZX8rcOBRU,2153
58
- data_science_document_ai-1.37.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
59
- data_science_document_ai-1.37.0.dist-info/RECORD,,
@@ -1,44 +0,0 @@
1
- You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
2
-
3
- blNumber: Bill of Lading number.
4
- voyage: The journey or route code taken by the vessel.
5
- portOfLoading: The port where cargo is loaded.
6
- portOfDischarge: The port where cargo is unloaded.
7
- bookingNumber: A unique identifier for the booking.
8
- containers:
9
- containerType: Type of the shipping container, usually related to it's size.
10
- grossWeight: Total weight of the cargo, including the tare weight of the container.
11
- measurements: Dimensions of the cargo (length, width, height) for freight calculations.
12
- packageQuantity: package quantity.
13
- packageType: Type of packaging used (e.g., cartons, pallets, barrels).
14
- containerNumber: Unique ID for tracking the shipping container.
15
- sealNumber: Number of the container's seal.
16
- vessel: The name of the vessel.
17
-
18
-
19
- Your task is to extract the text value of the following entities:
20
-
21
- Keywords for datapoints:
22
- - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
23
- - voyage: voyage, voy. no, voyage-no.
24
- - portOfLoading: port of loading, pol, from.]
25
- - portOfDischarge: port of discharge, pod, delivery, to
26
- - bookingNumber: Our reference, booking no., carrier reference
27
- - containers:
28
- - containerType: x 40' container
29
- - grossWeight: gross weight
30
- - measurements: Dimensions of the cargo (length, width, height) for freight calculations
31
- - packageQuantity: package quantity, number and kind of packages
32
- - packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
33
- - containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
34
- - sealNumber: seal number, seal nos., shipper seal, seal.
35
- - vessel: vessel
36
-
37
-
38
- You must apply the following rules:
39
- - The JSON schema must be followed during the extraction.
40
- - The values must only include text found in the document
41
- - Do not normalize any entity value.
42
- - If 'sealNumber' is not found don't add it to the result.
43
- - Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
44
- - Add an escape character (backwards slash) in from of all quotes in values
@@ -1,17 +0,0 @@
1
- Extract the following information from the sea waybill document.
2
-
3
- **blNumber:** Find the value labeled as "B/L No.".
4
- **voyage:** Get the "Voyage No." value.
5
- **portOfLoading:** Find the value in the "Port of Loading" field.
6
- **portOfDischarge:** Extract the text from the "Port of Discharge" field.
7
- **bookingNumber:** Look for the value associated with "Booking No.".
8
- **containers:**
9
- The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
10
- * **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
11
- * **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
12
- * **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
13
- * **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
14
- * **packageType:** Extract the value from the "Kind of Packages" field.
15
- * **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
16
- * **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
17
- **vessel:** Extract the text from the field "Vessel".
@@ -1,44 +0,0 @@
1
- You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
2
-
3
- blNumber: Bill of Lading number.
4
- voyage: The journey or route code taken by the vessel.
5
- portOfLoading: The port where cargo is loaded.
6
- portOfDischarge: The port where cargo is unloaded.
7
- bookingNumber: A unique identifier for the booking.
8
- containers:
9
- containerType: Type of the shipping container, usually related to it's size.
10
- grossWeight: Total weight of the cargo, including the tare weight of the container.
11
- measurements: Dimensions of the cargo (length, width, height) for freight calculations.
12
- packageQuantity: package quantity.
13
- packageType: Type of packaging used (e.g., cartons, pallets, barrels).
14
- containerNumber: Unique ID for tracking the shipping container.
15
- sealNumber: Number of the container's seal.
16
- vessel: The name of the vessel.
17
-
18
-
19
- Your task is to extract the text value of the following entities:
20
-
21
- Keywords for datapoints:
22
- - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
23
- - voyage: voyage, voy. no, voyage-no.
24
- - portOfLoading: port of loading, pol, from.]
25
- - portOfDischarge: port of discharge, pod, delivery, to
26
- - bookingNumber: Our reference, booking no., carrier reference
27
- - containers:
28
- - containerType: x 40' container
29
- - grossWeight: gross weight
30
- - measurements: Dimensions of the cargo (length, width, height) for freight calculations
31
- - packageQuantity: package quantity, number and kind of packages
32
- - packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
33
- - containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
34
- - sealNumber: seal number, seal nos., shipper seal, seal.
35
- - vessel: vessel
36
-
37
-
38
- You must apply the following rules:
39
- - The JSON schema must be followed during the extraction.
40
- - The values must only include text found in the document
41
- - Do not normalize any entity value.
42
- - If 'sealNumber' is not found don't add it to the result.
43
- - Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
44
- - Add an escape character (backwards slash) in from of all quotes in values
@@ -1,17 +0,0 @@
1
- Extract the following information from the sea waybill document.
2
-
3
- **blNumber:** Find the value labeled as "B/L No.".
4
- **voyage:** Get the "Voyage No." value.
5
- **portOfLoading:** Find the value in the "Port of Loading" field.
6
- **portOfDischarge:** Extract the text from the "Port of Discharge" field.
7
- **bookingNumber:** Look for the value associated with "Booking No.".
8
- **containers:**
9
- The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
10
- * **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
11
- * **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
12
- * **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
13
- * **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
14
- * **packageType:** Extract the value from the "Kind of Packages" field.
15
- * **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
16
- * **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
17
- **vessel:** Extract the text from the field "Vessel".