data-science-document-ai 1.42.5__py3-none-any.whl → 1.57.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/METADATA +2 -2
  2. data_science_document_ai-1.57.0.dist-info/RECORD +60 -0
  3. src/constants.py +13 -34
  4. src/docai_processor_config.yaml +0 -69
  5. src/excel_processing.py +24 -14
  6. src/io.py +23 -0
  7. src/llm.py +0 -29
  8. src/pdf_processing.py +183 -76
  9. src/postprocessing/common.py +172 -28
  10. src/postprocessing/postprocess_partner_invoice.py +194 -59
  11. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  12. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  13. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +135 -21
  14. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +21 -17
  15. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +136 -22
  16. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +52 -58
  17. src/prompts/library/bookingConfirmation/maersk/placeholders.json +135 -21
  18. src/prompts/library/bookingConfirmation/maersk/prompt.txt +10 -1
  19. src/prompts/library/bookingConfirmation/msc/placeholders.json +135 -21
  20. src/prompts/library/bookingConfirmation/msc/prompt.txt +10 -1
  21. src/prompts/library/bookingConfirmation/oocl/placeholders.json +149 -21
  22. src/prompts/library/bookingConfirmation/oocl/prompt.txt +11 -3
  23. src/prompts/library/bookingConfirmation/other/placeholders.json +149 -21
  24. src/prompts/library/bookingConfirmation/other/prompt.txt +56 -57
  25. src/prompts/library/bookingConfirmation/yangming/placeholders.json +149 -21
  26. src/prompts/library/bookingConfirmation/yangming/prompt.txt +11 -1
  27. src/prompts/library/bundeskasse/other/placeholders.json +5 -5
  28. src/prompts/library/bundeskasse/other/prompt.txt +7 -5
  29. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  30. src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
  31. src/prompts/library/customsAssessment/other/placeholders.json +70 -0
  32. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  33. src/prompts/library/customsInvoice/other/prompt.txt +4 -3
  34. src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
  35. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  36. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  37. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  38. src/prompts/library/finalMbL/other/placeholders.json +80 -0
  39. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  40. src/prompts/library/packingList/other/placeholders.json +98 -0
  41. src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
  42. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  43. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  44. src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
  45. src/prompts/prompt_library.py +0 -4
  46. src/setup.py +25 -24
  47. src/utils.py +120 -68
  48. data_science_document_ai-1.42.5.dist-info/RECORD +0 -57
  49. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
  50. src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
  51. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  52. src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
  53. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.57.0.dist-info}/WHEEL +0 -0
src/setup.py CHANGED
@@ -1,11 +1,8 @@
1
1
  """Contains project setup parameters and initialization functions."""
2
2
  import json
3
-
4
- # import streamlit as st
5
3
  import os
6
4
  import random
7
5
  import time
8
- from pathlib import Path
9
6
 
10
7
  import toml
11
8
  import vertexai
@@ -18,7 +15,7 @@ from src.constants import project_parameters
18
15
  from src.constants_sandbox import project_parameters_sandbox
19
16
 
20
17
  # Parent repos are imported without .
21
- from src.io import download_dir_from_bucket, get_bq_client, get_storage_client, logger
18
+ from src.io import get_bq_client, get_storage_client, logger
22
19
  from src.llm import LlmClient
23
20
 
24
21
 
@@ -116,8 +113,6 @@ def setup_params(args=None):
116
113
  # Directories and paths
117
114
  os.makedirs(params["folder_data"], exist_ok=True)
118
115
 
119
- params = setup_docai_client_and_path(params)
120
-
121
116
  # Set up BigQuery client for logging
122
117
  bq_client, _ = get_bq_client(params)
123
118
  params["bq_client"] = bq_client
@@ -125,23 +120,19 @@ def setup_params(args=None):
125
120
  # Set up Vertex AI for text embeddings
126
121
  setup_vertexai(params)
127
122
 
128
- # Load models from YAML file
129
- current_dir = os.path.dirname(__file__)
130
- file_path = os.path.join(current_dir, "docai_processor_config.yaml")
131
- with open(file_path) as file:
132
- yaml_content = yaml.safe_load(file)
133
- assert params.keys() & yaml_content.keys() == set()
134
- params.update(yaml_content)
123
+ if params.get("if_use_docai"):
124
+ # Set up Document AI client and processor paths
125
+ params = setup_docai_client_and_path(params)
135
126
 
136
- # Get models meta data from cloud
137
- client = get_storage_client(params)
138
- bucket = client.bucket(params["doc_ai_bucket_name"])
139
- downloaded_meta = download_dir_from_bucket(
140
- bucket, params["g_model_data_folder"], Path(params["local_model_data_folder"])
141
- )
142
- if not downloaded_meta:
143
- logger.info(f"Could not load models metadata from cloud.")
127
+ # Load models from YAML file
128
+ current_dir = os.path.dirname(__file__)
129
+ file_path = os.path.join(current_dir, "docai_processor_config.yaml")
130
+ with open(file_path) as file:
131
+ yaml_content = yaml.safe_load(file)
132
+ assert params.keys() & yaml_content.keys() == set()
133
+ params.update(yaml_content)
144
134
 
135
+ # Set up LLM clients
145
136
  params["LlmClient"] = LlmClient(
146
137
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_params"]
147
138
  )
@@ -149,7 +140,8 @@ def setup_params(args=None):
149
140
  openai_key=os.getenv("OPENAI_KEY"), parameters=params["gemini_flash_params"]
150
141
  )
151
142
 
152
- params["lookup_data"] = setup_lookup_data(params, bucket)
143
+ # Load lookup data from GCS bucket
144
+ setup_lookup_data(params)
153
145
 
154
146
  return params
155
147
 
@@ -182,15 +174,21 @@ def setup_vertexai(params):
182
174
  )
183
175
 
184
176
 
185
- def setup_lookup_data(params, bucket):
177
+ def setup_lookup_data(params):
186
178
  """
187
179
  Loads JSON mapping data from given GCP Bucket.
188
180
  """
181
+ client = get_storage_client(params)
182
+ bucket = client.bucket(params["doc_ai_bucket_name"])
183
+
189
184
  data = dict()
190
185
 
191
186
  input_path_item_code = (
192
187
  f'{params["g_model_fuzzy_lookup_folder"]}/{params["item_code_lookup"]}'
193
188
  )
189
+ input_path_intermodal_partners = (
190
+ f'{params["g_model_fuzzy_lookup_folder"]}/{params["intermodal_partners"]}'
191
+ )
194
192
  input_path_invoice_classification = f'{params["g_model_fuzzy_lookup_folder"]}/{params["invoice_classification_lookup"]}' # noqa: E501
195
193
  input_path_reverse_charge = f'{params["g_model_fuzzy_lookup_folder"]}/{params["reverse_charge_sentence_lookup"]}'
196
194
 
@@ -201,6 +199,9 @@ def setup_lookup_data(params, bucket):
201
199
  return json.loads(downloaded_data)
202
200
 
203
201
  data["item_code"] = download_json_from_bucket(input_path_item_code)
202
+ data["intermodal_partners"] = download_json_from_bucket(
203
+ input_path_intermodal_partners
204
+ )
204
205
  data["invoice_classification"] = download_json_from_bucket(
205
206
  input_path_invoice_classification
206
207
  )
@@ -208,4 +209,4 @@ def setup_lookup_data(params, bucket):
208
209
  input_path_reverse_charge
209
210
  )
210
211
 
211
- return data
212
+ params["lookup_data"] = data
src/utils.py CHANGED
@@ -6,38 +6,29 @@ import json
6
6
  import os
7
7
  import pickle
8
8
  from datetime import datetime
9
- from typing import Literal
9
+ from typing import Any, Dict, List, Literal, Optional
10
10
 
11
+ import httpx
12
+ import numpy as np
11
13
  import openpyxl
12
14
  import pandas as pd
13
- import requests
14
15
  from google.cloud import documentai_v1beta3 as docu_ai_beta
15
- from PyPDF2 import PdfReader, PdfWriter
16
+ from pypdf import PdfReader, PdfWriter
16
17
 
17
- from src.io import get_storage_client, logger
18
+ from src.io import bq_logs, get_storage_client, logger
18
19
 
19
20
 
20
- def bq_logs(data_to_insert, params):
21
- """Insert logs into Google BigQuery.
21
+ def get_pdf_page_count(pdf_bytes):
22
+ """Get the number of pages in a PDF document efficiently.
22
23
 
23
24
  Args:
24
- data_to_insert (list): The data to insert into BigQuery.
25
- params (dict): The parameters dictionary.
25
+ pdf_bytes (bytes): The PDF content as bytes.
26
+
27
+ Returns:
28
+ int: The number of pages in the PDF.
26
29
  """
27
- # Use the pre-initialized BigQuery client
28
- bq_client = params["bq_client"]
29
- # Get the table string
30
- table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
31
-
32
- logger.info(f"Log table: {table_string}")
33
- # Insert the rows into the table
34
- insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
35
-
36
- # Check if there were any errors inserting the rows
37
- if not insert_logs:
38
- logger.info("New rows have been added.")
39
- else:
40
- logger.info("Errors occurred while inserting rows: ", insert_logs)
30
+ reader = PdfReader(io.BytesIO(pdf_bytes))
31
+ return len(reader.pages)
41
32
 
42
33
 
43
34
  async def get_data_set_schema_from_docai(
@@ -164,6 +155,7 @@ async def run_background_tasks(
164
155
  processor_version,
165
156
  mime_type,
166
157
  elapsed_time=None,
158
+ page_count=None,
167
159
  ):
168
160
  """
169
161
  Run background tasks asynchronously.
@@ -177,6 +169,7 @@ async def run_background_tasks(
177
169
  processor_version: The processor version used to extract the data.
178
170
  mime_type: The MIME type of the document.
179
171
  elapsed_time: The time taken to process the document.
172
+ page_count (int, optional): The number of pages in the document.
180
173
 
181
174
  Returns:
182
175
  None
@@ -185,13 +178,8 @@ async def run_background_tasks(
185
178
 
186
179
  await loop.run_in_executor(None, store_json_in_gcs, params, doc_id, store_data)
187
180
 
188
- # Keep the page count as 1 for Excel files.
189
- page_count = 1
190
- # calculate the number of pages processed for PDFs
191
- try:
192
- if mime_type == "application/pdf":
193
- page_count = len(json.loads(store_data.encode("utf-8"))["pages"])
194
- except AttributeError:
181
+ # Use the passed page_count or default to 0 if not provided
182
+ if page_count is None:
195
183
  page_count = 0
196
184
 
197
185
  # Log the request in BigQuery
@@ -303,9 +291,6 @@ def generate_schema_structure(params, input_doc_type):
303
291
  "type": "string",
304
292
  }
305
293
 
306
- # update schema to extract value-page_number pairs
307
- response_schema = transform_schema_strings(response_schema)
308
-
309
294
  return response_schema
310
295
 
311
296
 
@@ -375,9 +360,9 @@ def extract_top_pages(pdf_bytes, num_pages=4):
375
360
  return output.getvalue()
376
361
 
377
362
 
378
- def get_tms_mappings(
379
- input_list: list[str], embedding_type: str, llm_ports: list[str] = None
380
- ):
363
+ async def get_tms_mappings(
364
+ input_list: List[str], embedding_type: str, llm_ports: Optional[List[str]] = None
365
+ ) -> Dict[str, Any]:
381
366
  """Get TMS mappings for the given values.
382
367
 
383
368
  Args:
@@ -387,39 +372,66 @@ def get_tms_mappings(
387
372
  llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
388
373
 
389
374
  Returns:
390
- dict: A dictionary with the mapping results.
375
+ dict or string: A dictionary or a string with the mapping results.
391
376
  """
392
- # To test the API locally, port-forward the embedding service in the sandbox to 8080:80
393
- # If you want to launch uvicorn from the tms-embedding repo, then use --port 8080 in the config file
394
377
  base_url = (
395
378
  "http://0.0.0.0:8080/"
396
379
  if os.getenv("CLUSTER") is None
397
380
  else "http://tms-mappings.api.svc.cluster.local./"
398
381
  )
399
382
 
383
+ # Ensure clean inputs
384
+ if not input_list:
385
+ return {}
386
+
400
387
  # Ensure input_list is a list
401
388
  if not isinstance(input_list, list):
402
389
  input_list = [input_list]
403
390
 
404
391
  # Always send a dict with named keys
405
392
  payload = {embedding_type: input_list}
393
+
406
394
  if llm_ports:
407
395
  payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
408
396
 
409
397
  # Make the POST request to the TMS mappings API
410
- url = f"{base_url}/{embedding_type}"
411
- response = requests.post(url=url, json=payload)
398
+ url = f"{base_url}{embedding_type}"
412
399
 
413
- if response.status_code != 200:
414
- logger.error(
415
- f"Error from TMS mappings API: {response.status_code} - {response.text}"
416
- )
400
+ # Use a timeout so the code doesn't hang forever
401
+ timeout = httpx.Timeout(60.0, connect=10.0)
402
+
403
+ async with httpx.AsyncClient(timeout=timeout) as client:
404
+ try:
405
+ response = await client.post(url, json=payload)
406
+ response.raise_for_status()
417
407
 
418
- formatted_values = (
419
- response.json().get("response", {}).get("data", {}).get(input_list[0], None)
408
+ # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
+ return response.json().get("response", {}).get("data", {})
410
+
411
+ except httpx.HTTPStatusError as exc:
412
+ logger.error(
413
+ f"Error from TMS mappings API: {exc.response.status_code} - {exc.response.text}"
414
+ )
415
+ return {}
416
+
417
+
418
+ async def batch_fetch_all_mappings(container_types, terminals, depots):
419
+ """Batch fetch all mappings for container types, terminals, and depots."""
420
+ # run batch calls concurrently
421
+ results = await asyncio.gather(
422
+ get_tms_mappings(list(container_types), "container_types"),
423
+ get_tms_mappings(list(terminals), "terminals"),
424
+ get_tms_mappings(list(depots), "depots"),
420
425
  )
421
426
 
422
- return formatted_values
427
+ batch_container_map, batch_terminal_map, batch_depot_map = results
428
+
429
+ # Convert lists of tuples to dicts if necessary
430
+ return (
431
+ dict(batch_container_map or {}),
432
+ dict(batch_terminal_map or {}),
433
+ dict(batch_depot_map or {}),
434
+ )
423
435
 
424
436
 
425
437
  def transform_schema_strings(schema):
@@ -435,12 +447,23 @@ def transform_schema_strings(schema):
435
447
  Returns:
436
448
  dict: The transformed schema dictionary.
437
449
  """
438
- # Base case: if the current schema definition is for a string
439
- if isinstance(schema, dict) and schema.get("type").upper() == "STRING":
440
- new_schema = {
450
+ if not isinstance(schema, dict):
451
+ return schema
452
+
453
+ schema_type = schema.get("type")
454
+ if not schema_type:
455
+ return schema
456
+
457
+ # Base case: STRING → OBJECT (only if not already transformed)
458
+ if schema_type.upper() == "STRING":
459
+ return {
441
460
  "type": "OBJECT",
442
461
  "properties": {
443
- "value": {"type": "STRING"},
462
+ "value": {
463
+ "type": "STRING",
464
+ "nullable": schema.get("nullable", False),
465
+ "description": schema.get("description", ""),
466
+ },
444
467
  "page_number": {
445
468
  "type": "STRING",
446
469
  "description": "Number of a page where the value was found in the document starting from 0.",
@@ -449,26 +472,55 @@ def transform_schema_strings(schema):
449
472
  "required": [],
450
473
  }
451
474
 
452
- # Preserve original properties like nullable and description on the new 'value' key
453
- if "nullable" in schema:
454
- new_schema["properties"]["value"]["nullable"] = schema["nullable"]
455
- if "description" in schema:
456
- new_schema["properties"]["value"]["description"] = schema["description"]
475
+ # Skip already transformed OBJECT (has both 'value' & 'page_number')
476
+ if (
477
+ schema_type.upper() == "OBJECT"
478
+ and "properties" in schema
479
+ and {"value", "page_number"}.issubset(schema["properties"].keys())
480
+ ):
481
+ return schema
482
+
483
+ # Recursive case for OBJECT
484
+ if schema_type.upper() == "OBJECT" and "properties" in schema:
485
+ new_schema = schema.copy()
486
+ new_schema["properties"] = {
487
+ k: transform_schema_strings(v) for k, v in schema["properties"].items()
488
+ }
489
+ return new_schema
457
490
 
491
+ # Recursive case for ARRAY
492
+ if schema_type.upper() == "ARRAY" and "items" in schema:
493
+ new_schema = schema.copy()
494
+ new_schema["items"] = transform_schema_strings(schema["items"])
458
495
  return new_schema
459
496
 
460
- # Recursive case: if the schema is a dictionary
461
- elif isinstance(schema, dict) and schema.get("type").upper() == "OBJECT":
462
- transformed_schema = schema.copy()
463
- for key, value in schema.get("properties").items():
464
- transformed_schema["properties"][key] = transform_schema_strings(value)
465
- return transformed_schema
497
+ return schema
466
498
 
467
- # Recursive case: if the schema is a list
468
- elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
469
- schema["items"] = transform_schema_strings(schema["items"])
470
- return schema
471
499
 
472
- # Base case: for non-dict/list values (e.g., None, bool, str)
500
+ def estimate_page_count(sheet):
501
+ """Assuming a page is 10 columns x 50 rows."""
502
+ if hasattr(sheet, "shape"):
503
+ pg_cnt = sheet.shape[0] * sheet.shape[1]
504
+ elif hasattr(sheet, "max_row"):
505
+ pg_cnt = sheet.max_column * sheet.max_row
473
506
  else:
474
- return schema
507
+ return None
508
+ return np.ceil(pg_cnt / 500)
509
+
510
+
511
+ def split_pdf_into_chunks(file_content: bytes, chunk_size: int = 1):
512
+ """Split PDF into smaller page chunks."""
513
+ pdf = PdfReader(io.BytesIO(file_content))
514
+ total_pages = len(pdf.pages)
515
+
516
+ # TODO: update the chunk_size based on doc length. However, it breaks the page number extraction logic.
517
+ for i in range(0, total_pages, chunk_size):
518
+ writer = PdfWriter()
519
+ for j in range(i, min(i + chunk_size, total_pages)):
520
+ writer.add_page(pdf.pages[j])
521
+
522
+ buffer = io.BytesIO()
523
+ writer.write(buffer)
524
+ buffer.seek(0)
525
+
526
+ yield buffer.getvalue()
@@ -1,57 +0,0 @@
1
- src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
2
- src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
- src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
- src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=8toKsafUvwE5QN3TOQO3zfLo0Wv2sGxZHKPsL7n5LkA,2771
6
- src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
- src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
- src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=dxsYvNnONAjzS-T7K5aSo89rz7QcdW3ZDfeuFyeCeII,16294
10
- src/postprocessing/common.py,sha256=5W-u3lKbnPQRKT4h5EfegegMjSXOKik73X7kUx9ik0Y,21888
11
- src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
- src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
14
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
15
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
16
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
17
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
18
- src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
19
- src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
20
- src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
21
- src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
22
- src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
23
- src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
24
- src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
25
- src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
26
- src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
27
- src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
28
- src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
29
- src/prompts/library/bundeskasse/other/prompt.txt,sha256=WV4D3ellIcB2cVmsZXCpbbHOShYY8VN_iZrYOuyoqzw,2937
30
- src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
31
- src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
32
- src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
33
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=Q5ihAVaZFToZ75D01ICEdCRB8nY_FD5DL3yuFvJ4418,9632
34
- src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
35
- src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
36
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
37
- src/prompts/library/draftMbl/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
38
- src/prompts/library/draftMbl/other/placeholders.json,sha256=wIN06_NWsESDyNEDfOLPi3F2Vq-XPa4O3U32A32s-_Q,1736
39
- src/prompts/library/draftMbl/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
40
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=RhxEJ4eWikAQiE40cuPsssnzizge6AJYFTSJLGUmz_U,2326
41
- src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
42
- src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
43
- src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
44
- src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
45
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=fGUtMYWvhedmSiv9xShRv0cHXmEws1D9pQmZP1E2gl0,7806
46
- src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
47
- src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
48
- src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
49
- src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
50
- src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
51
- src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
52
- src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
53
- src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
- src/utils.py,sha256=nU69zR3TB7IZmCc19DD8H27Riek8GJAldmhJjCSwNEE,16090
55
- data_science_document_ai-1.42.5.dist-info/METADATA,sha256=FauluZfyiueEsYJsiMdiXv7yko2N3Xp5UTe8K0U1Toc,2153
56
- data_science_document_ai-1.42.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.42.5.dist-info/RECORD,,
@@ -1,45 +0,0 @@
1
- You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
2
-
3
- blNumber: Bill of Lading number.
4
- voyage: The journey or route code taken by the vessel.
5
- portOfLoading: The port where cargo is loaded.
6
- portOfDischarge: The port where cargo is unloaded.
7
- bookingNumber: A unique identifier for the booking.
8
- containers:
9
- containerType: Type of the shipping container, usually related to it's size.
10
- grossWeight: Total weight of the cargo, including the tare weight of the container.
11
- measurements: Dimensions of the cargo (length, width, height) for freight calculations.
12
- packageQuantity: package quantity.
13
- packageType: Type of packaging used (e.g., cartons, pallets, barrels).
14
- containerNumber: Unique ID for tracking the shipping container.
15
- sealNumber: Number of the container's seal.
16
- vessel: The name of the vessel.
17
-
18
-
19
- Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
-
21
-
22
- Keywords for datapoints:
23
- - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
24
- - voyage: voyage, voy. no, voyage-no.
25
- - portOfLoading: port of loading, pol, from.]
26
- - portOfDischarge: port of discharge, pod, delivery, to
27
- - bookingNumber: Our reference, booking no., carrier reference
28
- - containers:
29
- - containerType: x 40' container
30
- - grossWeight: gross weight
31
- - measurements: Dimensions of the cargo (length, width, height) for freight calculations
32
- - packageQuantity: package quantity, number and kind of packages
33
- - packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
34
- - containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
35
- - sealNumber: seal number, seal nos., shipper seal, seal.
36
- - vessel: vessel
37
-
38
-
39
- You must apply the following rules:
40
- - The JSON schema must be followed during the extraction.
41
- - The values must only include text found in the document
42
- - Do not normalize any entity value.
43
- - If 'sealNumber' is not found don't add it to the result.
44
- - Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
45
- - Add an escape character (backwards slash) in from of all quotes in values
@@ -1,19 +0,0 @@
1
- Extract the following information from the sea waybill document.
2
- Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
3
-
4
-
5
- **blNumber:** Find the value labeled as "B/L No.".
6
- **voyage:** Get the "Voyage No." value.
7
- **portOfLoading:** Find the value in the "Port of Loading" field.
8
- **portOfDischarge:** Extract the text from the "Port of Discharge" field.
9
- **bookingNumber:** Look for the value associated with "Booking No.".
10
- **containers:**
11
- The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
12
- * **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
13
- * **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
14
- * **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
15
- * **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
16
- * **packageType:** Extract the value from the "Kind of Packages" field.
17
- * **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
18
- * **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
19
- **vessel:** Extract the text from the field "Vessel".
@@ -1,44 +0,0 @@
1
- You are a document entity extraction specialist. Given a document, the explained datapoint need to extract.
2
-
3
- blNumber: Bill of Lading number.
4
- voyage: The journey or route code taken by the vessel.
5
- portOfLoading: The port where cargo is loaded.
6
- portOfDischarge: The port where cargo is unloaded.
7
- bookingNumber: A unique identifier for the booking.
8
- containers:
9
- containerType: Type of the shipping container, usually related to it's size.
10
- grossWeight: Total weight of the cargo, including the tare weight of the container.
11
- measurements: Dimensions of the cargo (length, width, height) for freight calculations.
12
- packageQuantity: package quantity.
13
- packageType: Type of packaging used (e.g., cartons, pallets, barrels).
14
- containerNumber: Unique ID for tracking the shipping container.
15
- sealNumber: Number of the container's seal.
16
- vessel: The name of the vessel.
17
-
18
-
19
- Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
-
21
- Keywords for datapoints:
22
- - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
23
- - voyage: voyage, voy. no, voyage-no.
24
- - portOfLoading: port of loading, pol, from.]
25
- - portOfDischarge: port of discharge, pod, delivery, to
26
- - bookingNumber: Our reference, booking no., carrier reference
27
- - containers:
28
- - containerType: x 40' container
29
- - grossWeight: gross weight
30
- - measurements: Dimensions of the cargo (length, width, height) for freight calculations
31
- - packageQuantity: package quantity, number and kind of packages
32
- - packageType: Type of packaging used (e.g., cartons, pallets, barrels), number and kind of packages, description of goods
33
- - containerNumber: container number, cntr. nos., it is a combination of 4 letters and 7 digits separated by space right above 'SEAL'
34
- - sealNumber: seal number, seal nos., shipper seal, seal.
35
- - vessel: vessel
36
-
37
-
38
- You must apply the following rules:
39
- - The JSON schema must be followed during the extraction.
40
- - The values must only include text found in the document
41
- - Do not normalize any entity value.
42
- - If 'sealNumber' is not found don't add it to the result.
43
- - Validate the JSON make sure it is a valid JSON ! No extra text, no missing comma!
44
- - Add an escape character (backwards slash) in from of all quotes in values
@@ -1,19 +0,0 @@
1
- Extract the following information from the sea waybill document.
2
- Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
3
-
4
-
5
- **blNumber:** Find the value labeled as "B/L No.".
6
- **voyage:** Get the "Voyage No." value.
7
- **portOfLoading:** Find the value in the "Port of Loading" field.
8
- **portOfDischarge:** Extract the text from the "Port of Discharge" field.
9
- **bookingNumber:** Look for the value associated with "Booking No.".
10
- **containers:**
11
- The document may contain multiple containers listed within the section "PARTICULARS FURNISHED BY SHIPPER" under the line starting with "Kind of Packages; Description of goods; Marks and Numbers; Container No./Seal No.". Look for container information that starts with a line that includes "Container Said to Contain" and continues until the next instance of "Container Said to Contain" or the end of the section. For each container, extract the following:
12
- * **containerType:** Extract the container type information. It is usually a combination of numbers, the word "DRY", and may include additional characters. It is found on the same line as the container number.
13
- * **grossWeight:** Find the value corresponding to the "gross weight" of the container. It is usually represented in KGS and is found on the same line as the container number.
14
- * **measurements:** Find the value corresponding to the "measurement" of the container. It is usually represented in CBM and is found on the same line as the container number.
15
- * **packageQuantity:** Extract the "package quantity" information. It is usually a whole number and precedes the text "PACKAGE". All container information will be on the same line as the "package quantity".
16
- * **packageType:** Extract the value from the "Kind of Packages" field.
17
- * **containerNumber:** Find the container number. It starts with "MRKU" and is followed by a sequence of digits. It is found on the same line as the text "Container Said to Contain".
18
- * **sealNumber:** Get the "Shipper Seal" value. It follows after the text "Shipper Seal :".
19
- **vessel:** Extract the text from the field "Vessel".