data-science-document-ai 1.57.0__py3-none-any.whl → 1.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.57.0.dist-info → data_science_document_ai-1.58.0.dist-info}/METADATA +1 -1
- {data_science_document_ai-1.57.0.dist-info → data_science_document_ai-1.58.0.dist-info}/RECORD +7 -7
- src/postprocessing/common.py +35 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +1 -1
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +3 -1
- src/utils.py +6 -2
- {data_science_document_ai-1.57.0.dist-info → data_science_document_ai-1.58.0.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.57.0.dist-info → data_science_document_ai-1.58.0.dist-info}/RECORD
RENAMED
|
@@ -7,7 +7,7 @@ src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
|
7
7
|
src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
9
|
src/pdf_processing.py,sha256=Fx-Glb9niEUU3WUCrBZ02ZYV-E2vWoUM0ifN7-0A1Q4,19961
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
10
|
+
src/postprocessing/common.py,sha256=tyy97UBfcnSs8Oh5vVDp4D1qDRit32ri9IGqRlNZcaY,27254
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
13
|
src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
|
|
@@ -15,8 +15,8 @@ src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr
|
|
|
15
15
|
src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
|
|
16
16
|
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
|
|
17
17
|
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=OxNfXZaWppwsFMprthzJpOOr8ApQL4KYEmlu9fSUvxk,3485
|
|
18
|
-
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=
|
|
19
|
-
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=
|
|
18
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=en83Em25e5PF2OAgFJC8w-MONVnketPZ3J_3zCjIVfE,5915
|
|
19
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=bLHQgGR9e8X4UvFpiyd1OasD00XGvUMG6HSLQy4IgQ4,5157
|
|
20
20
|
src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
|
|
21
21
|
src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=S-C5cq8AkEoGKilCO0XiXLZXgZPwz9udQOTm557GG64,3984
|
|
22
22
|
src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=5efq6b--KGWeqGbvASZFTqXJgUEAvsC-0ljo-q0Lhew,5855
|
|
@@ -54,7 +54,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E
|
|
|
54
54
|
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
55
55
|
src/setup.py,sha256=8-vZWjC8Iwa3xxdk3iR4412VCjtNtgzVqkXcFon7UBE,7309
|
|
56
56
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
57
|
-
src/utils.py,sha256=
|
|
58
|
-
data_science_document_ai-1.
|
|
59
|
-
data_science_document_ai-1.
|
|
60
|
-
data_science_document_ai-1.
|
|
57
|
+
src/utils.py,sha256=8BpuJJLiJZntZAI86cQMNa-FGjl9jbOjlCWIG27mjJo,17418
|
|
58
|
+
data_science_document_ai-1.58.0.dist-info/METADATA,sha256=8MWt4KlixrpV8lQhKmFo5i1UZn02o16vMaR3uEe94Js,2152
|
|
59
|
+
data_science_document_ai-1.58.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
60
|
+
data_science_document_ai-1.58.0.dist-info/RECORD,,
|
src/postprocessing/common.py
CHANGED
|
@@ -723,10 +723,45 @@ async def format_all_entities(result, document_type_code, params, mime_type):
|
|
|
723
723
|
if document_type_code in ["partnerInvoice", "bundeskasse"]:
|
|
724
724
|
await process_partner_invoice(params, aggregated_data, document_type_code)
|
|
725
725
|
|
|
726
|
+
if document_type_code in ["bookingConfirmation"]:
|
|
727
|
+
aggregated_data["legalEntity"] = await get_legal_entity(
|
|
728
|
+
aggregated_data.get("carrierName", {}).get("documentValue", None),
|
|
729
|
+
aggregated_data.get("carrierAddress", {}).get("documentValue", None),
|
|
730
|
+
)
|
|
731
|
+
|
|
726
732
|
logger.info("Data Extraction completed successfully")
|
|
727
733
|
return aggregated_data
|
|
728
734
|
|
|
729
735
|
|
|
736
|
+
async def get_legal_entity(name, address):
|
|
737
|
+
"""Get legal entity mapping from TMS mappings.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
name (str): The name of the legal entity. Mandatory.
|
|
741
|
+
address (str): The address of the legal entity. Optional for better matching.
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
dict or None: The mapping result from TMS embeddings, or None if not found.
|
|
745
|
+
"""
|
|
746
|
+
# Name is mandatory for legal entity mapping
|
|
747
|
+
if not name:
|
|
748
|
+
return {"documentValue": None, "mappedValue": None}
|
|
749
|
+
|
|
750
|
+
# Build input safely
|
|
751
|
+
input_text = name if not address else f"{name} | {address}"
|
|
752
|
+
|
|
753
|
+
api_results = await get_tms_mappings(
|
|
754
|
+
input_list=[input_text],
|
|
755
|
+
embedding_type="legal_entities",
|
|
756
|
+
input_key="partnerNameAddress",
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
return {
|
|
760
|
+
"documentValue": None,
|
|
761
|
+
"formattedValue": api_results.get(input_text),
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
|
|
730
765
|
def add_text_without_space(text):
|
|
731
766
|
"""If the cleaned text is different from the original text, append it.
|
|
732
767
|
Useful for port names like QUINHON - Quinhon"""
|
|
@@ -134,7 +134,7 @@
|
|
|
134
134
|
"carrierAddress": {
|
|
135
135
|
"type": "STRING",
|
|
136
136
|
"nullable": true,
|
|
137
|
-
"description": "The address of the carrier who provides service and issued the document."
|
|
137
|
+
"description": "The address of the carrier who provides service and issued the document. It can be from Germany, Poland, Italy, Vietnam, China, etc."
|
|
138
138
|
},
|
|
139
139
|
"carrierName": {
|
|
140
140
|
"type": "STRING",
|
|
@@ -29,8 +29,10 @@ For Import Shipment: The loaded container / cargo arrives at a port of discharge
|
|
|
29
29
|
|
|
30
30
|
- carrierName and carrierAddress:
|
|
31
31
|
- Extract the name and address of the carrier who is the main parent company in the document.
|
|
32
|
+
- It can be found in the top section of the document, often near the logo or header.
|
|
32
33
|
- Example:
|
|
33
|
-
- "Hapag-Lloyd
|
|
34
|
+
- "Hapag-Lloyd" for vendorName
|
|
35
|
+
- Hamburg, Germany, Poland, Italy, Vietnam, China etc... for vendorAddress
|
|
34
36
|
|
|
35
37
|
- transportLegs: Multiple Transport Legs entries may exist, capture all instances under "transportLegs". Make sure the order of the legs are important.
|
|
36
38
|
- eta: The estimated time of arrival for a specific leg.
|
src/utils.py
CHANGED
|
@@ -361,7 +361,10 @@ def extract_top_pages(pdf_bytes, num_pages=4):
|
|
|
361
361
|
|
|
362
362
|
|
|
363
363
|
async def get_tms_mappings(
|
|
364
|
-
input_list: List[str],
|
|
364
|
+
input_list: List[str],
|
|
365
|
+
embedding_type: str,
|
|
366
|
+
llm_ports: Optional[List[str]] = None,
|
|
367
|
+
input_key: str = None,
|
|
365
368
|
) -> Dict[str, Any]:
|
|
366
369
|
"""Get TMS mappings for the given values.
|
|
367
370
|
|
|
@@ -370,6 +373,7 @@ async def get_tms_mappings(
|
|
|
370
373
|
embedding_type (str): Type of embedding to use
|
|
371
374
|
(e.g., "container_types", "ports", "depots", "lineitems", "terminals").
|
|
372
375
|
llm_ports (list[str], optional): List of LLM ports to use. Defaults to None.
|
|
376
|
+
input_key (str, optional): Key to use for input list in payload. Defaults to None.
|
|
373
377
|
|
|
374
378
|
Returns:
|
|
375
379
|
dict or string: A dictionary or a string with the mapping results.
|
|
@@ -389,7 +393,7 @@ async def get_tms_mappings(
|
|
|
389
393
|
input_list = [input_list]
|
|
390
394
|
|
|
391
395
|
# Always send a dict with named keys
|
|
392
|
-
payload = {embedding_type: input_list}
|
|
396
|
+
payload = {input_key or embedding_type: input_list}
|
|
393
397
|
|
|
394
398
|
if llm_ports:
|
|
395
399
|
payload["llm_ports"] = llm_ports if isinstance(llm_ports, list) else [llm_ports]
|
{data_science_document_ai-1.57.0.dist-info → data_science_document_ai-1.58.0.dist-info}/WHEEL
RENAMED
|
File without changes
|