data-science-document-ai 1.43.2__py3-none-any.whl → 1.43.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.43.2
3
+ Version: 1.43.4
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -1,13 +1,13 @@
1
- src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
1
+ src/constants.py,sha256=rpYIecVLIBLh98YrJ8e5gdvM0bqrXJZWIKgFkUSn69g,3513
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
- src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=gzP7QFCp4-n0FTevhWmXm-2UoDF0w0y5v39gsby0IV8,3135
4
+ src/docai_processor_config.yaml,sha256=81NUGs-u8UFJm6mc0ZOeeNQlhe9h0f35GhjTcwErvTA,1717
5
+ src/excel_processing.py,sha256=AppxrliVj7cLv1I_X7xC5bq4OPFAeiVNMNwcp-TZZDs,3466
6
6
  src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
7
  src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=0lmeaKwruAxqhk7NeCC4GU6Zlp0rQAmi0lbjlNTNCDc,17039
10
- src/postprocessing/common.py,sha256=wvlYI1S75r0q5xp9Yll89nOVWtwDd7hV4Sf0MIButA0,22150
9
+ src/pdf_processing.py,sha256=sr41bSMbH-WwWQ9uF5WKnsrkXJzCDirziv3TaS8hoPQ,17164
10
+ src/postprocessing/common.py,sha256=b0VpxM-levZi_8H5a9gDNjx-67W6F7xRFUDa7CJJGgQ,22214
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
13
  src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
@@ -27,6 +27,7 @@ src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nm
27
27
  src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
28
28
  src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
29
29
  src/prompts/library/bundeskasse/other/prompt.txt,sha256=WV4D3ellIcB2cVmsZXCpbbHOShYY8VN_iZrYOuyoqzw,2937
30
+ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
30
31
  src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
31
32
  src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
32
33
  src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
@@ -40,6 +41,7 @@ src/prompts/library/draftMbl/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
40
41
  src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=RhxEJ4eWikAQiE40cuPsssnzizge6AJYFTSJLGUmz_U,2326
41
42
  src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
42
43
  src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
44
+ src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
43
45
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
44
46
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
45
47
  src/prompts/library/partnerInvoice/other/prompt.txt,sha256=fGUtMYWvhedmSiv9xShRv0cHXmEws1D9pQmZP1E2gl0,7806
@@ -52,6 +54,6 @@ src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg
52
54
  src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
53
55
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
56
  src/utils.py,sha256=cTF2A12jugKjXxGlNXEZQtfgcsIoaTtaU7zhVOOvXXA,16634
55
- data_science_document_ai-1.43.2.dist-info/METADATA,sha256=4FTsGLX2lW2bIDgXV0wRwUcKKvkMl3ZfbQokcRdTFY0,2152
56
- data_science_document_ai-1.43.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.43.2.dist-info/RECORD,,
57
+ data_science_document_ai-1.43.4.dist-info/METADATA,sha256=bcmTXEnl4r0z7IqelSFuCyfxNJjnPvEY2snX1WViH9s,2152
58
+ data_science_document_ai-1.43.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
+ data_science_document_ai-1.43.4.dist-info/RECORD,,
src/constants.py CHANGED
@@ -50,8 +50,6 @@ project_parameters = {
50
50
  "model_selector": {
51
51
  "stable": {
52
52
  "bookingConfirmation": 1,
53
- "packingList": 0,
54
- "commercialInvoice": 0,
55
53
  "finalMbL": 0,
56
54
  "draftMbl": 0,
57
55
  "arrivalNotice": 0,
@@ -13,20 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
16
  finalMbL:
31
17
  - id: "1eda2f22d64b1b89"
32
18
  details:
src/excel_processing.py CHANGED
@@ -9,12 +9,12 @@ from src.postprocessing.common import llm_prediction_to_tuples
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
11
  import asyncio
12
- import json
13
12
 
14
13
  import numpy as np
15
14
  import pandas as pd
16
15
 
17
16
  from src.llm import prompt_excel_extraction
17
+ from src.prompts.prompt_library import prompt_library
18
18
  from src.utils import estimate_page_count, generate_schema_structure, get_excel_sheets
19
19
 
20
20
 
@@ -67,7 +67,18 @@ async def extract_data_from_excel(
67
67
 
68
68
  """
69
69
  # Generate the response structure
70
- response_schema = generate_schema_structure(params, input_doc_type)
70
+ response_schema = (
71
+ prompt_library.library[input_doc_type]["other"]["placeholders"]
72
+ if input_doc_type
73
+ in [
74
+ "partnerInvoice",
75
+ "customsInvoice",
76
+ "bundeskasse",
77
+ "commercialInvoice",
78
+ "packingList",
79
+ ]
80
+ else generate_schema_structure(params, input_doc_type)
81
+ )
71
82
 
72
83
  # Load the Excel file and get ONLY the "visible" sheet names
73
84
  sheets, workbook = get_excel_sheets(file_content, mime_type)
src/pdf_processing.py CHANGED
@@ -36,7 +36,6 @@ from src.utils import (
36
36
  get_pdf_page_count,
37
37
  get_processor_name,
38
38
  run_background_tasks,
39
- transform_schema_strings,
40
39
  validate_based_on_schema,
41
40
  )
42
41
 
@@ -207,7 +206,14 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
207
206
  # get the schema placeholder from the Doc AI and generate the response structure
208
207
  response_schema = (
209
208
  prompt_library.library[input_doc_type]["other"]["placeholders"]
210
- if input_doc_type in ["partnerInvoice", "customsInvoice", "bundeskasse"]
209
+ if input_doc_type
210
+ in [
211
+ "partnerInvoice",
212
+ "customsInvoice",
213
+ "bundeskasse",
214
+ "commercialInvoice",
215
+ "packingList",
216
+ ]
211
217
  else generate_schema_structure(params, input_doc_type)
212
218
  )
213
219
 
@@ -338,10 +344,10 @@ async def extract_data_by_doctype(
338
344
  if_use_llm,
339
345
  isBetaTest=False,
340
346
  ):
341
- # Select LLM client (Using 2.5 Flash model for Bundeskasse)
347
+ # Select LLM client (Using 2.5 Pro model only for PI and customsInvoice)
342
348
  llm_client = (
343
349
  params["LlmClient_Flash"]
344
- if input_doc_type == "bundeskasse"
350
+ if input_doc_type not in ["customsInvoice", "partnerInvoice"]
345
351
  else params["LlmClient"]
346
352
  )
347
353
 
@@ -429,7 +429,10 @@ async def format_label(entity_k, entity_value, document_type_code, params):
429
429
  except ValueError as e:
430
430
  logger.info(f"ParserError: {e}")
431
431
 
432
- elif entity_key in ["invoicenumber", "creditnoteinvoicenumber"]:
432
+ elif (
433
+ entity_key in ["invoicenumber", "creditnoteinvoicenumber"]
434
+ and document_type_code == "bundeskasse"
435
+ ):
433
436
  formatted_value = clean_invoice_number(entity_value)
434
437
 
435
438
  elif entity_key in ("shipmentid", "partnerreference"):
@@ -0,0 +1,125 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "consignee": {
5
+ "type": "string",
6
+ "nullable": true,
7
+ "description": "The receiver or buyer of the goods."
8
+ },
9
+ "currency": {
10
+ "type": "string",
11
+ "nullable": true,
12
+ "description": "The currency of the totalAmount."
13
+ },
14
+ "grossWeight": {
15
+ "type": "string",
16
+ "nullable": true,
17
+ "description": "The total gross weight of all the goods. Usually mentioned as G.W or GW or Gross Weight, etc.."
18
+ },
19
+ "incoterm": {
20
+ "type": "string",
21
+ "nullable": true,
22
+ "description": "An Incoterm is a 3 letter standardized trade term defining the responsibilities of buyers and sellers in international shipping and logistics. For example, FOB, CFR, DAP, CIF, etc..."
23
+ },
24
+ "invoiceDate": {
25
+ "type": "string",
26
+ "nullable": true,
27
+ "description": "A date that the invoice was created or issued."
28
+ },
29
+ "invoiceNumber": {
30
+ "type": "string",
31
+ "nullable": true,
32
+ "description": "The invoice number of the commercial invoice document."
33
+ },
34
+ "measurement": {
35
+ "type": "string",
36
+ "nullable": true,
37
+ "description": "The volume of the goods. Usually, it is measured in \"Cubic Meter (cbm)\" or dimensions. But volume in \"cbm\" is preferred."
38
+ },
39
+ "netWeight": {
40
+ "type": "string",
41
+ "nullable": true,
42
+ "description": "The total net weight of all the goods. Usually, mentioned as N.W or NW or Net Weight, etc.."
43
+ },
44
+ "shipper": {
45
+ "type": "string",
46
+ "nullable": true,
47
+ "description": "The seller or shipper of the goods."
48
+ },
49
+ "totalAmount": {
50
+ "type": "string",
51
+ "nullable": true,
52
+ "description": "The total amount of all the goods mentioned in the invoice."
53
+ },
54
+ "skus": {
55
+ "type": "ARRAY",
56
+ "items": {
57
+ "type": "OBJECT",
58
+ "properties": {
59
+ "amount": {
60
+ "type": "string",
61
+ "nullable": true,
62
+ "description": "Amount of the goods."
63
+ },
64
+ "containerNumber": {
65
+ "type": "string",
66
+ "nullable": true,
67
+ "description": "Container Number consists of 4 capital letters followed by 7 digits. Example: TEMU7972458. Usually mentioned as Container Number, CONTAINER NO. Containers, or Container / Truck No"
68
+ },
69
+ "currency": {
70
+ "type": "string",
71
+ "nullable": true,
72
+ "description": "The currency of the Amount. Usually mentioned in USD, EURO, CNY, $, or any other currency units and symbols."
73
+ },
74
+ "goodsDescription": {
75
+ "type": "string",
76
+ "nullable": true,
77
+ "description": "Description of the goods."
78
+ },
79
+ "grossWeight": {
80
+ "type": "string",
81
+ "nullable": true,
82
+ "description": "The gross weight of an individual product/goods. Usually, mentioned as G.W or GW or Gross Weight, etc.."
83
+ },
84
+ "hsCode": {
85
+ "type": "string",
86
+ "nullable": true,
87
+ "description": "The harmonized system code of a goods."
88
+ },
89
+ "materialNumber": {
90
+ "type": "string",
91
+ "nullable": true,
92
+ "description": "Material number of the product or goods."
93
+ },
94
+ "netWeight": {
95
+ "type": "string",
96
+ "nullable": true,
97
+ "description": "The net weight of an individual product/goods. Usually, mentioned as N.W or NW or Net Weight, etc.."
98
+ },
99
+ "packagingQuantity": {
100
+ "type": "string",
101
+ "nullable": true,
102
+ "description": "The quantity of the goods. Usually, the quantity is in pallets, PLT, cartons, CTNS, pieces, PCS, packages, boxes, etc. Please prioritize the packaging types based on their size, as follows: Pallets (PLT) >> Cartons (CTNS) >> Pieces (PCS). Extract the Larger packaging types that will have a lower count."
103
+ },
104
+ "packageType": {
105
+ "type": "string",
106
+ "nullable": true,
107
+ "description": "The packaging type is the unit of packagingQuantity. Example; pallets, PLT, cartons, CTNS, pieces, PCS, packages, etc. Sometimes, the packaging type is available in the column name of the packagingQuantity."
108
+ },
109
+ "poNumber": {
110
+ "type": "string",
111
+ "nullable": true,
112
+ "description": "Purchase order of the goods."
113
+ },
114
+ "skuNumber": {
115
+ "type": "string",
116
+ "nullable": true,
117
+ "description": "SKU number of the goods."
118
+ }
119
+ },
120
+ "required": []
121
+ }
122
+ }
123
+ },
124
+ "required": []
125
+ }
@@ -0,0 +1,98 @@
1
+ {
2
+ "type": "OBJECT",
3
+ "properties": {
4
+ "buyer": {
5
+ "type": "string",
6
+ "nullable": true,
7
+ "description": "The receiver or buyer of the goods."},
8
+ "grossWeight": {
9
+ "type": "string",
10
+ "nullable": true,
11
+ "description": "The total gross weight of all the goods. Usually mentioned as G.W or GW or Gross Weight, etc.."},
12
+ "invoiceNumber": {
13
+ "type": "string",
14
+ "nullable": true,
15
+ "description": "The invoice number"},
16
+ "netWeight": {
17
+ "type": "string",
18
+ "nullable": true,
19
+ "description": "The total net weight of all the goods. Usually mentioned as N.W or NW or Net Weight, etc.."},
20
+ "seller": {
21
+ "type": "string",
22
+ "nullable": true,
23
+ "description": "The seller or shipper of the goods."},
24
+ "skuData": {
25
+ "type": "ARRAY",
26
+ "items": {
27
+ "type": "OBJECT",
28
+ "properties": {
29
+ "containerNumber": {
30
+ "type": "string",
31
+ "nullable": true,
32
+ "description": "Container Number consists of 4 capital letters followed by 7 digits. Example: TEMU7972458. Usually mentioned as Container Number, CONTAINER NO. or Containers"},
33
+ "grossWeight": {
34
+ "type": "string",
35
+ "nullable": true,
36
+ "description": "The gross weight of the goods. Usually mentioned as G.W or GW or Gross Weight, etc.."},
37
+ "hsCode": {
38
+ "type": "string",
39
+ "nullable": true,
40
+ "description": "The harmonized system code of a goods."},
41
+ "measurements": {
42
+ "type": "string",
43
+ "nullable": true,
44
+ "description": "The volume of the goods. Usually, it is measured in 'Cubic Meter (cbm)' or dimensions. But volume in 'cbm' is preferred."},
45
+ "netWeight": {
46
+ "type": "string",
47
+ "nullable": true,
48
+ "description": "The net weight of the goods. Usually mentioned as N.W or NW or Net Weight, etc.."},
49
+ "packagingType": {
50
+ "type": "string",
51
+ "nullable": true,
52
+ "description": "The packaging type is the unit of quantityShipped. Example; pallets, PLT, cartons, CTNS, pieces, PCS, packages, etc. Sometimes, the packaging type is available in the column name of the quantityShipped."},
53
+ "poNumber": {
54
+ "type": "string",
55
+ "nullable": true,
56
+ "description": "Purchase order of the goods."},
57
+ "poPosition": {
58
+ "type": "string",
59
+ "nullable": true,
60
+ "description": "PO position refers to the specific item or line associated with a Purchase Order (PO). It represents the position or line number in the PO that corresponds to the items being shipped."},
61
+ "quantityShipped": {
62
+ "type": "string",
63
+ "nullable": true,
64
+ "description": "The quantity of the goods. Usually quantity is in pallets, PLT, cartons, CTNS, pieces, PCS, packages, boxes, etc. Please prioritize the packaging types based on their size, as follows: Pallets (PLT) >> Cartons (CTNS) >> Pieces (PCS). Extract the Larger packaging types that will have a lower count."},
65
+ "sealNumber": {
66
+ "type": "string",
67
+ "nullable": true,
68
+ "description": "A unique number associated with the container number"},
69
+ "skuDescription": {
70
+ "type": "string",
71
+ "nullable": true,
72
+ "description": "Description of the goods."},
73
+ "skuNumbers": {
74
+ "type": "string",
75
+ "nullable": true,
76
+ "description": "SKU number of the goods."}
77
+ },
78
+ "required": [
79
+ "skuNumbers",
80
+ "quantityShipped",
81
+ "skuDescription",
82
+ "grossWeight",
83
+ "netWeight",
84
+ "packagingType"
85
+ ]
86
+ }
87
+ },
88
+ "totalPackagingType": {
89
+ "type": "string",
90
+ "nullable": true,
91
+ "description": "The packaging type of all the goods associated with the totalQuantityShipped. It is the unit of totalQuantityShipped. Usually pallets, PLT, cartons, CTNS, pieces, PCS, packages, etc."},
92
+ "totalQuantityShipped": {
93
+ "type": "string",
94
+ "nullable": true,
95
+ "description": "The total quantity of the goods. Usually quantity is in pallets, cartons, pieces, packages, boxes, etc. Please prioritize the packaging types based on their size, as follows: Pallets >> Cartons >> Pieces. Larger packaging types will have a lower count."}
96
+ },
97
+ "required": []
98
+ }