data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
- data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
- {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
- src/constants.py +42 -12
- src/constants_sandbox.py +2 -22
- src/docai.py +18 -7
- src/docai_processor_config.yaml +0 -64
- src/excel_processing.py +34 -15
- src/io.py +74 -6
- src/llm.py +12 -34
- src/pdf_processing.py +228 -78
- src/postprocessing/common.py +495 -618
- src/postprocessing/postprocess_partner_invoice.py +383 -27
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
- src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
- src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
- src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
- src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
- src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
- src/prompts/library/bundeskasse/other/placeholders.json +113 -0
- src/prompts/library/bundeskasse/other/prompt.txt +48 -0
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
- src/prompts/library/customsAssessment/other/placeholders.json +67 -16
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/placeholders.json +205 -0
- src/prompts/library/customsInvoice/other/prompt.txt +105 -0
- src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +34 -34
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/packingList/other/prompt.txt +1 -1
- src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
- src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
- src/setup.py +73 -63
- src/utils.py +207 -30
- data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
{data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: data-science-document-ai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.56.1
|
|
4
4
|
Summary: "Document AI repo for data science"
|
|
5
5
|
Author: Naomi Nguyen
|
|
6
6
|
Author-email: naomi.nguyen@forto.com
|
|
@@ -14,6 +14,7 @@ Requires-Dist: db-dtypes (>=1.2.0,<2.0.0)
|
|
|
14
14
|
Requires-Dist: ddtrace (>=2.20.0,<3.0.0)
|
|
15
15
|
Requires-Dist: fastapi (>=0.109.2,<0.110.0)
|
|
16
16
|
Requires-Dist: fitz (>=0.0.1.dev2,<0.0.2)
|
|
17
|
+
Requires-Dist: fuzzywuzzy (>=0.18.0,<0.19.0)
|
|
17
18
|
Requires-Dist: google (>=3.0.0,<4.0.0)
|
|
18
19
|
Requires-Dist: google-api-python-client (>=2.89.0,<3.0.0)
|
|
19
20
|
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0)
|
|
@@ -27,16 +28,20 @@ Requires-Dist: gspread (>=6.1.0,<7.0.0)
|
|
|
27
28
|
Requires-Dist: httpx (>=0.26.0,<0.27.0)
|
|
28
29
|
Requires-Dist: jupyter (>=1.0.0,<2.0.0)
|
|
29
30
|
Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
|
|
31
|
+
Requires-Dist: nltk (>=3.9.1,<4.0.0)
|
|
30
32
|
Requires-Dist: numpy (>=1.25.1,<2.0.0)
|
|
31
33
|
Requires-Dist: openai (>=1.53.0,<2.0.0)
|
|
32
34
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
33
35
|
Requires-Dist: pandas (>=2.0.3,<3.0.0)
|
|
34
36
|
Requires-Dist: parameterized (>=0.9.0,<0.10.0)
|
|
35
37
|
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
|
38
|
+
Requires-Dist: pgzip (>=0.3.5,<0.4.0)
|
|
36
39
|
Requires-Dist: pyarrow (==16.1.0)
|
|
37
40
|
Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
|
|
41
|
+
Requires-Dist: pypdf (>=6.1.2,<7.0.0)
|
|
38
42
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
|
39
43
|
Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
|
|
44
|
+
Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
|
|
40
45
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
41
46
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
42
47
|
Requires-Dist: toml (>=0.10.2,<0.11.0)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
|
|
2
|
+
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
|
+
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
|
+
src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
|
|
5
|
+
src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
|
|
6
|
+
src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
7
|
+
src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
|
|
8
|
+
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
+
src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
|
|
10
|
+
src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
|
|
11
|
+
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
|
+
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
|
|
14
|
+
src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
|
|
15
|
+
src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
|
|
16
|
+
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
17
|
+
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
|
|
18
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
|
|
19
|
+
src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
|
|
20
|
+
src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
|
|
21
|
+
src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
|
|
22
|
+
src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
23
|
+
src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
|
|
24
|
+
src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
|
|
25
|
+
src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
|
|
26
|
+
src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
27
|
+
src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
|
|
28
|
+
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
29
|
+
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
|
|
30
|
+
src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
|
|
31
|
+
src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
|
|
32
|
+
src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
|
|
33
|
+
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
|
|
34
|
+
src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
|
|
35
|
+
src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
|
|
36
|
+
src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
|
|
37
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=hUBDhocFdHTiWdEPgEE8yKHqpIYOfOj-j9CvZd-3YZc,9941
|
|
38
|
+
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
|
|
39
|
+
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
|
|
40
|
+
src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
|
|
41
|
+
src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
|
|
42
|
+
src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
|
|
43
|
+
src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
|
|
44
|
+
src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
|
|
45
|
+
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
46
|
+
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
|
|
47
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=A3nw6QfraU1N6Aui4TC7eFofG3rUyo9cz8Ha1iQbMpU,8141
|
|
48
|
+
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
49
|
+
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
50
|
+
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
|
|
51
|
+
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
52
|
+
src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
|
|
53
|
+
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
|
|
54
|
+
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
55
|
+
src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
|
|
56
|
+
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
57
|
+
src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
|
|
58
|
+
data_science_document_ai-1.56.1.dist-info/METADATA,sha256=4rIhyVd5XG02M7f9l2UYjH6r-pjzpNiobuZ-v-trvtE,2152
|
|
59
|
+
data_science_document_ai-1.56.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
60
|
+
data_science_document_ai-1.56.1.dist-info/RECORD,,
|
src/constants.py
CHANGED
|
@@ -5,18 +5,31 @@ project_parameters = {
|
|
|
5
5
|
"project_name": "document-ai",
|
|
6
6
|
"project_hash": "ceb0ac54",
|
|
7
7
|
# Google related parameters
|
|
8
|
-
"bq_project_id": "data-pipeline-276214",
|
|
9
8
|
"g_ai_project_name": "forto-data-science-production",
|
|
10
9
|
"g_ai_project_id": "738250249861",
|
|
11
10
|
"g_api_endpoint": "eu-documentai.googleapis.com",
|
|
12
11
|
"g_location": "eu",
|
|
13
12
|
"g_region": "europe-west1",
|
|
14
13
|
# Google Cloud Storage
|
|
14
|
+
"doc_ai_bucket_project_name": "forto-data-science-production",
|
|
15
15
|
"doc_ai_bucket_name": "ds-document-capture",
|
|
16
16
|
"doc_ai_bucket_batch_input": "ds-batch-process-docs",
|
|
17
17
|
"doc_ai_bucket_batch_output": "ds-batch-process-output",
|
|
18
18
|
# Paths
|
|
19
19
|
"folder_data": "data",
|
|
20
|
+
# Fuzzy lookup
|
|
21
|
+
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
|
+
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
24
|
+
"invoice_classification_lookup": "invoice_classification.json",
|
|
25
|
+
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
26
|
+
# Fuzzy logic params
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
28
|
+
"fuzzy_threshold_reverse_charge": 80,
|
|
29
|
+
"fuzzy_threshold_invoice_classification": 70,
|
|
30
|
+
# Chunking params
|
|
31
|
+
"chunk_size": 1, # page (do not change this without changing the page number logic)
|
|
32
|
+
"chunk_after": 10, # pages
|
|
20
33
|
# Big Query
|
|
21
34
|
"g_ai_gbq_db_schema": "document_ai",
|
|
22
35
|
"g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
|
|
@@ -24,18 +37,23 @@ project_parameters = {
|
|
|
24
37
|
# models metadata (confidence),
|
|
25
38
|
"g_model_data_folder": "models",
|
|
26
39
|
"local_model_data_folder": "data",
|
|
40
|
+
"released_doc_types": {
|
|
41
|
+
"bookingConfirmation",
|
|
42
|
+
"packingList",
|
|
43
|
+
"commercialInvoice",
|
|
44
|
+
"finalMbL",
|
|
45
|
+
"draftMbl",
|
|
46
|
+
"arrivalNotice",
|
|
47
|
+
"shippingInstruction",
|
|
48
|
+
"customsAssessment",
|
|
49
|
+
"deliveryOrder",
|
|
50
|
+
"partnerInvoice",
|
|
51
|
+
"customsInvoice",
|
|
52
|
+
"bundeskasse",
|
|
53
|
+
},
|
|
27
54
|
"model_selector": {
|
|
28
55
|
"stable": {
|
|
29
56
|
"bookingConfirmation": 1,
|
|
30
|
-
"packingList": 0,
|
|
31
|
-
"commercialInvoice": 0,
|
|
32
|
-
"finalMbL": 0,
|
|
33
|
-
"draftMbl": 0,
|
|
34
|
-
"arrivalNotice": 0,
|
|
35
|
-
"shippingInstruction": 0,
|
|
36
|
-
"customsAssessment": 0,
|
|
37
|
-
"deliveryOrder": 0,
|
|
38
|
-
"partnerInvoice": 0,
|
|
39
57
|
},
|
|
40
58
|
"beta": {
|
|
41
59
|
"bookingConfirmation": 0,
|
|
@@ -46,22 +64,34 @@ project_parameters = {
|
|
|
46
64
|
# LLM model parameters
|
|
47
65
|
"gemini_params": {
|
|
48
66
|
"temperature": 0,
|
|
49
|
-
"maxOutputTokens":
|
|
67
|
+
"maxOutputTokens": 65536,
|
|
68
|
+
"top_p": 0.8,
|
|
69
|
+
"top_k": 40,
|
|
70
|
+
"seed": 42,
|
|
71
|
+
"model_id": "gemini-2.5-pro",
|
|
72
|
+
},
|
|
73
|
+
"gemini_flash_params": {
|
|
74
|
+
"temperature": 0,
|
|
75
|
+
"maxOutputTokens": 65536,
|
|
50
76
|
"top_p": 0.8,
|
|
51
77
|
"top_k": 40,
|
|
52
78
|
"seed": 42,
|
|
53
|
-
"model_id": "gemini-2.
|
|
79
|
+
"model_id": "gemini-2.5-flash",
|
|
54
80
|
},
|
|
55
81
|
# Key to combine the LLM results with the Doc Ai results
|
|
56
82
|
"key_to_combine": {
|
|
57
83
|
"bookingConfirmation": ["transportLegs"],
|
|
84
|
+
"arrivalNotice": ["containers"],
|
|
58
85
|
"finalMbL": ["containers"],
|
|
59
86
|
"draftMbl": ["containers"],
|
|
87
|
+
"deliveryOrder": ["Equipment", "TransportLeg"],
|
|
60
88
|
"customsAssessment": ["containers"],
|
|
61
89
|
"packingList": ["skuData"],
|
|
62
90
|
"commercialInvoice": ["skus"],
|
|
63
91
|
"shippingInstruction": ["containers"],
|
|
64
92
|
"partnerInvoice": ["lineItem"],
|
|
93
|
+
"customsInvoice": ["lineItem"],
|
|
94
|
+
"bundeskasse": ["lineItem"],
|
|
65
95
|
},
|
|
66
96
|
}
|
|
67
97
|
|
src/constants_sandbox.py
CHANGED
|
@@ -5,28 +5,8 @@ project_parameters_sandbox = {
|
|
|
5
5
|
"g_ai_project_name": "forto-data-science-sandbox",
|
|
6
6
|
"g_ai_project_id": "882852108312",
|
|
7
7
|
# Google Cloud Storage
|
|
8
|
-
"
|
|
8
|
+
"doc_ai_bucket_project_name": "forto-data-science-sandbox",
|
|
9
|
+
"doc_ai_bucket_name": "ds_document_capture",
|
|
9
10
|
"doc_ai_bucket_batch_input": "ds_batch_process_docs",
|
|
10
11
|
"doc_ai_bucket_batch_output": "ds_batch_process_output",
|
|
11
|
-
"excluded_endpoints": ["/healthz", "/", "/metrics", "/healthz/"],
|
|
12
|
-
"model_selector": {
|
|
13
|
-
"stable": {
|
|
14
|
-
"bookingConfirmation": 1,
|
|
15
|
-
"packingList": 0,
|
|
16
|
-
"commercialInvoice": 0,
|
|
17
|
-
"finalMbL": 0,
|
|
18
|
-
"draftMbl": 0,
|
|
19
|
-
"arrivalNotice": 0,
|
|
20
|
-
"shippingInstruction": 0,
|
|
21
|
-
"customsAssessment": 0,
|
|
22
|
-
"deliveryOrder": 0,
|
|
23
|
-
"partnerInvoice": 0,
|
|
24
|
-
},
|
|
25
|
-
"beta": {
|
|
26
|
-
"bookingConfirmation": 0,
|
|
27
|
-
},
|
|
28
|
-
},
|
|
29
|
-
# this is the model selector for the model
|
|
30
|
-
# to be used from the model_config.yaml file based on the environment,
|
|
31
|
-
# 0 mean the first model in the list
|
|
32
12
|
}
|
src/docai.py
CHANGED
|
@@ -3,11 +3,16 @@ import re
|
|
|
3
3
|
|
|
4
4
|
from google.cloud import documentai
|
|
5
5
|
|
|
6
|
-
from src.io import
|
|
6
|
+
from src.io import (
|
|
7
|
+
delete_folder_from_bucket,
|
|
8
|
+
get_gcp_labels,
|
|
9
|
+
logger,
|
|
10
|
+
upload_pdf_to_bucket,
|
|
11
|
+
)
|
|
7
12
|
from src.utils import cache_on_disk
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
15
|
+
async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
|
|
11
16
|
"""Process the PDF using Document AI.
|
|
12
17
|
|
|
13
18
|
Args:
|
|
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
15
20
|
client: The Document AI client.
|
|
16
21
|
processor_name (str): The name of the processor to be used.
|
|
17
22
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
23
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
18
24
|
|
|
19
25
|
Returns:
|
|
20
26
|
The processed document.
|
|
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
24
30
|
content=image_content, mime_type="application/pdf"
|
|
25
31
|
)
|
|
26
32
|
|
|
27
|
-
# Configure the process request
|
|
33
|
+
# Configure the process request with labels for cost tracking
|
|
28
34
|
request = documentai.ProcessRequest(
|
|
29
35
|
name=processor_name,
|
|
30
36
|
raw_document=raw_document, # field_mask=field_mask
|
|
37
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
31
38
|
)
|
|
32
39
|
result = await cache_on_disk(client.process_document, request=request)
|
|
33
40
|
|
|
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
|
|
|
35
42
|
|
|
36
43
|
|
|
37
44
|
async def _batch_process_pdf_w_docai(
|
|
38
|
-
params, image_content, client, processor_name, timeout=1200
|
|
45
|
+
params, image_content, client, processor_name, timeout=1200, doc_type=None
|
|
39
46
|
):
|
|
40
47
|
"""Process the PDF using Document AI Batch Process API.
|
|
41
48
|
|
|
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
|
|
|
45
52
|
processor_name (str): The name of the processor to be used.
|
|
46
53
|
e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
|
|
47
54
|
timeout (int, optional): The timeout in seconds. Defaults to 1200.
|
|
55
|
+
doc_type (str, optional): Document type for cost tracking labels.
|
|
48
56
|
|
|
49
57
|
Returns:
|
|
50
58
|
The processed document.
|
|
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
|
|
|
72
80
|
# Where to write results
|
|
73
81
|
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
|
|
74
82
|
|
|
75
|
-
# The full resource name of the processor
|
|
83
|
+
# The full resource name of the processor with labels for cost tracking
|
|
76
84
|
request = documentai.BatchProcessRequest(
|
|
77
85
|
name=processor_name,
|
|
78
86
|
input_documents=input_config,
|
|
79
87
|
document_output_config=output_config,
|
|
88
|
+
labels=get_gcp_labels(doc_type=doc_type),
|
|
80
89
|
)
|
|
81
90
|
|
|
82
91
|
# BatchProcess returns a Long Running Operation (LRO)
|
|
@@ -130,8 +139,10 @@ async def _batch_process_pdf_w_docai(
|
|
|
130
139
|
)
|
|
131
140
|
|
|
132
141
|
# Delete the temporary file and the output file from the bucket
|
|
133
|
-
delete_folder_from_bucket(
|
|
134
|
-
|
|
142
|
+
delete_folder_from_bucket(
|
|
143
|
+
params, params["doc_ai_bucket_batch_input"], "temp.pdf"
|
|
144
|
+
)
|
|
145
|
+
delete_folder_from_bucket(params, output_bucket, output_prefix)
|
|
135
146
|
logger.info("Batch Process Completed!")
|
|
136
147
|
|
|
137
148
|
return result_document
|
src/docai_processor_config.yaml
CHANGED
|
@@ -13,70 +13,6 @@ model_config:
|
|
|
13
13
|
author: "igor.tonko@forto.com"
|
|
14
14
|
created_date: ""
|
|
15
15
|
|
|
16
|
-
packingList:
|
|
17
|
-
- id: "d967005bd9d45aeb"
|
|
18
|
-
details:
|
|
19
|
-
display_name: "doc_cap_packingList"
|
|
20
|
-
author: "kumar.rajendrababu@forto.com"
|
|
21
|
-
created_date: ""
|
|
22
|
-
|
|
23
|
-
commercialInvoice:
|
|
24
|
-
- id: "7d37236207f75758"
|
|
25
|
-
details:
|
|
26
|
-
display_name: "doc_cap_commercialInvoice"
|
|
27
|
-
author: "kumar.rajendrababu@forto.com"
|
|
28
|
-
created_date: ""
|
|
29
|
-
|
|
30
|
-
finalMbL:
|
|
31
|
-
- id: "1eda2f22d64b1b89"
|
|
32
|
-
details:
|
|
33
|
-
display_name: "doc_cap_finalMbL"
|
|
34
|
-
author: "igor.tonko@forto.com"
|
|
35
|
-
created_date: ""
|
|
36
|
-
|
|
37
|
-
draftMbl:
|
|
38
|
-
- id: "1eda2f22d64b1b89"
|
|
39
|
-
details:
|
|
40
|
-
display_name: "doc_cap_finalMbL"
|
|
41
|
-
author: "igor.tonko@forto.com"
|
|
42
|
-
created_date: ""
|
|
43
|
-
|
|
44
|
-
shippingInstruction:
|
|
45
|
-
- id: "c77a0a515d99a8ba"
|
|
46
|
-
details:
|
|
47
|
-
display_name: "doc_cap_shippingInstruction"
|
|
48
|
-
author: "kumar.rajendrababu@forto.com"
|
|
49
|
-
created_date: ""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
arrivalNotice:
|
|
53
|
-
- id: "748b2e2b9161dcf3"
|
|
54
|
-
details:
|
|
55
|
-
display_name: "doc_cap_arrivalNotice"
|
|
56
|
-
author: "osman.demirel@forto.com"
|
|
57
|
-
created_date: ""
|
|
58
|
-
|
|
59
|
-
customsAssessment:
|
|
60
|
-
- id: "c464a18d82fad9be"
|
|
61
|
-
details:
|
|
62
|
-
display_name: "doc_cap_customsAssessment"
|
|
63
|
-
author: "igor.tonko@forto.com"
|
|
64
|
-
created_date: ""
|
|
65
|
-
|
|
66
|
-
deliveryOrder:
|
|
67
|
-
- id: "2245a72c7a5dbf5f"
|
|
68
|
-
details:
|
|
69
|
-
display_name: "doc_cap_releaseNote"
|
|
70
|
-
author: "igor.tonko@forto.com"
|
|
71
|
-
created_date: ""
|
|
72
|
-
|
|
73
|
-
partnerInvoice:
|
|
74
|
-
- id: "17d103181e745a05"
|
|
75
|
-
details:
|
|
76
|
-
display_name: "doc_cap_partnerInvoice"
|
|
77
|
-
author: "osman.demirel@forto.com"
|
|
78
|
-
created_date: ""
|
|
79
|
-
|
|
80
16
|
beta:
|
|
81
17
|
bookingConfirmation:
|
|
82
18
|
- id: "3c280b11bdb3ed89"
|
src/excel_processing.py
CHANGED
|
@@ -2,21 +2,25 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
+
from ddtrace import tracer
|
|
6
|
+
|
|
5
7
|
logger = logging.getLogger(__name__)
|
|
6
8
|
|
|
7
9
|
import asyncio
|
|
8
|
-
import json
|
|
9
10
|
|
|
10
11
|
import numpy as np
|
|
11
12
|
import pandas as pd
|
|
12
13
|
|
|
13
|
-
from src.
|
|
14
|
-
from src.utils import
|
|
14
|
+
from src.prompts.prompt_library import prompt_library
|
|
15
|
+
from src.utils import estimate_page_count, get_excel_sheets
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
async def extract_data_from_sheet(
|
|
18
|
+
async def extract_data_from_sheet(
|
|
19
|
+
llm_client, sheet_name, sheet, response_schema, doc_type=None
|
|
20
|
+
):
|
|
18
21
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
19
|
-
excel_content = pd.DataFrame(sheet.values)
|
|
22
|
+
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
23
|
+
|
|
20
24
|
# Convert to Markdown format for the LLM model
|
|
21
25
|
worksheet = (
|
|
22
26
|
"This is from a excel. Pay attention to the cell position:\n"
|
|
@@ -24,12 +28,16 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
|
|
|
24
28
|
)
|
|
25
29
|
|
|
26
30
|
# Prompt for the LLM JSON
|
|
27
|
-
|
|
31
|
+
prompt = prompt_library.library[doc_type]["other"]["prompt"]
|
|
32
|
+
|
|
33
|
+
# Join the worksheet content with the prompt
|
|
34
|
+
prompt = worksheet + "\n" + prompt
|
|
28
35
|
|
|
29
36
|
try:
|
|
30
|
-
result = await
|
|
31
|
-
|
|
37
|
+
result = await llm_client.get_unified_json_genai(
|
|
38
|
+
prompt,
|
|
32
39
|
response_schema=response_schema,
|
|
40
|
+
doc_type=doc_type,
|
|
33
41
|
)
|
|
34
42
|
except Exception as e:
|
|
35
43
|
result = {}
|
|
@@ -42,8 +50,8 @@ async def extract_data_from_excel(
|
|
|
42
50
|
params,
|
|
43
51
|
input_doc_type,
|
|
44
52
|
file_content,
|
|
45
|
-
schema_client,
|
|
46
53
|
mime_type,
|
|
54
|
+
llm_client,
|
|
47
55
|
):
|
|
48
56
|
"""Extract data from the Excel file.
|
|
49
57
|
|
|
@@ -51,8 +59,8 @@ async def extract_data_from_excel(
|
|
|
51
59
|
params (dict): Parameters for the data extraction process.
|
|
52
60
|
input_doc_type (str): The type of the document.
|
|
53
61
|
file_content (bytes): The content of the Excel file to process.
|
|
54
|
-
schema_client (DocumentSchemaClient): Client for the Document AI schema.
|
|
55
62
|
mime_type (str): The MIME type of the file.
|
|
63
|
+
llm_client: The LLM client to use for data extraction.
|
|
56
64
|
|
|
57
65
|
Returns:
|
|
58
66
|
formatted_data (list): A list of dictionaries containing the extracted data.
|
|
@@ -61,20 +69,31 @@ async def extract_data_from_excel(
|
|
|
61
69
|
|
|
62
70
|
"""
|
|
63
71
|
# Generate the response structure
|
|
64
|
-
response_schema =
|
|
72
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
65
73
|
|
|
66
74
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
67
75
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
68
76
|
|
|
77
|
+
# Track the number of sheets in dd-trace
|
|
78
|
+
span = tracer.current_span()
|
|
79
|
+
if span:
|
|
80
|
+
estimated_page_counts = [
|
|
81
|
+
estimate_page_count(workbook[sheet]) for sheet in sheets
|
|
82
|
+
]
|
|
83
|
+
est_page_count = sum(estimated_page_counts)
|
|
84
|
+
span.set_metric("est_page_count", est_page_count)
|
|
85
|
+
|
|
69
86
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
70
87
|
sheet_extract_tasks = [
|
|
71
88
|
extract_data_from_sheet(
|
|
72
|
-
|
|
89
|
+
llm_client,
|
|
90
|
+
sheet_name,
|
|
91
|
+
workbook[sheet_name],
|
|
92
|
+
response_schema,
|
|
93
|
+
doc_type=input_doc_type,
|
|
73
94
|
)
|
|
74
95
|
for sheet_name in sheets
|
|
75
96
|
]
|
|
76
97
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
77
98
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return extracted_data, stored_data, params["gemini_params"]["model_id"]
|
|
99
|
+
return extracted_data, extracted_data, llm_client.model_id
|
src/io.py
CHANGED
|
@@ -12,13 +12,55 @@ from pathlib import Path
|
|
|
12
12
|
from google.cloud import bigquery, storage
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def get_gcp_labels(**extra_labels):
|
|
16
|
+
"""Generate standardized GCP labels for cost tracking.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
**extra_labels: Additional custom labels
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
|
|
23
|
+
"""
|
|
24
|
+
project_name = os.getenv("PROJECT_NAME")
|
|
25
|
+
|
|
26
|
+
# If not set, detect once and cache it
|
|
27
|
+
if not project_name:
|
|
28
|
+
# Try pyproject.toml first
|
|
29
|
+
try:
|
|
30
|
+
import toml
|
|
31
|
+
|
|
32
|
+
pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
|
|
33
|
+
if pyproject_path.exists():
|
|
34
|
+
config = toml.load(pyproject_path)
|
|
35
|
+
project_name = config.get("tool", {}).get("poetry", {}).get("name")
|
|
36
|
+
except Exception:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
# Fallback to unknown
|
|
40
|
+
if not project_name:
|
|
41
|
+
project_name = "unknown"
|
|
42
|
+
|
|
43
|
+
# Cache it
|
|
44
|
+
os.environ["PROJECT_NAME"] = project_name
|
|
45
|
+
|
|
46
|
+
labels = {
|
|
47
|
+
"ds-project-name": project_name.lower(),
|
|
48
|
+
"ds-env": os.getenv("CLUSTER", "local").lower(),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Add any extra labels
|
|
52
|
+
labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
|
|
53
|
+
|
|
54
|
+
return labels
|
|
55
|
+
|
|
56
|
+
|
|
15
57
|
def get_bq_client(params):
|
|
16
58
|
"""Get Google BigQuery client."""
|
|
17
|
-
bq_client = bigquery.Client(project=params["
|
|
59
|
+
bq_client = bigquery.Client(project=params["g_ai_project_name"])
|
|
18
60
|
job_config = bigquery.QueryJobConfig(
|
|
19
61
|
allow_large_results=True,
|
|
20
62
|
# flatten_results=True,
|
|
21
|
-
labels=
|
|
63
|
+
labels=get_gcp_labels(),
|
|
22
64
|
)
|
|
23
65
|
return bq_client, job_config
|
|
24
66
|
|
|
@@ -41,7 +83,7 @@ def upload_pdf_to_bucket(params, content, file_name):
|
|
|
41
83
|
temp_file.write(content)
|
|
42
84
|
|
|
43
85
|
# Upload the temporary file to the bucket
|
|
44
|
-
client = storage.Client(project=params["
|
|
86
|
+
client = storage.Client(project=params["doc_ai_bucket_project_name"])
|
|
45
87
|
bucket = client.bucket(params["doc_ai_bucket_batch_input"])
|
|
46
88
|
|
|
47
89
|
blob = bucket.blob(file_name)
|
|
@@ -60,7 +102,7 @@ def upload_pdf_to_bucket(params, content, file_name):
|
|
|
60
102
|
return None, None
|
|
61
103
|
|
|
62
104
|
|
|
63
|
-
def delete_folder_from_bucket(bucket_name, folder_name):
|
|
105
|
+
def delete_folder_from_bucket(params, bucket_name, folder_name):
|
|
64
106
|
"""Delete a folder (prefix) and its contents from a GCS bucket.
|
|
65
107
|
|
|
66
108
|
Args:
|
|
@@ -68,7 +110,7 @@ def delete_folder_from_bucket(bucket_name, folder_name):
|
|
|
68
110
|
folder_name (str): Name of the folder (prefix) to delete.
|
|
69
111
|
"""
|
|
70
112
|
try:
|
|
71
|
-
client = storage.Client()
|
|
113
|
+
client = storage.Client(project=params["doc_ai_bucket_project_name"])
|
|
72
114
|
bucket = client.get_bucket(bucket_name)
|
|
73
115
|
|
|
74
116
|
# List all objects with the given prefix (folder name)
|
|
@@ -86,7 +128,7 @@ def delete_folder_from_bucket(bucket_name, folder_name):
|
|
|
86
128
|
|
|
87
129
|
def get_storage_client(params) -> storage.Client:
|
|
88
130
|
"""Get Google Storage client."""
|
|
89
|
-
return storage.Client(project=params["
|
|
131
|
+
return storage.Client(project=params["doc_ai_bucket_project_name"])
|
|
90
132
|
|
|
91
133
|
|
|
92
134
|
def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
@@ -112,3 +154,29 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
112
154
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
113
155
|
blob.download_to_filename(directory_local / Path(blob.name))
|
|
114
156
|
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def bq_logs(data_to_insert, params):
|
|
160
|
+
"""Insert logs into Google BigQuery.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
data_to_insert (list): The data to insert into BigQuery.
|
|
164
|
+
params (dict): The parameters dictionary.
|
|
165
|
+
"""
|
|
166
|
+
# Use the pre-initialized BigQuery client
|
|
167
|
+
bq_client = params["bq_client"]
|
|
168
|
+
# Get the table string
|
|
169
|
+
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
170
|
+
|
|
171
|
+
logger.info(f"Log table: {table_string}")
|
|
172
|
+
# Insert the rows into the table
|
|
173
|
+
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
174
|
+
|
|
175
|
+
# Check if there were any errors inserting the rows
|
|
176
|
+
if not insert_logs:
|
|
177
|
+
logger.info("New rows have been added.")
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# type: ignore
|