data-science-document-ai 1.13.0__py3-none-any.whl → 1.56.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +7 -2
  2. data_science_document_ai-1.56.1.dist-info/RECORD +60 -0
  3. {data_science_document_ai-1.13.0.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +1 -1
  4. src/constants.py +42 -12
  5. src/constants_sandbox.py +2 -22
  6. src/docai.py +18 -7
  7. src/docai_processor_config.yaml +0 -64
  8. src/excel_processing.py +34 -15
  9. src/io.py +74 -6
  10. src/llm.py +12 -34
  11. src/pdf_processing.py +228 -78
  12. src/postprocessing/common.py +495 -618
  13. src/postprocessing/postprocess_partner_invoice.py +383 -27
  14. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  15. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  16. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  17. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  18. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  19. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  24. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  26. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  28. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  29. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  30. src/prompts/library/bundeskasse/other/placeholders.json +113 -0
  31. src/prompts/library/bundeskasse/other/prompt.txt +48 -0
  32. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  33. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  34. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  35. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  36. src/prompts/library/customsInvoice/other/placeholders.json +205 -0
  37. src/prompts/library/customsInvoice/other/prompt.txt +105 -0
  38. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  39. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  40. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  41. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  42. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  43. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  44. src/prompts/library/packingList/other/placeholders.json +98 -0
  45. src/prompts/library/packingList/other/prompt.txt +1 -1
  46. src/prompts/library/partnerInvoice/other/placeholders.json +165 -45
  47. src/prompts/library/partnerInvoice/other/prompt.txt +82 -44
  48. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  49. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  50. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  51. src/setup.py +73 -63
  52. src/utils.py +207 -30
  53. data_science_document_ai-1.13.0.dist-info/RECORD +0 -55
  54. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  56. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  57. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.13.0
3
+ Version: 1.56.1
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -14,6 +14,7 @@ Requires-Dist: db-dtypes (>=1.2.0,<2.0.0)
14
14
  Requires-Dist: ddtrace (>=2.20.0,<3.0.0)
15
15
  Requires-Dist: fastapi (>=0.109.2,<0.110.0)
16
16
  Requires-Dist: fitz (>=0.0.1.dev2,<0.0.2)
17
+ Requires-Dist: fuzzywuzzy (>=0.18.0,<0.19.0)
17
18
  Requires-Dist: google (>=3.0.0,<4.0.0)
18
19
  Requires-Dist: google-api-python-client (>=2.89.0,<3.0.0)
19
20
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0)
@@ -27,16 +28,20 @@ Requires-Dist: gspread (>=6.1.0,<7.0.0)
27
28
  Requires-Dist: httpx (>=0.26.0,<0.27.0)
28
29
  Requires-Dist: jupyter (>=1.0.0,<2.0.0)
29
30
  Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
31
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
30
32
  Requires-Dist: numpy (>=1.25.1,<2.0.0)
31
33
  Requires-Dist: openai (>=1.53.0,<2.0.0)
32
34
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
35
  Requires-Dist: pandas (>=2.0.3,<3.0.0)
34
36
  Requires-Dist: parameterized (>=0.9.0,<0.10.0)
35
37
  Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
+ Requires-Dist: pgzip (>=0.3.5,<0.4.0)
36
39
  Requires-Dist: pyarrow (==16.1.0)
37
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
38
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
39
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
+ Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
40
45
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
41
46
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
42
47
  Requires-Dist: toml (>=0.10.2,<0.11.0)
@@ -0,0 +1,60 @@
1
+ src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
2
+ src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
+ src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
+ src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
5
+ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
+ src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
+ src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
+ src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
+ src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
10
+ src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
+ src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
+ src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
14
+ src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
+ src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
+ src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
17
+ src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
18
+ src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
19
+ src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
20
+ src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
21
+ src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
22
+ src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
23
+ src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
24
+ src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
25
+ src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
26
+ src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
27
+ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
28
+ src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
29
+ src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
30
+ src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
31
+ src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
32
+ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
33
+ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
34
+ src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
35
+ src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
36
+ src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
37
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=hUBDhocFdHTiWdEPgEE8yKHqpIYOfOj-j9CvZd-3YZc,9941
38
+ src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
39
+ src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
40
+ src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
41
+ src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
42
+ src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
43
+ src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
44
+ src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
45
+ src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
46
+ src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=A3nw6QfraU1N6Aui4TC7eFofG3rUyo9cz8Ha1iQbMpU,8141
48
+ src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
+ src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
+ src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
51
+ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
52
+ src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
+ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
+ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
+ src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
56
+ src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
+ src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
+ data_science_document_ai-1.56.1.dist-info/METADATA,sha256=4rIhyVd5XG02M7f9l2UYjH6r-pjzpNiobuZ-v-trvtE,2152
59
+ data_science_document_ai-1.56.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.56.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
src/constants.py CHANGED
@@ -5,18 +5,31 @@ project_parameters = {
5
5
  "project_name": "document-ai",
6
6
  "project_hash": "ceb0ac54",
7
7
  # Google related parameters
8
- "bq_project_id": "data-pipeline-276214",
9
8
  "g_ai_project_name": "forto-data-science-production",
10
9
  "g_ai_project_id": "738250249861",
11
10
  "g_api_endpoint": "eu-documentai.googleapis.com",
12
11
  "g_location": "eu",
13
12
  "g_region": "europe-west1",
14
13
  # Google Cloud Storage
14
+ "doc_ai_bucket_project_name": "forto-data-science-production",
15
15
  "doc_ai_bucket_name": "ds-document-capture",
16
16
  "doc_ai_bucket_batch_input": "ds-batch-process-docs",
17
17
  "doc_ai_bucket_batch_output": "ds-batch-process-output",
18
18
  # Paths
19
19
  "folder_data": "data",
20
+ # Fuzzy lookup
21
+ "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
+ "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
24
+ "invoice_classification_lookup": "invoice_classification.json",
25
+ "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
26
+ # Fuzzy logic params
27
+ "fuzzy_threshold_item_code": 92,
28
+ "fuzzy_threshold_reverse_charge": 80,
29
+ "fuzzy_threshold_invoice_classification": 70,
30
+ # Chunking params
31
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
32
+ "chunk_after": 10, # pages
20
33
  # Big Query
21
34
  "g_ai_gbq_db_schema": "document_ai",
22
35
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -24,18 +37,23 @@ project_parameters = {
24
37
  # models metadata (confidence),
25
38
  "g_model_data_folder": "models",
26
39
  "local_model_data_folder": "data",
40
+ "released_doc_types": {
41
+ "bookingConfirmation",
42
+ "packingList",
43
+ "commercialInvoice",
44
+ "finalMbL",
45
+ "draftMbl",
46
+ "arrivalNotice",
47
+ "shippingInstruction",
48
+ "customsAssessment",
49
+ "deliveryOrder",
50
+ "partnerInvoice",
51
+ "customsInvoice",
52
+ "bundeskasse",
53
+ },
27
54
  "model_selector": {
28
55
  "stable": {
29
56
  "bookingConfirmation": 1,
30
- "packingList": 0,
31
- "commercialInvoice": 0,
32
- "finalMbL": 0,
33
- "draftMbl": 0,
34
- "arrivalNotice": 0,
35
- "shippingInstruction": 0,
36
- "customsAssessment": 0,
37
- "deliveryOrder": 0,
38
- "partnerInvoice": 0,
39
57
  },
40
58
  "beta": {
41
59
  "bookingConfirmation": 0,
@@ -46,22 +64,34 @@ project_parameters = {
46
64
  # LLM model parameters
47
65
  "gemini_params": {
48
66
  "temperature": 0,
49
- "maxOutputTokens": 8000,
67
+ "maxOutputTokens": 65536,
68
+ "top_p": 0.8,
69
+ "top_k": 40,
70
+ "seed": 42,
71
+ "model_id": "gemini-2.5-pro",
72
+ },
73
+ "gemini_flash_params": {
74
+ "temperature": 0,
75
+ "maxOutputTokens": 65536,
50
76
  "top_p": 0.8,
51
77
  "top_k": 40,
52
78
  "seed": 42,
53
- "model_id": "gemini-2.0-flash",
79
+ "model_id": "gemini-2.5-flash",
54
80
  },
55
81
  # Key to combine the LLM results with the Doc Ai results
56
82
  "key_to_combine": {
57
83
  "bookingConfirmation": ["transportLegs"],
84
+ "arrivalNotice": ["containers"],
58
85
  "finalMbL": ["containers"],
59
86
  "draftMbl": ["containers"],
87
+ "deliveryOrder": ["Equipment", "TransportLeg"],
60
88
  "customsAssessment": ["containers"],
61
89
  "packingList": ["skuData"],
62
90
  "commercialInvoice": ["skus"],
63
91
  "shippingInstruction": ["containers"],
64
92
  "partnerInvoice": ["lineItem"],
93
+ "customsInvoice": ["lineItem"],
94
+ "bundeskasse": ["lineItem"],
65
95
  },
66
96
  }
67
97
 
src/constants_sandbox.py CHANGED
@@ -5,28 +5,8 @@ project_parameters_sandbox = {
5
5
  "g_ai_project_name": "forto-data-science-sandbox",
6
6
  "g_ai_project_id": "882852108312",
7
7
  # Google Cloud Storage
8
- "doc_ai_bucket_name": "ds_document_capture", # Creating a new bucket bcz the prod bucket name is already taken
8
+ "doc_ai_bucket_project_name": "forto-data-science-sandbox",
9
+ "doc_ai_bucket_name": "ds_document_capture",
9
10
  "doc_ai_bucket_batch_input": "ds_batch_process_docs",
10
11
  "doc_ai_bucket_batch_output": "ds_batch_process_output",
11
- "excluded_endpoints": ["/healthz", "/", "/metrics", "/healthz/"],
12
- "model_selector": {
13
- "stable": {
14
- "bookingConfirmation": 1,
15
- "packingList": 0,
16
- "commercialInvoice": 0,
17
- "finalMbL": 0,
18
- "draftMbl": 0,
19
- "arrivalNotice": 0,
20
- "shippingInstruction": 0,
21
- "customsAssessment": 0,
22
- "deliveryOrder": 0,
23
- "partnerInvoice": 0,
24
- },
25
- "beta": {
26
- "bookingConfirmation": 0,
27
- },
28
- },
29
- # this is the model selector for the model
30
- # to be used from the model_config.yaml file based on the environment,
31
- # 0 mean the first model in the list
32
12
  }
src/docai.py CHANGED
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -130,8 +139,10 @@ async def _batch_process_pdf_w_docai(
130
139
  )
131
140
 
132
141
  # Delete the temporary file and the output file from the bucket
133
- delete_folder_from_bucket(params["doc_ai_bucket_batch_input"], "temp.pdf")
134
- delete_folder_from_bucket(output_bucket, output_prefix)
142
+ delete_folder_from_bucket(
143
+ params, params["doc_ai_bucket_batch_input"], "temp.pdf"
144
+ )
145
+ delete_folder_from_bucket(params, output_bucket, output_prefix)
135
146
  logger.info("Batch Process Completed!")
136
147
 
137
148
  return result_document
@@ -13,70 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
- finalMbL:
31
- - id: "1eda2f22d64b1b89"
32
- details:
33
- display_name: "doc_cap_finalMbL"
34
- author: "igor.tonko@forto.com"
35
- created_date: ""
36
-
37
- draftMbl:
38
- - id: "1eda2f22d64b1b89"
39
- details:
40
- display_name: "doc_cap_finalMbL"
41
- author: "igor.tonko@forto.com"
42
- created_date: ""
43
-
44
- shippingInstruction:
45
- - id: "c77a0a515d99a8ba"
46
- details:
47
- display_name: "doc_cap_shippingInstruction"
48
- author: "kumar.rajendrababu@forto.com"
49
- created_date: ""
50
-
51
-
52
- arrivalNotice:
53
- - id: "748b2e2b9161dcf3"
54
- details:
55
- display_name: "doc_cap_arrivalNotice"
56
- author: "osman.demirel@forto.com"
57
- created_date: ""
58
-
59
- customsAssessment:
60
- - id: "c464a18d82fad9be"
61
- details:
62
- display_name: "doc_cap_customsAssessment"
63
- author: "igor.tonko@forto.com"
64
- created_date: ""
65
-
66
- deliveryOrder:
67
- - id: "2245a72c7a5dbf5f"
68
- details:
69
- display_name: "doc_cap_releaseNote"
70
- author: "igor.tonko@forto.com"
71
- created_date: ""
72
-
73
- partnerInvoice:
74
- - id: "17d103181e745a05"
75
- details:
76
- display_name: "doc_cap_partnerInvoice"
77
- author: "osman.demirel@forto.com"
78
- created_date: ""
79
-
80
16
  beta:
81
17
  bookingConfirmation:
82
18
  - id: "3c280b11bdb3ed89"
src/excel_processing.py CHANGED
@@ -2,21 +2,25 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from ddtrace import tracer
6
+
5
7
  logger = logging.getLogger(__name__)
6
8
 
7
9
  import asyncio
8
- import json
9
10
 
10
11
  import numpy as np
11
12
  import pandas as pd
12
13
 
13
- from src.llm import prompt_excel_extraction
14
- from src.utils import generate_schema_structure, get_excel_sheets
14
+ from src.prompts.prompt_library import prompt_library
15
+ from src.utils import estimate_page_count, get_excel_sheets
15
16
 
16
17
 
17
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
18
+ async def extract_data_from_sheet(
19
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
20
+ ):
18
21
  logger.info(f"Processing sheet: {sheet_name}")
19
- excel_content = pd.DataFrame(sheet.values)
22
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
23
+
20
24
  # Convert to Markdown format for the LLM model
21
25
  worksheet = (
22
26
  "This is from a excel. Pay attention to the cell position:\n"
@@ -24,12 +28,16 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
24
28
  )
25
29
 
26
30
  # Prompt for the LLM JSON
27
- prompt_docai = prompt_excel_extraction(worksheet)
31
+ prompt = prompt_library.library[doc_type]["other"]["prompt"]
32
+
33
+ # Join the worksheet content with the prompt
34
+ prompt = worksheet + "\n" + prompt
28
35
 
29
36
  try:
30
- result = await params["LlmClient"].get_unified_json_genai(
31
- prompt_docai,
37
+ result = await llm_client.get_unified_json_genai(
38
+ prompt,
32
39
  response_schema=response_schema,
40
+ doc_type=doc_type,
33
41
  )
34
42
  except Exception as e:
35
43
  result = {}
@@ -42,8 +50,8 @@ async def extract_data_from_excel(
42
50
  params,
43
51
  input_doc_type,
44
52
  file_content,
45
- schema_client,
46
53
  mime_type,
54
+ llm_client,
47
55
  ):
48
56
  """Extract data from the Excel file.
49
57
 
@@ -51,8 +59,8 @@ async def extract_data_from_excel(
51
59
  params (dict): Parameters for the data extraction process.
52
60
  input_doc_type (str): The type of the document.
53
61
  file_content (bytes): The content of the Excel file to process.
54
- schema_client (DocumentSchemaClient): Client for the Document AI schema.
55
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
56
64
 
57
65
  Returns:
58
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -61,20 +69,31 @@ async def extract_data_from_excel(
61
69
 
62
70
  """
63
71
  # Generate the response structure
64
- response_schema = generate_schema_structure(params, input_doc_type)
72
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
65
73
 
66
74
  # Load the Excel file and get ONLY the "visible" sheet names
67
75
  sheets, workbook = get_excel_sheets(file_content, mime_type)
68
76
 
77
+ # Track the number of sheets in dd-trace
78
+ span = tracer.current_span()
79
+ if span:
80
+ estimated_page_counts = [
81
+ estimate_page_count(workbook[sheet]) for sheet in sheets
82
+ ]
83
+ est_page_count = sum(estimated_page_counts)
84
+ span.set_metric("est_page_count", est_page_count)
85
+
69
86
  # Excel files may contain multiple sheets. Extract data from each sheet
70
87
  sheet_extract_tasks = [
71
88
  extract_data_from_sheet(
72
- params, sheet_name, workbook[sheet_name], response_schema
89
+ llm_client,
90
+ sheet_name,
91
+ workbook[sheet_name],
92
+ response_schema,
93
+ doc_type=input_doc_type,
73
94
  )
74
95
  for sheet_name in sheets
75
96
  ]
76
97
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
77
98
 
78
- stored_data = json.dumps(extracted_data)
79
-
80
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
99
+ return extracted_data, extracted_data, llm_client.model_id
src/io.py CHANGED
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
- bq_client = bigquery.Client(project=params["bq_project_id"])
59
+ bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -41,7 +83,7 @@ def upload_pdf_to_bucket(params, content, file_name):
41
83
  temp_file.write(content)
42
84
 
43
85
  # Upload the temporary file to the bucket
44
- client = storage.Client(project=params["g_ai_project_name"])
86
+ client = storage.Client(project=params["doc_ai_bucket_project_name"])
45
87
  bucket = client.bucket(params["doc_ai_bucket_batch_input"])
46
88
 
47
89
  blob = bucket.blob(file_name)
@@ -60,7 +102,7 @@ def upload_pdf_to_bucket(params, content, file_name):
60
102
  return None, None
61
103
 
62
104
 
63
- def delete_folder_from_bucket(bucket_name, folder_name):
105
+ def delete_folder_from_bucket(params, bucket_name, folder_name):
64
106
  """Delete a folder (prefix) and its contents from a GCS bucket.
65
107
 
66
108
  Args:
@@ -68,7 +110,7 @@ def delete_folder_from_bucket(bucket_name, folder_name):
68
110
  folder_name (str): Name of the folder (prefix) to delete.
69
111
  """
70
112
  try:
71
- client = storage.Client()
113
+ client = storage.Client(project=params["doc_ai_bucket_project_name"])
72
114
  bucket = client.get_bucket(bucket_name)
73
115
 
74
116
  # List all objects with the given prefix (folder name)
@@ -86,7 +128,7 @@ def delete_folder_from_bucket(bucket_name, folder_name):
86
128
 
87
129
  def get_storage_client(params) -> storage.Client:
88
130
  """Get Google Storage client."""
89
- return storage.Client(project=params["g_ai_project_name"])
131
+ return storage.Client(project=params["doc_ai_bucket_project_name"])
90
132
 
91
133
 
92
134
  def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
@@ -112,3 +154,29 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
182
+ # type: ignore