data-science-document-ai 1.40.3__py3-none-any.whl → 1.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/METADATA +2 -2
  2. data_science_document_ai-1.51.0.dist-info/RECORD +60 -0
  3. src/constants.py +6 -10
  4. src/docai.py +14 -5
  5. src/docai_processor_config.yaml +0 -56
  6. src/excel_processing.py +34 -13
  7. src/io.py +69 -1
  8. src/llm.py +10 -32
  9. src/pdf_processing.py +192 -54
  10. src/postprocessing/common.py +246 -44
  11. src/postprocessing/postprocess_partner_invoice.py +139 -85
  12. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  13. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  14. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  15. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  16. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  17. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  18. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  19. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  20. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  21. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  22. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  23. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  24. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  25. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  26. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  27. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  28. src/prompts/library/bundeskasse/other/placeholders.json +25 -25
  29. src/prompts/library/bundeskasse/other/prompt.txt +8 -6
  30. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  31. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  32. src/prompts/library/customsAssessment/other/placeholders.json +67 -16
  33. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  34. src/prompts/library/customsInvoice/other/placeholders.json +20 -20
  35. src/prompts/library/customsInvoice/other/prompt.txt +4 -4
  36. src/prompts/library/deliveryOrder/other/placeholders.json +79 -28
  37. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  38. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  39. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  40. src/prompts/library/finalMbL/other/placeholders.json +34 -34
  41. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  42. src/prompts/library/packingList/other/placeholders.json +98 -0
  43. src/prompts/library/packingList/other/prompt.txt +1 -1
  44. src/prompts/library/partnerInvoice/other/placeholders.json +2 -23
  45. src/prompts/library/partnerInvoice/other/prompt.txt +7 -18
  46. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  47. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  48. src/prompts/library/shippingInstruction/other/prompt.txt +28 -15
  49. src/setup.py +13 -16
  50. src/utils.py +157 -45
  51. data_science_document_ai-1.40.3.dist-info/RECORD +0 -59
  52. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -44
  53. src/prompts/library/draftMbl/maersk/prompt.txt +0 -17
  54. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  55. src/prompts/library/finalMbL/maersk/prompt.txt +0 -17
  56. {data_science_document_ai-1.40.3.dist-info → data_science_document_ai-1.51.0.dist-info}/WHEEL +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.40.3
3
+ Version: 1.51.0
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
- Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
42
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
43
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -0,0 +1,60 @@
1
+ src/constants.py,sha256=k5bBnJN-kmXiAtIAlz6Kg6fDyR9n0DuIudCZ9ZHO_Jw,3528
2
+ src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
+ src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
+ src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
5
+ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
+ src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
+ src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
+ src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
+ src/pdf_processing.py,sha256=oKVPnIu_keiN17XLOGImeyJ4iMT2H51x4OD1Tp9yw1s,19992
10
+ src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
+ src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
+ src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=Fv4Y6Lc8e6aFFcwX0kLOal2y4TrR-XfAzjtuQnBwo0o,12815
14
+ src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
+ src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
16
+ src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
17
+ src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
18
+ src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
19
+ src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt,sha256=XgfhrFTXLJ467L4Cer77K0KTPtWTg_-QJXCsltvLlpI,3430
20
+ src/prompts/library/bookingConfirmation/maersk/placeholders.json,sha256=6p_IQMA1PUgGZqjf_by4ja9jK27ba4loYhEpIa7Oxx4,1406
21
+ src/prompts/library/bookingConfirmation/maersk/prompt.txt,sha256=t-yh1dOrcRa0fm0VPFC1xCRBf0R0Zjp9j_Hb31aZS1w,3223
22
+ src/prompts/library/bookingConfirmation/msc/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
23
+ src/prompts/library/bookingConfirmation/msc/prompt.txt,sha256=_Jfioislp7SNs2BEXoklvnTPVXe6Z0M6myD1IWnBFYQ,4705
24
+ src/prompts/library/bookingConfirmation/oocl/placeholders.json,sha256=JTtWvLSsoxN7huXY8ZNqqPkODM-DOs5wu3YvNHOna3k,1404
25
+ src/prompts/library/bookingConfirmation/oocl/prompt.txt,sha256=xNTrJdUtDalcP3AKkfRiOnHjAdRCbcTvehcBQKurRj0,2201
26
+ src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
27
+ src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
28
+ src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
29
+ src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
30
+ src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
31
+ src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
32
+ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
33
+ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
34
+ src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
35
+ src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
36
+ src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
37
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
38
+ src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
39
+ src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
40
+ src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
41
+ src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
42
+ src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
43
+ src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
44
+ src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
45
+ src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
46
+ src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
48
+ src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
+ src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
+ src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
51
+ src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
52
+ src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
+ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
+ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
+ src/setup.py,sha256=EHfAl3Pvb082dl_s6Tk93IjtE3vBmrW_fp2GW4955HQ,6952
56
+ src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
57
+ src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
+ data_science_document_ai-1.51.0.dist-info/METADATA,sha256=mQBXhk_NZlceIozn434C-du7ESoNGyhgGJxQRZYYLNs,2152
59
+ data_science_document_ai-1.51.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.51.0.dist-info/RECORD,,
src/constants.py CHANGED
@@ -23,9 +23,12 @@ project_parameters = {
23
23
  "invoice_classification_lookup": "invoice_classification.json",
24
24
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
25
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 70,
26
+ "fuzzy_threshold_item_code": 90,
27
27
  "fuzzy_threshold_reverse_charge": 80,
28
28
  "fuzzy_threshold_invoice_classification": 70,
29
+ # Chunking params
30
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
31
+ "chunk_after": 10, # pages
29
32
  # Big Query
30
33
  "g_ai_gbq_db_schema": "document_ai",
31
34
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,15 +53,6 @@ project_parameters = {
50
53
  "model_selector": {
51
54
  "stable": {
52
55
  "bookingConfirmation": 1,
53
- "packingList": 0,
54
- "commercialInvoice": 0,
55
- "finalMbL": 0,
56
- "draftMbl": 0,
57
- "arrivalNotice": 0,
58
- "shippingInstruction": 0,
59
- "customsAssessment": 0,
60
- "deliveryOrder": 0,
61
- "partnerInvoice": 0,
62
56
  },
63
57
  "beta": {
64
58
  "bookingConfirmation": 0,
@@ -86,8 +80,10 @@ project_parameters = {
86
80
  # Key to combine the LLM results with the Doc Ai results
87
81
  "key_to_combine": {
88
82
  "bookingConfirmation": ["transportLegs"],
83
+ "arrivalNotice": ["containers"],
89
84
  "finalMbL": ["containers"],
90
85
  "draftMbl": ["containers"],
86
+ "deliveryOrder": ["Equipment", "TransportLeg"],
91
87
  "customsAssessment": ["containers"],
92
88
  "packingList": ["skuData"],
93
89
  "commercialInvoice": ["skus"],
src/docai.py CHANGED
@@ -3,11 +3,16 @@ import re
3
3
 
4
4
  from google.cloud import documentai
5
5
 
6
- from src.io import delete_folder_from_bucket, logger, upload_pdf_to_bucket
6
+ from src.io import (
7
+ delete_folder_from_bucket,
8
+ get_gcp_labels,
9
+ logger,
10
+ upload_pdf_to_bucket,
11
+ )
7
12
  from src.utils import cache_on_disk
8
13
 
9
14
 
10
- async def _process_pdf_w_docai(image_content, client, processor_name):
15
+ async def _process_pdf_w_docai(image_content, client, processor_name, doc_type=None):
11
16
  """Process the PDF using Document AI.
12
17
 
13
18
  Args:
@@ -15,6 +20,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
15
20
  client: The Document AI client.
16
21
  processor_name (str): The name of the processor to be used.
17
22
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
23
+ doc_type (str, optional): Document type for cost tracking labels.
18
24
 
19
25
  Returns:
20
26
  The processed document.
@@ -24,10 +30,11 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
24
30
  content=image_content, mime_type="application/pdf"
25
31
  )
26
32
 
27
- # Configure the process request
33
+ # Configure the process request with labels for cost tracking
28
34
  request = documentai.ProcessRequest(
29
35
  name=processor_name,
30
36
  raw_document=raw_document, # field_mask=field_mask
37
+ labels=get_gcp_labels(doc_type=doc_type),
31
38
  )
32
39
  result = await cache_on_disk(client.process_document, request=request)
33
40
 
@@ -35,7 +42,7 @@ async def _process_pdf_w_docai(image_content, client, processor_name):
35
42
 
36
43
 
37
44
  async def _batch_process_pdf_w_docai(
38
- params, image_content, client, processor_name, timeout=1200
45
+ params, image_content, client, processor_name, timeout=1200, doc_type=None
39
46
  ):
40
47
  """Process the PDF using Document AI Batch Process API.
41
48
 
@@ -45,6 +52,7 @@ async def _batch_process_pdf_w_docai(
45
52
  processor_name (str): The name of the processor to be used.
46
53
  e.g.: projects/{project_id}/locations/{location}/processor/{processor_id}
47
54
  timeout (int, optional): The timeout in seconds. Defaults to 1200.
55
+ doc_type (str, optional): Document type for cost tracking labels.
48
56
 
49
57
  Returns:
50
58
  The processed document.
@@ -72,11 +80,12 @@ async def _batch_process_pdf_w_docai(
72
80
  # Where to write results
73
81
  output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74
82
 
75
- # The full resource name of the processor
83
+ # The full resource name of the processor with labels for cost tracking
76
84
  request = documentai.BatchProcessRequest(
77
85
  name=processor_name,
78
86
  input_documents=input_config,
79
87
  document_output_config=output_config,
88
+ labels=get_gcp_labels(doc_type=doc_type),
80
89
  )
81
90
 
82
91
  # BatchProcess returns a Long Running Operation (LRO)
@@ -13,62 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
- finalMbL:
31
- - id: "1eda2f22d64b1b89"
32
- details:
33
- display_name: "doc_cap_finalMbL"
34
- author: "igor.tonko@forto.com"
35
- created_date: ""
36
-
37
- draftMbl:
38
- - id: "1eda2f22d64b1b89"
39
- details:
40
- display_name: "doc_cap_finalMbL"
41
- author: "igor.tonko@forto.com"
42
- created_date: ""
43
-
44
- shippingInstruction:
45
- - id: "c77a0a515d99a8ba"
46
- details:
47
- display_name: "doc_cap_shippingInstruction"
48
- author: "kumar.rajendrababu@forto.com"
49
- created_date: ""
50
-
51
- arrivalNotice:
52
- - id: "748b2e2b9161dcf3"
53
- details:
54
- display_name: "doc_cap_arrivalNotice"
55
- author: "osman.demirel@forto.com"
56
- created_date: ""
57
-
58
- customsAssessment:
59
- - id: "c464a18d82fad9be"
60
- details:
61
- display_name: "doc_cap_customsAssessment"
62
- author: "igor.tonko@forto.com"
63
- created_date: ""
64
-
65
- deliveryOrder:
66
- - id: "2245a72c7a5dbf5f"
67
- details:
68
- display_name: "doc_cap_releaseNote"
69
- author: "igor.tonko@forto.com"
70
- created_date: ""
71
-
72
16
  beta:
73
17
  bookingConfirmation:
74
18
  - id: "3c280b11bdb3ed89"
src/excel_processing.py CHANGED
@@ -2,21 +2,25 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
+ from ddtrace import tracer
6
+
5
7
  logger = logging.getLogger(__name__)
6
8
 
7
9
  import asyncio
8
- import json
9
10
 
10
11
  import numpy as np
11
12
  import pandas as pd
12
13
 
13
- from src.llm import prompt_excel_extraction
14
- from src.utils import generate_schema_structure, get_excel_sheets
14
+ from src.prompts.prompt_library import prompt_library
15
+ from src.utils import estimate_page_count, get_excel_sheets
15
16
 
16
17
 
17
- async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
18
+ async def extract_data_from_sheet(
19
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
20
+ ):
18
21
  logger.info(f"Processing sheet: {sheet_name}")
19
- excel_content = pd.DataFrame(sheet.values)
22
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
23
+
20
24
  # Convert to Markdown format for the LLM model
21
25
  worksheet = (
22
26
  "This is from a excel. Pay attention to the cell position:\n"
@@ -24,12 +28,16 @@ async def extract_data_from_sheet(params, sheet_name, sheet, response_schema):
24
28
  )
25
29
 
26
30
  # Prompt for the LLM JSON
27
- prompt_docai = prompt_excel_extraction(worksheet)
31
+ prompt = prompt_library.library[doc_type]["other"]["prompt"]
32
+
33
+ # Join the worksheet content with the prompt
34
+ prompt = worksheet + "\n" + prompt
28
35
 
29
36
  try:
30
- result = await params["LlmClient"].get_unified_json_genai(
31
- prompt_docai,
37
+ result = await llm_client.get_unified_json_genai(
38
+ prompt,
32
39
  response_schema=response_schema,
40
+ doc_type=doc_type,
33
41
  )
34
42
  except Exception as e:
35
43
  result = {}
@@ -43,6 +51,7 @@ async def extract_data_from_excel(
43
51
  input_doc_type,
44
52
  file_content,
45
53
  mime_type,
54
+ llm_client,
46
55
  ):
47
56
  """Extract data from the Excel file.
48
57
 
@@ -51,6 +60,7 @@ async def extract_data_from_excel(
51
60
  input_doc_type (str): The type of the document.
52
61
  file_content (bytes): The content of the Excel file to process.
53
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
54
64
 
55
65
  Returns:
56
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -59,20 +69,31 @@ async def extract_data_from_excel(
59
69
 
60
70
  """
61
71
  # Generate the response structure
62
- response_schema = generate_schema_structure(params, input_doc_type)
72
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
63
73
 
64
74
  # Load the Excel file and get ONLY the "visible" sheet names
65
75
  sheets, workbook = get_excel_sheets(file_content, mime_type)
66
76
 
77
+ # Track the number of sheets in dd-trace
78
+ span = tracer.current_span()
79
+ if span:
80
+ estimated_page_counts = [
81
+ estimate_page_count(workbook[sheet]) for sheet in sheets
82
+ ]
83
+ est_page_count = sum(estimated_page_counts)
84
+ span.set_metric("est_page_count", est_page_count)
85
+
67
86
  # Excel files may contain multiple sheets. Extract data from each sheet
68
87
  sheet_extract_tasks = [
69
88
  extract_data_from_sheet(
70
- params, sheet_name, workbook[sheet_name], response_schema
89
+ llm_client,
90
+ sheet_name,
91
+ workbook[sheet_name],
92
+ response_schema,
93
+ doc_type=input_doc_type,
71
94
  )
72
95
  for sheet_name in sheets
73
96
  ]
74
97
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
75
98
 
76
- stored_data = json.dumps(extracted_data)
77
-
78
- return extracted_data, stored_data, params["gemini_params"]["model_id"]
99
+ return extracted_data, extracted_data, llm_client.model_id
src/io.py CHANGED
@@ -12,13 +12,55 @@ from pathlib import Path
12
12
  from google.cloud import bigquery, storage
13
13
 
14
14
 
15
+ def get_gcp_labels(**extra_labels):
16
+ """Generate standardized GCP labels for cost tracking.
17
+
18
+ Args:
19
+ **extra_labels: Additional custom labels
20
+
21
+ Returns:
22
+ dict: Labels dictionary with keys normalized (lowercase, hyphens, max 63 chars)
23
+ """
24
+ project_name = os.getenv("PROJECT_NAME")
25
+
26
+ # If not set, detect once and cache it
27
+ if not project_name:
28
+ # Try pyproject.toml first
29
+ try:
30
+ import toml
31
+
32
+ pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
33
+ if pyproject_path.exists():
34
+ config = toml.load(pyproject_path)
35
+ project_name = config.get("tool", {}).get("poetry", {}).get("name")
36
+ except Exception:
37
+ pass
38
+
39
+ # Fallback to unknown
40
+ if not project_name:
41
+ project_name = "unknown"
42
+
43
+ # Cache it
44
+ os.environ["PROJECT_NAME"] = project_name
45
+
46
+ labels = {
47
+ "ds-project-name": project_name.lower(),
48
+ "ds-env": os.getenv("CLUSTER", "local").lower(),
49
+ }
50
+
51
+ # Add any extra labels
52
+ labels.update({k.lower(): str(v).lower() for k, v in extra_labels.items()})
53
+
54
+ return labels
55
+
56
+
15
57
  def get_bq_client(params):
16
58
  """Get Google BigQuery client."""
17
59
  bq_client = bigquery.Client(project=params["g_ai_project_name"])
18
60
  job_config = bigquery.QueryJobConfig(
19
61
  allow_large_results=True,
20
62
  # flatten_results=True,
21
- labels={"project-name": params["project_name"]},
63
+ labels=get_gcp_labels(),
22
64
  )
23
65
  return bq_client, job_config
24
66
 
@@ -112,3 +154,29 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
112
154
  Path(directory).mkdir(parents=True, exist_ok=True)
113
155
  blob.download_to_filename(directory_local / Path(blob.name))
114
156
  return result
157
+
158
+
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
182
+ # type: ignore
src/llm.py CHANGED
@@ -15,6 +15,7 @@ from vertexai.generative_models import (
15
15
  Part,
16
16
  )
17
17
 
18
+ from src.io import get_gcp_labels
18
19
  from src.utils import cache_on_disk
19
20
 
20
21
 
@@ -69,6 +70,7 @@ class LlmClient:
69
70
  document: str = None,
70
71
  response_schema: dict = None,
71
72
  response_mime_type: str = "application/json",
73
+ doc_type: str = None,
72
74
  ):
73
75
  """Ask the Gemini model a question.
74
76
 
@@ -76,6 +78,7 @@ class LlmClient:
76
78
  prompt (str): The prompt to send to the model.
77
79
  document (str, optional): An optional document to provide context.
78
80
  response_schema (dict, optional): Defines a specific response schema for the model.
81
+ doc_type (str, optional): Document type for cost tracking labels.
79
82
 
80
83
  Returns:
81
84
  str: The response from the model.
@@ -96,12 +99,13 @@ class LlmClient:
96
99
  # Prepare inputs for the model
97
100
  inputs = [document, prompt] if document else prompt
98
101
 
99
- # Generate the response
102
+ # Generate the response with labels for cost tracking
100
103
  model_response = await cache_on_disk(
101
104
  self.geminy_client.generate_content_async,
102
105
  contents=inputs,
103
106
  generation_config=config,
104
107
  safety_settings=self.safety_config,
108
+ labels=get_gcp_labels(doc_type=doc_type),
105
109
  )
106
110
 
107
111
  response_text = model_response.text
@@ -113,7 +117,7 @@ class LlmClient:
113
117
  return "{}"
114
118
 
115
119
  async def get_unified_json_genai(
116
- self, prompt, document=None, response_schema=None, model="gemini"
120
+ self, prompt, document=None, response_schema=None, model="gemini", doc_type=None
117
121
  ):
118
122
  """Send a prompt to a Google Cloud AI Platform model and returns the generated json.
119
123
 
@@ -122,6 +126,7 @@ class LlmClient:
122
126
  document: Content of the PDF document
123
127
  response_schema: The schema to use for the response
124
128
  model (str): The model to use for the response ["gemini" or "chatGPT"]. Default is "gemini".
129
+ doc_type (str, optional): Document type for cost tracking labels.
125
130
 
126
131
  Returns:
127
132
  dict: The generated json from the model.
@@ -131,7 +136,9 @@ class LlmClient:
131
136
  response = await self.ask_chatgpt(prompt, document, response_schema)
132
137
  else:
133
138
  # Default to Gemini
134
- response = await self.ask_gemini(prompt, document, response_schema)
139
+ response = await self.ask_gemini(
140
+ prompt, document, response_schema, doc_type=doc_type
141
+ )
135
142
 
136
143
  try:
137
144
  return json.loads(response)
@@ -194,33 +201,4 @@ class LlmClient:
194
201
  return response
195
202
 
196
203
 
197
- def prompt_excel_extraction(excel_structured_text):
198
- """Write a prompt to extract data from Excel files.
199
-
200
- Args:
201
- excel_structured_text (str): The structured text of the Excel file.
202
-
203
- Returns:
204
- prompt str: The prompt for common json.
205
- """
206
- prompt = f"""{excel_structured_text}
207
-
208
- Task: Fill in the following dictionary from the information in the given in the above excel data.
209
-
210
- Instructions:
211
- - Do not change the keys of the following dictionary.
212
- - The values should be filled in as per the schema provided below.
213
- - If an entity contains a 'display_name', consider its properties as child data points in the below format.
214
- {{'data-field': {{
215
- 'child-data-field': 'type -occurrence_type- description',
216
- }}
217
- }}
218
- - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
219
- - Ensure the schema reflects the hierarchical relationship.
220
- - Use the data field description to understand the context of the data.
221
-
222
- """
223
- return prompt
224
-
225
-
226
204
  # pylint: enable=all