data-science-document-ai 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +2 -2
  2. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD +34 -31
  3. src/constants.py +7 -10
  4. src/docai_processor_config.yaml +0 -56
  5. src/excel_processing.py +24 -14
  6. src/io.py +23 -0
  7. src/llm.py +0 -29
  8. src/pdf_processing.py +156 -51
  9. src/postprocessing/common.py +172 -28
  10. src/postprocessing/postprocess_partner_invoice.py +194 -59
  11. src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
  12. src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
  13. src/prompts/library/bundeskasse/other/placeholders.json +5 -5
  14. src/prompts/library/bundeskasse/other/prompt.txt +7 -5
  15. src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
  16. src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
  17. src/prompts/library/customsAssessment/other/placeholders.json +70 -0
  18. src/prompts/library/customsAssessment/other/prompt.txt +24 -37
  19. src/prompts/library/customsInvoice/other/prompt.txt +4 -3
  20. src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
  21. src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
  22. src/prompts/library/draftMbl/other/placeholders.json +33 -33
  23. src/prompts/library/draftMbl/other/prompt.txt +34 -44
  24. src/prompts/library/finalMbL/other/placeholders.json +80 -0
  25. src/prompts/library/finalMbL/other/prompt.txt +34 -44
  26. src/prompts/library/packingList/other/placeholders.json +98 -0
  27. src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
  28. src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
  29. src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
  30. src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
  31. src/prompts/prompt_library.py +0 -4
  32. src/setup.py +15 -16
  33. src/utils.py +120 -68
  34. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
  35. src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
  36. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
  37. src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
  38. {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.42.5
3
+ Version: 1.56.1
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
38
38
  Requires-Dist: pgzip (>=0.3.5,<0.4.0)
39
39
  Requires-Dist: pyarrow (==16.1.0)
40
40
  Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
41
- Requires-Dist: pypdf2 (>=3.0.1,<4.0.0)
41
+ Requires-Dist: pypdf (>=6.1.2,<7.0.0)
42
42
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
43
43
  Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
44
44
  Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
@@ -1,16 +1,18 @@
1
- src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
1
+ src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
- src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=8toKsafUvwE5QN3TOQO3zfLo0Wv2sGxZHKPsL7n5LkA,2771
6
- src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
- src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
4
+ src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
5
+ src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
6
+ src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
+ src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=dxsYvNnONAjzS-T7K5aSo89rz7QcdW3ZDfeuFyeCeII,16294
10
- src/postprocessing/common.py,sha256=5W-u3lKbnPQRKT4h5EfegegMjSXOKik73X7kUx9ik0Y,21888
9
+ src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
10
+ src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
14
+ src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
15
+ src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
14
16
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
15
17
  src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
16
18
  src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
@@ -25,33 +27,34 @@ src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPd
25
27
  src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
26
28
  src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
27
29
  src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
28
- src/prompts/library/bundeskasse/other/placeholders.json,sha256=1ll8AI58F2zRDSwQq_r0gxQdxlQB521l5CuiJ-8G6us,4068
29
- src/prompts/library/bundeskasse/other/prompt.txt,sha256=WV4D3ellIcB2cVmsZXCpbbHOShYY8VN_iZrYOuyoqzw,2937
30
- src/prompts/library/commercialInvoice/other/prompt.txt,sha256=6sowYMzrKvgmTDpDnAzkeG4OqA44e6-8aUKWRKNziBY,2699
31
- src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
30
+ src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
31
+ src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
32
+ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
33
+ src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
34
+ src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
35
+ src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
32
36
  src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
33
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=Q5ihAVaZFToZ75D01ICEdCRB8nY_FD5DL3yuFvJ4418,9632
34
- src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
35
- src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
36
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
37
- src/prompts/library/draftMbl/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
38
- src/prompts/library/draftMbl/other/placeholders.json,sha256=wIN06_NWsESDyNEDfOLPi3F2Vq-XPa4O3U32A32s-_Q,1736
39
- src/prompts/library/draftMbl/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
40
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt,sha256=RhxEJ4eWikAQiE40cuPsssnzizge6AJYFTSJLGUmz_U,2326
41
- src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
42
- src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
37
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=hUBDhocFdHTiWdEPgEE8yKHqpIYOfOj-j9CvZd-3YZc,9941
38
+ src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
39
+ src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
40
+ src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
41
+ src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
42
+ src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
43
+ src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
44
+ src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
43
45
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
44
46
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
45
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=fGUtMYWvhedmSiv9xShRv0cHXmEws1D9pQmZP1E2gl0,7806
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=A3nw6QfraU1N6Aui4TC7eFofG3rUyo9cz8Ha1iQbMpU,8141
46
48
  src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
47
49
  src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
48
- src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
50
+ src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
49
51
  src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
50
- src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYpwmok_1dWQ2Oa8Qy9NGZ6CCLOUI4,1468
51
- src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg,2760
52
- src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
52
+ src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
53
+ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
54
+ src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
55
+ src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
53
56
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
- src/utils.py,sha256=nU69zR3TB7IZmCc19DD8H27Riek8GJAldmhJjCSwNEE,16090
55
- data_science_document_ai-1.42.5.dist-info/METADATA,sha256=FauluZfyiueEsYJsiMdiXv7yko2N3Xp5UTe8K0U1Toc,2153
56
- data_science_document_ai-1.42.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.42.5.dist-info/RECORD,,
57
+ src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
58
+ data_science_document_ai-1.56.1.dist-info/METADATA,sha256=4rIhyVd5XG02M7f9l2UYjH6r-pjzpNiobuZ-v-trvtE,2152
59
+ data_science_document_ai-1.56.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
60
+ data_science_document_ai-1.56.1.dist-info/RECORD,,
src/constants.py CHANGED
@@ -20,12 +20,16 @@ project_parameters = {
20
20
  # Fuzzy lookup
21
21
  "g_model_fuzzy_lookup_folder": "fuzzy_lookup",
22
22
  "item_code_lookup": "line_item_kvp_table.json",
23
+ "intermodal_partners": "intermodal_partners.json",
23
24
  "invoice_classification_lookup": "invoice_classification.json",
24
25
  "reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
25
26
  # Fuzzy logic params
26
- "fuzzy_threshold_item_code": 70,
27
+ "fuzzy_threshold_item_code": 92,
27
28
  "fuzzy_threshold_reverse_charge": 80,
28
29
  "fuzzy_threshold_invoice_classification": 70,
30
+ # Chunking params
31
+ "chunk_size": 1, # page (do not change this without changing the page number logic)
32
+ "chunk_after": 10, # pages
29
33
  # Big Query
30
34
  "g_ai_gbq_db_schema": "document_ai",
31
35
  "g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
@@ -50,15 +54,6 @@ project_parameters = {
50
54
  "model_selector": {
51
55
  "stable": {
52
56
  "bookingConfirmation": 1,
53
- "packingList": 0,
54
- "commercialInvoice": 0,
55
- "finalMbL": 0,
56
- "draftMbl": 0,
57
- "arrivalNotice": 0,
58
- "shippingInstruction": 0,
59
- "customsAssessment": 0,
60
- "deliveryOrder": 0,
61
- "partnerInvoice": 0,
62
57
  },
63
58
  "beta": {
64
59
  "bookingConfirmation": 0,
@@ -86,8 +81,10 @@ project_parameters = {
86
81
  # Key to combine the LLM results with the Doc Ai results
87
82
  "key_to_combine": {
88
83
  "bookingConfirmation": ["transportLegs"],
84
+ "arrivalNotice": ["containers"],
89
85
  "finalMbL": ["containers"],
90
86
  "draftMbl": ["containers"],
87
+ "deliveryOrder": ["Equipment", "TransportLeg"],
91
88
  "customsAssessment": ["containers"],
92
89
  "packingList": ["skuData"],
93
90
  "commercialInvoice": ["skus"],
@@ -13,62 +13,6 @@ model_config:
13
13
  author: "igor.tonko@forto.com"
14
14
  created_date: ""
15
15
 
16
- packingList:
17
- - id: "d967005bd9d45aeb"
18
- details:
19
- display_name: "doc_cap_packingList"
20
- author: "kumar.rajendrababu@forto.com"
21
- created_date: ""
22
-
23
- commercialInvoice:
24
- - id: "7d37236207f75758"
25
- details:
26
- display_name: "doc_cap_commercialInvoice"
27
- author: "kumar.rajendrababu@forto.com"
28
- created_date: ""
29
-
30
- finalMbL:
31
- - id: "1eda2f22d64b1b89"
32
- details:
33
- display_name: "doc_cap_finalMbL"
34
- author: "igor.tonko@forto.com"
35
- created_date: ""
36
-
37
- draftMbl:
38
- - id: "1eda2f22d64b1b89"
39
- details:
40
- display_name: "doc_cap_finalMbL"
41
- author: "igor.tonko@forto.com"
42
- created_date: ""
43
-
44
- shippingInstruction:
45
- - id: "c77a0a515d99a8ba"
46
- details:
47
- display_name: "doc_cap_shippingInstruction"
48
- author: "kumar.rajendrababu@forto.com"
49
- created_date: ""
50
-
51
- arrivalNotice:
52
- - id: "748b2e2b9161dcf3"
53
- details:
54
- display_name: "doc_cap_arrivalNotice"
55
- author: "osman.demirel@forto.com"
56
- created_date: ""
57
-
58
- customsAssessment:
59
- - id: "c464a18d82fad9be"
60
- details:
61
- display_name: "doc_cap_customsAssessment"
62
- author: "igor.tonko@forto.com"
63
- created_date: ""
64
-
65
- deliveryOrder:
66
- - id: "2245a72c7a5dbf5f"
67
- details:
68
- display_name: "doc_cap_releaseNote"
69
- author: "igor.tonko@forto.com"
70
- created_date: ""
71
-
72
16
  beta:
73
17
  bookingConfirmation:
74
18
  - id: "3c280b11bdb3ed89"
src/excel_processing.py CHANGED
@@ -2,22 +2,21 @@
2
2
  # flake8: noqa: E402
3
3
  import logging
4
4
 
5
- from src.postprocessing.common import llm_prediction_to_tuples
5
+ from ddtrace import tracer
6
6
 
7
7
  logger = logging.getLogger(__name__)
8
8
 
9
9
  import asyncio
10
- import json
11
10
 
12
11
  import numpy as np
13
12
  import pandas as pd
14
13
 
15
- from src.llm import prompt_excel_extraction
16
- from src.utils import generate_schema_structure, get_excel_sheets
14
+ from src.prompts.prompt_library import prompt_library
15
+ from src.utils import estimate_page_count, get_excel_sheets
17
16
 
18
17
 
19
18
  async def extract_data_from_sheet(
20
- params, sheet_name, sheet, response_schema, doc_type=None
19
+ llm_client, sheet_name, sheet, response_schema, doc_type=None
21
20
  ):
22
21
  logger.info(f"Processing sheet: {sheet_name}")
23
22
  excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
@@ -29,11 +28,14 @@ async def extract_data_from_sheet(
29
28
  )
30
29
 
31
30
  # Prompt for the LLM JSON
32
- prompt_docai = prompt_excel_extraction(worksheet)
31
+ prompt = prompt_library.library[doc_type]["other"]["prompt"]
32
+
33
+ # Join the worksheet content with the prompt
34
+ prompt = worksheet + "\n" + prompt
33
35
 
34
36
  try:
35
- result = await params["LlmClient"].get_unified_json_genai(
36
- prompt_docai,
37
+ result = await llm_client.get_unified_json_genai(
38
+ prompt,
37
39
  response_schema=response_schema,
38
40
  doc_type=doc_type,
39
41
  )
@@ -49,6 +51,7 @@ async def extract_data_from_excel(
49
51
  input_doc_type,
50
52
  file_content,
51
53
  mime_type,
54
+ llm_client,
52
55
  ):
53
56
  """Extract data from the Excel file.
54
57
 
@@ -57,6 +60,7 @@ async def extract_data_from_excel(
57
60
  input_doc_type (str): The type of the document.
58
61
  file_content (bytes): The content of the Excel file to process.
59
62
  mime_type (str): The MIME type of the file.
63
+ llm_client: The LLM client to use for data extraction.
60
64
 
61
65
  Returns:
62
66
  formatted_data (list): A list of dictionaries containing the extracted data.
@@ -65,15 +69,24 @@ async def extract_data_from_excel(
65
69
 
66
70
  """
67
71
  # Generate the response structure
68
- response_schema = generate_schema_structure(params, input_doc_type)
72
+ response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
69
73
 
70
74
  # Load the Excel file and get ONLY the "visible" sheet names
71
75
  sheets, workbook = get_excel_sheets(file_content, mime_type)
72
76
 
77
+ # Track the number of sheets in dd-trace
78
+ span = tracer.current_span()
79
+ if span:
80
+ estimated_page_counts = [
81
+ estimate_page_count(workbook[sheet]) for sheet in sheets
82
+ ]
83
+ est_page_count = sum(estimated_page_counts)
84
+ span.set_metric("est_page_count", est_page_count)
85
+
73
86
  # Excel files may contain multiple sheets. Extract data from each sheet
74
87
  sheet_extract_tasks = [
75
88
  extract_data_from_sheet(
76
- params,
89
+ llm_client,
77
90
  sheet_name,
78
91
  workbook[sheet_name],
79
92
  response_schema,
@@ -83,7 +96,4 @@ async def extract_data_from_excel(
83
96
  ]
84
97
  extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
85
98
 
86
- # Convert LLM prediction dictionary to tuples of (value, page_number).
87
- extracted_data = llm_prediction_to_tuples(extracted_data)
88
-
89
- return extracted_data, extracted_data, params["gemini_params"]["model_id"]
99
+ return extracted_data, extracted_data, llm_client.model_id
src/io.py CHANGED
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
156
156
  return result
157
157
 
158
158
 
159
+ def bq_logs(data_to_insert, params):
160
+ """Insert logs into Google BigQuery.
161
+
162
+ Args:
163
+ data_to_insert (list): The data to insert into BigQuery.
164
+ params (dict): The parameters dictionary.
165
+ """
166
+ # Use the pre-initialized BigQuery client
167
+ bq_client = params["bq_client"]
168
+ # Get the table string
169
+ table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
170
+
171
+ logger.info(f"Log table: {table_string}")
172
+ # Insert the rows into the table
173
+ insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
174
+
175
+ # Check if there were any errors inserting the rows
176
+ if not insert_logs:
177
+ logger.info("New rows have been added.")
178
+ else:
179
+ logger.info("Errors occurred while inserting rows: ", insert_logs)
180
+
181
+
159
182
  # type: ignore
src/llm.py CHANGED
@@ -201,33 +201,4 @@ class LlmClient:
201
201
  return response
202
202
 
203
203
 
204
- def prompt_excel_extraction(excel_structured_text):
205
- """Write a prompt to extract data from Excel files.
206
-
207
- Args:
208
- excel_structured_text (str): The structured text of the Excel file.
209
-
210
- Returns:
211
- prompt str: The prompt for common json.
212
- """
213
- prompt = f"""{excel_structured_text}
214
-
215
- Task: Fill in the following dictionary from the information in the given in the above excel data.
216
-
217
- Instructions:
218
- - Do not change the keys of the following dictionary.
219
- - The values should be filled in as per the schema provided below.
220
- - If an entity contains a 'display_name', consider its properties as child data points in the below format.
221
- {{'data-field': {{
222
- 'child-data-field': 'type -occurrence_type- description',
223
- }}
224
- }}
225
- - The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
226
- - Ensure the schema reflects the hierarchical relationship.
227
- - Use the data field description to understand the context of the data.
228
-
229
- """
230
- return prompt
231
-
232
-
233
204
  # pylint: enable=all