data-science-document-ai 1.42.5__py3-none-any.whl → 1.56.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA +2 -2
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD +34 -31
- src/constants.py +7 -10
- src/docai_processor_config.yaml +0 -56
- src/excel_processing.py +24 -14
- src/io.py +23 -0
- src/llm.py +0 -29
- src/pdf_processing.py +156 -51
- src/postprocessing/common.py +172 -28
- src/postprocessing/postprocess_partner_invoice.py +194 -59
- src/prompts/library/arrivalNotice/other/placeholders.json +70 -0
- src/prompts/library/arrivalNotice/other/prompt.txt +40 -0
- src/prompts/library/bundeskasse/other/placeholders.json +5 -5
- src/prompts/library/bundeskasse/other/prompt.txt +7 -5
- src/prompts/library/commercialInvoice/other/placeholders.json +125 -0
- src/prompts/library/commercialInvoice/other/prompt.txt +1 -1
- src/prompts/library/customsAssessment/other/placeholders.json +70 -0
- src/prompts/library/customsAssessment/other/prompt.txt +24 -37
- src/prompts/library/customsInvoice/other/prompt.txt +4 -3
- src/prompts/library/deliveryOrder/other/placeholders.json +80 -27
- src/prompts/library/deliveryOrder/other/prompt.txt +26 -40
- src/prompts/library/draftMbl/other/placeholders.json +33 -33
- src/prompts/library/draftMbl/other/prompt.txt +34 -44
- src/prompts/library/finalMbL/other/placeholders.json +80 -0
- src/prompts/library/finalMbL/other/prompt.txt +34 -44
- src/prompts/library/packingList/other/placeholders.json +98 -0
- src/prompts/library/partnerInvoice/other/prompt.txt +8 -7
- src/prompts/library/preprocessing/carrier/placeholders.json +0 -16
- src/prompts/library/shippingInstruction/other/placeholders.json +115 -0
- src/prompts/library/shippingInstruction/other/prompt.txt +26 -14
- src/prompts/prompt_library.py +0 -4
- src/setup.py +15 -16
- src/utils.py +120 -68
- src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +0 -45
- src/prompts/library/draftMbl/maersk/prompt.txt +0 -19
- src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +0 -44
- src/prompts/library/finalMbL/maersk/prompt.txt +0 -19
- {data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/WHEEL +0 -0
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-science-document-ai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.56.1
|
|
4
4
|
Summary: "Document AI repo for data science"
|
|
5
5
|
Author: Naomi Nguyen
|
|
6
6
|
Author-email: naomi.nguyen@forto.com
|
|
@@ -38,7 +38,7 @@ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
|
|
38
38
|
Requires-Dist: pgzip (>=0.3.5,<0.4.0)
|
|
39
39
|
Requires-Dist: pyarrow (==16.1.0)
|
|
40
40
|
Requires-Dist: pymupdf (>=1.23.26,<2.0.0)
|
|
41
|
-
Requires-Dist:
|
|
41
|
+
Requires-Dist: pypdf (>=6.1.2,<7.0.0)
|
|
42
42
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
|
43
43
|
Requires-Dist: python-multipart (>=0.0.7,<0.0.8)
|
|
44
44
|
Requires-Dist: rapidfuzz (>=3.12.2,<4.0.0)
|
{data_science_document_ai-1.42.5.dist-info → data_science_document_ai-1.56.1.dist-info}/RECORD
RENAMED
|
@@ -1,16 +1,18 @@
|
|
|
1
|
-
src/constants.py,sha256=
|
|
1
|
+
src/constants.py,sha256=H43Az9AtoBKfcq9yY4TQQJY8DfdILV5kXy8EMtRaWYA,3583
|
|
2
2
|
src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
|
|
3
3
|
src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
|
|
4
|
-
src/docai_processor_config.yaml,sha256=
|
|
5
|
-
src/excel_processing.py,sha256=
|
|
6
|
-
src/io.py,sha256=
|
|
7
|
-
src/llm.py,sha256=
|
|
4
|
+
src/docai_processor_config.yaml,sha256=4yKKZPvFCA-3S56jDYSqMGKXGFND-768OiU2seRiAzE,604
|
|
5
|
+
src/excel_processing.py,sha256=TRgAzSHvL1WKbUgjHtpXL701bPhiWGH7kk3S6e1UPaA,3074
|
|
6
|
+
src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
|
|
7
|
+
src/llm.py,sha256=a7UYA4ITUNjzct_2fHgM-bma_XWc28VC0FV71g9tnUI,7137
|
|
8
8
|
src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
|
|
9
|
-
src/pdf_processing.py,sha256=
|
|
10
|
-
src/postprocessing/common.py,sha256=
|
|
9
|
+
src/pdf_processing.py,sha256=81fS2xL36n9QgB7DpXe7SCS-Lyz11cFDgccYMK3ZVkk,20026
|
|
10
|
+
src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
|
|
11
11
|
src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
|
|
12
12
|
src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
|
|
13
|
-
src/postprocessing/postprocess_partner_invoice.py,sha256=
|
|
13
|
+
src/postprocessing/postprocess_partner_invoice.py,sha256=WuaTQK5D09dV_QNrh29ZoKX9IvQn2Ub-WnAMyRjCsvI,14240
|
|
14
|
+
src/prompts/library/arrivalNotice/other/placeholders.json,sha256=1vzly1amgyKt3jr2JJQbb24kNZsnI289iduvoUo5dJU,3061
|
|
15
|
+
src/prompts/library/arrivalNotice/other/prompt.txt,sha256=QNuU-BvMA8VbdupVNapad4O3WmCotH5cKNxImRMbKDk,2906
|
|
14
16
|
src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
15
17
|
src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
|
|
16
18
|
src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
|
|
@@ -25,33 +27,34 @@ src/prompts/library/bookingConfirmation/other/placeholders.json,sha256=IpM9nmSPd
|
|
|
25
27
|
src/prompts/library/bookingConfirmation/other/prompt.txt,sha256=kUK7NgVNDYFMnqOcIblCwWSw2SC0YQEtHsYrspiVUMo,3379
|
|
26
28
|
src/prompts/library/bookingConfirmation/yangming/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
|
|
27
29
|
src/prompts/library/bookingConfirmation/yangming/prompt.txt,sha256=fYKfusDajDFw0v54-nv2iAqUSp2yCeOzc6G7AFe-h2w,3226
|
|
28
|
-
src/prompts/library/bundeskasse/other/placeholders.json,sha256=
|
|
29
|
-
src/prompts/library/bundeskasse/other/prompt.txt,sha256=
|
|
30
|
-
src/prompts/library/commercialInvoice/other/
|
|
31
|
-
src/prompts/library/
|
|
30
|
+
src/prompts/library/bundeskasse/other/placeholders.json,sha256=7xKzi_ypkIICO9nrEl45W9G7-h33uWVRVWnpg2b5lUg,4288
|
|
31
|
+
src/prompts/library/bundeskasse/other/prompt.txt,sha256=miNYoqRZEd6Z1LNisTahX1-tenzr5kEpRA6gvPH7NCw,3316
|
|
32
|
+
src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHiEQRYF6VgTiUiq68WGy5f7_4qL63CWyR0,4700
|
|
33
|
+
src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
|
|
34
|
+
src/prompts/library/customsAssessment/other/placeholders.json,sha256=scIV--C9HNWAQbU9zEz3GT_FoAvJqbfuY85YUtt7t-Q,3850
|
|
35
|
+
src/prompts/library/customsAssessment/other/prompt.txt,sha256=z3FuoHZ588Pz1WBJDW7ISAC3J6n7hPJCcS92CdHDTFw,2494
|
|
32
36
|
src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
|
|
33
|
-
src/prompts/library/customsInvoice/other/prompt.txt,sha256=
|
|
34
|
-
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=
|
|
35
|
-
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=
|
|
36
|
-
src/prompts/library/draftMbl/
|
|
37
|
-
src/prompts/library/draftMbl/
|
|
38
|
-
src/prompts/library/
|
|
39
|
-
src/prompts/library/
|
|
40
|
-
src/prompts/library/
|
|
41
|
-
src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTaMeOFt92Emba01mzJA,2192
|
|
42
|
-
src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
|
|
37
|
+
src/prompts/library/customsInvoice/other/prompt.txt,sha256=hUBDhocFdHTiWdEPgEE8yKHqpIYOfOj-j9CvZd-3YZc,9941
|
|
38
|
+
src/prompts/library/deliveryOrder/other/placeholders.json,sha256=j-9F4V3yDg4610PPsOwU3oOj_S9vAvAB9Ix155WGIwc,3827
|
|
39
|
+
src/prompts/library/deliveryOrder/other/prompt.txt,sha256=RD076vq0x0IjoEVQfh-G0u4nxITCpgKZGrwMlR9YAvk,2695
|
|
40
|
+
src/prompts/library/draftMbl/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
|
|
41
|
+
src/prompts/library/draftMbl/other/prompt.txt,sha256=4RjlGT2OFmcBCUJhuCnO9GtmCn3vVesXHi_ml2g3dK8,2386
|
|
42
|
+
src/prompts/library/finalMbL/other/placeholders.json,sha256=Gn8kQ8cMmrzRGLSFH7_8wO1_j2jxhqHd4zeivZP2SjU,4304
|
|
43
|
+
src/prompts/library/finalMbL/other/prompt.txt,sha256=cyeKjK94sepqXiLEeZKB4VpmT0-nqXALP4dih-B67M8,2386
|
|
44
|
+
src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
|
|
43
45
|
src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
|
|
44
46
|
src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
|
|
45
|
-
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=
|
|
47
|
+
src/prompts/library/partnerInvoice/other/prompt.txt,sha256=A3nw6QfraU1N6Aui4TC7eFofG3rUyo9cz8Ha1iQbMpU,8141
|
|
46
48
|
src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
|
|
47
49
|
src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
|
|
48
|
-
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=
|
|
50
|
+
src/prompts/library/preprocessing/carrier/placeholders.json,sha256=tQeVDtvembhVqvel9vGoy4qcKp1hOvg-bLCgZRdQj0g,192
|
|
49
51
|
src/prompts/library/preprocessing/carrier/prompt.txt,sha256=NLvRZQCZ6aWC1yTr7Q93jK5z7Vi_b4HBaiFYYnIsO-w,134
|
|
50
|
-
src/prompts/library/shippingInstruction/other/
|
|
51
|
-
src/prompts/
|
|
52
|
-
src/
|
|
52
|
+
src/prompts/library/shippingInstruction/other/placeholders.json,sha256=eK4AeMfORkGMWVYcqH7NjB56Zb4swHTvcQD5UQbTryg,6374
|
|
53
|
+
src/prompts/library/shippingInstruction/other/prompt.txt,sha256=CbrqlKMtB-sVY-8E460KP1KNmz169YVPMrH3-uEldPg,2135
|
|
54
|
+
src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
|
|
55
|
+
src/setup.py,sha256=yb0Pz1RI-uId5lEjgQrj1Pqo9FvwG9vs0HXRVbyST2M,7186
|
|
53
56
|
src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
|
|
54
|
-
src/utils.py,sha256=
|
|
55
|
-
data_science_document_ai-1.
|
|
56
|
-
data_science_document_ai-1.
|
|
57
|
-
data_science_document_ai-1.
|
|
57
|
+
src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
|
|
58
|
+
data_science_document_ai-1.56.1.dist-info/METADATA,sha256=4rIhyVd5XG02M7f9l2UYjH6r-pjzpNiobuZ-v-trvtE,2152
|
|
59
|
+
data_science_document_ai-1.56.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
60
|
+
data_science_document_ai-1.56.1.dist-info/RECORD,,
|
src/constants.py
CHANGED
|
@@ -20,12 +20,16 @@ project_parameters = {
|
|
|
20
20
|
# Fuzzy lookup
|
|
21
21
|
"g_model_fuzzy_lookup_folder": "fuzzy_lookup",
|
|
22
22
|
"item_code_lookup": "line_item_kvp_table.json",
|
|
23
|
+
"intermodal_partners": "intermodal_partners.json",
|
|
23
24
|
"invoice_classification_lookup": "invoice_classification.json",
|
|
24
25
|
"reverse_charge_sentence_lookup": "reverse_charge_sentences.json",
|
|
25
26
|
# Fuzzy logic params
|
|
26
|
-
"fuzzy_threshold_item_code":
|
|
27
|
+
"fuzzy_threshold_item_code": 92,
|
|
27
28
|
"fuzzy_threshold_reverse_charge": 80,
|
|
28
29
|
"fuzzy_threshold_invoice_classification": 70,
|
|
30
|
+
# Chunking params
|
|
31
|
+
"chunk_size": 1, # page (do not change this without changing the page number logic)
|
|
32
|
+
"chunk_after": 10, # pages
|
|
29
33
|
# Big Query
|
|
30
34
|
"g_ai_gbq_db_schema": "document_ai",
|
|
31
35
|
"g_ai_gbq_db_table_out": "document_ai_api_calls_v1",
|
|
@@ -50,15 +54,6 @@ project_parameters = {
|
|
|
50
54
|
"model_selector": {
|
|
51
55
|
"stable": {
|
|
52
56
|
"bookingConfirmation": 1,
|
|
53
|
-
"packingList": 0,
|
|
54
|
-
"commercialInvoice": 0,
|
|
55
|
-
"finalMbL": 0,
|
|
56
|
-
"draftMbl": 0,
|
|
57
|
-
"arrivalNotice": 0,
|
|
58
|
-
"shippingInstruction": 0,
|
|
59
|
-
"customsAssessment": 0,
|
|
60
|
-
"deliveryOrder": 0,
|
|
61
|
-
"partnerInvoice": 0,
|
|
62
57
|
},
|
|
63
58
|
"beta": {
|
|
64
59
|
"bookingConfirmation": 0,
|
|
@@ -86,8 +81,10 @@ project_parameters = {
|
|
|
86
81
|
# Key to combine the LLM results with the Doc Ai results
|
|
87
82
|
"key_to_combine": {
|
|
88
83
|
"bookingConfirmation": ["transportLegs"],
|
|
84
|
+
"arrivalNotice": ["containers"],
|
|
89
85
|
"finalMbL": ["containers"],
|
|
90
86
|
"draftMbl": ["containers"],
|
|
87
|
+
"deliveryOrder": ["Equipment", "TransportLeg"],
|
|
91
88
|
"customsAssessment": ["containers"],
|
|
92
89
|
"packingList": ["skuData"],
|
|
93
90
|
"commercialInvoice": ["skus"],
|
src/docai_processor_config.yaml
CHANGED
|
@@ -13,62 +13,6 @@ model_config:
|
|
|
13
13
|
author: "igor.tonko@forto.com"
|
|
14
14
|
created_date: ""
|
|
15
15
|
|
|
16
|
-
packingList:
|
|
17
|
-
- id: "d967005bd9d45aeb"
|
|
18
|
-
details:
|
|
19
|
-
display_name: "doc_cap_packingList"
|
|
20
|
-
author: "kumar.rajendrababu@forto.com"
|
|
21
|
-
created_date: ""
|
|
22
|
-
|
|
23
|
-
commercialInvoice:
|
|
24
|
-
- id: "7d37236207f75758"
|
|
25
|
-
details:
|
|
26
|
-
display_name: "doc_cap_commercialInvoice"
|
|
27
|
-
author: "kumar.rajendrababu@forto.com"
|
|
28
|
-
created_date: ""
|
|
29
|
-
|
|
30
|
-
finalMbL:
|
|
31
|
-
- id: "1eda2f22d64b1b89"
|
|
32
|
-
details:
|
|
33
|
-
display_name: "doc_cap_finalMbL"
|
|
34
|
-
author: "igor.tonko@forto.com"
|
|
35
|
-
created_date: ""
|
|
36
|
-
|
|
37
|
-
draftMbl:
|
|
38
|
-
- id: "1eda2f22d64b1b89"
|
|
39
|
-
details:
|
|
40
|
-
display_name: "doc_cap_finalMbL"
|
|
41
|
-
author: "igor.tonko@forto.com"
|
|
42
|
-
created_date: ""
|
|
43
|
-
|
|
44
|
-
shippingInstruction:
|
|
45
|
-
- id: "c77a0a515d99a8ba"
|
|
46
|
-
details:
|
|
47
|
-
display_name: "doc_cap_shippingInstruction"
|
|
48
|
-
author: "kumar.rajendrababu@forto.com"
|
|
49
|
-
created_date: ""
|
|
50
|
-
|
|
51
|
-
arrivalNotice:
|
|
52
|
-
- id: "748b2e2b9161dcf3"
|
|
53
|
-
details:
|
|
54
|
-
display_name: "doc_cap_arrivalNotice"
|
|
55
|
-
author: "osman.demirel@forto.com"
|
|
56
|
-
created_date: ""
|
|
57
|
-
|
|
58
|
-
customsAssessment:
|
|
59
|
-
- id: "c464a18d82fad9be"
|
|
60
|
-
details:
|
|
61
|
-
display_name: "doc_cap_customsAssessment"
|
|
62
|
-
author: "igor.tonko@forto.com"
|
|
63
|
-
created_date: ""
|
|
64
|
-
|
|
65
|
-
deliveryOrder:
|
|
66
|
-
- id: "2245a72c7a5dbf5f"
|
|
67
|
-
details:
|
|
68
|
-
display_name: "doc_cap_releaseNote"
|
|
69
|
-
author: "igor.tonko@forto.com"
|
|
70
|
-
created_date: ""
|
|
71
|
-
|
|
72
16
|
beta:
|
|
73
17
|
bookingConfirmation:
|
|
74
18
|
- id: "3c280b11bdb3ed89"
|
src/excel_processing.py
CHANGED
|
@@ -2,22 +2,21 @@
|
|
|
2
2
|
# flake8: noqa: E402
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from ddtrace import tracer
|
|
6
6
|
|
|
7
7
|
logger = logging.getLogger(__name__)
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
|
-
import json
|
|
11
10
|
|
|
12
11
|
import numpy as np
|
|
13
12
|
import pandas as pd
|
|
14
13
|
|
|
15
|
-
from src.
|
|
16
|
-
from src.utils import
|
|
14
|
+
from src.prompts.prompt_library import prompt_library
|
|
15
|
+
from src.utils import estimate_page_count, get_excel_sheets
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
async def extract_data_from_sheet(
|
|
20
|
-
|
|
19
|
+
llm_client, sheet_name, sheet, response_schema, doc_type=None
|
|
21
20
|
):
|
|
22
21
|
logger.info(f"Processing sheet: {sheet_name}")
|
|
23
22
|
excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
|
|
@@ -29,11 +28,14 @@ async def extract_data_from_sheet(
|
|
|
29
28
|
)
|
|
30
29
|
|
|
31
30
|
# Prompt for the LLM JSON
|
|
32
|
-
|
|
31
|
+
prompt = prompt_library.library[doc_type]["other"]["prompt"]
|
|
32
|
+
|
|
33
|
+
# Join the worksheet content with the prompt
|
|
34
|
+
prompt = worksheet + "\n" + prompt
|
|
33
35
|
|
|
34
36
|
try:
|
|
35
|
-
result = await
|
|
36
|
-
|
|
37
|
+
result = await llm_client.get_unified_json_genai(
|
|
38
|
+
prompt,
|
|
37
39
|
response_schema=response_schema,
|
|
38
40
|
doc_type=doc_type,
|
|
39
41
|
)
|
|
@@ -49,6 +51,7 @@ async def extract_data_from_excel(
|
|
|
49
51
|
input_doc_type,
|
|
50
52
|
file_content,
|
|
51
53
|
mime_type,
|
|
54
|
+
llm_client,
|
|
52
55
|
):
|
|
53
56
|
"""Extract data from the Excel file.
|
|
54
57
|
|
|
@@ -57,6 +60,7 @@ async def extract_data_from_excel(
|
|
|
57
60
|
input_doc_type (str): The type of the document.
|
|
58
61
|
file_content (bytes): The content of the Excel file to process.
|
|
59
62
|
mime_type (str): The MIME type of the file.
|
|
63
|
+
llm_client: The LLM client to use for data extraction.
|
|
60
64
|
|
|
61
65
|
Returns:
|
|
62
66
|
formatted_data (list): A list of dictionaries containing the extracted data.
|
|
@@ -65,15 +69,24 @@ async def extract_data_from_excel(
|
|
|
65
69
|
|
|
66
70
|
"""
|
|
67
71
|
# Generate the response structure
|
|
68
|
-
response_schema =
|
|
72
|
+
response_schema = prompt_library.library[input_doc_type]["other"]["placeholders"]
|
|
69
73
|
|
|
70
74
|
# Load the Excel file and get ONLY the "visible" sheet names
|
|
71
75
|
sheets, workbook = get_excel_sheets(file_content, mime_type)
|
|
72
76
|
|
|
77
|
+
# Track the number of sheets in dd-trace
|
|
78
|
+
span = tracer.current_span()
|
|
79
|
+
if span:
|
|
80
|
+
estimated_page_counts = [
|
|
81
|
+
estimate_page_count(workbook[sheet]) for sheet in sheets
|
|
82
|
+
]
|
|
83
|
+
est_page_count = sum(estimated_page_counts)
|
|
84
|
+
span.set_metric("est_page_count", est_page_count)
|
|
85
|
+
|
|
73
86
|
# Excel files may contain multiple sheets. Extract data from each sheet
|
|
74
87
|
sheet_extract_tasks = [
|
|
75
88
|
extract_data_from_sheet(
|
|
76
|
-
|
|
89
|
+
llm_client,
|
|
77
90
|
sheet_name,
|
|
78
91
|
workbook[sheet_name],
|
|
79
92
|
response_schema,
|
|
@@ -83,7 +96,4 @@ async def extract_data_from_excel(
|
|
|
83
96
|
]
|
|
84
97
|
extracted_data = {k: v for k, v in await asyncio.gather(*sheet_extract_tasks)}
|
|
85
98
|
|
|
86
|
-
|
|
87
|
-
extracted_data = llm_prediction_to_tuples(extracted_data)
|
|
88
|
-
|
|
89
|
-
return extracted_data, extracted_data, params["gemini_params"]["model_id"]
|
|
99
|
+
return extracted_data, extracted_data, llm_client.model_id
|
src/io.py
CHANGED
|
@@ -156,4 +156,27 @@ def download_dir_from_bucket(bucket, directory_cloud, directory_local) -> bool:
|
|
|
156
156
|
return result
|
|
157
157
|
|
|
158
158
|
|
|
159
|
+
def bq_logs(data_to_insert, params):
|
|
160
|
+
"""Insert logs into Google BigQuery.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
data_to_insert (list): The data to insert into BigQuery.
|
|
164
|
+
params (dict): The parameters dictionary.
|
|
165
|
+
"""
|
|
166
|
+
# Use the pre-initialized BigQuery client
|
|
167
|
+
bq_client = params["bq_client"]
|
|
168
|
+
# Get the table string
|
|
169
|
+
table_string = f"{params['g_ai_project_name']}.{params['g_ai_gbq_db_schema']}.{params['g_ai_gbq_db_table_out']}"
|
|
170
|
+
|
|
171
|
+
logger.info(f"Log table: {table_string}")
|
|
172
|
+
# Insert the rows into the table
|
|
173
|
+
insert_logs = bq_client.insert_rows_json(table_string, data_to_insert)
|
|
174
|
+
|
|
175
|
+
# Check if there were any errors inserting the rows
|
|
176
|
+
if not insert_logs:
|
|
177
|
+
logger.info("New rows have been added.")
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Errors occurred while inserting rows: ", insert_logs)
|
|
180
|
+
|
|
181
|
+
|
|
159
182
|
# type: ignore
|
src/llm.py
CHANGED
|
@@ -201,33 +201,4 @@ class LlmClient:
|
|
|
201
201
|
return response
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def prompt_excel_extraction(excel_structured_text):
|
|
205
|
-
"""Write a prompt to extract data from Excel files.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
excel_structured_text (str): The structured text of the Excel file.
|
|
209
|
-
|
|
210
|
-
Returns:
|
|
211
|
-
prompt str: The prompt for common json.
|
|
212
|
-
"""
|
|
213
|
-
prompt = f"""{excel_structured_text}
|
|
214
|
-
|
|
215
|
-
Task: Fill in the following dictionary from the information in the given in the above excel data.
|
|
216
|
-
|
|
217
|
-
Instructions:
|
|
218
|
-
- Do not change the keys of the following dictionary.
|
|
219
|
-
- The values should be filled in as per the schema provided below.
|
|
220
|
-
- If an entity contains a 'display_name', consider its properties as child data points in the below format.
|
|
221
|
-
{{'data-field': {{
|
|
222
|
-
'child-data-field': 'type -occurrence_type- description',
|
|
223
|
-
}}
|
|
224
|
-
}}
|
|
225
|
-
- The entity with 'display_name' can be extracted multiple times. Please pay attention to the occurrence_type.
|
|
226
|
-
- Ensure the schema reflects the hierarchical relationship.
|
|
227
|
-
- Use the data field description to understand the context of the data.
|
|
228
|
-
|
|
229
|
-
"""
|
|
230
|
-
return prompt
|
|
231
|
-
|
|
232
|
-
|
|
233
204
|
# pylint: enable=all
|