data-science-document-ai 1.40.4__py3-none-any.whl → 1.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {data_science_document_ai-1.40.4.dist-info → data_science_document_ai-1.41.0.dist-info}/METADATA +1 -1
  2. data_science_document_ai-1.41.0.dist-info/RECORD +57 -0
  3. src/excel_processing.py +4 -0
  4. src/pdf_processing.py +14 -3
  5. src/postprocessing/common.py +27 -0
  6. src/prompts/library/bookingConfirmation/evergreen/placeholders.json +17 -17
  7. src/prompts/library/bookingConfirmation/evergreen/prompt.txt +1 -0
  8. src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json +18 -18
  9. src/prompts/library/bookingConfirmation/hapag-lloyd/prompt.txt +1 -1
  10. src/prompts/library/bookingConfirmation/maersk/placeholders.json +17 -17
  11. src/prompts/library/bookingConfirmation/maersk/prompt.txt +1 -1
  12. src/prompts/library/bookingConfirmation/msc/placeholders.json +17 -17
  13. src/prompts/library/bookingConfirmation/msc/prompt.txt +1 -1
  14. src/prompts/library/bookingConfirmation/oocl/placeholders.json +17 -17
  15. src/prompts/library/bookingConfirmation/oocl/prompt.txt +3 -1
  16. src/prompts/library/bookingConfirmation/other/placeholders.json +17 -17
  17. src/prompts/library/bookingConfirmation/other/prompt.txt +1 -1
  18. src/prompts/library/bookingConfirmation/yangming/placeholders.json +17 -17
  19. src/prompts/library/bookingConfirmation/yangming/prompt.txt +1 -1
  20. src/prompts/library/bundeskasse/other/placeholders.json +19 -19
  21. src/prompts/library/bundeskasse/other/prompt.txt +1 -1
  22. src/prompts/library/commercialInvoice/other/prompt.txt +2 -1
  23. src/prompts/library/customsAssessment/other/prompt.txt +1 -1
  24. src/prompts/library/customsInvoice/other/placeholders.json +19 -19
  25. src/prompts/library/customsInvoice/other/prompt.txt +1 -1
  26. src/prompts/library/deliveryOrder/other/placeholders.json +15 -17
  27. src/prompts/library/deliveryOrder/other/prompt.txt +1 -1
  28. src/prompts/library/draftMbl/hapag-lloyd/prompt.txt +2 -1
  29. src/prompts/library/draftMbl/maersk/prompt.txt +2 -0
  30. src/prompts/library/draftMbl/other/prompt.txt +1 -1
  31. src/prompts/library/finalMbL/hapag-lloyd/prompt.txt +1 -1
  32. src/prompts/library/finalMbL/maersk/prompt.txt +2 -0
  33. src/prompts/library/finalMbL/other/prompt.txt +1 -1
  34. src/prompts/library/packingList/other/prompt.txt +1 -1
  35. src/prompts/library/partnerInvoice/other/placeholders.json +12 -60
  36. src/prompts/library/partnerInvoice/other/prompt.txt +1 -1
  37. src/prompts/library/shippingInstruction/other/prompt.txt +1 -0
  38. src/prompts/prompt_library.py +4 -0
  39. src/utils.py +57 -0
  40. data_science_document_ai-1.40.4.dist-info/RECORD +0 -59
  41. src/prompts/library/customsAssessment/other/placeholders.json +0 -19
  42. src/prompts/library/finalMbL/other/placeholders.json +0 -80
  43. {data_science_document_ai-1.40.4.dist-info → data_science_document_ai-1.41.0.dist-info}/WHEEL +0 -0
@@ -2,47 +2,47 @@
2
2
  "type": "OBJECT",
3
3
  "properties": {
4
4
  "currencyCode": {
5
- "type": "string",
5
+ "type": "STRING",
6
6
  "nullable": true,
7
7
  "description": "The currency in which the invoice is issued."
8
8
  },
9
9
  "grandTotal": {
10
- "type": "string",
10
+ "type": "STRING",
11
11
  "nullable": true,
12
12
  "description": "The overall total amount of the invoice."
13
13
  },
14
14
  "issueDate": {
15
- "type": "string",
15
+ "type": "STRING",
16
16
  "nullable": true,
17
17
  "description": "The date the document was issued."
18
18
  },
19
19
  "recipientAddress": {
20
- "type": "string",
20
+ "type": "STRING",
21
21
  "nullable": true,
22
22
  "description": "The address of the recipient."
23
23
  },
24
24
  "recipientName": {
25
- "type": "string",
25
+ "type": "STRING",
26
26
  "nullable": true,
27
27
  "description": "The name of the recipient."
28
28
  },
29
29
  "serviceDate": {
30
- "type": "string",
30
+ "type": "STRING",
31
31
  "nullable": true,
32
32
  "description": "The date of service or transaction."
33
33
  },
34
34
  "shipmentId": {
35
- "type": "string",
35
+ "type": "STRING",
36
36
  "nullable": true,
37
37
  "description": "Starting with an \"S\" and followed by 6 or 7 digits. Example: S124321"
38
38
  },
39
39
  "vendorName": {
40
- "type": "string",
40
+ "type": "STRING",
41
41
  "nullable": true,
42
42
  "description": "The name of the vendor."
43
43
  },
44
44
  "vendorAddress": {
45
- "type": "string",
45
+ "type": "STRING",
46
46
  "nullable": true,
47
47
  "description": "The address of the vendor."
48
48
  },
@@ -52,37 +52,37 @@
52
52
  "type": "OBJECT",
53
53
  "properties": {
54
54
  "deferredDutyPayer": {
55
- "type": "string",
55
+ "type": "STRING",
56
56
  "nullable": true,
57
57
  "description": "It can be identified under \"Aufschubenhmer\" for each line item"
58
58
  },
59
59
  "name": {
60
- "type": "string",
60
+ "type": "STRING",
61
61
  "nullable": true,
62
62
  "description": "The name or description of the line item A0000 and B0000"
63
63
  },
64
64
  "taxType": {
65
- "type": "string",
65
+ "type": "STRING",
66
66
  "nullable": true,
67
67
  "description": "It's a line item mentioned in the invoice. For example; A0000 and B0000"
68
68
  },
69
69
  "totalAmount": {
70
- "type": "string",
70
+ "type": "STRING",
71
71
  "nullable": true,
72
72
  "description": "The total amount for the line item."
73
73
  },
74
74
  "totalAmountCurrency": {
75
- "type": "string",
75
+ "type": "STRING",
76
76
  "nullable": true,
77
77
  "description": "The currency of the total amount."
78
78
  },
79
79
  "vatId": {
80
- "type": "string",
80
+ "type": "STRING",
81
81
  "nullable": true,
82
82
  "description": "The VAT identification number. This is named a Konto-Nummer for each line item."
83
83
  },
84
84
  "dueDate": {
85
- "type": "string",
85
+ "type": "STRING",
86
86
  "nullable": true,
87
87
  "description": "It's a due date. Due date to pay the amount. It's usually mentioned either in a date or a number of days format"
88
88
  }
@@ -91,20 +91,20 @@
91
91
  }
92
92
  },
93
93
  "invoiceNumber": {
94
- "type": "string",
94
+ "type": "STRING",
95
95
  "nullable": true,
96
96
  "description": "Invoice Number is a unique identifier for the invoice, it starts with \"ATC\", \"AT-C\", or \"AT/C\" only (e.g., ATC40, AT-C-40-, AT/C/40/....). Do NOT extract \"NIZZA-Registrierkennzeichen number."
97
97
  },
98
98
  "containerNumber": {
99
99
  "type": "ARRAY",
100
100
  "items": {
101
- "type": "string",
101
+ "type": "STRING",
102
102
  "nullable": true,
103
103
  "description": "The unique identifier for each container. It always starts with 4 capital letters and followed by 7 digits. Example: TEMU7972458."
104
104
  }
105
105
  },
106
106
  "creditNoteInvoiceNumber": {
107
- "type": "string",
107
+ "type": "STRING",
108
108
  "nullable": true,
109
109
  "description": "The unique identifier for the associated Invoice. The number usually starts with ATS..."
110
110
  }
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data from customs invoice documents as per the given response schema structure.<TASK>
3
+ <TASK>Your task is to extract data and page numbers starting from 0 from customs invoice documents as per the given response schema structure.<TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives Customs invoices from Customs Brokers called Bundeskasse.
@@ -2,7 +2,8 @@ Task: You are a document entity extraction specialist. Given a document, your ta
2
2
 
3
3
  Extract all the data points from the given document.
4
4
  Each data point is part of a master field called skus. There may be multiple skus entries in a document.
5
- Your goal is to extract all instances.
5
+ Your task is to extract the text value of the entities and page numbers starting from 0 starting from 0 where the value was found in the document.
6
+
6
7
 
7
8
  Instructions:
8
9
  - Populate fields as defined in the response schema.
@@ -15,7 +15,7 @@ containers:
15
15
  goodsDescription: Goods description.
16
16
 
17
17
 
18
- Your task is to extract the text value of the following entities:
18
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
19
19
  SCHEMA_PLACEHOLDER
20
20
 
21
21
  Keywords for datapoints:
@@ -4,63 +4,63 @@
4
4
  "bankAccount": {
5
5
  "type": "ARRAY",
6
6
  "items": {
7
- "type": "STRING",
7
+ "type": "STRING",
8
8
  "nullable": true,
9
9
  "description": "The bank account(s) number(s) of the vendor. This is the account to which the payment should be made. Extract all the relevant bank account numbers mentioned in the invoice."
10
10
  }
11
11
  },
12
- "contractNumber": {"type": "STRING",
12
+ "contractNumber": {"type": "STRING",
13
13
  "nullable": true,
14
14
  "description": "It's a contract number between the carrier and Forto Logistics SE & Co KG."
15
15
  },
16
16
  "currencyExchange": {
17
17
  "type": "OBJECT",
18
18
  "properties": {
19
- "from": {"type": "STRING",
19
+ "from": {"type": "STRING",
20
20
  "nullable": true,
21
21
  "description": "The currency code from which the exchange rate is applied."
22
22
  },
23
- "fxRate": {"type": "STRING",
23
+ "fxRate": {"type": "STRING",
24
24
  "nullable": true,
25
25
  "description": "The exchange rate applied to convert the amount from the 'from' currency to the 'to' currency."
26
26
  },
27
- "to": {"type": "STRING",
27
+ "to": {"type": "STRING",
28
28
  "nullable": true,
29
29
  "description": "The currency code to which the exchange rate is applied."}
30
30
  }
31
31
  },
32
- "documentType": {"type": "STRING", "nullable": true},
33
- "dueDate": {"type": "STRING", "nullable": true,
32
+ "documentType": {"type": "STRING", "nullable": true},
33
+ "dueDate": {"type": "STRING", "nullable": true,
34
34
  "description": "The date by which the payment should be made by Forto Logistics SE & Co KG. Do Not calculate dueDate based on issueDate or any other date. Extract it directly from the invoice."},
35
- "eta": {"type": "STRING", "nullable": true,
35
+ "eta": {"type": "STRING", "nullable": true,
36
36
  "description": "Estimated Time of Arrival (ETA) is the expected date when the shipment will arrive at its destination."},
37
- "etd": {"type": "STRING", "nullable": true,
37
+ "etd": {"type": "STRING", "nullable": true,
38
38
  "description": "Estimated Time of Departure (ETD) is the expected date when the shipment will leave the origin port."},
39
- "fortoEntity": {"type": "STRING", "nullable": true,
39
+ "fortoEntity": {"type": "STRING", "nullable": true,
40
40
  "description": "The entity of 'Forto Logistics SE & Co KG' that is responsible for the invoice. The Forto organization or branch managing the shipment."
41
41
  },
42
- "hblNumber": {"type": "STRING", "nullable": true,
42
+ "hblNumber": {"type": "STRING", "nullable": true,
43
43
  "description": "House Bill of Lading number, a document issued by a freight forwarder."
44
44
  },
45
- "currencyCode": {"type": "STRING", "nullable": true,
45
+ "currencyCode": {"type": "STRING", "nullable": true,
46
46
  "description": "The currency code in which the invoice is issued, such as EUR, USD, etc."
47
47
  },
48
- "grandTotal": {"type": "STRING", "nullable": true,
48
+ "grandTotal": {"type": "STRING", "nullable": true,
49
49
  "description": "The total amount of the invoice, including all line items and taxes."
50
50
  },
51
- "vatAmount": {"type": "STRING", "nullable": true,
51
+ "vatAmount": {"type": "STRING", "nullable": true,
52
52
  "description": "The total VAT amount applied to the invoice. This is the tax charged on the vatApplicableAmount of the invoice. Bitte Zahlen is not the vatAmount."
53
53
  },
54
- "vatApplicableAmount": {"type": "STRING", "nullable": true,
54
+ "vatApplicableAmount": {"type": "STRING", "nullable": true,
55
55
  "description": "The amount on which VAT is applicable. This is the net amount before VAT is applied (without VAT)."
56
56
  },
57
- "vatPercentage": {"type": "STRING", "nullable": true,
57
+ "vatPercentage": {"type": "STRING", "nullable": true,
58
58
  "description": "The percentage rate of VAT applied to the vatApplicableAmount. This is used to calculate the vatAmount."
59
59
  },
60
- "invoiceNumber": {"type": "STRING", "nullable": true,
60
+ "invoiceNumber": {"type": "STRING", "nullable": true,
61
61
  "description": "The unique identifier for the invoice. This is used to track and reference the invoice in financial records."
62
62
  },
63
- "issueDate": {"type": "STRING", "nullable": true,
63
+ "issueDate": {"type": "STRING", "nullable": true,
64
64
  "description": "The date when the invoice was issued."
65
65
  },
66
66
  "lineItem": {
@@ -69,7 +69,7 @@
69
69
  "type": "OBJECT",
70
70
  "properties": {
71
71
  "uniqueId": {
72
- "type": "STRING",
72
+ "type": "STRING",
73
73
  "nullable": true,
74
74
  "description": "A line item can belong to a different shipments. Hence, the unique IDs of a line item need to be extracted that you see only on the line item level. UniqueIds are containerNumber, shipmentId, or sealNumber."
75
75
  },
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data from invoice documents as per the given response schema structure.<TASK>
3
+ <TASK>Your task is to extract data and their page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
@@ -1,31 +1,29 @@
1
1
  {
2
- "SCHEMA_PLACEHOLDER": {
3
2
  "type": "OBJECT",
4
3
  "properties": {
5
- "EmptyContainerDepot": {"type": "string", "nullable": true},
4
+ "EmptyContainerDepot": {"type": "STRING", "nullable": true},
6
5
  "Equipment": {"type": "ARRAY",
7
6
  "items": {
8
7
  "type": "OBJECT", "properties": {
9
- "CargoGrossWeight": {"type": "string", "nullable": true},
10
- "ContainerNumber": {"type": "string", "nullable": true},
11
- "ContainerType": {"type": "string", "nullable": true},
12
- "EmptyReturnReference": {"type": "string", "nullable": true},
13
- "Pin": {"type": "string", "nullable": true},
14
- "TareWeight": {"type": "string", "nullable": true}
8
+ "CargoGrossWeight": {"type": "STRING", "nullable": true},
9
+ "ContainerNumber": {"type": "STRING", "nullable": true},
10
+ "ContainerType": {"type": "STRING", "nullable": true},
11
+ "EmptyReturnReference": {"type": "STRING", "nullable": true},
12
+ "Pin": {"type": "STRING", "nullable": true},
13
+ "TareWeight": {"type": "STRING", "nullable": true}
15
14
  }, "required": []}
16
15
  },
17
- "pickUpTerminal": {"type": "string", "nullable": true},
16
+ "pickUpTerminal": {"type": "STRING", "nullable": true},
18
17
  "TransportLeg": {"type": "ARRAY",
19
18
  "items": {
20
19
  "type": "OBJECT", "properties": {
21
- "eta": {"type": "string", "nullable": true},
22
- "etd": {"type": "string", "nullable": true},
23
- "portOfDischarge": {"type": "string", "nullable": true},
24
- "portOfLoading": {"type": "string", "nullable": true},
25
- "vesselName": {"type": "string", "nullable": true},
26
- "voyage": {"type": "string", "nullable": true}
20
+ "eta": {"type": "STRING", "nullable": true},
21
+ "etd": {"type": "STRING", "nullable": true},
22
+ "portOfDischarge": {"type": "STRING", "nullable": true},
23
+ "portOfLoading": {"type": "STRING", "nullable": true},
24
+ "vesselName": {"type": "STRING", "nullable": true},
25
+ "voyage": {"type": "STRING", "nullable": true}
27
26
  }, "required": []}
28
- }
29
- },
27
+ },
30
28
  "required": []}
31
29
  }
@@ -18,7 +18,7 @@ TransportLeg:
18
18
  vesselName: The name of the vessel.
19
19
  voyage: The journey or route code taken by the vessel.
20
20
 
21
- Your task is to extract the text value of the following entities:
21
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
22
22
  SCHEMA_PLACEHOLDER
23
23
 
24
24
  Keywords for datapoints:
@@ -16,7 +16,8 @@ containers:
16
16
  vessel: The name of the vessel.
17
17
 
18
18
 
19
- Your task is to extract the text value of the following entities:
19
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
+
20
21
 
21
22
  Keywords for datapoints:
22
23
  - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
@@ -1,4 +1,6 @@
1
1
  Extract the following information from the sea waybill document.
2
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
3
+
2
4
 
3
5
  **blNumber:** Find the value labeled as "B/L No.".
4
6
  **voyage:** Get the "Voyage No." value.
@@ -16,7 +16,7 @@ containers:
16
16
  vessel: The name of the vessel.
17
17
 
18
18
 
19
- Your task is to extract the text value of the following entities:
19
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
20
 
21
21
  Keywords for datapoints:
22
22
  - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
@@ -16,7 +16,7 @@ containers:
16
16
  vessel: The name of the vessel.
17
17
 
18
18
 
19
- Your task is to extract the text value of the following entities:
19
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
20
 
21
21
  Keywords for datapoints:
22
22
  - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
@@ -1,4 +1,6 @@
1
1
  Extract the following information from the sea waybill document.
2
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
3
+
2
4
 
3
5
  **blNumber:** Find the value labeled as "B/L No.".
4
6
  **voyage:** Get the "Voyage No." value.
@@ -16,7 +16,7 @@ containers:
16
16
  vessel: The name of the vessel.
17
17
 
18
18
 
19
- Your task is to extract the text value of the following entities:
19
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
20
20
 
21
21
  Keywords for datapoints:
22
22
  - blNumber: Bill of Lading number, bill of landing no., swb-no., b/l no.
@@ -2,7 +2,7 @@ Task: You are a document entity extraction specialist. Given a document, your ta
2
2
 
3
3
  Extract all the data points from the given document.
4
4
  Each data point is part of a master field called "skuData". There may be multiple sku entries in a document.
5
- Your goal is to extract all instances.
5
+ Your task is to extract the text value of the following entities and their page numbers starting from 0 where the value was found in the document:
6
6
 
7
7
  Instructions:
8
8
  - Populate fields as defined in the response schema.
@@ -110,66 +110,18 @@
110
110
  "containerSize"
111
111
  ]
112
112
  },
113
- "mblNumber": {"type": "STRING", "nullable": true,
114
- "description": "Bill of Lading number (B/L NO.), a document issued by the carrier."
115
- },
116
- "partnerReference": {"type": "STRING", "nullable": true,
117
- "description": "A partnerReference can be a shipment ID. It starts with 'S' followed by 6 or 7 digits (e.g., 'S1234567')."
118
- },
119
- "paymentTerm": {"type": "STRING", "nullable": true,
120
- "description": "The payment term indicates the conditions under which the payment should be made. E.g., 'In 10 TAGEN', '14 TAGEN', '14 days', etc."},
121
- "portOfDischarge": {"type": "STRING", "nullable": true,
122
- "description": "The port where the goods are discharged from the vessel. This is the destination port for the shipment."},
123
- "portOfLoading": {"type": "STRING", "nullable": true,
124
- "description": "The origin port where the goods are loaded onto the vessel. Find information like 'Ladehafen' or 'Port of Loading' in the invoice."},
125
- "recipientAddress": {"type": "STRING", "nullable": true,
126
- "description": "Majority of the times, it is 'Forto Logistics SE & Co KG' Address depends on the entity."},
127
- "recipientName": {"type": "STRING", "nullable": true,
128
- "description": "The name of the recipient who is responsible for making the payment. This is often the 'Forto Logistics SE & Co KG' entity or partner."},
129
- "serviceDate": {"type": "STRING", "nullable": true,
130
- "description": "The date when the service was provided. If Service date is not available in the invoice, Estimated Time of Arrival (ETA) can be used."},
131
- "vatId": {"type": "STRING", "nullable": true,
132
- "description": "The VAT ID of the vendor. This is used for tax purposes and to identify the vendor in financial transactions."},
133
- "vendorAddress": {"type": "STRING", "nullable": true,
134
- "description": "The address of the vendor to whom the payment should be made."},
135
- "vendorName": {"type": "STRING", "nullable": true,
136
- "description": "The name of the vendor to whom the payment should be made. Extract the main vendor details incase the invoice contains 'As Agent For'."},
137
- "agentName": {
138
- "type": "STRING",
139
- "nullable": true,
140
- "description": "The name of the agent or intermediary involved in the transaction, if applicable."},
141
- "agentKeyWord": {
142
- "type": "STRING",
143
- "nullable": true,
144
- "description": "A keyword or phrase that indicates the presence of an agent or intermediary in the transaction, such as 'As Agent For', 'Acting Agent', 'Issuing agent', 'Contact Agent', or similar words."},
145
- "paymentInformation": {
146
- "type": "OBJECT",
147
- "properties": {
148
- "paidAmount": {
149
- "type": "STRING",
150
- "nullable": true,
151
- "description": "The amount that has been paid so far. You can identify this in the invoice by looking for terms like 'Vorschuss'."
152
- },
153
- "remainingAmountToPay": {
154
- "type": "STRING",
155
- "nullable": true,
156
- "description": "The amount that is still due for payment (e.g., 'Bitte zahlen', 'Zu zahlen' only). This can be negative & ensure the negative sign is captured if applicable."
157
- },
158
- "currency": {
159
- "type": "STRING",
160
- "nullable": true,
161
- "description": "Currency code associated with the paidAmount and remainngAmountToPay"
162
- },
163
- "sentence": {"type": "STRING", "nullable": true,
164
- "description": "A sentence that indicates the payment status, such as 'Vorschuss', 'Vorauszahlung', 'Paid', 'Partially Paid', or 'Unpaid'. This is used to summarize the payment status of the invoice."}
165
- }
166
-
167
- },
168
- "reverseChargeSentence": {
169
- "type": "STRING",
170
- "nullable": true,
171
- "description": "A sentence which indicate that the reverse charge applies. Mostly found as VAT/Tax Clause."
172
- }
113
+ "mblNumber": {"type": "STRING", "nullable": true},
114
+ "partnerReference": {"type": "STRING", "nullable": true},
115
+ "paymentTerm": {"type": "STRING", "nullable": true},
116
+ "portOfDischarge": {"type": "STRING", "nullable": true},
117
+ "portOfLoading": {"type": "STRING", "nullable": true},
118
+ "recipientAddress": {"type": "STRING", "nullable": true},
119
+ "recipientName": {"type": "STRING", "nullable": true},
120
+ "serviceDate": {"type": "STRING", "nullable": true},
121
+ "vatId": {"type": "STRING", "nullable": true},
122
+ "vendorAddress": {"type": "STRING", "nullable": true},
123
+ "vendorName": {"type": "STRING", "nullable": true},
124
+ "reverseChargeSentence": {"type": "STRING", "nullable": true}
173
125
  },
174
126
  "required": [
175
127
  "bankAccount",
@@ -1,6 +1,6 @@
1
1
  <PERSONA> You are an efficient document entity data extraction specialist working for a Freight Forwarding company. <PERSONA>
2
2
 
3
- <TASK>Your task is to extract data from invoice documents as per the given response schema structure.<TASK>
3
+ <TASK>Your task is to extract data and page numbers starting from 0 from invoice documents as per the given response schema structure.<TASK>
4
4
 
5
5
  <CONTEXT>
6
6
  The Freight Forwarding company receives invoices from Carrier (Shipping Lines) partners and Customs Brokers. These include Partner Invoices (COGS Invoices) and COGS Customs Invoices.
@@ -1,4 +1,5 @@
1
1
  Task: Extract data from the provided shipping instruction PDF document and populate the following dictionary based on the given schema.
2
+ Your task is to extract the text value of the following entities and page numbers starting from 0 where the value was found in the document:
2
3
 
3
4
  ### Instructions:
4
5
  1. Extract all data points from the shipping instruction document.
@@ -4,6 +4,8 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import Dict
6
6
 
7
+ from src.utils import transform_schema_strings
8
+
7
9
 
8
10
  class PromptLibrary:
9
11
  """
@@ -41,6 +43,8 @@ class PromptLibrary:
41
43
  if file == "placeholders.json":
42
44
  with open(path_to_library / prompt_type / prompt_subtype / file) as f:
43
45
  placeholders = json.load(f)
46
+ if prompt_type not in ["postprocessing", "preprocessing"]:
47
+ placeholders = transform_schema_strings(placeholders)
44
48
  self.library[prompt_type][prompt_subtype][
45
49
  "placeholders"
46
50
  ] = placeholders
src/utils.py CHANGED
@@ -298,6 +298,9 @@ def generate_schema_structure(params, input_doc_type):
298
298
  "type": "string",
299
299
  }
300
300
 
301
+ # update schema to extract value-page_number pairs
302
+ response_schema = transform_schema_strings(response_schema)
303
+
301
304
  return response_schema
302
305
 
303
306
 
@@ -412,3 +415,57 @@ def get_tms_mappings(
412
415
  )
413
416
 
414
417
  return formatted_values
418
+
419
+
420
+ def transform_schema_strings(schema):
421
+ """
422
+ Recursively transforms a schema dictionary, replacing all "type": "STRING"
423
+ definitions with a new object containing "value" and "page_number" fields.
424
+ It preserves 'nullable' and 'description' fields by moving them to the
425
+ new 'value' property.
426
+
427
+ Args:
428
+ schema (dict): The input schema dictionary.
429
+
430
+ Returns:
431
+ dict: The transformed schema dictionary.
432
+ """
433
+ # Base case: if the current schema definition is for a string
434
+ if isinstance(schema, dict) and schema.get("type").upper() == "STRING":
435
+ new_schema = {
436
+ "type": "OBJECT",
437
+ "properties": {
438
+ "value": {
439
+ "type": "STRING"
440
+ },
441
+ "page_number": {
442
+ "type": "STRING",
443
+ "description": "Number of a page where the value was found in the document starting from 0."
444
+ }
445
+ },
446
+ "required": []
447
+ }
448
+
449
+ # Preserve original properties like nullable and description on the new 'value' key
450
+ if "nullable" in schema:
451
+ new_schema["properties"]["value"]["nullable"] = schema["nullable"]
452
+ if "description" in schema:
453
+ new_schema["properties"]["value"]["description"] = schema["description"]
454
+
455
+ return new_schema
456
+
457
+ # Recursive case: if the schema is a dictionary
458
+ elif isinstance(schema, dict) and schema.get("type").upper() == "OBJECT":
459
+ transformed_schema = schema.copy()
460
+ for key, value in schema.get("properties").items():
461
+ transformed_schema["properties"][key] = transform_schema_strings(value)
462
+ return transformed_schema
463
+
464
+ # Recursive case: if the schema is a list
465
+ elif isinstance(schema, dict) and schema.get("type").upper() == "ARRAY":
466
+ schema["items"] = transform_schema_strings(schema["items"])
467
+ return schema
468
+
469
+ # Base case: for non-dict/list values (e.g., None, bool, str)
470
+ else:
471
+ return schema