data-science-document-ai 1.42.1__py3-none-any.whl → 1.42.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.42.1
3
+ Version: 1.42.3
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -2,15 +2,15 @@ src/constants.py,sha256=TF_UblovdXZnKIb1lnyJwUqQncJCbzBVihoelI6foSU,3579
2
2
  src/constants_sandbox.py,sha256=Iu6HdjCoNSmOX0AwoL9qUQkhq_ZnIN5U9e-Q2UfNuGc,547
3
3
  src/docai.py,sha256=dHuR0ehVjUi1CnoNvdp_yxJtpU_HFXqAZ61ywdz7BEo,5655
4
4
  src/docai_processor_config.yaml,sha256=qOMmCIORpLQ_D-ytvejXxFvER0e0uGYuzPVdZBGv4Pc,2105
5
- src/excel_processing.py,sha256=jBL6h5T3fJ4uM_rFiV8c0yWAy8Tt3V3RFtBBqb8ztfo,2744
5
+ src/excel_processing.py,sha256=8toKsafUvwE5QN3TOQO3zfLo0Wv2sGxZHKPsL7n5LkA,2771
6
6
  src/io.py,sha256=tOJpMyI-mP1AaXKG4UFudH47MHWzjWBgVahFJUcjGfs,4749
7
7
  src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
9
  src/pdf_processing.py,sha256=dxsYvNnONAjzS-T7K5aSo89rz7QcdW3ZDfeuFyeCeII,16294
10
- src/postprocessing/common.py,sha256=lc95nGvy-KrFFQyX2X3ABMjrx1xVYDjuTBgeAXQTcuU,21570
10
+ src/postprocessing/common.py,sha256=xw_BpX8kDaL2SVPgSmSGmzHLdwMGe39YbbvEJJovsBI,21655
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=cM4te4qjOI_bXyrF8Zhb6X7eNf5aMKoRaPCFfqFv-98,11538
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=koGR7dN37FqJcepdzkrzNBHuBBUuCp_3CrteScASqyE,10590
14
14
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
15
15
  src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
16
16
  src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
@@ -42,7 +42,7 @@ src/prompts/library/finalMbL/maersk/prompt.txt,sha256=4neW6buJirgoS84iDsy9ZcfQTa
42
42
  src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylgr06l4IxrkFYjE9uM,2241
43
43
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
44
44
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=AJNBVKwDGebyNAuyWEwEuaUTL9hbLK0Rjr2H0lNfOBY,8686
45
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=HuTUlCpUgDQKUKF5QYYoUoHZ0pkBIqX0g5NzciF_fps,9393
45
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=XcZGsJsRIT5yQCtLU1O6dkxNZTazbX4FLPGTNs2yb9Q,9481
46
46
  src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
47
47
  src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
48
48
  src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
@@ -52,6 +52,6 @@ src/prompts/prompt_library.py,sha256=jPxybNPPGH7mzonqtAOqmw5WcT-RtbGP0pvMqqP22hg
52
52
  src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
53
53
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
54
54
  src/utils.py,sha256=nU69zR3TB7IZmCc19DD8H27Riek8GJAldmhJjCSwNEE,16090
55
- data_science_document_ai-1.42.1.dist-info/METADATA,sha256=nsGhuml2YNlNF7s7aRUJPpY8psKss8wiLcIavpVInjs,2153
56
- data_science_document_ai-1.42.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
- data_science_document_ai-1.42.1.dist-info/RECORD,,
55
+ data_science_document_ai-1.42.3.dist-info/METADATA,sha256=p9Vii_gokX3EsFWtgDSjeY5fZ59gGLDVD3rFPXXHfOk,2153
56
+ data_science_document_ai-1.42.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
57
+ data_science_document_ai-1.42.3.dist-info/RECORD,,
src/excel_processing.py CHANGED
@@ -20,7 +20,8 @@ async def extract_data_from_sheet(
20
20
  params, sheet_name, sheet, response_schema, doc_type=None
21
21
  ):
22
22
  logger.info(f"Processing sheet: {sheet_name}")
23
- excel_content = pd.DataFrame(sheet.values)
23
+ excel_content = pd.DataFrame(sheet.values).dropna(how="all", axis=1)
24
+
24
25
  # Convert to Markdown format for the LLM model
25
26
  worksheet = (
26
27
  "This is from a excel. Pay attention to the cell position:\n"
@@ -456,6 +456,7 @@ async def format_label(entity_k, entity_value, document_type_code, params):
456
456
  elif "quantity" in entity_key:
457
457
  if document_type_code in ["partnerInvoice", "customsInvoice", "bundeskasse"]:
458
458
  # For partner invoice, quantity can be mentioned as whole number
459
+ # Apply decimal convertor for 46,45 --> 46.45 but not for 1.000 --> 1000
459
460
  formatted_value = decimal_convertor(
460
461
  extract_number(entity_value), quantity=True
461
462
  )
@@ -138,36 +138,7 @@ def update_recipient_and_vendor(aggregated_data, is_recipient_forto):
138
138
 
139
139
  def process_partner_invoice(params, aggregated_data, document_type_code):
140
140
  """Process the partner invoice data."""
141
- # Post process containerNumber.
142
- # TODO: Remove this block of code after migrating to LLM completely and update the placeholder in the prompt library
143
- if "containerNumber" in aggregated_data and isinstance(
144
- aggregated_data["containerNumber"], dict
145
- ):
146
- container_number = aggregated_data.get("containerNumber", {}).get(
147
- "formattedValue", None
148
- )
149
- if container_number:
150
- aggregated_data["containerNumber"] = (
151
- [
152
- {
153
- "documentValue": aggregated_data.get("containerNumber", {}).get(
154
- "documentValue", ""
155
- ),
156
- "formattedValue": ctr_number,
157
- }
158
- for ctr_number in container_number
159
- ]
160
- if isinstance(container_number, list)
161
- else [
162
- {
163
- "documentValue": aggregated_data.get("containerNumber", {}).get(
164
- "documentValue", ""
165
- ),
166
- "formattedValue": container_number,
167
- }
168
- ]
169
- )
170
-
141
+ # Post process bundeskasse invoices
171
142
  if document_type_code == "bundeskasse":
172
143
  post_process_bundeskasse(aggregated_data)
173
144
  return
@@ -197,9 +168,13 @@ def process_partner_invoice(params, aggregated_data, document_type_code):
197
168
  params,
198
169
  )
199
170
 
171
+ # Add page number for the consistency
172
+ line_item["itemCode"]["page"] = line_item["lineItemDescription"]["page"]
173
+
200
174
  if reverse_charge:
201
175
  # Distribute reverseChargeSentence to all line items
202
176
  line_item["reverseChargeSentence"] = reverse_charge
177
+ line_item["reverseChargeSentence"]["page"] = reverse_charge["page"]
203
178
 
204
179
 
205
180
  def compute_score(args):
@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
52
52
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
53
53
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
54
54
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
55
- - quantity: The quantity of the item or service provided in the line item.
55
+ - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
56
56
  - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
57
57
 
58
58
  - hblNumber and mblNumber: