data-science-document-ai 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-science-document-ai
3
- Version: 1.44.0
3
+ Version: 1.45.1
4
4
  Summary: "Document AI repo for data science"
5
5
  Author: Naomi Nguyen
6
6
  Author-email: naomi.nguyen@forto.com
@@ -6,11 +6,11 @@ src/excel_processing.py,sha256=_vP2q1xEIeyjO8TvZlSTeEM-M1PMceyDSuYGfyZeceY,3361
6
6
  src/io.py,sha256=rYjXVLlriEacw1uNuPIYhg12bXNu48Qs9GYMY2YcVTE,5563
7
7
  src/llm.py,sha256=OE4IEIqcM-hYK9U7e0x1rAfcqdpeo4iXPHBp64L5Qz0,8199
8
8
  src/log_setup.py,sha256=RhHnpXqcl-ii4EJzRt47CF2R-Q3YPF68tepg_Kg7tkw,2895
9
- src/pdf_processing.py,sha256=7ZNC-OCf3OlvmfzCqrY4Simv_Pofac-mgFyCi7WYUB0,20274
10
- src/postprocessing/common.py,sha256=KhXDxJ2AKfBrvYovA5ZyvW9IX76EFoTD4L6wnVCzxQ4,23322
9
+ src/pdf_processing.py,sha256=lzvoza9itpEyl-rcBQbIcWuFxUAvF_Qyc-OpuPQWWMk,20354
10
+ src/postprocessing/common.py,sha256=dagAg0hZGuZc03bXdfOolxekewMEVUfz917IGCiAtWI,26118
11
11
  src/postprocessing/postprocess_booking_confirmation.py,sha256=nK32eDiBNbauyQz0oCa9eraysku8aqzrcoRFoWVumDU,4827
12
12
  src/postprocessing/postprocess_commercial_invoice.py,sha256=3I8ijluTZcOs_sMnFZxfkAPle0UFQ239EMuvZfDZVPg,1028
13
- src/postprocessing/postprocess_partner_invoice.py,sha256=Hm9frILlIOvCWVcFNpyh0jLi6QEN9eBbHseZShYiISQ,12562
13
+ src/postprocessing/postprocess_partner_invoice.py,sha256=LZcMZfJeLdcbYqPemO8gn9SmJxv-NPmb4uVCT3lKg18,12341
14
14
  src/prompts/library/bookingConfirmation/evergreen/placeholders.json,sha256=IpM9nmSPdyroliZfXB1-NDCjiHZX_Ff5BH7-scNhGqE,1406
15
15
  src/prompts/library/bookingConfirmation/evergreen/prompt.txt,sha256=5ivskCG831M2scW3oqQaoltXIyHV-n6DYUygWycXxjw,2755
16
16
  src/prompts/library/bookingConfirmation/hapag-lloyd/placeholders.json,sha256=hMPNt9s3LuxR85AxYy7bPcCDleug6gSwVjefm3ismWY,1405
@@ -31,7 +31,7 @@ src/prompts/library/commercialInvoice/other/placeholders.json,sha256=zUK2mg9MnHi
31
31
  src/prompts/library/commercialInvoice/other/prompt.txt,sha256=CJapcVrmcvynJUanETDklkzU-0N9hHdhq5wL4MK7OIY,2683
32
32
  src/prompts/library/customsAssessment/other/prompt.txt,sha256=XSqWa3k9LM7dTiJtX8AKTp_0x5Z0pCNRKNUWaywwBlY,2191
33
33
  src/prompts/library/customsInvoice/other/placeholders.json,sha256=BnWYtl4sPooTHb_EHRIlrPawBrfHI8_QVas8zytbqyY,12172
34
- src/prompts/library/customsInvoice/other/prompt.txt,sha256=daSRssY8zcboCJCuqbLqehGR5dJs_wp4hOZHRol3KqU,9595
34
+ src/prompts/library/customsInvoice/other/prompt.txt,sha256=1dR73TQZJAfO9dKl-h7VhiJkdli498IV4e5JgBlOoYw,9695
35
35
  src/prompts/library/deliveryOrder/other/placeholders.json,sha256=7fjqag3kCVMV4mJ52dTjAcLtaBX0paXrDrW48vQVZSk,1250
36
36
  src/prompts/library/deliveryOrder/other/prompt.txt,sha256=y3QjN54e8PplEJngNlxoykbdrToBefS3r8gWixCbjfE,2468
37
37
  src/prompts/library/draftMbl/hapag-lloyd/prompt.txt,sha256=4FxiO1eHkimZVQZXU6gGNikuDVAWNniYvY8FUdVhpvk,2327
@@ -44,7 +44,7 @@ src/prompts/library/finalMbL/other/prompt.txt,sha256=pj-kgPV51upLhDppSKfhc2s5ylg
44
44
  src/prompts/library/packingList/other/placeholders.json,sha256=cGUUvEFoi4Lm0BAiyD29KbNFbUgzO1s7eit_qK3F0ig,4478
45
45
  src/prompts/library/packingList/other/prompt.txt,sha256=6Q9d0KBG6YWmNtzFivvmtQmitaUE2jytfwwc5YwsUgQ,2872
46
46
  src/prompts/library/partnerInvoice/other/placeholders.json,sha256=NX6ADT4gxLpP90uoNCYDbmfBvROxxVWRKK0lRFy1n9s,10897
47
- src/prompts/library/partnerInvoice/other/prompt.txt,sha256=bn1_CXrQy38DI7MXl6r40Cp-70w5cfXY6CQyBntvaX8,7944
47
+ src/prompts/library/partnerInvoice/other/prompt.txt,sha256=vMk-FBq9XkWiFiCf36t43DcIKNYh7IcGAsnfXq8vqio,8052
48
48
  src/prompts/library/postprocessing/port_code/placeholders.json,sha256=2TiXf3zSzrglOMPtDOlCntIa5RSvyZQAKG2-IgrCY5A,22
49
49
  src/prompts/library/postprocessing/port_code/prompt_port_code.txt,sha256=--1wunSqEr2ox958lEhjO-0JFBfOLzA3qfKYIzG_Iok,884
50
50
  src/prompts/library/preprocessing/carrier/placeholders.json,sha256=1UmrQNqBEsjLIpOO-a39Az6bQ_g1lxDGlwqZFU3IEt0,408
@@ -53,7 +53,7 @@ src/prompts/library/shippingInstruction/other/prompt.txt,sha256=dT2e-dPuvuz0rVYp
53
53
  src/prompts/prompt_library.py,sha256=VJWHeXN-s501C2GiidIIvQQuZdU6T1R27hE2dKBiI40,2555
54
54
  src/setup.py,sha256=M-p5c8M9ejKcSZ9N86VtmtPc4TYLxe1_4_dxf6jpfVc,7262
55
55
  src/tms.py,sha256=UXbIo1QE--hIX6NZi5Qyp2R_CP338syrY9pCTPrfgnE,1741
56
- src/utils.py,sha256=OqEu1apmN428_RgzqjRU5sZdEbECgBH0YiMpyys4Q5E,16947
57
- data_science_document_ai-1.44.0.dist-info/METADATA,sha256=jLyTuN383EQ-WdVsShEIsoj-t_ubnQ9VTSfSTKV3g9o,2152
58
- data_science_document_ai-1.44.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
- data_science_document_ai-1.44.0.dist-info/RECORD,,
56
+ src/utils.py,sha256=Ow5_Jals88o8mbZ1BoHfZpHZoCfig_UQb5aalH-mpWE,17278
57
+ data_science_document_ai-1.45.1.dist-info/METADATA,sha256=U2ASt9xmLqXeWIDx7cr0LBJFV9yJC4yh398R25jkWvs,2152
58
+ data_science_document_ai-1.45.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
59
+ data_science_document_ai-1.45.1.dist-info/RECORD,,
src/pdf_processing.py CHANGED
@@ -200,6 +200,7 @@ async def process_file_w_llm(params, file_content, input_doc_type, llm_client):
200
200
  file_content = extract_top_pages(file_content, num_pages=5)
201
201
 
202
202
  number_of_pages = get_pdf_page_count(file_content)
203
+ logger.info(f"processing {input_doc_type} with {number_of_pages} pages...")
203
204
 
204
205
  # get the schema placeholder from the Doc AI and generate the response structure
205
206
  response_schema = (
@@ -12,7 +12,7 @@ from src.constants import formatting_rules
12
12
  from src.io import logger
13
13
  from src.postprocessing.postprocess_partner_invoice import process_partner_invoice
14
14
  from src.prompts.prompt_library import prompt_library
15
- from src.utils import get_tms_mappings
15
+ from src.utils import batch_fetch_all_mappings, get_tms_mappings
16
16
 
17
17
  tms_domain = os.environ["TMS_DOMAIN"]
18
18
 
@@ -134,8 +134,11 @@ def extract_number(data_field_value):
134
134
  formatted_value: string
135
135
 
136
136
  """
137
+ # Remove container size pattern like 20FT, 40HC, etc from 1 x 40HC
138
+ value = remove_unwanted_patterns(data_field_value)
139
+
137
140
  formatted_value = ""
138
- for c in data_field_value:
141
+ for c in value:
139
142
  if c.isnumeric() or c in [",", ".", "-"]:
140
143
  formatted_value += c
141
144
 
@@ -320,9 +323,12 @@ def remove_unwanted_patterns(lineitem: str):
320
323
  lineitem = lineitem.replace("HIGH CUBE", "")
321
324
 
322
325
  # Remove container size e.g., 20FT, 40HC, etc.
323
- lineitem = re.sub(
324
- r"\b(20|22|40|45)(FT|HC|DC|HD|GP|OT|RF|FR|TK|DV)?\b", "", lineitem
325
- ).strip()
326
+ pattern = [
327
+ f"{s}{t}"
328
+ for s in ("20|22|40|45".split("|"))
329
+ for t in ("FT|HC|DC|HD|GP|OT|RF|FR|TK|DV".split("|"))
330
+ ]
331
+ lineitem = re.sub(r"|".join(pattern), "", lineitem, flags=re.IGNORECASE).strip()
326
332
 
327
333
  return lineitem
328
334
 
@@ -372,18 +378,45 @@ def clean_item_description(lineitem: str, remove_numbers: bool = True):
372
378
  return re.sub(r"\s{2,}", " ", lineitem).strip()
373
379
 
374
380
 
375
- async def format_label(entity_k, entity_value, document_type_code, params, mime_type):
381
+ async def format_label(
382
+ entity_k,
383
+ entity_value,
384
+ document_type_code,
385
+ params,
386
+ mime_type,
387
+ container_map,
388
+ terminal_map,
389
+ depot_map,
390
+ ):
376
391
  llm_client = params["LlmClient"]
377
392
  if isinstance(entity_value, dict): # if it's a nested entity
378
393
  format_tasks = [
379
- format_label(sub_k, sub_v, document_type_code, params, mime_type)
394
+ format_label(
395
+ sub_k,
396
+ sub_v,
397
+ document_type_code,
398
+ params,
399
+ mime_type,
400
+ container_map,
401
+ terminal_map,
402
+ depot_map,
403
+ )
380
404
  for sub_k, sub_v in entity_value.items()
381
405
  ]
382
406
  return entity_k, {k: v for k, v in await asyncio.gather(*format_tasks)}
383
407
  if isinstance(entity_value, list):
384
408
  format_tasks = await asyncio.gather(
385
409
  *[
386
- format_label(entity_k, sub_v, document_type_code, params, mime_type)
410
+ format_label(
411
+ entity_k,
412
+ sub_v,
413
+ document_type_code,
414
+ params,
415
+ mime_type,
416
+ container_map,
417
+ terminal_map,
418
+ depot_map,
419
+ )
387
420
  for sub_v in entity_value
388
421
  ]
389
422
  )
@@ -405,13 +438,13 @@ async def format_label(entity_k, entity_value, document_type_code, params, mime_
405
438
  )
406
439
 
407
440
  elif (entity_key == "containertype") or (entity_key == "containersize"):
408
- formatted_value = await get_tms_mappings(entity_value, "container_types")
441
+ formatted_value = container_map.get(entity_value)
409
442
 
410
443
  elif check_formatting_rule(entity_k, document_type_code, "terminal"):
411
- formatted_value = await get_tms_mappings(entity_value, "terminals")
444
+ formatted_value = terminal_map.get(entity_value)
412
445
 
413
446
  elif check_formatting_rule(entity_k, document_type_code, "depot"):
414
- formatted_value = await get_tms_mappings(entity_value, "depots")
447
+ formatted_value = depot_map.get(entity_value)
415
448
 
416
449
  elif entity_key.startswith(("eta", "etd", "duedate", "issuedate", "servicedate")):
417
450
  try:
@@ -507,7 +540,8 @@ async def get_port_code_ai(port: str, llm_client, doc_type=None):
507
540
  """Get port code using AI model."""
508
541
  port_llm = await get_port_code_llm(port, llm_client, doc_type=doc_type)
509
542
 
510
- return await get_tms_mappings(port, "ports", port_llm)
543
+ result = await get_tms_mappings(port, "ports", port_llm)
544
+ return result.get(port, None)
511
545
 
512
546
 
513
547
  async def get_port_code_llm(port: str, llm_client, doc_type=None):
@@ -598,6 +632,74 @@ def decimal_convertor(value, quantity=False):
598
632
  return value
599
633
 
600
634
 
635
+ async def collect_mapping_requests(entity_value, document_type_code):
636
+ """Collect all unique container types, terminals, and depots from the entity value."""
637
+ # Sets to store unique values
638
+ container_types = set()
639
+ terminals = set()
640
+ depots = set()
641
+
642
+ def walk(key, value):
643
+ key_lower = key.lower()
644
+
645
+ # nested dict
646
+ if isinstance(value, dict):
647
+ for k, v in value.items():
648
+ walk(k, v)
649
+
650
+ # list of values
651
+ elif isinstance(value, list):
652
+ for item in value:
653
+ walk(key, item)
654
+
655
+ # leaf node
656
+ else:
657
+ if key_lower in ("containertype", "containersize"):
658
+ # Take only "20DV" from ('20DV', 0) if it's a tuple
659
+ container_types.add(value[0]) if isinstance(
660
+ value, tuple
661
+ ) else container_types.add(value)
662
+
663
+ elif check_formatting_rule(key, document_type_code, "terminal"):
664
+ terminals.add(value[0]) if isinstance(value, tuple) else terminals.add(
665
+ value
666
+ )
667
+
668
+ elif check_formatting_rule(key, document_type_code, "depot"):
669
+ depots.add(value[0]) if isinstance(value, tuple) else depots.add(value)
670
+
671
+ walk("root", entity_value)
672
+
673
+ return container_types, terminals, depots
674
+
675
+
676
+ async def format_all_labels(entity_data, document_type_code, params, mime_type):
677
+ """Format all labels in the entity data using cached mappings."""
678
+ # Collect all mapping values needed
679
+ container_req, terminal_req, depot_req = await collect_mapping_requests(
680
+ entity_data, document_type_code
681
+ )
682
+
683
+ # Batch fetch mappings
684
+ container_map, terminal_map, depot_map = await batch_fetch_all_mappings(
685
+ container_req, terminal_req, depot_req
686
+ )
687
+
688
+ # Format labels using cached mappings
689
+ _, result = await format_label(
690
+ "root",
691
+ entity_data,
692
+ document_type_code,
693
+ params,
694
+ mime_type,
695
+ container_map,
696
+ terminal_map,
697
+ depot_map,
698
+ )
699
+
700
+ return _, result
701
+
702
+
601
703
  async def format_all_entities(result, document_type_code, params, mime_type):
602
704
  """Format the entity values in the result dictionary."""
603
705
  # Since we treat `customsInvoice` same as `partnerInvoice`
@@ -613,8 +715,8 @@ async def format_all_entities(result, document_type_code, params, mime_type):
613
715
  return {}
614
716
 
615
717
  # Format all entities recursively
616
- _, aggregated_data = await format_label(
617
- None, result, document_type_code, params, mime_type
718
+ _, aggregated_data = await format_all_labels(
719
+ result, document_type_code, params, mime_type
618
720
  )
619
721
 
620
722
  # Process partner invoice on lineitem mapping and reverse charge sentence
@@ -1,7 +1,5 @@
1
1
  """This module contains the postprocessing functions for the partner invoice."""
2
- from concurrent.futures import ThreadPoolExecutor
3
-
4
- from fuzzywuzzy import fuzz
2
+ from rapidfuzz import fuzz, process
5
3
 
6
4
  from src.io import logger
7
5
  from src.utils import get_tms_mappings
@@ -177,6 +175,7 @@ async def process_line_items_batch(
177
175
  pending_line_items = {}
178
176
 
179
177
  # Check Fuzzy Matching
178
+ logger.info(f"Mapping line item codes with Fuzzy matching....")
180
179
  for i, item in enumerate(line_items):
181
180
  description_obj = item.get("lineItemDescription")
182
181
 
@@ -231,12 +230,6 @@ async def process_line_items_batch(
231
230
  return line_items
232
231
 
233
232
 
234
- def compute_score(args):
235
- """Compute the fuzzy matching score between a new line item and a key."""
236
- new_lineitem, key = args
237
- return key, fuzz.ratio(new_lineitem, key)
238
-
239
-
240
233
  def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
241
234
  """Get the best fuzzy match for a target string from a list of candidates.
242
235
 
@@ -249,16 +242,18 @@ def get_fuzzy_match_score(target: str, sentences: list, threshold: int):
249
242
  tuple: (best_match, score) if above threshold, else (None, 0)
250
243
  """
251
244
  # Use multiprocessing to find the best match
252
- with ThreadPoolExecutor() as executor:
253
- results = executor.map(compute_score, [(target, s) for s in sentences])
245
+ result = process.extractOne(
246
+ target, sentences, scorer=fuzz.WRatio, score_cutoff=threshold
247
+ )
254
248
 
255
- # Find the best match and score
256
- best_match, best_score = max(results, key=lambda x: x[1], default=(None, 0))
249
+ if result is None:
250
+ return None, False
257
251
 
258
- # return best_match, best_score
259
- # If the best match score is above a threshold (e.g., 80), return it
260
- if best_score >= threshold:
261
- return best_match, True
252
+ match, score, index = result
253
+
254
+ # return best_match if the best match score is above a threshold (e.g., 80)
255
+ if match:
256
+ return match, True
262
257
 
263
258
  return None, False
264
259
 
@@ -290,18 +285,22 @@ def find_matching_lineitem(new_lineitem: str, kvp_dict: dict, threshold=90):
290
285
  Returns:
291
286
  str: The best matching 'Forto SLI' value from the dictionary.
292
287
  """
293
- new_lineitem = new_lineitem.upper()
294
-
295
288
  # Check if the new line item is already in the dictionary
296
289
  if new_lineitem in kvp_dict:
297
290
  return kvp_dict[new_lineitem]
298
291
 
299
292
  # Get the best fuzzy match score for the extracted line item
300
- best_match, _ = get_fuzzy_match_score(
301
- new_lineitem, list(kvp_dict.keys()), threshold
293
+ match, _ = get_fuzzy_match_score(
294
+ new_lineitem,
295
+ list(kvp_dict.keys()),
296
+ threshold,
302
297
  )
303
298
 
304
- return kvp_dict.get(best_match, None)
299
+ if match:
300
+ # find the code from the kvp_dict
301
+ return kvp_dict[match]
302
+
303
+ return None
305
304
 
306
305
 
307
306
  async def associate_forto_item_code(line_item_data, params):
@@ -54,7 +54,7 @@ Your role is to accurately extract specific entities from these invoices to supp
54
54
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
55
55
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
56
56
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
57
- - quantity: The quantity of the item or service provided in the line item.
57
+ - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
58
58
  - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567).
59
59
 
60
60
  - hblNumber and mblNumber:
@@ -52,7 +52,7 @@ Your role is to accurately extract specific entities from these invoices to supp
52
52
  - unitPrice: Even if the quantity is not mentioned, you can still extract the unit price. Check the naming of the columns in a different languages, it can be "Unit Price", "Prezzo unitario", "Prix Unitaire", "Unitario", etc. Refer to "Prezzo unitario" field in the italian invoice example.
53
53
  - totalAmount: The total amount for the item. It can be in different currencies, so ensure to capture the currency as well for the totalAmountCurrency.
54
54
  - totalAmountEuro: Few line items contains a total amount in Euro. You can find it by looking for the term "Total EUR" or "Amount in Euro" in the line item but it's always in the EURO / € currency. Sometimes, it can be same as totalAmount if the line item is already in Euro.
55
- - quantity: The quantity of the item or service provided in the line item. Pay attention to 2x40HC. It means, quantity is 2 and containerSize is 40HC but not 240.
55
+ - quantity: The quantity of the item or service provided in the line item. Pay attention to 2 x 40HC or 2x40HC. It means, quantity is 2 and 40HC is containerSize but not 240.
56
56
  - containerNumber: Container Number always starts with 4 letters and is followed by 7 digits (e.g., ABCD1234567, XALU 8593678).
57
57
 
58
58
  - hblNumber and mblNumber:
@@ -68,6 +68,7 @@ Your role is to accurately extract specific entities from these invoices to supp
68
68
  - Example:
69
69
  - "COSCO SHIPPING Lines Italy, Poland, or France S.R.L. – Genova Office – As Agent For COSCO SHIPPING Lines Co.,Ltd."
70
70
  - vendorName: COSCO SHIPPING Lines Co.,Ltd.
71
+ - From Hapag-Lloyd invoices, look for "Ballindamm 25" address to extract the vendorAddress.
71
72
 
72
73
  - agentName: Name of the agent. Agencies are offices authorized to act on behalf of a company. This details usually available including the branch name of the parent company name in the invoice.
73
74
  - agentKeyWord:
src/utils.py CHANGED
@@ -406,16 +406,7 @@ async def get_tms_mappings(
406
406
  response.raise_for_status()
407
407
 
408
408
  # Structure expected: {"response": {"data": {"desc1": "code1", "desc2": "code2"}}}
409
- if embedding_type == "line_items":
410
- # For line_items, return the full data mapping
411
- return response.json().get("response", {}).get("data", {})
412
- else:
413
- return (
414
- response.json()
415
- .get("response", {})
416
- .get("data", {})
417
- .get(input_list[0], None)
418
- )
409
+ return response.json().get("response", {}).get("data", {})
419
410
 
420
411
  except httpx.HTTPStatusError as exc:
421
412
  logger.error(
@@ -424,6 +415,25 @@ async def get_tms_mappings(
424
415
  return {}
425
416
 
426
417
 
418
+ async def batch_fetch_all_mappings(container_types, terminals, depots):
419
+ """Batch fetch all mappings for container types, terminals, and depots."""
420
+ # run batch calls concurrently
421
+ results = await asyncio.gather(
422
+ get_tms_mappings(list(container_types), "container_types"),
423
+ get_tms_mappings(list(terminals), "terminals"),
424
+ get_tms_mappings(list(depots), "depots"),
425
+ )
426
+
427
+ batch_container_map, batch_terminal_map, batch_depot_map = results
428
+
429
+ # Convert lists of tuples to dicts if necessary
430
+ return (
431
+ dict(batch_container_map or {}),
432
+ dict(batch_terminal_map or {}),
433
+ dict(batch_depot_map or {}),
434
+ )
435
+
436
+
427
437
  def transform_schema_strings(schema):
428
438
  """
429
439
  Recursively transforms a schema dictionary, replacing all "type": "STRING"