streamlit-octostar-utils 0.5.5.dev1__tar.gz → 0.5.6.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/nifi.py +16 -218
  4. streamlit_octostar_utils-0.5.6.dev2/streamlit_octostar_utils/core/opensearch_conversion.py +354 -0
  5. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/LICENSE +0 -0
  6. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/README.md +0 -0
  7. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/__init__.py +0 -0
  8. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  9. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  10. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
  11. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  31. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/nlp/language.py +0 -0
  32. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/nlp/ner.py +0 -0
  33. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  34. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/octostar/client.py +0 -0
  35. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/octostar/context.py +0 -0
  36. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  37. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  38. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  39. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  40. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/ontology/validation.py +0 -0
  41. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/style/__init__.py +0 -0
  42. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/style/common.py +0 -0
  43. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/threading/__init__.py +0 -0
  44. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  45. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  46. {streamlit_octostar_utils-0.5.5.dev1 → streamlit_octostar_utils-0.5.6.dev2}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.5.5.dev1
3
+ Version: 0.5.6.dev2
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.5.5-dev.1"
8
+ version = "0.5.6-dev.2"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -17,7 +17,6 @@ from starlette.exceptions import HTTPException as StarletteHTTPException
17
17
 
18
18
  from octostar.utils.workspace import upsert_entities
19
19
  from octostar.utils.ontology import fetch_ontology_data
20
- from octostar.utils.workspace.permissions import get_permissions, PermissionLevel
21
20
  from octostar.utils.pipeline import update_processing_status
22
21
 
23
22
  from octostar.client import make_client
@@ -54,144 +53,6 @@ OS_RESERVED_FIELDS = [
54
53
  MAX_IN_MEMORY_SIZE_BYTES = 5_242_880
55
54
 
56
55
 
57
- class NifiPriority:
58
- COMPONENT_WIDTHS = (10, 10, 1, 20)
59
- SEPARATOR = "."
60
- MAX_FRAGMENT_DEPTH = 9
61
- BASE62_CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
62
- BASE = len(BASE62_CHARS)
63
- _BASE62_SET = frozenset(BASE62_CHARS)
64
-
65
- @staticmethod
66
- def _normalize_component(value, width):
67
- if value is None:
68
- return "z" * width
69
- s = str(value)
70
- invalid = set(s) - NifiPriority._BASE62_SET
71
- if invalid:
72
- raise ValueError(f"Invalid base62 character(s): {sorted(invalid)!r}")
73
- if len(s) > width:
74
- return "z" * width
75
- return s.rjust(width, "0")
76
-
77
- def __init__(self, op_reserved=None, user_prio=None, fragment_prio=0, entity_timestamp=0):
78
- w = self.COMPONENT_WIDTHS
79
- self.op_reserved = self._normalize_component(op_reserved, w[0])
80
- self.user_prio = self._normalize_component(user_prio, w[1])
81
- self.fragment_prio = int(fragment_prio)
82
- self.entity_timestamp = int(entity_timestamp)
83
-
84
- @staticmethod
85
- def _encode_base62(value, width):
86
- chars = NifiPriority.BASE62_CHARS
87
- base = NifiPriority.BASE
88
- if value < 0:
89
- value = 0
90
- if value == 0:
91
- return "0" * width
92
- result = []
93
- v = value
94
- while v > 0:
95
- result.append(chars[v % base])
96
- v //= base
97
- result.reverse()
98
- s = "".join(result)
99
- if len(s) > width:
100
- return chars[-1] * width
101
- return s.rjust(width, "0")
102
-
103
- @staticmethod
104
- def _decode_base62(s):
105
- chars = NifiPriority.BASE62_CHARS
106
- result = 0
107
- for c in s:
108
- idx = chars.find(c)
109
- if idx < 0:
110
- raise ValueError(f"Invalid base62 character: {c!r}")
111
- result = result * NifiPriority.BASE + idx
112
- return result
113
-
114
- @classmethod
115
- def from_string(cls, s):
116
- if not s or cls.SEPARATOR not in s:
117
- return cls()
118
- parts = s.split(cls.SEPARATOR)
119
- if len(parts) != 4:
120
- return cls()
121
- try:
122
- for part in parts:
123
- if set(part) - cls._BASE62_SET:
124
- return cls()
125
- return cls(
126
- op_reserved=parts[0],
127
- user_prio=parts[1],
128
- fragment_prio=cls._decode_base62(parts[2]),
129
- entity_timestamp=cls._decode_base62(parts[3]),
130
- )
131
- except (ValueError, TypeError):
132
- return cls()
133
-
134
- @classmethod
135
- def from_dict(cls, d):
136
- if not d:
137
- return cls()
138
- return cls(
139
- op_reserved=d.get("op_reserved"),
140
- user_prio=d.get("user_prio"),
141
- fragment_prio=d.get("fragment_prio", 0),
142
- entity_timestamp=d.get("entity_timestamp", 0),
143
- )
144
-
145
- @classmethod
146
- def from_entity(cls, entity):
147
- existing = entity.request.get("priority", {})
148
- prio = cls.from_dict(existing)
149
- prio.fragment_prio = cls.compute_fragment_depth(entity)
150
- ts_str = entity.request.get("entity_timestamp")
151
- if ts_str:
152
- try:
153
- dt = string_to_datetime(ts_str)
154
- prio.entity_timestamp = int(dt.timestamp() * 1000)
155
- except Exception:
156
- prio.entity_timestamp = 0
157
- else:
158
- prio.entity_timestamp = 0
159
- return prio
160
-
161
- @staticmethod
162
- def compute_fragment_depth(entity):
163
- stack = entity.request.get("config", {}).get("fragment", {}).get("fragments_stack", [])
164
- depth = 0
165
- for key in stack:
166
- try:
167
- info = NifiFragmenter.get_fragment_info(entity, key)
168
- if info.get("index", 0) != 0:
169
- depth += 1
170
- else:
171
- break
172
- except (KeyError, RuntimeError):
173
- break
174
- return min(depth, 9)
175
-
176
- def to_string(self):
177
- w = self.COMPONENT_WIDTHS
178
- return self.SEPARATOR.join([
179
- self.op_reserved,
180
- self.user_prio,
181
- self._encode_base62(self.fragment_prio, w[2]),
182
- self._encode_base62(self.entity_timestamp, w[3]),
183
- ])
184
-
185
- def to_dict(self):
186
- return {
187
- "op_reserved": self.op_reserved,
188
- "user_prio": self.user_prio,
189
- }
190
-
191
- def apply_to_entity(self, entity):
192
- entity.request["priority"] = self.to_dict()
193
-
194
-
195
56
  class NifiProxyEntityModel(BaseModel):
196
57
  entity_id: str
197
58
  entity_type: str
@@ -223,8 +84,6 @@ class NifiEntityModel(BaseModel):
223
84
  is_temporary: bool = False
224
85
  exception: dict = Field(default_factory=dict)
225
86
  last_processor_name: Optional[str] = None
226
- fallback_os_workspace: Optional[str] = None
227
- priority: dict = Field(default_factory=dict)
228
87
 
229
88
  class RecordModel(BaseModel):
230
89
  model_config = ConfigDict(extra="allow")
@@ -614,7 +473,6 @@ class NifiContextManager(object):
614
473
  def __init__(self, json_data, lazy_sync=True):
615
474
  if not json_data:
616
475
  raise ValueError("Nifi context manager received list of 0 entities")
617
- self.permissions = {}
618
476
  self.in_batches = None
619
477
  self.out_entities = None
620
478
  self.nonlazy_sync_ids = set()
@@ -667,8 +525,6 @@ class NifiContextManager(object):
667
525
  key=lambda x: string_to_datetime(x.request.get("entity_timestamp")),
668
526
  )
669
527
  entities = list({e.record["entity_id"]: e for e in entities}.values())
670
- for entity in entities:
671
- NifiPriority.from_entity(entity).apply_to_entity(entity)
672
528
  entities = [
673
529
  (
674
530
  jsondict_hash(NifiContextManager._config_get(entity, processor_name)),
@@ -693,16 +549,6 @@ class NifiContextManager(object):
693
549
  def __enter__(self):
694
550
  return self
695
551
 
696
- def get_workspaces_permissions(self, workspace_ids):
697
- permissions_to_fetch = list(set(workspace_ids).difference(set(list(self.permissions.keys()))))
698
- if permissions_to_fetch:
699
- permissions = get_permissions.sync(permissions_to_fetch, client=self.client)
700
- self.permissions.update(permissions)
701
- permissions = {}
702
- for k in workspace_ids:
703
- permissions[k] = self.permissions.get(k, PermissionLevel.NONE)
704
- return permissions
705
-
706
552
  def request_entity_sync(
707
553
  self,
708
554
  entity,
@@ -743,9 +589,6 @@ class NifiContextManager(object):
743
589
  )
744
590
  self.out_entities = list({e.record["entity_id"]: e for e in all_entities}.values())
745
591
  self.sync_entities()
746
- for entity in self.out_entities:
747
- prio = NifiPriority.from_entity(entity)
748
- entity.request["nifi_attributes"]["priority"] = prio.to_string()
749
592
  return [entity for entity in self.jsonify(self.out_entities)["content"]]
750
593
 
751
594
  def raise_exception(self, entity, exc):
@@ -772,16 +615,18 @@ class NifiContextManager(object):
772
615
  import logging
773
616
  _lock_logger = logging.getLogger(__name__)
774
617
 
775
- records = []
776
- for e in entities:
777
- if isinstance(e, dict):
778
- records.append(
779
- (e, e.get("entity_timestamp"))
780
- )
781
- else:
782
- records.append(
783
- (e.record, e.request.get("entity_timestamp") if e.request else None)
784
- )
618
+ def _read_entries():
619
+ entries = []
620
+ for e in entities:
621
+ if isinstance(e, dict):
622
+ entries.append((e, e.get("entity_timestamp")))
623
+ else:
624
+ entries.append(
625
+ (e.record, e.request.get("entity_timestamp") if e.request else None)
626
+ )
627
+ return entries
628
+
629
+ records = _read_entries()
785
630
 
786
631
  long_expiry = (datetime.now(timezone.utc) + timedelta(seconds=timeout)).strftime("%Y-%m-%dT%H:%M:%SZ")
787
632
  statuses = [
@@ -815,6 +660,7 @@ class NifiContextManager(object):
815
660
  try:
816
661
  yield True
817
662
  finally:
663
+ records = _read_entries()
818
664
  short_expiry = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
819
665
  statuses = [
820
666
  {
@@ -849,10 +695,8 @@ class NifiContextManager(object):
849
695
  self._sync_upsert_entities(entities_to_upsert)
850
696
  self._sync_fetch_relationships(entities, fetch_rel_entities, fetch_concept_rels)
851
697
 
852
- now_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
853
698
  for entity in all_entities_to_modify:
854
699
  entity.request["is_temporary"] = False
855
- entity.request["entity_timestamp"] = now_ts
856
700
 
857
701
  for entity in entities:
858
702
  entity.sync_params = {}
@@ -1114,29 +958,7 @@ class NifiEntity(object):
1114
958
 
1115
959
  @property
1116
960
  def write_os_workspace(self):
1117
- permissions = self.context.get_workspaces_permissions(
1118
- [
1119
- e
1120
- for e in [
1121
- self.record.get("os_workspace"),
1122
- self.request.get("fallback_os_workspace"),
1123
- ]
1124
- if e
1125
- ]
1126
- )
1127
- if (
1128
- self.record.get("os_workspace")
1129
- and (permissions.get(self.record.get("os_workspace")) or PermissionLevel.NONE) >= PermissionLevel.WRITE
1130
- ):
1131
- return self.record["os_workspace"]
1132
- elif (
1133
- self.request.get("fallback_os_workspace")
1134
- and (permissions.get(self.request.get("fallback_os_workspace")) or PermissionLevel.NONE)
1135
- >= PermissionLevel.WRITE
1136
- ):
1137
- return self.request["fallback_os_workspace"]
1138
- else:
1139
- return None
961
+ return self.record.get("os_workspace")
1140
962
 
1141
963
  @property
1142
964
  def label(self):
@@ -1164,29 +986,6 @@ class NifiEntity(object):
1164
986
  options={"verify_signature": False},
1165
987
  )
1166
988
 
1167
- @property
1168
- def priority(self):
1169
- return NifiPriority.from_entity(self)
1170
-
1171
- @priority.setter
1172
- def priority(self, value):
1173
- if isinstance(value, NifiPriority):
1174
- self.request["priority"] = value.to_dict()
1175
- elif isinstance(value, dict):
1176
- self.request["priority"] = value
1177
- else:
1178
- raise TypeError("priority must be a NifiPriority or dict")
1179
-
1180
- def set_user_priority(self, value):
1181
- prio = self.priority
1182
- prio.user_prio = NifiPriority._normalize_component(value, NifiPriority.COMPONENT_WIDTHS[1])
1183
- self.priority = prio
1184
-
1185
- def set_op_priority(self, value):
1186
- prio = self.priority
1187
- prio.op_reserved = NifiPriority._normalize_component(value, NifiPriority.COMPONENT_WIDTHS[0])
1188
- self.priority = prio
1189
-
1190
989
  def update_last_timestamp(self):
1191
990
  self.record["os_last_updated_at"] = now()
1192
991
 
@@ -1377,8 +1176,6 @@ class NifiEntity(object):
1377
1176
  "is_temporary": True,
1378
1177
  "exception": {},
1379
1178
  "last_processor_name": None,
1380
- "fallback_os_workspace": self.request["fallback_os_workspace"],
1381
- "priority": deepcopy(self.request.get("priority", {})),
1382
1179
  }
1383
1180
  child_entity = NifiEntity(
1384
1181
  self.context,
@@ -1604,6 +1401,7 @@ class NifiEntity(object):
1604
1401
  os_entity_uid=None,
1605
1402
  os_relationship_uid=None,
1606
1403
  os_entity_type=FRAGMENT_ENTITY_NAME,
1404
+ os_parent_uid=None,
1607
1405
  previous_fragment_uid=None,
1608
1406
  previous_fragment_relationship_uid=None,
1609
1407
  previous_fragment_relationship=PREVIOUS_FRAGMENT_RELATIONSHIP,
@@ -1616,7 +1414,7 @@ class NifiEntity(object):
1616
1414
  fields = {
1617
1415
  **{k: v for k, v in self.record.items() if k.startswith("fragment") and v is not None},
1618
1416
  **fields,
1619
- "os_parent_uid": self.record["os_entity_uid"],
1417
+ "os_parent_uid": os_parent_uid or self.record["os_entity_uid"],
1620
1418
  "source_entity_uid": source_entity_uid,
1621
1419
  "previous_entity_uid": previous_fragment_uid,
1622
1420
  "next_entity_uid": next_fragment_uid,
@@ -0,0 +1,354 @@
1
+ """Mapping-aware client-side conversion of Python values to OpenSearch types.
2
+
3
+ Recursively walks a data dict alongside an OpenSearch index mapping and
4
+ coerces Python values so they match the expected field types (text, keyword,
5
+ integer, date, binary, knn_vector, nested, etc.).
6
+
7
+ Primary entry point:
8
+ ``convert_clientside(data, curr_mapping)``
9
+
10
+ The *curr_mapping* can be either the raw ``{"properties": {...}}`` tree or
11
+ just the inner ``properties`` dict -- the function handles both.
12
+ """
13
+
14
+ from .dict import travel_dict
15
+ from .timestamp import string_to_datetime
16
+ import json
17
+ import base64
18
+ import logging
19
+ import datetime as dt
20
+ from typing import Dict, Any
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ vector_nomenclature_order = [
25
+ "model_name",
26
+ "dim",
27
+ "model_version",
28
+ ]
29
+
30
+ conversion_matrix = {
31
+ ("str", "text"): lambda d: (d, False),
32
+ ("str", "keyword"): lambda d: (d, False),
33
+ ("str", "boolean"): lambda d: (bool(d), False),
34
+ ("str", "integer"): lambda d: (int(d), False),
35
+ ("str", "long"): lambda d: (int(d), False),
36
+ ("str", "float"): lambda d: (float(d), False),
37
+ ("str", "double"): lambda d: (float(d), False),
38
+ ("str", "date"): lambda d: (
39
+ string_to_datetime(d).strftime("%Y-%m-%dT%H:%M:%SZ"),
40
+ False,
41
+ ),
42
+ ("str", "binary"): lambda d: (
43
+ base64.b64encode(d.encode("utf-8")).decode("utf-8"),
44
+ False,
45
+ ),
46
+ ("str", "object"): lambda d: (json.loads(d), True),
47
+ ("str", "nested"): lambda d: (json.loads(d), True),
48
+ ("bool", "text"): lambda d: (str(d), False),
49
+ ("bool", "keyword"): lambda d: (str(d), False),
50
+ ("bool", "boolean"): lambda d: (d, False),
51
+ ("bool", "integer"): lambda d: (int(d), False),
52
+ ("bool", "long"): lambda d: (int(d), False),
53
+ ("bool", "float"): lambda d: (float(int(d)), False),
54
+ ("bool", "double"): lambda d: (float(int(d)), False),
55
+ ("bool", "binary"): lambda d: (
56
+ base64.b64encode(str(int(d)).encode("utf-8")).decode("utf-8"),
57
+ False,
58
+ ),
59
+ ("int", "text"): lambda d: (str(d), False),
60
+ ("int", "keyword"): lambda d: (str(d), False),
61
+ ("int", "boolean"): lambda d: (bool(d), False),
62
+ ("int", "integer"): lambda d: (d, False),
63
+ ("int", "long"): lambda d: (d, False),
64
+ ("int", "float"): lambda d: (float(d), False),
65
+ ("int", "double"): lambda d: (float(d), False),
66
+ ("int", "date"): lambda d: (
67
+ dt.datetime.fromtimestamp(d).strftime("%Y-%m-%dT%H:%M:%SZ"),
68
+ False,
69
+ ),
70
+ ("int", "binary"): lambda d: (
71
+ base64.b64encode(str(d).encode("utf-8")).decode("utf-8"),
72
+ False,
73
+ ),
74
+ ("float", "text"): lambda d: (str(d), False),
75
+ ("float", "keyword"): lambda d: (str(d), False),
76
+ ("float", "boolean"): lambda d: (bool(d), False),
77
+ ("float", "integer"): lambda d: (int(d), False),
78
+ ("float", "long"): lambda d: (int(d), False),
79
+ ("float", "float"): lambda d: (d, False),
80
+ ("float", "double"): lambda d: (d, False),
81
+ ("float", "date"): lambda d: (
82
+ dt.datetime.fromtimestamp(int(d)).strftime("%Y-%m-%dT%H:%M:%SZ"),
83
+ False,
84
+ ),
85
+ ("float", "binary"): lambda d: (
86
+ base64.b64encode(str(d).encode("utf-8")).decode("utf-8"),
87
+ False,
88
+ ),
89
+ ("bytes", "text"): lambda d: (d.decode("utf-8"), False),
90
+ ("bytes", "keyword"): lambda d: (d.decode("utf-8"), False),
91
+ ("bytes", "boolean"): lambda d: (
92
+ d.decode("utf-8").lower() in ["true", "1", "yes", "y", "on"],
93
+ False,
94
+ ),
95
+ ("bytes", "integer"): lambda d: (int(d.decode("utf-8")), False),
96
+ ("bytes", "long"): lambda d: (int(d.decode("utf-8")), False),
97
+ ("bytes", "float"): lambda d: (float(d.decode("utf-8")), False),
98
+ ("bytes", "double"): lambda d: (float(d.decode("utf-8")), False),
99
+ ("bytes", "date"): lambda d: (
100
+ string_to_datetime(d.decode("utf-8")).strftime("%Y-%m-%dT%H:%M:%SZ"),
101
+ False,
102
+ ),
103
+ ("bytes", "binary"): lambda d: (base64.b64encode(d).decode("utf-8"), False),
104
+ ("bytes", "object"): lambda d: (json.loads(d.decode("utf-8")), True),
105
+ ("bytes", "nested"): lambda d: (json.loads(d.decode("utf-8")), True),
106
+ ("dict", "text"): lambda d: (json.dumps(d), False),
107
+ ("dict", "keyword"): lambda d: (json.dumps(d), False),
108
+ ("dict", "boolean"): lambda d: (bool(d), False),
109
+ ("dict", "object"): lambda d: (d, True),
110
+ ("dict", "nested"): lambda d: (d, True),
111
+ }
112
+
113
+
114
+ def has_opensearch_type(mapping):
115
+ return "type" in mapping and isinstance(mapping["type"], str)
116
+
117
+
118
+ def has_opensearch_properties(mapping):
119
+ return (
120
+ "properties" in mapping
121
+ and isinstance(mapping["properties"], dict)
122
+ and ("type" not in mapping or has_opensearch_type(mapping))
123
+ )
124
+
125
+
126
+ def mapping_from_data(data, mapping):
127
+ python_to_os_types = {
128
+ "str": "text",
129
+ "bool": "boolean",
130
+ "int": "long",
131
+ "float": "double",
132
+ }
133
+ if not mapping:
134
+ mapping = {}
135
+ if data is None:
136
+ return mapping
137
+ datatype = type(data).__name__
138
+ if datatype == "NoneType" or data == {} or data == []:
139
+ return {}
140
+ if datatype not in ["str", "bool", "int", "float", "bytes", "dict", "list"]:
141
+ raise TypeError()
142
+ elif datatype == "dict":
143
+ for key, value in data.items():
144
+ mapping[key] = mapping_from_data(value, {})
145
+ elif datatype == "list":
146
+ mapping = mapping_from_data(data[0], {})
147
+ else:
148
+ if datatype == "bytes":
149
+ datatype = "str"
150
+ datatype = python_to_os_types[datatype]
151
+ if datatype == "text":
152
+ try:
153
+ dt.datetime.fromisoformat(data)
154
+ datatype = "date"
155
+ except Exception:
156
+ pass
157
+ mapping = {"type": datatype}
158
+ return mapping
159
+
160
+
161
+ def convert_data_type(data, curr_mapping, key):
162
+ data_elem = travel_dict(data, key, "r", True)
163
+ if data_elem is None or data_elem == []:
164
+ return
165
+ data_type = type(data_elem).__name__
166
+ if not curr_mapping:
167
+ curr_mapping = mapping_from_data(data_elem, {})
168
+ if not has_opensearch_type(curr_mapping):
169
+ mapping_type = "object"
170
+ else:
171
+ mapping_type = curr_mapping.get("type", "object")
172
+ recurse = False
173
+ try:
174
+ converted_elem, recurse = conversion_matrix[(data_type, mapping_type)](
175
+ data_elem
176
+ )
177
+ except BaseException as e:
178
+ logger.warning(f"{type(e).__name__}: {str(e)}")
179
+ converted_elem = None
180
+ if converted_elem and recurse:
181
+ converted_elem = convert_clientside(converted_elem, curr_mapping)
182
+ travel_dict(data, key, "w", True)(converted_elem)
183
+
184
+
185
+ def handle_incompatible_data_type(data, key):
186
+ value = travel_dict(data, key, "r", True)
187
+ datatype = type(value).__name__
188
+ if datatype == "NoneType":
189
+ return data
190
+ if datatype not in ["str", "bool", "int", "float", "bytes", "dict", "list"]:
191
+ travel_dict(data, key, "w")(str(value))
192
+ return data
193
+
194
+
195
+ def handle_data_dict(data, curr_mapping, key):
196
+ curr_data = travel_dict(data, key, "r")
197
+ if (
198
+ curr_mapping
199
+ and has_opensearch_type(curr_mapping)
200
+ and curr_mapping.get("type") not in ["object", "nested"]
201
+ ):
202
+ convert_data_type(data, curr_mapping, key)
203
+ else:
204
+ for subkey in list(curr_data.keys()):
205
+ convert_clientside(data, curr_mapping.get(subkey), key + [subkey])
206
+ sub_val = curr_data.get(subkey)
207
+ if isinstance(sub_val, dict):
208
+ sub_val.pop("#type", None)
209
+
210
+
211
+ def get_vector_name(metadata: Dict[str, Any]) -> str:
212
+ vector_name = "vector"
213
+ for prop in vector_nomenclature_order:
214
+ if prop in metadata:
215
+ vector_name += f"_{metadata[prop]}"
216
+ return vector_name
217
+
218
+
219
+ def validate_vector_data(vector_data, curr_mapping):
220
+ try:
221
+ assert "data" in vector_data, "'data' field missing in vector_data"
222
+ assert (
223
+ isinstance(vector_data["data"], list) and vector_data["data"]
224
+ ), "'data' must be a non-empty list"
225
+ assert "metadata" in vector_data, "'metadata' field missing in vector_data"
226
+ assert isinstance(
227
+ vector_data["metadata"], dict
228
+ ), "'metadata' must be a dictionary"
229
+ assert "dim" in vector_data["metadata"], "'dim' field missing in 'metadata'"
230
+ assert (
231
+ isinstance(vector_data["metadata"]["dim"], int)
232
+ and vector_data["metadata"]["dim"] > 0
233
+ ), "'dim' must be a positive integer"
234
+ assert vector_data["metadata"]["dim"] == len(
235
+ vector_data["data"]
236
+ ), "'dim' must match the length of 'data'"
237
+ return True
238
+ except AssertionError as e:
239
+ logger.warning(f"Validation failed: {str(e)}")
240
+ return False
241
+
242
+
243
+ def handle_data_vector(data, curr_mapping, key, vector_data=None):
244
+ if vector_data is None:
245
+ vector_data = travel_dict(data, key, "r")
246
+
247
+ if not validate_vector_data(vector_data, curr_mapping):
248
+ convert_clientside(data, curr_mapping, key)
249
+ return
250
+
251
+ vector_name = get_vector_name(vector_data["metadata"])
252
+
253
+ if vector_name not in curr_mapping:
254
+ logger.warning(
255
+ f"Found valid vector in input data but no field '{vector_name}' allocated in OpenSearch for it"
256
+ )
257
+ convert_clientside(data, curr_mapping, key)
258
+ return
259
+
260
+ travel_dict(data, key, "w")(
261
+ {
262
+ vector_name: {
263
+ "value": vector_data["data"],
264
+ },
265
+ "#type": "VECTOR",
266
+ }
267
+ )
268
+
269
+
270
+ def handle_data_list(data, curr_mapping, key):
271
+ def _align_elems_to_mapping(
272
+ data, super_path, i, path, elem_structure, curr_mapping
273
+ ):
274
+ path = []
275
+ for key in elem_structure.keys() | curr_mapping.keys():
276
+ sub_path = path + [key]
277
+ if key not in curr_mapping:
278
+ continue
279
+ if key not in elem_structure:
280
+ convert_clientside(data, elem_structure, super_path + [i] + sub_path)
281
+ else:
282
+ if isinstance(elem_structure[key], dict) and isinstance(
283
+ curr_mapping[key], dict
284
+ ):
285
+ _align_elems_to_mapping(
286
+ data,
287
+ super_path,
288
+ i,
289
+ sub_path,
290
+ elem_structure[key],
291
+ curr_mapping[key],
292
+ )
293
+ elif elem_structure[key] != curr_mapping[key]:
294
+ convert_clientside(
295
+ data, elem_structure, super_path + [i] + sub_path
296
+ )
297
+
298
+ curr_data = travel_dict(data, key, "r")
299
+ if len(curr_data) == 0:
300
+ return
301
+
302
+ elem_structure = {}
303
+ for i, elem in enumerate(curr_data):
304
+ data = convert_clientside(data, curr_mapping, key + [i])
305
+ elem = travel_dict(data, key + [i], "r")
306
+ elem_structure = {**mapping_from_data(elem, {}), **elem_structure}
307
+ elem_structure = {**elem_structure, **curr_mapping}
308
+
309
+ if isinstance(curr_data[0], dict) and curr_data[0].get("#type") == "VECTOR":
310
+ vectors_data = {}
311
+ for i, elem in enumerate(curr_data):
312
+ elem.pop("#type", None)
313
+ for vector_name in elem:
314
+ if vector_name not in vectors_data:
315
+ vectors_data[vector_name] = []
316
+ vectors_data[vector_name].append({"value": elem[vector_name]["value"]})
317
+ travel_dict(data, key, "w")(vectors_data)
318
+ elem_structure.pop("#type")
319
+
320
+ if elem_structure != curr_mapping:
321
+ _align_elems_to_mapping(data, key, i, [], elem_structure, curr_mapping)
322
+
323
+
324
+ def convert_clientside(data, curr_mapping, parent_keylist=[]):
325
+ """Recursively convert *data* so values match the OpenSearch *curr_mapping*.
326
+
327
+ *curr_mapping* may be a full mapping dict (with a ``"properties"`` key) or
328
+ the inner properties dict directly. *parent_keylist* is used internally
329
+ for recursive traversal and should normally be left empty.
330
+ """
331
+ if not curr_mapping:
332
+ curr_mapping = {}
333
+ if has_opensearch_properties(curr_mapping):
334
+ curr_mapping = curr_mapping["properties"]
335
+ if parent_keylist:
336
+ curr_data = travel_dict(data, parent_keylist, "r")
337
+ else:
338
+ curr_data = data
339
+ datatype = type(curr_data).__name__
340
+ if datatype not in ["str", "bool", "int", "float", "bytes", "dict", "list"]:
341
+ handle_incompatible_data_type(data, parent_keylist)
342
+ convert_data_type(data, curr_mapping, parent_keylist)
343
+ elif datatype == "dict":
344
+ if curr_data.get("#type") == "VECTOR":
345
+ curr_data.pop("#type", None)
346
+ handle_data_vector(data, curr_mapping, parent_keylist)
347
+ else:
348
+ curr_data.pop("#type", None)
349
+ handle_data_dict(data, curr_mapping, parent_keylist)
350
+ elif datatype == "list":
351
+ handle_data_list(data, curr_mapping, parent_keylist)
352
+ else:
353
+ convert_data_type(data, curr_mapping, parent_keylist)
354
+ return data