streamlit-octostar-utils 0.4.2.dev25__tar.gz → 0.5.0.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/celery.py +180 -24
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/fastapi.py +1 -97
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/nifi.py +215 -20
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/inheritance.py +5 -5
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/LICENSE +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/README.md +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/language.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/ner.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/relationships.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
|
@@ -9,7 +9,6 @@ import subprocess
|
|
|
9
9
|
from fastapi import Query
|
|
10
10
|
import time
|
|
11
11
|
import os
|
|
12
|
-
import pickle
|
|
13
12
|
import atexit
|
|
14
13
|
import redis
|
|
15
14
|
import uuid
|
|
@@ -68,7 +67,31 @@ class CeleryQueueConfig:
|
|
|
68
67
|
self.options = options
|
|
69
68
|
|
|
70
69
|
|
|
70
|
+
class TaskResult:
|
|
71
|
+
"""Wrapper for task results that include binary parts alongside JSON data.
|
|
72
|
+
Tasks returning binary data (e.g. images) should return a TaskResult
|
|
73
|
+
so that serialized_io writes them as multipart parts instead of attempting
|
|
74
|
+
JSON serialization on bytes."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, data, part=None):
|
|
77
|
+
self.data = data
|
|
78
|
+
self.part = part
|
|
79
|
+
|
|
80
|
+
|
|
71
81
|
class CelerySerialized:
|
|
82
|
+
"""Serializes task data to a boundary-delimited multipart file.
|
|
83
|
+
|
|
84
|
+
Format: metadata JSON part followed by optional binary/streamed parts,
|
|
85
|
+
separated by boundary markers (the task_id). Replaces pickle entirely.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
BOUNDARY_PREFIX = b"--"
|
|
89
|
+
BOUNDARY_SUFFIX = b"\r\n"
|
|
90
|
+
BOUNDARY_END = b"--\r\n"
|
|
91
|
+
CONTENT_TYPE_JSON = b"Content-Type: application/json\r\n"
|
|
92
|
+
CONTENT_TYPE_BYTES = b"Content-Type: application/octet-stream\r\n"
|
|
93
|
+
HEADER_END = b"\r\n"
|
|
94
|
+
|
|
72
95
|
def __init__(self, folder, redis_client, data=None):
|
|
73
96
|
self.folder = folder
|
|
74
97
|
self.data = data
|
|
@@ -77,18 +100,70 @@ class CelerySerialized:
|
|
|
77
100
|
def set_task_id(self, task_id):
|
|
78
101
|
self.task_id = task_id
|
|
79
102
|
|
|
80
|
-
def
|
|
103
|
+
def _boundary(self):
|
|
104
|
+
return self.task_id.encode()
|
|
105
|
+
|
|
106
|
+
def _write_boundary(self, f):
|
|
107
|
+
f.write(self.BOUNDARY_PREFIX + self._boundary() + self.BOUNDARY_SUFFIX)
|
|
108
|
+
|
|
109
|
+
def _write_end_boundary(self, f):
|
|
110
|
+
f.write(self.BOUNDARY_PREFIX + self._boundary() + self.BOUNDARY_END)
|
|
111
|
+
|
|
112
|
+
def dump(self, parts=None, part_is_list=False):
|
|
113
|
+
"""Write metadata + optional parts in multipart format.
|
|
114
|
+
parts: optional list of bytes objects to write as additional parts.
|
|
115
|
+
part_is_list: whether the original part was a list (preserves type on read).
|
|
116
|
+
"""
|
|
81
117
|
assert self.task_id
|
|
118
|
+
if isinstance(self.data, dict):
|
|
119
|
+
metadata = self.data
|
|
120
|
+
else:
|
|
121
|
+
metadata = {"data": self.data}
|
|
122
|
+
metadata["part_count"] = len(parts) if parts else 0
|
|
123
|
+
metadata["part_is_list"] = part_is_list
|
|
82
124
|
with RedisFileLock(self.redis_client, os.path.join(self.folder, self.task_id)):
|
|
83
|
-
with open(os.path.join(self.folder, self.task_id), "wb") as
|
|
84
|
-
|
|
125
|
+
with open(os.path.join(self.folder, self.task_id), "wb") as f:
|
|
126
|
+
self._write_boundary(f)
|
|
127
|
+
f.write(self.CONTENT_TYPE_JSON)
|
|
128
|
+
f.write(self.HEADER_END)
|
|
129
|
+
f.write(json.dumps(metadata).encode())
|
|
130
|
+
f.write(b"\r\n")
|
|
131
|
+
if parts:
|
|
132
|
+
for part in parts:
|
|
133
|
+
self._write_boundary(f)
|
|
134
|
+
f.write(self.CONTENT_TYPE_BYTES)
|
|
135
|
+
f.write(self.HEADER_END)
|
|
136
|
+
f.write(part)
|
|
137
|
+
f.write(b"\r\n")
|
|
138
|
+
self._write_end_boundary(f)
|
|
85
139
|
|
|
86
140
|
def load(self):
|
|
141
|
+
"""Read multipart file. Returns (metadata_dict, list_of_bytes_parts)."""
|
|
87
142
|
assert self.task_id
|
|
143
|
+
boundary = self.BOUNDARY_PREFIX + self._boundary()
|
|
144
|
+
end_boundary = self.BOUNDARY_PREFIX + self._boundary() + b"--"
|
|
88
145
|
with RedisFileLock(self.redis_client, os.path.join(self.folder, self.task_id)):
|
|
89
|
-
with open(os.path.join(self.folder, self.task_id), "rb") as
|
|
90
|
-
|
|
91
|
-
|
|
146
|
+
with open(os.path.join(self.folder, self.task_id), "rb") as f:
|
|
147
|
+
raw = f.read()
|
|
148
|
+
sections = raw.split(boundary)
|
|
149
|
+
metadata = None
|
|
150
|
+
parts = []
|
|
151
|
+
for section in sections:
|
|
152
|
+
section = section.strip(b"\r\n")
|
|
153
|
+
if not section or section == b"--":
|
|
154
|
+
continue
|
|
155
|
+
header_end = section.find(b"\r\n\r\n")
|
|
156
|
+
if header_end == -1:
|
|
157
|
+
continue
|
|
158
|
+
header = section[:header_end]
|
|
159
|
+
body = section[header_end + 4:]
|
|
160
|
+
if body.endswith(b"\r\n"):
|
|
161
|
+
body = body[:-2]
|
|
162
|
+
if b"application/json" in header:
|
|
163
|
+
metadata = json.loads(body)
|
|
164
|
+
else:
|
|
165
|
+
parts.append(body)
|
|
166
|
+
return metadata or {}, parts
|
|
92
167
|
|
|
93
168
|
|
|
94
169
|
class CeleryExecutor(object):
|
|
@@ -523,9 +598,16 @@ class CeleryExecutor(object):
|
|
|
523
598
|
task_id = task.request.id
|
|
524
599
|
serialized_data = CelerySerialized(folder=self.in_folder, redis_client=self.redis_client)
|
|
525
600
|
serialized_data.set_task_id(task_id)
|
|
526
|
-
|
|
601
|
+
metadata, parts = serialized_data.load()
|
|
527
602
|
del serialized_data
|
|
528
|
-
args, kwargs =
|
|
603
|
+
args, kwargs = metadata.get("args", []), metadata.get("kwargs", {})
|
|
604
|
+
|
|
605
|
+
part_count = metadata.get("part_count", 0)
|
|
606
|
+
if part_count > 0:
|
|
607
|
+
if metadata.get("part_is_list", part_count > 1):
|
|
608
|
+
args = [parts] + args
|
|
609
|
+
else:
|
|
610
|
+
args = [parts[0]] + args
|
|
529
611
|
|
|
530
612
|
if self.app.conf.task_always_eager:
|
|
531
613
|
queue = task.request.delivery_info.get("routing_key", self.app.conf.task_default_routing_key)
|
|
@@ -536,9 +618,25 @@ class CeleryExecutor(object):
|
|
|
536
618
|
queue = task.request.delivery_info.get("routing_key", self.app.conf.task_default_routing_key)
|
|
537
619
|
task.request.resources = (self.resource_registry or {}).get(queue, {})
|
|
538
620
|
out_data = task_fn(task, *args, **kwargs)
|
|
539
|
-
|
|
621
|
+
if isinstance(out_data, TaskResult):
|
|
622
|
+
out_part_is_list = isinstance(out_data.part, list)
|
|
623
|
+
if out_data.part is None:
|
|
624
|
+
out_parts = None
|
|
625
|
+
elif out_part_is_list:
|
|
626
|
+
out_parts = out_data.part
|
|
627
|
+
else:
|
|
628
|
+
out_parts = [out_data.part]
|
|
629
|
+
serialized_data = CelerySerialized(
|
|
630
|
+
folder=self.out_folder, data=out_data.data, redis_client=self.redis_client
|
|
631
|
+
)
|
|
632
|
+
else:
|
|
633
|
+
out_parts = None
|
|
634
|
+
out_part_is_list = False
|
|
635
|
+
serialized_data = CelerySerialized(
|
|
636
|
+
folder=self.out_folder, data=out_data, redis_client=self.redis_client
|
|
637
|
+
)
|
|
540
638
|
serialized_data.set_task_id(task_id)
|
|
541
|
-
serialized_data.dump()
|
|
639
|
+
serialized_data.dump(parts=out_parts, part_is_list=out_part_is_list)
|
|
542
640
|
del serialized_data
|
|
543
641
|
if os.path.isfile(os.path.join(self.in_folder, task_id)):
|
|
544
642
|
with RedisFileLock(self.redis_client, os.path.join(self.in_folder, task_id)):
|
|
@@ -579,7 +677,7 @@ class CeleryExecutor(object):
|
|
|
579
677
|
|
|
580
678
|
return decorator
|
|
581
679
|
|
|
582
|
-
async def send_task(self, task_fn, args=None, kwargs=None, **options) -> str:
|
|
680
|
+
async def send_task(self, task_fn, args=None, kwargs=None, part=None, **options) -> str:
|
|
583
681
|
args = args if args is not None else []
|
|
584
682
|
kwargs = kwargs if kwargs is not None else {}
|
|
585
683
|
if self.app.conf.task_always_eager and "dev_preload" not in self.app.conf:
|
|
@@ -635,14 +733,21 @@ class CeleryExecutor(object):
|
|
|
635
733
|
await asyncio.get_running_loop().run_in_executor(
|
|
636
734
|
self.set_thread_pool, _check_queue_llen, queue_name
|
|
637
735
|
)
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
736
|
+
|
|
737
|
+
if part is not None:
|
|
738
|
+
await self._write_task_data_with_part(
|
|
739
|
+
task_id, args, kwargs, part
|
|
740
|
+
)
|
|
741
|
+
else:
|
|
742
|
+
await asyncio.get_running_loop().run_in_executor(
|
|
743
|
+
self.io_thread_pool,
|
|
744
|
+
_write_task_data,
|
|
745
|
+
self.in_folder,
|
|
746
|
+
args,
|
|
747
|
+
kwargs,
|
|
748
|
+
task_id,
|
|
749
|
+
)
|
|
750
|
+
|
|
646
751
|
await asyncio.get_running_loop().run_in_executor(
|
|
647
752
|
self.set_thread_pool, _send_task, task_fn, task_id, options
|
|
648
753
|
)
|
|
@@ -664,6 +769,50 @@ class CeleryExecutor(object):
|
|
|
664
769
|
sem.release()
|
|
665
770
|
return task_id
|
|
666
771
|
|
|
772
|
+
async def _write_task_data_with_part(self, task_id, args, kwargs, part):
|
|
773
|
+
"""Write task data with a streamed part to the multipart file.
|
|
774
|
+
The part becomes the first arg on the worker side.
|
|
775
|
+
"""
|
|
776
|
+
boundary = CelerySerialized.BOUNDARY_PREFIX + task_id.encode()
|
|
777
|
+
boundary_line = boundary + CelerySerialized.BOUNDARY_SUFFIX
|
|
778
|
+
end_boundary_line = boundary + CelerySerialized.BOUNDARY_END
|
|
779
|
+
|
|
780
|
+
is_list = isinstance(part, list)
|
|
781
|
+
items = part if is_list else [part]
|
|
782
|
+
part_count = len(items)
|
|
783
|
+
|
|
784
|
+
metadata = {"args": args, "kwargs": kwargs, "part_count": part_count, "part_is_list": is_list}
|
|
785
|
+
metadata_bytes = json.dumps(metadata).encode()
|
|
786
|
+
|
|
787
|
+
file_path = os.path.join(self.in_folder, task_id)
|
|
788
|
+
with open(file_path, "wb") as f:
|
|
789
|
+
f.write(boundary_line)
|
|
790
|
+
f.write(CelerySerialized.CONTENT_TYPE_JSON)
|
|
791
|
+
f.write(CelerySerialized.HEADER_END)
|
|
792
|
+
f.write(metadata_bytes)
|
|
793
|
+
f.write(b"\r\n")
|
|
794
|
+
|
|
795
|
+
for item in items:
|
|
796
|
+
f.write(boundary_line)
|
|
797
|
+
f.write(CelerySerialized.CONTENT_TYPE_BYTES)
|
|
798
|
+
f.write(CelerySerialized.HEADER_END)
|
|
799
|
+
if hasattr(item, "__aiter__"):
|
|
800
|
+
async for chunk in item:
|
|
801
|
+
f.write(chunk if isinstance(chunk, bytes) else chunk.encode())
|
|
802
|
+
elif hasattr(item, "read"):
|
|
803
|
+
while True:
|
|
804
|
+
chunk = await item.read(65536)
|
|
805
|
+
if not chunk:
|
|
806
|
+
break
|
|
807
|
+
f.write(chunk if isinstance(chunk, bytes) else chunk.encode())
|
|
808
|
+
elif isinstance(item, bytes):
|
|
809
|
+
f.write(item)
|
|
810
|
+
else:
|
|
811
|
+
raise TypeError(f"Unsupported part item type: {type(item)}")
|
|
812
|
+
f.write(b"\r\n")
|
|
813
|
+
|
|
814
|
+
f.write(end_boundary_line)
|
|
815
|
+
|
|
667
816
|
async def terminate_task(self, task_id):
|
|
668
817
|
def _terminate_task(celery_app, task_id):
|
|
669
818
|
celery_app.control.revoke(task_id, terminate=True)
|
|
@@ -717,8 +866,15 @@ class CeleryExecutor(object):
|
|
|
717
866
|
def _read_task_data(out_folder, task_id):
|
|
718
867
|
serialized_data = CelerySerialized(folder=out_folder, redis_client=self.redis_client)
|
|
719
868
|
serialized_data.set_task_id(task_id)
|
|
720
|
-
|
|
721
|
-
|
|
869
|
+
metadata, parts = serialized_data.load()
|
|
870
|
+
data = metadata.get("data", metadata)
|
|
871
|
+
part_count = metadata.get("part_count", 0)
|
|
872
|
+
if part_count > 0:
|
|
873
|
+
if metadata.get("part_is_list", part_count > 1):
|
|
874
|
+
return TaskResult(data=data, part=parts)
|
|
875
|
+
else:
|
|
876
|
+
return TaskResult(data=data, part=parts[0])
|
|
877
|
+
return data
|
|
722
878
|
|
|
723
879
|
def _remove_task_data(celery_app, in_folder, out_folder, task_id):
|
|
724
880
|
celery_app.AsyncResult(task_id).forget()
|
|
@@ -744,10 +900,10 @@ class CeleryExecutor(object):
|
|
|
744
900
|
)
|
|
745
901
|
return result
|
|
746
902
|
|
|
747
|
-
async def send_and_wait_task(self, task_fn, args=None, kwargs=None, timeout=60, **options):
|
|
903
|
+
async def send_and_wait_task(self, task_fn, args=None, kwargs=None, part=None, timeout=60, **options):
|
|
748
904
|
args = args if args is not None else []
|
|
749
905
|
kwargs = kwargs if kwargs is not None else {}
|
|
750
|
-
task_id = await self.send_task(task_fn, args, kwargs, **options)
|
|
906
|
+
task_id = await self.send_task(task_fn, args, kwargs, part=part, **options)
|
|
751
907
|
ready = False
|
|
752
908
|
state = None
|
|
753
909
|
start_time = time.time()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from fastapi import Request
|
|
2
|
+
from fastapi import Request
|
|
3
3
|
from fastapi.responses import JSONResponse, StreamingResponse
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
from typing import List, Optional, Literal, Any
|
|
@@ -17,8 +17,6 @@ import traceback
|
|
|
17
17
|
from copy import copy
|
|
18
18
|
import logging
|
|
19
19
|
|
|
20
|
-
from octostar.client import make_client
|
|
21
|
-
|
|
22
20
|
MAX_ERROR_MESSAGE_BYTES = 256
|
|
23
21
|
MAX_ERROR_TRACEBACK_BYTES = 10240
|
|
24
22
|
|
|
@@ -99,100 +97,6 @@ class Route(ABC):
|
|
|
99
97
|
return self
|
|
100
98
|
|
|
101
99
|
|
|
102
|
-
class OctostarRoute(Route):
|
|
103
|
-
def __init__(self, app, tasks_routes, celery_executor=None, router=None):
|
|
104
|
-
self.app = app
|
|
105
|
-
self._router = router
|
|
106
|
-
self.routed_funcs = []
|
|
107
|
-
self.tasks_routes = tasks_routes
|
|
108
|
-
self.celery_executor = celery_executor
|
|
109
|
-
self.endpoints = {}
|
|
110
|
-
self.define_routes()
|
|
111
|
-
|
|
112
|
-
def register_route(self, op, octostar_task):
|
|
113
|
-
self.endpoints[op.strip("/")] = octostar_task
|
|
114
|
-
|
|
115
|
-
def define_routes(self):
|
|
116
|
-
if self.celery_executor:
|
|
117
|
-
|
|
118
|
-
@Route.route(self, path="/task-state/{task_id}")
|
|
119
|
-
async def get_task_status(task_id: str) -> JSONResponse:
|
|
120
|
-
task_status = await self.tasks_routes.get_task(task_id, pop=False)
|
|
121
|
-
task_status = task_status.model_dump(mode="json")["data"]["task_state"]
|
|
122
|
-
return JSONResponse(task_status)
|
|
123
|
-
|
|
124
|
-
@Route.route(self, path="/task-result/{task_id}")
|
|
125
|
-
async def get_task_result(task_id: str) -> JSONResponse:
|
|
126
|
-
return_data = await self.tasks_routes.get_task(task_id, pop=True)
|
|
127
|
-
return_data = return_data.model_dump(mode="json")["data"]["data"]
|
|
128
|
-
return JSONResponse(return_data)
|
|
129
|
-
|
|
130
|
-
@Route.route(self, path="/{op}", methods=["POST"])
|
|
131
|
-
async def send_task(
|
|
132
|
-
op: str,
|
|
133
|
-
os_context: dict = Body(...),
|
|
134
|
-
jwt: str = Body(...),
|
|
135
|
-
params: dict = Body(dict()),
|
|
136
|
-
) -> str:
|
|
137
|
-
"""
|
|
138
|
-
Any request coming from Octostar (e.g. manifest) should enter from here.
|
|
139
|
-
"""
|
|
140
|
-
path_params = []
|
|
141
|
-
op = op.split("/")
|
|
142
|
-
if len(op) > 1:
|
|
143
|
-
path_params = op[1:]
|
|
144
|
-
op = op[0]
|
|
145
|
-
query_params = params
|
|
146
|
-
client = make_client(jwt)
|
|
147
|
-
if op not in self.endpoints.keys():
|
|
148
|
-
raise StarletteHTTPException(401, f"Route {op} is forbidden for NiFi.")
|
|
149
|
-
task_id = await self.celery_executor.send_task(
|
|
150
|
-
self.endpoints[op], args=[os_context, client, query_params]
|
|
151
|
-
)
|
|
152
|
-
return task_id
|
|
153
|
-
|
|
154
|
-
else:
|
|
155
|
-
|
|
156
|
-
@Route.route(self, path="/{op}", methods=["POST"])
|
|
157
|
-
async def call_task(
|
|
158
|
-
op: str,
|
|
159
|
-
os_context: dict = Body(...),
|
|
160
|
-
jwt: str = Body(...),
|
|
161
|
-
params: dict = Body(dict()),
|
|
162
|
-
) -> str:
|
|
163
|
-
"""
|
|
164
|
-
Any request coming from Octostar (e.g. manifest) should enter from here.
|
|
165
|
-
"""
|
|
166
|
-
path_params = []
|
|
167
|
-
op = op.split("/")
|
|
168
|
-
if len(op) > 1:
|
|
169
|
-
path_params = op[1:]
|
|
170
|
-
op = op[0]
|
|
171
|
-
query_params = params
|
|
172
|
-
client = make_client(jwt)
|
|
173
|
-
if op not in self.endpoints.keys():
|
|
174
|
-
raise StarletteHTTPException(401, f"Route {op} is forbidden for NiFi.")
|
|
175
|
-
result = await self.endpoints[op](os_context, client, query_params)
|
|
176
|
-
return result
|
|
177
|
-
|
|
178
|
-
@staticmethod
|
|
179
|
-
def octostar_task(celery_executor, *args, **opts):
|
|
180
|
-
def decorator(func):
|
|
181
|
-
if celery_executor:
|
|
182
|
-
serialized_func = celery_executor.serialized_io(func)
|
|
183
|
-
task_func = celery_executor.app.task(*args, **opts)(serialized_func)
|
|
184
|
-
else:
|
|
185
|
-
|
|
186
|
-
@wraps(func)
|
|
187
|
-
def octostar_func(*args, **kwargs):
|
|
188
|
-
return func(None, *args, **kwargs)
|
|
189
|
-
|
|
190
|
-
task_func = octostar_func
|
|
191
|
-
return task_func
|
|
192
|
-
|
|
193
|
-
return decorator
|
|
194
|
-
|
|
195
|
-
|
|
196
100
|
class CommonModels(object):
|
|
197
101
|
class OKResponseModel(BaseModel):
|
|
198
102
|
message: str = "OK"
|
|
@@ -238,6 +238,156 @@ class NifiFragmenter(object):
|
|
|
238
238
|
pointer.get("merge_params") or {}, defragmenter_config, lambda _, v2: v2
|
|
239
239
|
)
|
|
240
240
|
|
|
241
|
+
_REQUIRED_FRAGMENT_FIELDS = ("index", "count", "identifier")
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def get_fragment_info(entity, fragmenter_keylist):
|
|
245
|
+
"""Read fragment metadata (identifier, count, index, root_uid, merge_params)
|
|
246
|
+
for a given fragmenter level. Read-only -- does not mutate the entity.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
entity: A NifiEntity or NifiEntityProxy.
|
|
250
|
+
fragmenter_keylist: Dot-separated key path into the fragment config
|
|
251
|
+
(e.g. "document_pages" or "audio_split").
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
dict with keys like identifier, count, index, root_uid, merge_params.
|
|
255
|
+
Empty dict if fragmenter_keylist is empty or intermediate keys are
|
|
256
|
+
missing (entity not fragmented at this level).
|
|
257
|
+
|
|
258
|
+
Raises:
|
|
259
|
+
KeyError: If the final fragment key is missing from the config.
|
|
260
|
+
ValueError: If the fragment info exists but lacks required fields
|
|
261
|
+
(index, count, identifier).
|
|
262
|
+
"""
|
|
263
|
+
if not fragmenter_keylist:
|
|
264
|
+
return {}
|
|
265
|
+
pointer = entity.request["config"]["fragment"]
|
|
266
|
+
for k in fragmenter_keylist.split(".")[:-1]:
|
|
267
|
+
if not pointer.get(k):
|
|
268
|
+
return {}
|
|
269
|
+
pointer = pointer[k]
|
|
270
|
+
info = pointer[fragmenter_keylist.split(".")[-1]]
|
|
271
|
+
missing = [f for f in NifiFragmenter._REQUIRED_FRAGMENT_FIELDS if f not in info]
|
|
272
|
+
if missing:
|
|
273
|
+
raise RuntimeError(
|
|
274
|
+
f"Fragment info for '{fragmenter_keylist}' is missing required "
|
|
275
|
+
f"field(s): {', '.join(missing)}"
|
|
276
|
+
)
|
|
277
|
+
return info
|
|
278
|
+
|
|
279
|
+
@staticmethod
|
|
280
|
+
def identify_fragment_groups(nifi_batches):
|
|
281
|
+
"""Find all fragmented entities grouped by their active fragmenter level.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
nifi_batches: List of NifiEntityBatch objects.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
dict mapping fragmenter_keylist to list of entities at that level.
|
|
288
|
+
Empty dict if no fragments found. Callers use get_fragment_info()
|
|
289
|
+
to fetch metadata per entity, and filter by index==0 to find roots.
|
|
290
|
+
"""
|
|
291
|
+
all_entities = list(itertools.chain(*[b.entities for b in nifi_batches]))
|
|
292
|
+
groups = {}
|
|
293
|
+
for e in all_entities:
|
|
294
|
+
stack = e.request["config"].get("fragment", {}).get("fragments_stack", [])
|
|
295
|
+
if stack:
|
|
296
|
+
groups.setdefault(stack[0], []).append(e)
|
|
297
|
+
return groups
|
|
298
|
+
|
|
299
|
+
@staticmethod
|
|
300
|
+
def build_fragment_tree_from_children_entities(root_entity, fragmenter_keylist):
|
|
301
|
+
"""Recursively build a tree from a root fragment entity by walking
|
|
302
|
+
its children_entities.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
root_entity: The root entity (index 0) to start from.
|
|
306
|
+
fragmenter_keylist: The fragmenter level to build for.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Nested dict with keys:
|
|
310
|
+
"entity": NifiEntity/NifiEntityProxy
|
|
311
|
+
"index": int
|
|
312
|
+
"merge_params": dict or None
|
|
313
|
+
"children": list of child trees
|
|
314
|
+
"""
|
|
315
|
+
info = NifiFragmenter.get_fragment_info(root_entity, fragmenter_keylist)
|
|
316
|
+
child_fragments = []
|
|
317
|
+
for e in root_entity.children_entities:
|
|
318
|
+
try:
|
|
319
|
+
child_info = NifiFragmenter.get_fragment_info(e, fragmenter_keylist)
|
|
320
|
+
if child_info:
|
|
321
|
+
child_fragments.append(e)
|
|
322
|
+
except (AttributeError, KeyError):
|
|
323
|
+
pass
|
|
324
|
+
return {
|
|
325
|
+
"entity": root_entity,
|
|
326
|
+
"index": info.get("index"),
|
|
327
|
+
"merge_params": info.get("merge_params"),
|
|
328
|
+
"children": [
|
|
329
|
+
NifiFragmenter.build_fragment_tree_from_children_entities(child, fragmenter_keylist)
|
|
330
|
+
for child in child_fragments
|
|
331
|
+
],
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def extract_tree_entities(tree):
|
|
336
|
+
"""Flatten a fragment tree into a list of all entities (pre-order).
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
tree: Fragment tree node (from build_fragment_tree_from_children_entities).
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
List of entities in pre-order traversal.
|
|
343
|
+
"""
|
|
344
|
+
entities = [tree["entity"]]
|
|
345
|
+
for child in tree.get("children", []):
|
|
346
|
+
entities.extend(NifiFragmenter.extract_tree_entities(child))
|
|
347
|
+
return entities
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def iterate_fragments_tree(tree, order="post"):
|
|
351
|
+
"""Yield tree nodes in traversal order.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
tree: Fragment tree node (from build_fragment_tree_from_children_entities).
|
|
355
|
+
order: "post" (children first, default) or "pre" (parent first).
|
|
356
|
+
|
|
357
|
+
Yields:
|
|
358
|
+
dict nodes with "entity", "index", "merge_params", "children" keys.
|
|
359
|
+
"""
|
|
360
|
+
children = sorted(tree.get("children", []), key=lambda x: x["index"])
|
|
361
|
+
if order == "pre":
|
|
362
|
+
yield tree
|
|
363
|
+
for child in children:
|
|
364
|
+
yield from NifiFragmenter.iterate_fragments_tree(child, order)
|
|
365
|
+
if order == "post":
|
|
366
|
+
yield tree
|
|
367
|
+
|
|
368
|
+
@staticmethod
|
|
369
|
+
def reduce_fragments_tree(tree, leaf_fn, parent_fn):
|
|
370
|
+
"""Bottom-up tree reduction. Processes leaves first, then folds results up.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
tree: Fragment tree node (from build_fragment_tree_from_children_entities).
|
|
374
|
+
leaf_fn: Callable(node) -> result, called on nodes with no children.
|
|
375
|
+
parent_fn: Callable(node, child_results) -> result, called on
|
|
376
|
+
nodes with children. child_results is a list of results from
|
|
377
|
+
child nodes, sorted by index.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
The result from the root node.
|
|
381
|
+
"""
|
|
382
|
+
children = sorted(tree.get("children", []), key=lambda x: x["index"])
|
|
383
|
+
if not children:
|
|
384
|
+
return leaf_fn(tree)
|
|
385
|
+
child_results = [
|
|
386
|
+
NifiFragmenter.reduce_fragments_tree(child, leaf_fn, parent_fn)
|
|
387
|
+
for child in children
|
|
388
|
+
]
|
|
389
|
+
return parent_fn(tree, child_results)
|
|
390
|
+
|
|
241
391
|
|
|
242
392
|
class NifiEntityBatch(object):
|
|
243
393
|
def __init__(self, entities, config, config_key):
|
|
@@ -777,25 +927,68 @@ class NifiEntity(object):
|
|
|
777
927
|
entity_type = self.record["entity_type"]
|
|
778
928
|
return entity_type == type or type in self.request["ontology_info"]["parents"]
|
|
779
929
|
|
|
780
|
-
def is_fragmented(self) -> bool:
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
930
|
+
def is_fragmented(self, fragment_name_or_idx=None) -> bool:
|
|
931
|
+
"""Check whether this entity is part of a fragmentation.
|
|
932
|
+
|
|
933
|
+
Args:
|
|
934
|
+
fragment_name_or_idx: If None (default), returns True if the entity
|
|
935
|
+
belongs to any fragmentation level. If an int, checks whether the
|
|
936
|
+
fragments_stack has an entry at that index. If a string, checks
|
|
937
|
+
whether that fragmenter keylist is present in the stack.
|
|
938
|
+
|
|
939
|
+
Returns:
|
|
940
|
+
True if the entity is fragmented (at the specified level, if given).
|
|
941
|
+
"""
|
|
942
|
+
stack = self.request["config"].get("fragment", {}).get("fragments_stack", [])
|
|
943
|
+
if fragment_name_or_idx is None:
|
|
944
|
+
return bool(stack)
|
|
945
|
+
if isinstance(fragment_name_or_idx, int):
|
|
946
|
+
return abs(fragment_name_or_idx) <= len(stack)
|
|
947
|
+
return fragment_name_or_idx in stack
|
|
948
|
+
|
|
949
|
+
def is_root_fragment(self, fragment_name_or_idx=-1, recurse=True) -> bool:
|
|
950
|
+
"""Check whether this entity is a root fragment (index == 0).
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
fragment_name_or_idx: Which fragmentation level to check. An int
|
|
954
|
+
indexes into fragments_stack (default -1 = oldest level),
|
|
955
|
+
a string matches by fragmenter keylist name.
|
|
956
|
+
recurse: If True (default), check from the starting level towards
|
|
957
|
+
index 0 (most recent) and return True only if the entity is
|
|
958
|
+
root at every checked level. If False, only check the single
|
|
959
|
+
specified level.
|
|
960
|
+
|
|
961
|
+
Returns:
|
|
962
|
+
True if the entity is a root fragment at the specified level(s),
|
|
963
|
+
or if the entity is not fragmented at all.
|
|
964
|
+
"""
|
|
795
965
|
if not self.is_fragmented():
|
|
796
966
|
return True
|
|
797
|
-
|
|
798
|
-
|
|
967
|
+
fragments_stack = self.request["config"]["fragment"]["fragments_stack"]
|
|
968
|
+
|
|
969
|
+
if isinstance(fragment_name_or_idx, int):
|
|
970
|
+
try:
|
|
971
|
+
resolved = (
|
|
972
|
+
fragment_name_or_idx
|
|
973
|
+
if fragment_name_or_idx >= 0
|
|
974
|
+
else len(fragments_stack) + fragment_name_or_idx
|
|
975
|
+
)
|
|
976
|
+
key = fragments_stack[resolved]
|
|
977
|
+
except (IndexError, ValueError):
|
|
978
|
+
return True
|
|
979
|
+
else:
|
|
980
|
+
key = fragment_name_or_idx
|
|
981
|
+
if key not in fragments_stack:
|
|
982
|
+
return True
|
|
983
|
+
resolved = fragments_stack.index(key)
|
|
984
|
+
|
|
985
|
+
if recurse:
|
|
986
|
+
keys_to_check = fragments_stack[:resolved + 1]
|
|
987
|
+
return all(
|
|
988
|
+
NifiFragmenter.get_fragment_info(self, k).get("index", 0) == 0
|
|
989
|
+
for k in keys_to_check
|
|
990
|
+
)
|
|
991
|
+
return NifiFragmenter.get_fragment_info(self, key).get("index", 0) == 0
|
|
799
992
|
|
|
800
993
|
def get_fragment_root_uid(self, fragment_name_or_idx) -> str:
|
|
801
994
|
fragment_config = self.request.get("config", {}).get("fragment", {})
|
|
@@ -1216,18 +1409,20 @@ class NifiRoute(Route):
|
|
|
1216
1409
|
op = op[0]
|
|
1217
1410
|
query_params = request.query_params
|
|
1218
1411
|
processor_suffix = query_params["processor_suffix"]
|
|
1219
|
-
body = await request.json()
|
|
1220
1412
|
processor_name = "processor." + self.processor_name + "." + op + "." + processor_suffix
|
|
1221
1413
|
if op not in self.endpoints.keys():
|
|
1222
1414
|
raise StarletteHTTPException(403, f"Route {op} is forbidden for NiFi.")
|
|
1223
|
-
task_id = await self.celery_executor.send_task(
|
|
1415
|
+
task_id = await self.celery_executor.send_task(
|
|
1416
|
+
self.endpoints[op], args=[processor_name], part=request.stream()
|
|
1417
|
+
)
|
|
1224
1418
|
return task_id
|
|
1225
1419
|
|
|
1226
1420
|
@staticmethod
|
|
1227
1421
|
def nifi_task(celery_executor, *args, **opts):
|
|
1228
1422
|
def decorator(func):
|
|
1229
1423
|
@wraps(func)
|
|
1230
|
-
def nifi_func(task,
|
|
1424
|
+
def nifi_func(task, body_bytes, processor_name, *args, **kwargs):
|
|
1425
|
+
body = json.loads(body_bytes)
|
|
1231
1426
|
with NifiContextManager(body) as nifi_context:
|
|
1232
1427
|
entity_batches = nifi_context.receive_input(body, processor_name)
|
|
1233
1428
|
entity_batches = func(
|
|
@@ -15,13 +15,13 @@ def get_label_keys(type, ontology):
|
|
|
15
15
|
return list(label_keys.keys())
|
|
16
16
|
|
|
17
17
|
def get_label(record, ontology):
|
|
18
|
-
if record.get("
|
|
19
|
-
return str(record.get("
|
|
20
|
-
if record.get("
|
|
21
|
-
return str(record.get("
|
|
18
|
+
if record.get("os_entity_label_materialized") not in (None, ""):
|
|
19
|
+
return str(record.get("os_entity_label_materialized")).strip()
|
|
20
|
+
if record.get("os_entity_label") not in (None, ""):
|
|
21
|
+
return str(record.get("os_entity_label")).strip()
|
|
22
22
|
label_keys = get_label_keys(record["entity_type"], ontology)
|
|
23
23
|
fields = [record.get(field) for field in label_keys]
|
|
24
24
|
fields = [f for f in fields if f is not None]
|
|
25
25
|
fields = [str(f) for f in fields if f]
|
|
26
26
|
label = " ".join(fields)
|
|
27
|
-
return label or None
|
|
27
|
+
return label.strip() or None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|