streamlit-octostar-utils 0.4.2.dev25__tar.gz → 0.5.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/celery.py +180 -24
  4. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/fastapi.py +1 -97
  5. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/nifi.py +215 -20
  6. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/inheritance.py +5 -5
  7. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/LICENSE +0 -0
  8. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/README.md +0 -0
  9. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/__init__.py +0 -0
  10. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  11. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
  12. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  31. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/language.py +0 -0
  32. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/nlp/ner.py +0 -0
  33. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  34. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/client.py +0 -0
  35. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/context.py +0 -0
  36. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  37. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  38. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  39. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/ontology/validation.py +0 -0
  40. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/style/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/style/common.py +0 -0
  42. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/__init__.py +0 -0
  43. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  44. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  45. {streamlit_octostar_utils-0.4.2.dev25 → streamlit_octostar_utils-0.5.0.dev2}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.4.2.dev25
3
+ Version: 0.5.0.dev2
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.4.2-dev.25"
8
+ version = "0.5.0-dev.2"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -9,7 +9,6 @@ import subprocess
9
9
  from fastapi import Query
10
10
  import time
11
11
  import os
12
- import pickle
13
12
  import atexit
14
13
  import redis
15
14
  import uuid
@@ -68,7 +67,31 @@ class CeleryQueueConfig:
68
67
  self.options = options
69
68
 
70
69
 
70
+ class TaskResult:
71
+ """Wrapper for task results that include binary parts alongside JSON data.
72
+ Tasks returning binary data (e.g. images) should return a TaskResult
73
+ so that serialized_io writes them as multipart parts instead of attempting
74
+ JSON serialization on bytes."""
75
+
76
+ def __init__(self, data, part=None):
77
+ self.data = data
78
+ self.part = part
79
+
80
+
71
81
  class CelerySerialized:
82
+ """Serializes task data to a boundary-delimited multipart file.
83
+
84
+ Format: metadata JSON part followed by optional binary/streamed parts,
85
+ separated by boundary markers (the task_id). Replaces pickle entirely.
86
+ """
87
+
88
+ BOUNDARY_PREFIX = b"--"
89
+ BOUNDARY_SUFFIX = b"\r\n"
90
+ BOUNDARY_END = b"--\r\n"
91
+ CONTENT_TYPE_JSON = b"Content-Type: application/json\r\n"
92
+ CONTENT_TYPE_BYTES = b"Content-Type: application/octet-stream\r\n"
93
+ HEADER_END = b"\r\n"
94
+
72
95
  def __init__(self, folder, redis_client, data=None):
73
96
  self.folder = folder
74
97
  self.data = data
@@ -77,18 +100,70 @@ class CelerySerialized:
77
100
  def set_task_id(self, task_id):
78
101
  self.task_id = task_id
79
102
 
80
- def dump(self):
103
+ def _boundary(self):
104
+ return self.task_id.encode()
105
+
106
+ def _write_boundary(self, f):
107
+ f.write(self.BOUNDARY_PREFIX + self._boundary() + self.BOUNDARY_SUFFIX)
108
+
109
+ def _write_end_boundary(self, f):
110
+ f.write(self.BOUNDARY_PREFIX + self._boundary() + self.BOUNDARY_END)
111
+
112
+ def dump(self, parts=None, part_is_list=False):
113
+ """Write metadata + optional parts in multipart format.
114
+ parts: optional list of bytes objects to write as additional parts.
115
+ part_is_list: whether the original part was a list (preserves type on read).
116
+ """
81
117
  assert self.task_id
118
+ if isinstance(self.data, dict):
119
+ metadata = self.data
120
+ else:
121
+ metadata = {"data": self.data}
122
+ metadata["part_count"] = len(parts) if parts else 0
123
+ metadata["part_is_list"] = part_is_list
82
124
  with RedisFileLock(self.redis_client, os.path.join(self.folder, self.task_id)):
83
- with open(os.path.join(self.folder, self.task_id), "wb") as target_file:
84
- pickle.dump(self.data, file=target_file, protocol=pickle.HIGHEST_PROTOCOL)
125
+ with open(os.path.join(self.folder, self.task_id), "wb") as f:
126
+ self._write_boundary(f)
127
+ f.write(self.CONTENT_TYPE_JSON)
128
+ f.write(self.HEADER_END)
129
+ f.write(json.dumps(metadata).encode())
130
+ f.write(b"\r\n")
131
+ if parts:
132
+ for part in parts:
133
+ self._write_boundary(f)
134
+ f.write(self.CONTENT_TYPE_BYTES)
135
+ f.write(self.HEADER_END)
136
+ f.write(part)
137
+ f.write(b"\r\n")
138
+ self._write_end_boundary(f)
85
139
 
86
140
  def load(self):
141
+ """Read multipart file. Returns (metadata_dict, list_of_bytes_parts)."""
87
142
  assert self.task_id
143
+ boundary = self.BOUNDARY_PREFIX + self._boundary()
144
+ end_boundary = self.BOUNDARY_PREFIX + self._boundary() + b"--"
88
145
  with RedisFileLock(self.redis_client, os.path.join(self.folder, self.task_id)):
89
- with open(os.path.join(self.folder, self.task_id), "rb") as source_file:
90
- data = pickle.load(source_file)
91
- return data
146
+ with open(os.path.join(self.folder, self.task_id), "rb") as f:
147
+ raw = f.read()
148
+ sections = raw.split(boundary)
149
+ metadata = None
150
+ parts = []
151
+ for section in sections:
152
+ section = section.strip(b"\r\n")
153
+ if not section or section == b"--":
154
+ continue
155
+ header_end = section.find(b"\r\n\r\n")
156
+ if header_end == -1:
157
+ continue
158
+ header = section[:header_end]
159
+ body = section[header_end + 4:]
160
+ if body.endswith(b"\r\n"):
161
+ body = body[:-2]
162
+ if b"application/json" in header:
163
+ metadata = json.loads(body)
164
+ else:
165
+ parts.append(body)
166
+ return metadata or {}, parts
92
167
 
93
168
 
94
169
  class CeleryExecutor(object):
@@ -523,9 +598,16 @@ class CeleryExecutor(object):
523
598
  task_id = task.request.id
524
599
  serialized_data = CelerySerialized(folder=self.in_folder, redis_client=self.redis_client)
525
600
  serialized_data.set_task_id(task_id)
526
- data = serialized_data.load()
601
+ metadata, parts = serialized_data.load()
527
602
  del serialized_data
528
- args, kwargs = data.get("args", []), data.get("kwargs", {})
603
+ args, kwargs = metadata.get("args", []), metadata.get("kwargs", {})
604
+
605
+ part_count = metadata.get("part_count", 0)
606
+ if part_count > 0:
607
+ if metadata.get("part_is_list", part_count > 1):
608
+ args = [parts] + args
609
+ else:
610
+ args = [parts[0]] + args
529
611
 
530
612
  if self.app.conf.task_always_eager:
531
613
  queue = task.request.delivery_info.get("routing_key", self.app.conf.task_default_routing_key)
@@ -536,9 +618,25 @@ class CeleryExecutor(object):
536
618
  queue = task.request.delivery_info.get("routing_key", self.app.conf.task_default_routing_key)
537
619
  task.request.resources = (self.resource_registry or {}).get(queue, {})
538
620
  out_data = task_fn(task, *args, **kwargs)
539
- serialized_data = CelerySerialized(folder=self.out_folder, data=out_data, redis_client=self.redis_client)
621
+ if isinstance(out_data, TaskResult):
622
+ out_part_is_list = isinstance(out_data.part, list)
623
+ if out_data.part is None:
624
+ out_parts = None
625
+ elif out_part_is_list:
626
+ out_parts = out_data.part
627
+ else:
628
+ out_parts = [out_data.part]
629
+ serialized_data = CelerySerialized(
630
+ folder=self.out_folder, data=out_data.data, redis_client=self.redis_client
631
+ )
632
+ else:
633
+ out_parts = None
634
+ out_part_is_list = False
635
+ serialized_data = CelerySerialized(
636
+ folder=self.out_folder, data=out_data, redis_client=self.redis_client
637
+ )
540
638
  serialized_data.set_task_id(task_id)
541
- serialized_data.dump()
639
+ serialized_data.dump(parts=out_parts, part_is_list=out_part_is_list)
542
640
  del serialized_data
543
641
  if os.path.isfile(os.path.join(self.in_folder, task_id)):
544
642
  with RedisFileLock(self.redis_client, os.path.join(self.in_folder, task_id)):
@@ -579,7 +677,7 @@ class CeleryExecutor(object):
579
677
 
580
678
  return decorator
581
679
 
582
- async def send_task(self, task_fn, args=None, kwargs=None, **options) -> str:
680
+ async def send_task(self, task_fn, args=None, kwargs=None, part=None, **options) -> str:
583
681
  args = args if args is not None else []
584
682
  kwargs = kwargs if kwargs is not None else {}
585
683
  if self.app.conf.task_always_eager and "dev_preload" not in self.app.conf:
@@ -635,14 +733,21 @@ class CeleryExecutor(object):
635
733
  await asyncio.get_running_loop().run_in_executor(
636
734
  self.set_thread_pool, _check_queue_llen, queue_name
637
735
  )
638
- await asyncio.get_running_loop().run_in_executor(
639
- self.io_thread_pool,
640
- _write_task_data,
641
- self.in_folder,
642
- args,
643
- kwargs,
644
- task_id,
645
- )
736
+
737
+ if part is not None:
738
+ await self._write_task_data_with_part(
739
+ task_id, args, kwargs, part
740
+ )
741
+ else:
742
+ await asyncio.get_running_loop().run_in_executor(
743
+ self.io_thread_pool,
744
+ _write_task_data,
745
+ self.in_folder,
746
+ args,
747
+ kwargs,
748
+ task_id,
749
+ )
750
+
646
751
  await asyncio.get_running_loop().run_in_executor(
647
752
  self.set_thread_pool, _send_task, task_fn, task_id, options
648
753
  )
@@ -664,6 +769,50 @@ class CeleryExecutor(object):
664
769
  sem.release()
665
770
  return task_id
666
771
 
772
+ async def _write_task_data_with_part(self, task_id, args, kwargs, part):
773
+ """Write task data with a streamed part to the multipart file.
774
+ The part becomes the first arg on the worker side.
775
+ """
776
+ boundary = CelerySerialized.BOUNDARY_PREFIX + task_id.encode()
777
+ boundary_line = boundary + CelerySerialized.BOUNDARY_SUFFIX
778
+ end_boundary_line = boundary + CelerySerialized.BOUNDARY_END
779
+
780
+ is_list = isinstance(part, list)
781
+ items = part if is_list else [part]
782
+ part_count = len(items)
783
+
784
+ metadata = {"args": args, "kwargs": kwargs, "part_count": part_count, "part_is_list": is_list}
785
+ metadata_bytes = json.dumps(metadata).encode()
786
+
787
+ file_path = os.path.join(self.in_folder, task_id)
788
+ with open(file_path, "wb") as f:
789
+ f.write(boundary_line)
790
+ f.write(CelerySerialized.CONTENT_TYPE_JSON)
791
+ f.write(CelerySerialized.HEADER_END)
792
+ f.write(metadata_bytes)
793
+ f.write(b"\r\n")
794
+
795
+ for item in items:
796
+ f.write(boundary_line)
797
+ f.write(CelerySerialized.CONTENT_TYPE_BYTES)
798
+ f.write(CelerySerialized.HEADER_END)
799
+ if hasattr(item, "__aiter__"):
800
+ async for chunk in item:
801
+ f.write(chunk if isinstance(chunk, bytes) else chunk.encode())
802
+ elif hasattr(item, "read"):
803
+ while True:
804
+ chunk = await item.read(65536)
805
+ if not chunk:
806
+ break
807
+ f.write(chunk if isinstance(chunk, bytes) else chunk.encode())
808
+ elif isinstance(item, bytes):
809
+ f.write(item)
810
+ else:
811
+ raise TypeError(f"Unsupported part item type: {type(item)}")
812
+ f.write(b"\r\n")
813
+
814
+ f.write(end_boundary_line)
815
+
667
816
  async def terminate_task(self, task_id):
668
817
  def _terminate_task(celery_app, task_id):
669
818
  celery_app.control.revoke(task_id, terminate=True)
@@ -717,8 +866,15 @@ class CeleryExecutor(object):
717
866
  def _read_task_data(out_folder, task_id):
718
867
  serialized_data = CelerySerialized(folder=out_folder, redis_client=self.redis_client)
719
868
  serialized_data.set_task_id(task_id)
720
- result = serialized_data.load()
721
- return result
869
+ metadata, parts = serialized_data.load()
870
+ data = metadata.get("data", metadata)
871
+ part_count = metadata.get("part_count", 0)
872
+ if part_count > 0:
873
+ if metadata.get("part_is_list", part_count > 1):
874
+ return TaskResult(data=data, part=parts)
875
+ else:
876
+ return TaskResult(data=data, part=parts[0])
877
+ return data
722
878
 
723
879
  def _remove_task_data(celery_app, in_folder, out_folder, task_id):
724
880
  celery_app.AsyncResult(task_id).forget()
@@ -744,10 +900,10 @@ class CeleryExecutor(object):
744
900
  )
745
901
  return result
746
902
 
747
- async def send_and_wait_task(self, task_fn, args=None, kwargs=None, timeout=60, **options):
903
+ async def send_and_wait_task(self, task_fn, args=None, kwargs=None, part=None, timeout=60, **options):
748
904
  args = args if args is not None else []
749
905
  kwargs = kwargs if kwargs is not None else {}
750
- task_id = await self.send_task(task_fn, args, kwargs, **options)
906
+ task_id = await self.send_task(task_fn, args, kwargs, part=part, **options)
751
907
  ready = False
752
908
  state = None
753
909
  start_time = time.time()
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- from fastapi import Request, Body
2
+ from fastapi import Request
3
3
  from fastapi.responses import JSONResponse, StreamingResponse
4
4
  from pydantic import BaseModel
5
5
  from typing import List, Optional, Literal, Any
@@ -17,8 +17,6 @@ import traceback
17
17
  from copy import copy
18
18
  import logging
19
19
 
20
- from octostar.client import make_client
21
-
22
20
  MAX_ERROR_MESSAGE_BYTES = 256
23
21
  MAX_ERROR_TRACEBACK_BYTES = 10240
24
22
 
@@ -99,100 +97,6 @@ class Route(ABC):
99
97
  return self
100
98
 
101
99
 
102
- class OctostarRoute(Route):
103
- def __init__(self, app, tasks_routes, celery_executor=None, router=None):
104
- self.app = app
105
- self._router = router
106
- self.routed_funcs = []
107
- self.tasks_routes = tasks_routes
108
- self.celery_executor = celery_executor
109
- self.endpoints = {}
110
- self.define_routes()
111
-
112
- def register_route(self, op, octostar_task):
113
- self.endpoints[op.strip("/")] = octostar_task
114
-
115
- def define_routes(self):
116
- if self.celery_executor:
117
-
118
- @Route.route(self, path="/task-state/{task_id}")
119
- async def get_task_status(task_id: str) -> JSONResponse:
120
- task_status = await self.tasks_routes.get_task(task_id, pop=False)
121
- task_status = task_status.model_dump(mode="json")["data"]["task_state"]
122
- return JSONResponse(task_status)
123
-
124
- @Route.route(self, path="/task-result/{task_id}")
125
- async def get_task_result(task_id: str) -> JSONResponse:
126
- return_data = await self.tasks_routes.get_task(task_id, pop=True)
127
- return_data = return_data.model_dump(mode="json")["data"]["data"]
128
- return JSONResponse(return_data)
129
-
130
- @Route.route(self, path="/{op}", methods=["POST"])
131
- async def send_task(
132
- op: str,
133
- os_context: dict = Body(...),
134
- jwt: str = Body(...),
135
- params: dict = Body(dict()),
136
- ) -> str:
137
- """
138
- Any request coming from Octostar (e.g. manifest) should enter from here.
139
- """
140
- path_params = []
141
- op = op.split("/")
142
- if len(op) > 1:
143
- path_params = op[1:]
144
- op = op[0]
145
- query_params = params
146
- client = make_client(jwt)
147
- if op not in self.endpoints.keys():
148
- raise StarletteHTTPException(401, f"Route {op} is forbidden for NiFi.")
149
- task_id = await self.celery_executor.send_task(
150
- self.endpoints[op], args=[os_context, client, query_params]
151
- )
152
- return task_id
153
-
154
- else:
155
-
156
- @Route.route(self, path="/{op}", methods=["POST"])
157
- async def call_task(
158
- op: str,
159
- os_context: dict = Body(...),
160
- jwt: str = Body(...),
161
- params: dict = Body(dict()),
162
- ) -> str:
163
- """
164
- Any request coming from Octostar (e.g. manifest) should enter from here.
165
- """
166
- path_params = []
167
- op = op.split("/")
168
- if len(op) > 1:
169
- path_params = op[1:]
170
- op = op[0]
171
- query_params = params
172
- client = make_client(jwt)
173
- if op not in self.endpoints.keys():
174
- raise StarletteHTTPException(401, f"Route {op} is forbidden for NiFi.")
175
- result = await self.endpoints[op](os_context, client, query_params)
176
- return result
177
-
178
- @staticmethod
179
- def octostar_task(celery_executor, *args, **opts):
180
- def decorator(func):
181
- if celery_executor:
182
- serialized_func = celery_executor.serialized_io(func)
183
- task_func = celery_executor.app.task(*args, **opts)(serialized_func)
184
- else:
185
-
186
- @wraps(func)
187
- def octostar_func(*args, **kwargs):
188
- return func(None, *args, **kwargs)
189
-
190
- task_func = octostar_func
191
- return task_func
192
-
193
- return decorator
194
-
195
-
196
100
  class CommonModels(object):
197
101
  class OKResponseModel(BaseModel):
198
102
  message: str = "OK"
@@ -238,6 +238,156 @@ class NifiFragmenter(object):
238
238
  pointer.get("merge_params") or {}, defragmenter_config, lambda _, v2: v2
239
239
  )
240
240
 
241
+ _REQUIRED_FRAGMENT_FIELDS = ("index", "count", "identifier")
242
+
243
+ @staticmethod
244
+ def get_fragment_info(entity, fragmenter_keylist):
245
+ """Read fragment metadata (identifier, count, index, root_uid, merge_params)
246
+ for a given fragmenter level. Read-only -- does not mutate the entity.
247
+
248
+ Args:
249
+ entity: A NifiEntity or NifiEntityProxy.
250
+ fragmenter_keylist: Dot-separated key path into the fragment config
251
+ (e.g. "document_pages" or "audio_split").
252
+
253
+ Returns:
254
+ dict with keys like identifier, count, index, root_uid, merge_params.
255
+ Empty dict if fragmenter_keylist is empty or intermediate keys are
256
+ missing (entity not fragmented at this level).
257
+
258
+ Raises:
259
+ KeyError: If the final fragment key is missing from the config.
260
+ ValueError: If the fragment info exists but lacks required fields
261
+ (index, count, identifier).
262
+ """
263
+ if not fragmenter_keylist:
264
+ return {}
265
+ pointer = entity.request["config"]["fragment"]
266
+ for k in fragmenter_keylist.split(".")[:-1]:
267
+ if not pointer.get(k):
268
+ return {}
269
+ pointer = pointer[k]
270
+ info = pointer[fragmenter_keylist.split(".")[-1]]
271
+ missing = [f for f in NifiFragmenter._REQUIRED_FRAGMENT_FIELDS if f not in info]
272
+ if missing:
273
+ raise RuntimeError(
274
+ f"Fragment info for '{fragmenter_keylist}' is missing required "
275
+ f"field(s): {', '.join(missing)}"
276
+ )
277
+ return info
278
+
279
+ @staticmethod
280
+ def identify_fragment_groups(nifi_batches):
281
+ """Find all fragmented entities grouped by their active fragmenter level.
282
+
283
+ Args:
284
+ nifi_batches: List of NifiEntityBatch objects.
285
+
286
+ Returns:
287
+ dict mapping fragmenter_keylist to list of entities at that level.
288
+ Empty dict if no fragments found. Callers use get_fragment_info()
289
+ to fetch metadata per entity, and filter by index==0 to find roots.
290
+ """
291
+ all_entities = list(itertools.chain(*[b.entities for b in nifi_batches]))
292
+ groups = {}
293
+ for e in all_entities:
294
+ stack = e.request["config"].get("fragment", {}).get("fragments_stack", [])
295
+ if stack:
296
+ groups.setdefault(stack[0], []).append(e)
297
+ return groups
298
+
299
+ @staticmethod
300
+ def build_fragment_tree_from_children_entities(root_entity, fragmenter_keylist):
301
+ """Recursively build a tree from a root fragment entity by walking
302
+ its children_entities.
303
+
304
+ Args:
305
+ root_entity: The root entity (index 0) to start from.
306
+ fragmenter_keylist: The fragmenter level to build for.
307
+
308
+ Returns:
309
+ Nested dict with keys:
310
+ "entity": NifiEntity/NifiEntityProxy
311
+ "index": int
312
+ "merge_params": dict or None
313
+ "children": list of child trees
314
+ """
315
+ info = NifiFragmenter.get_fragment_info(root_entity, fragmenter_keylist)
316
+ child_fragments = []
317
+ for e in root_entity.children_entities:
318
+ try:
319
+ child_info = NifiFragmenter.get_fragment_info(e, fragmenter_keylist)
320
+ if child_info:
321
+ child_fragments.append(e)
322
+ except (AttributeError, KeyError):
323
+ pass
324
+ return {
325
+ "entity": root_entity,
326
+ "index": info.get("index"),
327
+ "merge_params": info.get("merge_params"),
328
+ "children": [
329
+ NifiFragmenter.build_fragment_tree_from_children_entities(child, fragmenter_keylist)
330
+ for child in child_fragments
331
+ ],
332
+ }
333
+
334
+ @staticmethod
335
+ def extract_tree_entities(tree):
336
+ """Flatten a fragment tree into a list of all entities (pre-order).
337
+
338
+ Args:
339
+ tree: Fragment tree node (from build_fragment_tree_from_children_entities).
340
+
341
+ Returns:
342
+ List of entities in pre-order traversal.
343
+ """
344
+ entities = [tree["entity"]]
345
+ for child in tree.get("children", []):
346
+ entities.extend(NifiFragmenter.extract_tree_entities(child))
347
+ return entities
348
+
349
+ @staticmethod
350
+ def iterate_fragments_tree(tree, order="post"):
351
+ """Yield tree nodes in traversal order.
352
+
353
+ Args:
354
+ tree: Fragment tree node (from build_fragment_tree_from_children_entities).
355
+ order: "post" (children first, default) or "pre" (parent first).
356
+
357
+ Yields:
358
+ dict nodes with "entity", "index", "merge_params", "children" keys.
359
+ """
360
+ children = sorted(tree.get("children", []), key=lambda x: x["index"])
361
+ if order == "pre":
362
+ yield tree
363
+ for child in children:
364
+ yield from NifiFragmenter.iterate_fragments_tree(child, order)
365
+ if order == "post":
366
+ yield tree
367
+
368
+ @staticmethod
369
+ def reduce_fragments_tree(tree, leaf_fn, parent_fn):
370
+ """Bottom-up tree reduction. Processes leaves first, then folds results up.
371
+
372
+ Args:
373
+ tree: Fragment tree node (from build_fragment_tree_from_children_entities).
374
+ leaf_fn: Callable(node) -> result, called on nodes with no children.
375
+ parent_fn: Callable(node, child_results) -> result, called on
376
+ nodes with children. child_results is a list of results from
377
+ child nodes, sorted by index.
378
+
379
+ Returns:
380
+ The result from the root node.
381
+ """
382
+ children = sorted(tree.get("children", []), key=lambda x: x["index"])
383
+ if not children:
384
+ return leaf_fn(tree)
385
+ child_results = [
386
+ NifiFragmenter.reduce_fragments_tree(child, leaf_fn, parent_fn)
387
+ for child in children
388
+ ]
389
+ return parent_fn(tree, child_results)
390
+
241
391
 
242
392
  class NifiEntityBatch(object):
243
393
  def __init__(self, entities, config, config_key):
@@ -777,25 +927,68 @@ class NifiEntity(object):
777
927
  entity_type = self.record["entity_type"]
778
928
  return entity_type == type or type in self.request["ontology_info"]["parents"]
779
929
 
780
- def is_fragmented(self) -> bool:
781
- return bool(self.request["config"].get("fragment", {}).get("fragments_stack"))
782
-
783
- def is_root_fragment(self, entity) -> bool:
784
- def _is_sub_fragment_recursive(fragment: dict) -> bool:
785
- if not isinstance(fragment, dict):
786
- return False
787
- if all(k in fragment for k in ["index", "count", "identifier"]):
788
- return fragment.get("index", 0) != 0
789
- for value in fragment.values():
790
- if isinstance(value, dict):
791
- if _is_sub_fragment_recursive(value):
792
- return True
793
- return False
794
-
930
+ def is_fragmented(self, fragment_name_or_idx=None) -> bool:
931
+ """Check whether this entity is part of a fragmentation.
932
+
933
+ Args:
934
+ fragment_name_or_idx: If None (default), returns True if the entity
935
+ belongs to any fragmentation level. If an int, checks whether the
936
+ fragments_stack has an entry at that index. If a string, checks
937
+ whether that fragmenter keylist is present in the stack.
938
+
939
+ Returns:
940
+ True if the entity is fragmented (at the specified level, if given).
941
+ """
942
+ stack = self.request["config"].get("fragment", {}).get("fragments_stack", [])
943
+ if fragment_name_or_idx is None:
944
+ return bool(stack)
945
+ if isinstance(fragment_name_or_idx, int):
946
+ return abs(fragment_name_or_idx) <= len(stack)
947
+ return fragment_name_or_idx in stack
948
+
949
+ def is_root_fragment(self, fragment_name_or_idx=-1, recurse=True) -> bool:
950
+ """Check whether this entity is a root fragment (index == 0).
951
+
952
+ Args:
953
+ fragment_name_or_idx: Which fragmentation level to check. An int
954
+ indexes into fragments_stack (default -1 = oldest level),
955
+ a string matches by fragmenter keylist name.
956
+ recurse: If True (default), check from the starting level towards
957
+ index 0 (most recent) and return True only if the entity is
958
+ root at every checked level. If False, only check the single
959
+ specified level.
960
+
961
+ Returns:
962
+ True if the entity is a root fragment at the specified level(s),
963
+ or if the entity is not fragmented at all.
964
+ """
795
965
  if not self.is_fragmented():
796
966
  return True
797
- fragment = entity.request.get("config", {}).get("fragment", {})
798
- return not _is_sub_fragment_recursive(fragment)
967
+ fragments_stack = self.request["config"]["fragment"]["fragments_stack"]
968
+
969
+ if isinstance(fragment_name_or_idx, int):
970
+ try:
971
+ resolved = (
972
+ fragment_name_or_idx
973
+ if fragment_name_or_idx >= 0
974
+ else len(fragments_stack) + fragment_name_or_idx
975
+ )
976
+ key = fragments_stack[resolved]
977
+ except (IndexError, ValueError):
978
+ return True
979
+ else:
980
+ key = fragment_name_or_idx
981
+ if key not in fragments_stack:
982
+ return True
983
+ resolved = fragments_stack.index(key)
984
+
985
+ if recurse:
986
+ keys_to_check = fragments_stack[:resolved + 1]
987
+ return all(
988
+ NifiFragmenter.get_fragment_info(self, k).get("index", 0) == 0
989
+ for k in keys_to_check
990
+ )
991
+ return NifiFragmenter.get_fragment_info(self, key).get("index", 0) == 0
799
992
 
800
993
  def get_fragment_root_uid(self, fragment_name_or_idx) -> str:
801
994
  fragment_config = self.request.get("config", {}).get("fragment", {})
@@ -1216,18 +1409,20 @@ class NifiRoute(Route):
1216
1409
  op = op[0]
1217
1410
  query_params = request.query_params
1218
1411
  processor_suffix = query_params["processor_suffix"]
1219
- body = await request.json()
1220
1412
  processor_name = "processor." + self.processor_name + "." + op + "." + processor_suffix
1221
1413
  if op not in self.endpoints.keys():
1222
1414
  raise StarletteHTTPException(403, f"Route {op} is forbidden for NiFi.")
1223
- task_id = await self.celery_executor.send_task(self.endpoints[op], args=[body, processor_name])
1415
+ task_id = await self.celery_executor.send_task(
1416
+ self.endpoints[op], args=[processor_name], part=request.stream()
1417
+ )
1224
1418
  return task_id
1225
1419
 
1226
1420
  @staticmethod
1227
1421
  def nifi_task(celery_executor, *args, **opts):
1228
1422
  def decorator(func):
1229
1423
  @wraps(func)
1230
- def nifi_func(task, body, processor_name, *args, **kwargs):
1424
+ def nifi_func(task, body_bytes, processor_name, *args, **kwargs):
1425
+ body = json.loads(body_bytes)
1231
1426
  with NifiContextManager(body) as nifi_context:
1232
1427
  entity_batches = nifi_context.receive_input(body, processor_name)
1233
1428
  entity_batches = func(
@@ -15,13 +15,13 @@ def get_label_keys(type, ontology):
15
15
  return list(label_keys.keys())
16
16
 
17
17
  def get_label(record, ontology):
18
- if record.get("os_materialized_label") not in (None, ""):
19
- return str(record.get("os_materialized_label"))
20
- if record.get("os_custom_label") not in (None, ""):
21
- return str(record.get("os_custom_label"))
18
+ if record.get("os_entity_label_materialized") not in (None, ""):
19
+ return str(record.get("os_entity_label_materialized")).strip()
20
+ if record.get("os_entity_label") not in (None, ""):
21
+ return str(record.get("os_entity_label")).strip()
22
22
  label_keys = get_label_keys(record["entity_type"], ontology)
23
23
  fields = [record.get(field) for field in label_keys]
24
24
  fields = [f for f in fields if f is not None]
25
25
  fields = [str(f) for f in fields if f]
26
26
  label = " ".join(fields)
27
- return label or None
27
+ return label.strip() or None