streamlit-octostar-utils 0.4.1__tar.gz → 0.4.2.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/pyproject.toml +1 -1
  3. streamlit_octostar_utils-0.4.2.dev1/streamlit_octostar_utils/api_crafter/contents.py +760 -0
  4. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/nifi.py +35 -51
  5. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/timestamp.py +4 -1
  6. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/LICENSE +0 -0
  7. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/README.md +0 -0
  8. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/__init__.py +0 -0
  9. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  10. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  11. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  29. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  30. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/nlp/language.py +0 -0
  31. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/nlp/ner.py +0 -0
  32. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  33. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/octostar/client.py +0 -0
  34. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/octostar/context.py +0 -0
  35. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  36. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  37. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  38. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  39. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/ontology/validation.py +0 -0
  40. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/style/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/style/common.py +0 -0
  42. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/threading/__init__.py +0 -0
  43. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  44. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  45. {streamlit_octostar_utils-0.4.1 → streamlit_octostar_utils-0.4.2.dev1}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.4.1
3
+ Version: 0.4.2.dev1
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.4.1"
8
+ version = "0.4.2-dev.1"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -0,0 +1,760 @@
1
+ """
2
+ Contents abstraction for NiFi entities.
3
+
4
+ Provides a FileIO-like interface for handling entity contents with support for:
5
+ - Full io.BufferedIOBase compatibility (read, write, seek, tell, etc.)
6
+ - Seeking and streaming with HTTP Range requests
7
+ - Multiple storage backends (memory, workspace attachments)
8
+ - Serialization/deserialization via from_locator()/to_locator() methods
9
+ - Lazy loading and efficient partial reads for large files
10
+
11
+ Storage Backends:
12
+ - MemoryContents: In-memory buffer (like BytesIO)
13
+ - WorkspaceAttachmentContents: Octostar workspace attachments with HTTP Range support
14
+ """
15
+
16
+ from abc import ABC, abstractmethod
17
+ from typing import Optional, Dict, Any, Union, BinaryIO, List
18
+ from enum import Enum
19
+ import base64
20
+ import httpx
21
+ import logging
22
+ from io import BytesIO, SEEK_SET, SEEK_CUR, SEEK_END
23
+ from copy import deepcopy
24
+
25
+ _logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ContentsLocation(Enum):
29
+ """Enumeration of supported content storage locations."""
30
+ MEMORY = "memory"
31
+ WORKSPACE_ATTACHMENT = "workspace_attachment"
32
+
33
+
34
+ class Contents(ABC):
35
+ """
36
+ Abstract base class for entity contents.
37
+
38
+ Provides a FileIO-like interface with support for seeking, streaming,
39
+ and multiple storage backends. Each implementation manages its own
40
+ locator dictionary for serialization/deserialization.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ mode: str = "rb",
46
+ entity_type: Optional[str] = None,
47
+ filetype: Optional[str] = None,
48
+ **kwargs
49
+ ):
50
+ self._mode = mode
51
+ self._entity_type = entity_type
52
+ self._filetype = filetype
53
+ self._closed = False
54
+
55
+ # ==================== FileIO Interface ====================
56
+
57
+ @abstractmethod
58
+ def read(self, size: int = -1) -> bytes:
59
+ """
60
+ Read and return up to size bytes, or all bytes if size is -1.
61
+
62
+ Args:
63
+ size: Number of bytes to read. -1 means read all.
64
+
65
+ Returns:
66
+ Bytes read from the stream.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def write(self, b: bytes) -> int:
72
+ """
73
+ Write bytes to the stream.
74
+
75
+ Args:
76
+ b: Bytes to write.
77
+
78
+ Returns:
79
+ Number of bytes written.
80
+ """
81
+ pass
82
+
83
+ @abstractmethod
84
+ def seek(self, offset: int, whence: int = SEEK_SET) -> int:
85
+ """
86
+ Change stream position.
87
+
88
+ Args:
89
+ offset: Offset relative to whence.
90
+ whence: SEEK_SET (0) = from start, SEEK_CUR (1) = from current, SEEK_END (2) = from end.
91
+
92
+ Returns:
93
+ New absolute position.
94
+ """
95
+ pass
96
+
97
+ @abstractmethod
98
+ def tell(self) -> int:
99
+ """
100
+ Return current stream position.
101
+
102
+ Returns:
103
+ Current position in bytes.
104
+ """
105
+ pass
106
+
107
+ def readable(self) -> bool:
108
+ """Check if stream is readable."""
109
+ return 'r' in self._mode or '+' in self._mode
110
+
111
+ def writable(self) -> bool:
112
+ """Check if stream is writable."""
113
+ return 'w' in self._mode or 'a' in self._mode or '+' in self._mode
114
+
115
+ def seekable(self) -> bool:
116
+ """Check if stream is seekable."""
117
+ return True
118
+
119
+ @abstractmethod
120
+ def flush(self):
121
+ """Flush write buffers."""
122
+ pass
123
+
124
+ @abstractmethod
125
+ def close(self):
126
+ """Close the stream and release resources."""
127
+ self._closed = True
128
+
129
+ @property
130
+ def closed(self) -> bool:
131
+ """Check if stream is closed."""
132
+ return self._closed
133
+
134
+ @property
135
+ def mode(self) -> str:
136
+ """Get the file mode."""
137
+ return self._mode
138
+
139
+ @abstractmethod
140
+ def truncate(self, size: Optional[int] = None) -> int:
141
+ """
142
+ Resize the stream to the given size.
143
+
144
+ Args:
145
+ size: New size in bytes. If None, use current position.
146
+
147
+ Returns:
148
+ New size.
149
+ """
150
+ pass
151
+
152
+ def __enter__(self):
153
+ """Context manager entry."""
154
+ return self
155
+
156
+ def __exit__(self, exc_type, exc_val, exc_tb):
157
+ """Context manager exit."""
158
+ self.close()
159
+ return False
160
+
161
+ def readline(self, size: int = -1) -> bytes:
162
+ """
163
+ Read and return one line from the stream.
164
+
165
+ Args:
166
+ size: Maximum number of bytes to read. -1 means no limit.
167
+
168
+ Returns:
169
+ Bytes up to and including the newline character, or until EOF.
170
+ """
171
+ result = bytearray()
172
+ while True:
173
+ if size >= 0 and len(result) >= size:
174
+ break
175
+ byte = self.read(1)
176
+ if not byte:
177
+ break
178
+ result.extend(byte)
179
+ if byte == b'\n':
180
+ break
181
+ return bytes(result)
182
+
183
+ def readlines(self, hint: int = -1) -> List[bytes]:
184
+ """
185
+ Read and return a list of lines from the stream.
186
+
187
+ Args:
188
+ hint: Optional size hint. If total size of lines exceeds hint, no more lines are read.
189
+
190
+ Returns:
191
+ List of lines.
192
+ """
193
+ lines = []
194
+ total_size = 0
195
+ while True:
196
+ line = self.readline()
197
+ if not line:
198
+ break
199
+ lines.append(line)
200
+ total_size += len(line)
201
+ if hint >= 0 and total_size >= hint:
202
+ break
203
+ return lines
204
+
205
+ def writelines(self, lines: List[bytes]):
206
+ """
207
+ Write a list of lines to the stream.
208
+
209
+ Args:
210
+ lines: List of byte strings to write.
211
+ """
212
+ for line in lines:
213
+ self.write(line)
214
+
215
+ def readinto(self, b: bytearray) -> int:
216
+ """
217
+ Read bytes into a pre-allocated buffer.
218
+
219
+ Args:
220
+ b: Pre-allocated bytearray to read into.
221
+
222
+ Returns:
223
+ Number of bytes read.
224
+ """
225
+ data = self.read(len(b))
226
+ n = len(data)
227
+ b[:n] = data
228
+ return n
229
+
230
+ def read1(self, size: int = -1) -> bytes:
231
+ """
232
+ Read up to size bytes with at most one read() call to the underlying stream.
233
+
234
+ For most implementations, this is the same as read(). Subclasses may override
235
+ for optimization.
236
+
237
+ Args:
238
+ size: Number of bytes to read. -1 means read all available.
239
+
240
+ Returns:
241
+ Bytes read from the stream.
242
+ """
243
+ return self.read(size)
244
+
245
+ # ==================== Locator Interface ====================
246
+
247
+ @abstractmethod
248
+ def to_locator(self) -> Dict[str, Any]:
249
+ """
250
+ Serialize contents to a locator dictionary.
251
+
252
+ The locator contains:
253
+ - location: The storage backend type
254
+ - Additional backend-specific parameters
255
+
256
+ Returns:
257
+ Dictionary describing how to locate/access this content.
258
+ """
259
+ pass
260
+
261
+ @staticmethod
262
+ def from_locator(locator: Optional[Dict[str, Any]], client=None) -> Optional['Contents']:
263
+ """
264
+ Deserialize contents from a locator dictionary.
265
+
266
+ Factory method that creates the appropriate Contents subclass based on the
267
+ locator's "location" field. This replaces the old base64.b64decode logic in NiFi.
268
+
269
+ Args:
270
+ locator: Locator dictionary (e.g., from request.contents_pointer)
271
+ client: Optional Octostar client for remote operations
272
+
273
+ Returns:
274
+ Contents instance or None if locator is None/empty.
275
+
276
+ Raises:
277
+ ValueError: If the location type is unknown
278
+ """
279
+ if not locator:
280
+ return None
281
+
282
+ location = locator.get("location")
283
+
284
+ match location:
285
+ case ContentsLocation.MEMORY.value:
286
+ return MemoryContents._from_locator(locator)
287
+ case ContentsLocation.WORKSPACE_ATTACHMENT.value:
288
+ return WorkspaceAttachmentContents._from_locator(locator, client)
289
+ case _:
290
+ raise ValueError(f"Unknown contents location type: {location}")
291
+
292
+ # ==================== Utility Methods ====================
293
+
294
+ def read_all(self) -> bytes:
295
+ """Read all contents and return as bytes."""
296
+ current_pos = self.tell()
297
+ self.seek(0, SEEK_SET)
298
+ data = self.read()
299
+ self.seek(current_pos, SEEK_SET)
300
+ return data
301
+
302
+ def getvalue(self) -> bytes:
303
+ """
304
+ Return the entire contents without moving the position.
305
+
306
+ This matches BytesIO.getvalue() behavior.
307
+
308
+ Returns:
309
+ Entire contents as bytes.
310
+ """
311
+ return self.read_all()
312
+
313
+ def write_all(self, data: bytes):
314
+ """Write all data, replacing existing contents."""
315
+ self.seek(0, SEEK_SET)
316
+ self.truncate(0)
317
+ self.write(data)
318
+ self.flush()
319
+
320
+ def __len__(self) -> int:
321
+ """
322
+ Return the length of the contents.
323
+
324
+ Returns:
325
+ Total size in bytes.
326
+ """
327
+ current_pos = self.tell()
328
+ self.seek(0, SEEK_END)
329
+ length = self.tell()
330
+ self.seek(current_pos, SEEK_SET)
331
+ return length
332
+
333
+
334
+ class MemoryContents(Contents):
335
+ """In-memory contents implementation using BytesIO."""
336
+
337
+ def __init__(
338
+ self,
339
+ mode: str = "r+b",
340
+ entity_type: Optional[str] = None,
341
+ filetype: Optional[str] = None,
342
+ *,
343
+ initial_data: Optional[bytes] = None,
344
+ **kwargs
345
+ ):
346
+ super().__init__(mode, entity_type, filetype, **kwargs)
347
+ self._buffer = BytesIO(initial_data or b"")
348
+
349
+ def read(self, size: int = -1) -> bytes:
350
+ if not self.readable():
351
+ raise IOError("Contents not readable")
352
+ return self._buffer.read(size)
353
+
354
+ def write(self, b: bytes) -> int:
355
+ if not self.writable():
356
+ raise IOError("Contents not writable")
357
+ return self._buffer.write(b)
358
+
359
+ def seek(self, offset: int, whence: int = SEEK_SET) -> int:
360
+ return self._buffer.seek(offset, whence)
361
+
362
+ def tell(self) -> int:
363
+ return self._buffer.tell()
364
+
365
+ def flush(self):
366
+ self._buffer.flush()
367
+
368
+ def close(self):
369
+ if not self._closed:
370
+ self._buffer.close()
371
+ super().close()
372
+
373
+ def truncate(self, size: Optional[int] = None) -> int:
374
+ return self._buffer.truncate(size)
375
+
376
+ def getvalue(self) -> bytes:
377
+ return self._buffer.getvalue()
378
+
379
+ def to_locator(self) -> Dict[str, Any]:
380
+ """
381
+ Serialize to locator with base64-encoded data.
382
+
383
+ Returns:
384
+ {"location": "memory", "data": "<base64>", "entity_type": "...", "filetype": "..."}
385
+ """
386
+ data = self._buffer.getvalue()
387
+ locator = {
388
+ "location": ContentsLocation.MEMORY.value,
389
+ "data": base64.b64encode(data).decode('utf-8') if data else None
390
+ }
391
+ if self._entity_type:
392
+ locator["entity_type"] = self._entity_type
393
+ if self._filetype:
394
+ locator["filetype"] = self._filetype
395
+ return locator
396
+
397
+ @staticmethod
398
+ def _from_locator(locator: Dict[str, Any]) -> 'MemoryContents':
399
+ """
400
+ Create MemoryContents from a locator dictionary.
401
+
402
+ Args:
403
+ locator: Locator dictionary with base64-encoded data
404
+
405
+ Returns:
406
+ New MemoryContents instance
407
+ """
408
+ data = locator.get("data")
409
+ initial_data = base64.b64decode(data) if data else None
410
+ return MemoryContents(
411
+ entity_type=locator.get("entity_type"),
412
+ filetype=locator.get("filetype"),
413
+ initial_data=initial_data
414
+ )
415
+
416
+
417
+ class WorkspaceAttachmentContents(Contents):
418
+ """
419
+ Contents implementation for Octostar workspace attachments.
420
+
421
+ Uses octostar-api utilities (read_attachment, get_attachment_url) to:
422
+ - Lazy load from workspace storage via read_attachment()
423
+ - Stream efficiently with HTTP Range requests (no need to download entire file)
424
+ - Support true seeking with configurable chunk sizes
425
+ - Buffer modifications in memory (write back handled at NiFi entity level)
426
+
427
+ Presigned URL Handling:
428
+ - URLs are obtained via get_attachment_url() and cached
429
+ - On 403 (Forbidden) responses, URLs are refreshed and requests retried
430
+ - Follows the same retry pattern as octostar-api read_file utilities
431
+ """
432
+
433
+ DEFAULT_CHUNK_SIZE = 8192
434
+ DEFAULT_URL_TIMEOUT = 120
435
+
436
+ def __init__(
437
+ self,
438
+ mode: str = "rb",
439
+ entity_type: Optional[str] = None,
440
+ filetype: Optional[str] = None,
441
+ *,
442
+ workspace_id: str,
443
+ entity_id: str,
444
+ client,
445
+ initial_data: Optional[bytes] = None,
446
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
447
+ **kwargs
448
+ ):
449
+ super().__init__(mode, entity_type, filetype, **kwargs)
450
+ self._workspace_id = workspace_id
451
+ self._entity_id = entity_id
452
+ self._client = client
453
+ self._chunk_size = chunk_size
454
+
455
+ self._buffer: Optional[BytesIO] = None
456
+ self._fully_loaded = False
457
+ self._modified = False
458
+ self._position = 0
459
+ self._size: Optional[int] = None
460
+ self._presigned_url: Optional[str] = None
461
+ self._http_client: Optional[httpx.Client] = None
462
+
463
+ if initial_data is not None:
464
+ self._buffer = BytesIO(initial_data)
465
+ self._fully_loaded = True
466
+ self._size = len(initial_data)
467
+
468
+ def _get_presigned_url(self) -> str:
469
+ """Get or refresh the presigned URL for the attachment."""
470
+ from octostar.utils.workspace import get_attachment_url
471
+
472
+ return get_attachment_url.sync(
473
+ os_workspace=self._workspace_id,
474
+ os_entity_uid=self._entity_id,
475
+ client=self._client
476
+ )
477
+
478
+ def _ensure_http_client(self):
479
+ """Ensure HTTP client is initialized."""
480
+ if not self._http_client:
481
+ self._http_client = httpx.Client(timeout=self.DEFAULT_URL_TIMEOUT)
482
+
483
+ def _fetch_size(self) -> int:
484
+ """
485
+ Fetch the size of the remote attachment using HEAD request.
486
+
487
+ Handles presigned URL expiration with automatic refresh and retry.
488
+ """
489
+ if self._size is not None:
490
+ return self._size
491
+
492
+ if not self._presigned_url:
493
+ self._presigned_url = self._get_presigned_url()
494
+
495
+ self._ensure_http_client()
496
+ max_retries = 3
497
+
498
+ for attempt in range(max_retries):
499
+ response = self._http_client.head(self._presigned_url)
500
+ if response.status_code == 200:
501
+ self._size = int(response.headers.get('content-length', 0))
502
+ return self._size
503
+ elif response.status_code == 403:
504
+ _logger.debug(
505
+ f"Presigned URL expired while fetching size, refreshing... (attempt {attempt + 1}/{max_retries})"
506
+ )
507
+ self._presigned_url = self._get_presigned_url()
508
+ continue
509
+ else:
510
+ response.raise_for_status()
511
+
512
+ raise ConnectionError(
513
+ f"Failed to fetch file size after {max_retries} attempts (URL kept expiring)"
514
+ )
515
+
516
+ def _read_range(self, start: int, end: int) -> bytes:
517
+ """
518
+ Read a specific byte range using HTTP Range request.
519
+
520
+ Handles presigned URL expiration with automatic refresh and retry.
521
+
522
+ Args:
523
+ start: Start byte (inclusive)
524
+ end: End byte (inclusive)
525
+
526
+ Returns:
527
+ Bytes from the specified range
528
+ """
529
+ if not self._presigned_url:
530
+ self._presigned_url = self._get_presigned_url()
531
+
532
+ self._ensure_http_client()
533
+
534
+ headers = {"Range": f"bytes={start}-{end}"}
535
+ max_retries = 3
536
+
537
+ for attempt in range(max_retries):
538
+ response = self._http_client.get(self._presigned_url, headers=headers)
539
+
540
+ if response.status_code == 206:
541
+ return response.content
542
+ elif response.status_code == 416:
543
+ return b""
544
+ elif response.status_code == 403:
545
+ _logger.debug(
546
+ f"Presigned URL expired at byte {start}, refreshing... (attempt {attempt + 1}/{max_retries})"
547
+ )
548
+ self._presigned_url = self._get_presigned_url()
549
+ continue
550
+ else:
551
+ response.raise_for_status()
552
+ return response.content
553
+
554
+ raise ConnectionError(
555
+ f"Failed to read range {start}-{end} after {max_retries} attempts (URL kept expiring)"
556
+ )
557
+
558
+ def _load_full(self):
559
+ """Load the entire attachment into memory buffer using read_attachment()."""
560
+ if self._fully_loaded:
561
+ return
562
+
563
+ from octostar.utils.workspace import read_attachment
564
+
565
+ data = read_attachment.sync(
566
+ os_workspace=self._workspace_id,
567
+ os_entity_uid=self._entity_id,
568
+ decode=False,
569
+ stream=False,
570
+ client=self._client
571
+ )
572
+ self._buffer = BytesIO(data or b"")
573
+ self._fully_loaded = True
574
+ self._size = len(data) if data else 0
575
+ self._position = 0
576
+
577
+ def read(self, size: int = -1) -> bytes:
578
+ if not self.readable():
579
+ raise IOError("Contents not readable")
580
+
581
+ # If writable or already fully loaded, use buffer
582
+ if self.writable() or self._fully_loaded:
583
+ if not self._buffer:
584
+ self._load_full()
585
+ return self._buffer.read(size)
586
+
587
+ # Otherwise, use HTTP Range requests for efficient streaming
588
+ if size == -1:
589
+ # Read all from current position
590
+ self._fetch_size()
591
+ if self._position >= self._size:
592
+ return b""
593
+ size = self._size - self._position
594
+
595
+ if size <= 0:
596
+ return b""
597
+
598
+ # Fetch the size if we don't know it
599
+ self._fetch_size()
600
+
601
+ # Adjust size if it goes beyond the end
602
+ if self._position + size > self._size:
603
+ size = self._size - self._position
604
+
605
+ if size <= 0:
606
+ return b""
607
+
608
+ # Read using HTTP Range request
609
+ end_byte = self._position + size - 1
610
+ data = self._read_range(self._position, end_byte)
611
+ self._position += len(data)
612
+
613
+ return data
614
+
615
+ def write(self, b: bytes) -> int:
616
+ if not self.writable():
617
+ raise IOError("Contents not writable")
618
+
619
+ if not self._buffer:
620
+ self._load_full()
621
+
622
+ n = self._buffer.write(b)
623
+ self._modified = True
624
+ return n
625
+
626
+ def seek(self, offset: int, whence: int = SEEK_SET) -> int:
627
+ # Calculate new position
628
+ if whence == SEEK_SET:
629
+ new_pos = offset
630
+ elif whence == SEEK_CUR:
631
+ if self._buffer and self._fully_loaded:
632
+ new_pos = self._buffer.tell() + offset
633
+ else:
634
+ new_pos = self._position + offset
635
+ elif whence == SEEK_END:
636
+ self._fetch_size()
637
+ new_pos = self._size + offset
638
+ else:
639
+ raise ValueError(f"Invalid whence value: {whence}")
640
+
641
+ if new_pos < 0:
642
+ raise ValueError("Negative seek position")
643
+
644
+ # If fully loaded, use buffer seek
645
+ if self._buffer and self._fully_loaded:
646
+ return self._buffer.seek(new_pos, SEEK_SET)
647
+
648
+ # Otherwise, just update position (HTTP Range will handle it)
649
+ self._position = new_pos
650
+ return self._position
651
+
652
+ def tell(self) -> int:
653
+ if self._buffer and self._fully_loaded:
654
+ return self._buffer.tell()
655
+ return self._position
656
+
657
+ def flush(self):
658
+ """Flush the internal buffer and write to workspace if modified."""
659
+ if self._buffer:
660
+ self._buffer.flush()
661
+
662
+ if not self._modified or not self._buffer:
663
+ return
664
+
665
+ if not self._entity_type or not self._filetype:
666
+ raise ValueError("entity_type and filetype required to flush to workspace")
667
+
668
+ from octostar.utils.workspace import write_attachment
669
+
670
+ current_pos = self._buffer.tell()
671
+ self._buffer.seek(0, SEEK_SET)
672
+ data = self._buffer.read()
673
+ self._buffer.seek(current_pos, SEEK_SET)
674
+
675
+ write_attachment.sync(
676
+ os_workspace=self._workspace_id,
677
+ os_entity_uid=self._entity_id,
678
+ entity_type=self._entity_type,
679
+ filetype=self._filetype,
680
+ file=data,
681
+ client=self._client
682
+ )
683
+ self._modified = False
684
+
685
+ def close(self):
686
+ if not self._closed:
687
+ # Flush any pending writes before closing
688
+ if self._modified:
689
+ self.flush()
690
+
691
+ if self._buffer:
692
+ self._buffer.close()
693
+ if self._http_client:
694
+ self._http_client.close()
695
+ self._http_client = None
696
+ super().close()
697
+
698
+ def truncate(self, size: Optional[int] = None) -> int:
699
+ if not self._buffer:
700
+ self._load_full()
701
+ self._modified = True
702
+ return self._buffer.truncate(size)
703
+
704
+ def getvalue(self) -> bytes:
705
+ if not self._buffer or not self._fully_loaded:
706
+ self._load_full()
707
+ return self._buffer.getvalue()
708
+
709
+ def to_locator(self) -> Dict[str, Any]:
710
+ """
711
+ Serialize to locator with workspace and entity ID.
712
+
713
+ Returns:
714
+ {"location": "workspace_attachment", "pointer": "workspace_id/entity_id",
715
+ "entity_type": "...", "filetype": "..."}
716
+ """
717
+ if self._workspace_id and self._entity_id:
718
+ pointer = f"{self._workspace_id}/{self._entity_id}"
719
+ else:
720
+ pointer = None
721
+
722
+ locator = {
723
+ "location": ContentsLocation.WORKSPACE_ATTACHMENT.value,
724
+ "pointer": pointer
725
+ }
726
+ if self._entity_type:
727
+ locator["entity_type"] = self._entity_type
728
+ if self._filetype:
729
+ locator["filetype"] = self._filetype
730
+ return locator
731
+
732
+ @staticmethod
733
+ def _from_locator(locator: Dict[str, Any], client=None) -> 'WorkspaceAttachmentContents':
734
+ """
735
+ Create WorkspaceAttachmentContents from a locator dictionary.
736
+
737
+ Args:
738
+ locator: Locator dictionary with pointer "workspace_id/entity_id"
739
+ client: Octostar client for remote operations
740
+
741
+ Returns:
742
+ New WorkspaceAttachmentContents instance
743
+ """
744
+ pointer = locator.get("pointer")
745
+ workspace_id = None
746
+ entity_id = None
747
+
748
+ if pointer:
749
+ parts = pointer.split("/")
750
+ if len(parts) >= 2:
751
+ workspace_id = parts[0]
752
+ entity_id = parts[-1]
753
+
754
+ return WorkspaceAttachmentContents(
755
+ entity_type=locator.get("entity_type"),
756
+ filetype=locator.get("filetype"),
757
+ workspace_id=workspace_id,
758
+ entity_id=entity_id,
759
+ client=client
760
+ )
@@ -26,6 +26,7 @@ from ..core.dict import recursive_update_dict, travel_dict, jsondict_hash
26
26
  from ..core.timestamp import now, string_to_datetime
27
27
  from .fastapi import DefaultErrorRoute, Route
28
28
  from ..ontology.inheritance import is_child_concept as is_child_concept_fn, get_label_keys
29
+ from .contents import Contents, MemoryContents, WorkspaceAttachmentContents, ContentsLocation
29
30
 
30
31
  RELATIONSHIP_ENTITY_NAME = "os_relationship"
31
32
  LOCAL_RELATIONSHIP_ENTITY_NAME = "os_workspace_relationship"
@@ -47,11 +48,6 @@ def safe_async_run(coro):
47
48
  return asyncio.run(coro)
48
49
 
49
50
 
50
- class NifiContentsPointerLocationModel(Enum):
51
- LOCAL = "local"
52
- ATTACHMENT = "attachment"
53
-
54
-
55
51
  class NifiProxyEntityModel(BaseModel):
56
52
  entity_id: str
57
53
  entity_type: str
@@ -72,10 +68,6 @@ class NifiEntityModel(BaseModel):
72
68
  relationships: List[str]
73
69
  label_keys: List[str]
74
70
 
75
- class ContentsPointerModel(BaseModel):
76
- location: NifiContentsPointerLocationModel
77
- pointer: Optional[str] = None
78
-
79
71
  jwt: str
80
72
  ontology_name: str
81
73
  ontology_info: OntologyInfoModel
@@ -84,7 +76,7 @@ class NifiEntityModel(BaseModel):
84
76
  nifi_attributes: dict = Field(default_factory=dict)
85
77
  config: dict = Field(default_factory=dict)
86
78
  metrics: dict = Field(default_factory=dict)
87
- contents_pointer: Optional[ContentsPointerModel] = None
79
+ contents_pointer: Optional[dict] = None
88
80
  is_temporary: bool = False
89
81
  exception: dict = Field(default_factory=dict)
90
82
  last_processor_name: Optional[str] = None
@@ -103,7 +95,7 @@ class NifiEntityModel(BaseModel):
103
95
  record: RecordModel
104
96
  annotations: Dict[str, Any] = Field(default_factory=dict)
105
97
  children: List[Union[NifiOTMRelationshipProxyModel, NifiProxyEntityModel]] = []
106
- contents: Optional[bytes] = None
98
+ contents: Optional[Dict[str, Any]] = None
107
99
 
108
100
 
109
101
  NifiEntityModel.model_rebuild()
@@ -295,12 +287,10 @@ class NifiContextManager(object):
295
287
  return client, curr_user_ontology
296
288
 
297
289
  def receive_input(self, json_data, processor_name) -> List["NifiEntityBatch"]:
298
- def _safe_decode(contents):
299
- return base64.b64decode(contents) if contents else None
300
-
301
290
  entities = []
302
291
  all_independent_uids = [e["record"]["entity_id"] for e in json_data]
303
292
  for elem in json_data:
293
+ contents = Contents.from_locator(elem.get("contents"), client=self.client)
304
294
  entities.append(
305
295
  NifiEntity(
306
296
  self,
@@ -309,7 +299,7 @@ class NifiContextManager(object):
309
299
  elem["annotations"],
310
300
  all_independent_uids,
311
301
  elem["children"],
312
- _safe_decode(elem.get("contents")),
302
+ contents,
313
303
  )
314
304
  )
315
305
  entities = sorted(
@@ -467,7 +457,7 @@ class NifiContextManager(object):
467
457
  has_write_flag = entity.sync_params.get(NifiContextManager.SyncFlag.WRITE_CONTENTS)
468
458
  is_temp_with_pointer = entity.request.get("is_temporary") and entity.contents_pointer
469
459
  if has_write_flag or is_temp_with_pointer:
470
- if entity.contents:
460
+ if entity.contents: # Contents instance check
471
461
  files_to_write.append(entity)
472
462
  # FIND ENTITIES TO UPSERT
473
463
  self._find_entities_to_upsert(entities, entities_to_upsert, reserved_fields)
@@ -495,6 +485,10 @@ class NifiContextManager(object):
495
485
  # WRITE FILES
496
486
  if files_to_write:
497
487
  for file in files_to_write:
488
+ if not file.contents:
489
+ continue
490
+
491
+ # Pass Contents instance directly — write_file uses duck typing
498
492
  new_file_record = write_file.sync(
499
493
  file.write_os_workspace,
500
494
  "./" + file.record["os_item_name"],
@@ -510,10 +504,18 @@ class NifiContextManager(object):
510
504
  file.record["entity_label"] = file.label
511
505
  file.request["is_temporary"] = False
512
506
  file.request["entity_timestamp"] = file.record["os_last_updated_at"]
513
- file._contents = None
507
+ file._contents = WorkspaceAttachmentContents(
508
+ workspace_id=file.record['os_workspace'],
509
+ entity_id=file.record['os_entity_uid'],
510
+ client=self.client,
511
+ entity_type=file.record["os_concept"],
512
+ filetype=file.record["os_item_content_type"]
513
+ )
514
514
  file.request["contents_pointer"] = {
515
- "location": NifiContentsPointerLocationModel.ATTACHMENT.value,
516
- "pointer": f"{file.record['os_workspace']}/{file.record['os_entity_uid']}"
515
+ "location": ContentsLocation.WORKSPACE_ATTACHMENT.value,
516
+ "pointer": f"{file.record['os_workspace']}/{file.record['os_entity_uid']}",
517
+ "entity_type": file.record["os_concept"],
518
+ "filetype": file.record["os_item_content_type"]
517
519
  }
518
520
  # UPSERT ENTITIES
519
521
  if entities_to_upsert:
@@ -692,7 +694,7 @@ class NifiEntity(object):
692
694
  c["annotations"],
693
695
  all_independent_uids,
694
696
  c["children"],
695
- c["contents"],
697
+ Contents.from_locator(c.get("contents"), client=self.context.client),
696
698
  )
697
699
  for c in full_entity_children
698
700
  ]
@@ -711,7 +713,7 @@ class NifiEntity(object):
711
713
  for i in range(len(child_uids))
712
714
  ]
713
715
  self.children.extend(proxy_otm_children)
714
- self._contents = contents
716
+ self._contents: Optional[Contents] = contents
715
717
  self.drop_on_output = False
716
718
 
717
719
  def __eq__(self, other):
@@ -741,36 +743,16 @@ class NifiEntity(object):
741
743
  self._annotations = new_annotations
742
744
 
743
745
  @property
744
- def contents(self):
746
+ def contents(self) -> Optional[Contents]:
745
747
  if not self._contents:
746
748
  contents_pointer = self.contents_pointer
747
749
  if not contents_pointer:
748
750
  return None
749
- if contents_pointer["location"] == "attachment":
750
- self._contents = read_file.sync(
751
- contents_pointer["pointer"].split("/")[0],
752
- contents_pointer["pointer"].split("/")[-1],
753
- False,
754
- client=self.context.client,
755
- )
751
+ self._contents = Contents.from_locator(contents_pointer, client=self.context.client)
756
752
  return self._contents
757
753
 
758
- @property
759
- def contents_pointer(self):
760
- contents_pointer = deepcopy(self.request.get("contents_pointer"))
761
- if not self.request.get("contents_pointer"):
762
- return None
763
- ptr_location = contents_pointer.get("location")
764
- if ptr_location == "attachment" and not contents_pointer.get("pointer"):
765
- contents_pointer["pointer"] = f"{self.record['os_workspace']}/{self.record['os_entity_uid']}"
766
- return contents_pointer
767
-
768
- @contents_pointer.setter
769
- def contents_pointer(self, new_value):
770
- self.request["contents_pointer"] = new_value
771
-
772
754
  @contents.setter
773
- def contents(self, new_contents):
755
+ def contents(self, new_contents: Optional[Union[Contents, bytes]]):
774
756
  self._contents = new_contents
775
757
 
776
758
  @property
@@ -866,9 +848,6 @@ class NifiEntity(object):
866
848
  return not _is_sub_fragment_recursive(fragment)
867
849
 
868
850
  def to_json(self):
869
- def _safe_encode(contents):
870
- return base64.b64encode(contents) if contents else None
871
-
872
851
  if self.drop_on_output:
873
852
  return
874
853
  proxy_entity_children = []
@@ -909,7 +888,7 @@ class NifiEntity(object):
909
888
  "record": self.record,
910
889
  "children": children,
911
890
  "annotations": self.annotations,
912
- "contents": _safe_encode(self._contents),
891
+ "contents": self._contents.to_locator() if self._contents else None,
913
892
  }
914
893
 
915
894
  def _add_entity(self, os_workspace, entity_type, fields, os_entity_uid=None):
@@ -1058,7 +1037,7 @@ class NifiEntity(object):
1058
1037
  os_parent_folder,
1059
1038
  filename,
1060
1039
  filetype,
1061
- file,
1040
+ file: Union[Contents, bytes],
1062
1041
  fields={},
1063
1042
  os_relationship_name=FILE_RELATIONSHIP_NAME,
1064
1043
  os_relationship_type="mtm",
@@ -1080,8 +1059,13 @@ class NifiEntity(object):
1080
1059
  os_entity_uid,
1081
1060
  os_relationship_uid,
1082
1061
  )
1083
- child_entity._contents = file
1084
- child_entity.request["contents_pointer"] = NifiEntityModel.RequestModel.ContentsPointerModel(location="local")
1062
+ if isinstance(file, Contents):
1063
+ child_entity._contents = file
1064
+ else:
1065
+ child_entity._contents = MemoryContents(
1066
+ entity_type=FILE_ENTITY_NAME, filetype=filetype, initial_data=file
1067
+ )
1068
+ child_entity.request["contents_pointer"] = child_entity._contents.to_locator()
1085
1069
  return child_entity, child_rel
1086
1070
 
1087
1071
  def add_tag(self, os_workspace, name, group, order, color, fields={}):
@@ -10,7 +10,10 @@ def now():
10
10
 
11
11
 
12
12
  def string_to_datetime(datetime_str):
13
- return dt_parser.parse(
13
+ parsed = dt_parser.parse(
14
14
  datetime_str
15
15
  or dt.datetime.fromtimestamp(0, dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
16
16
  )
17
+ if parsed.tzinfo is None or parsed.tzinfo.utcoffset(parsed) is None:
18
+ parsed = parsed.replace(tzinfo=dt.timezone.utc)
19
+ return parsed