streamlit-octostar-utils 0.4.2.dev14__tar.gz → 0.4.2.dev16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/celery.py +137 -36
  4. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/fastapi.py +13 -8
  5. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/nifi.py +5 -2
  6. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/ner.py +285 -26
  7. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/LICENSE +0 -0
  8. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/README.md +0 -0
  9. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/__init__.py +0 -0
  10. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  11. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
  12. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  31. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/language.py +0 -0
  32. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  33. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/client.py +0 -0
  34. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/context.py +0 -0
  35. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  36. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  37. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  38. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  39. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/validation.py +0 -0
  40. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/style/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/style/common.py +0 -0
  42. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/__init__.py +0 -0
  43. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  44. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  45. {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.4.2.dev14
3
+ Version: 0.4.2.dev16
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.4.2-dev.14"
8
+ version = "0.4.2-dev.16"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -14,6 +14,7 @@ import atexit
14
14
  import redis
15
15
  import uuid
16
16
  import json
17
+ import hashlib
17
18
  import shutil
18
19
  import threading
19
20
  from pottery import Redlock
@@ -56,12 +57,14 @@ class CeleryQueueConfig:
56
57
  max_tasks_in_queue=None,
57
58
  max_tasks_per_child=None,
58
59
  max_memory_per_child=None,
60
+ stall_timeout=1200,
59
61
  **options,
60
62
  ):
61
63
  self.n_workers = n_workers
62
64
  self.max_tasks_in_queue = max_tasks_in_queue
63
65
  self.max_tasks_per_child = max_tasks_per_child
64
66
  self.max_memory_per_child = max_memory_per_child # KiB
67
+ self.stall_timeout = stall_timeout # seconds; None or 0 to disable
65
68
  self.options = options
66
69
 
67
70
 
@@ -90,6 +93,11 @@ class CelerySerialized:
90
93
 
91
94
  class CeleryExecutor(object):
92
95
  class QueueFullException(Exception):
96
+ """Queue is full. Try again later."""
97
+ pass
98
+
99
+ class QueueStalledException(Exception):
100
+ """Queue has tasks but none are being consumed — workers may be deadlocked or in a crash loop."""
93
101
  pass
94
102
 
95
103
  AWAITING = "AWAITING"
@@ -169,6 +177,12 @@ class CeleryExecutor(object):
169
177
  self.stop_event = threading.Event()
170
178
  self.worker_health_check_thread = None
171
179
  self.worker_info = {}
180
+
181
+ # Queue stall detection
182
+ self._queue_fingerprints = {}
183
+ self._queue_fingerprint_changed_at = {}
184
+ self._queue_stalled = {}
185
+
172
186
  atexit.register(self.close)
173
187
  self.set_cleanup_task()
174
188
  self.register_state_signals()
@@ -204,9 +218,20 @@ class CeleryExecutor(object):
204
218
  if self.preload_functions:
205
219
  celery_signals.worker_process_init.connect(self.preload_on_worker_init)
206
220
 
221
+ def set_last_completed_time(self, sender=None, task_id=None, task=None, **kwargs):
222
+ try:
223
+ queue = task.request.delivery_info.get(
224
+ "routing_key", self.app.conf.task_default_routing_key
225
+ ) if task else None
226
+ if queue:
227
+ self.redis_client.set(f"queue:last_completed:{queue}", str(time.time()))
228
+ except Exception:
229
+ pass
230
+
207
231
  def register_state_signals(self):
208
232
  celery_signals.before_task_publish.connect(self.set_awaiting_state)
209
233
  celery_signals.task_prerun.connect(self.set_started_state)
234
+ celery_signals.task_postrun.connect(self.set_last_completed_time)
210
235
 
211
236
  def cleanup_task_results(in_dir, out_dir, redis_host, redis_port, task_expires, result_expires):
212
237
  logger.info("Starting cleanup of expired task results...")
@@ -378,37 +403,87 @@ class CeleryExecutor(object):
378
403
  def _worker_health_check_loop(self):
379
404
  while not self.stop_event.is_set():
380
405
  try:
381
- dead_processes = []
382
- for process in self.processes:
383
- poll_result = process.poll()
384
- if poll_result is not None:
385
- queue_name, slot, command = self.worker_info[process]
386
- logger.warning(
387
- f"Worker process dead for queue '{queue_name}' slot {slot}. "
388
- f"Exit code: {poll_result}. Restarting..."
389
- )
390
- dead_processes.append(process)
391
- if self.beat_process and self.beat_process.poll() is not None:
392
- logger.warning(
393
- f"Beat process dead (exit code: {self.beat_process.poll()}). Restarting..."
394
- )
395
- self.beat_process = None
396
- for dead_process in dead_processes:
397
- queue_name, slot, command = self.worker_info[dead_process]
398
- self.processes.remove(dead_process)
399
- del self.worker_info[dead_process]
400
- new_process = subprocess.Popen(command)
401
- self.processes.append(new_process)
402
- self.worker_info[new_process] = (queue_name, slot, command)
403
- logger.info(f"Restarted worker for queue '{queue_name}' slot {slot} (PID: {new_process.pid})")
404
- if self.beat_process is None:
405
- self.beat_process = subprocess.Popen(self.beat_command)
406
- logger.info(f"Restarted beat process (PID: {self.beat_process.pid})")
406
+ self._restart_dead_processes()
407
+ self._check_queue_stalls()
407
408
  time.sleep(5)
408
409
  except Exception as e:
409
410
  logger.error(f"Error in worker health check: {e}")
410
411
  time.sleep(5)
411
412
 
413
+ def _restart_dead_processes(self):
414
+ dead_processes = []
415
+ for process in self.processes:
416
+ poll_result = process.poll()
417
+ if poll_result is not None:
418
+ queue_name, slot, command = self.worker_info[process]
419
+ logger.warning(
420
+ f"Worker process dead for queue '{queue_name}' slot {slot}. "
421
+ f"Exit code: {poll_result}. Restarting..."
422
+ )
423
+ dead_processes.append(process)
424
+ if self.beat_process and self.beat_process.poll() is not None:
425
+ logger.warning(
426
+ f"Beat process dead (exit code: {self.beat_process.poll()}). Restarting..."
427
+ )
428
+ self.beat_process = None
429
+ for dead_process in dead_processes:
430
+ queue_name, slot, command = self.worker_info[dead_process]
431
+ self.processes.remove(dead_process)
432
+ del self.worker_info[dead_process]
433
+ new_process = subprocess.Popen(command)
434
+ self.processes.append(new_process)
435
+ self.worker_info[new_process] = (queue_name, slot, command)
436
+ logger.info(f"Restarted worker for queue '{queue_name}' slot {slot} (PID: {new_process.pid})")
437
+ if self.beat_process is None:
438
+ self.beat_process = subprocess.Popen(self.beat_command)
439
+ logger.info(f"Restarted beat process (PID: {self.beat_process.pid})")
440
+
441
+ def _check_queue_stalls(self):
442
+ for queue_name, queue_config in self.queue_config.items():
443
+ if not queue_config.stall_timeout:
444
+ continue
445
+ try:
446
+ queue_items = self.redis_client.lrange(queue_name, 0, -1)
447
+ if len(queue_items) == 0:
448
+ self._queue_stalled[queue_name] = False
449
+ self._queue_fingerprints.pop(queue_name, None)
450
+ self._queue_fingerprint_changed_at.pop(queue_name, None)
451
+ continue
452
+
453
+ fingerprint = hashlib.md5(b"".join(sorted(queue_items))).hexdigest()
454
+ now_time = time.time()
455
+ prev_fingerprint = self._queue_fingerprints.get(queue_name)
456
+
457
+ if fingerprint != prev_fingerprint:
458
+ self._queue_fingerprints[queue_name] = fingerprint
459
+ self._queue_fingerprint_changed_at[queue_name] = now_time
460
+ self._queue_stalled[queue_name] = False
461
+ continue
462
+
463
+ fingerprint_age = now_time - self._queue_fingerprint_changed_at.get(queue_name, now_time)
464
+
465
+ last_completed_raw = self.redis_client.get(f"queue:last_completed:{queue_name}")
466
+ last_completed = float(last_completed_raw) if last_completed_raw else 0
467
+ time_since_completion = (now_time - last_completed) if last_completed else float("inf")
468
+
469
+ was_stalled = self._queue_stalled.get(queue_name, False)
470
+ is_stalled = (
471
+ fingerprint_age >= queue_config.stall_timeout
472
+ and time_since_completion >= queue_config.stall_timeout
473
+ )
474
+ self._queue_stalled[queue_name] = is_stalled
475
+
476
+ if is_stalled and not was_stalled:
477
+ logger.error(
478
+ f"Queue '{queue_name}' is STALLED: {len(queue_items)} task(s) stuck for "
479
+ f"{fingerprint_age:.0f}s with no completions in {time_since_completion:.0f}s. "
480
+ f"New requests will receive 503."
481
+ )
482
+ elif not is_stalled and was_stalled:
483
+ logger.info(f"Queue '{queue_name}' has recovered from stall.")
484
+ except Exception as e:
485
+ logger.error(f"Error checking stall for queue '{queue_name}': {e}")
486
+
412
487
  def close(self):
413
488
  self.stop_event.set()
414
489
  if self.worker_health_check_thread and self.worker_health_check_thread.is_alive():
@@ -507,6 +582,10 @@ class CeleryExecutor(object):
507
582
  self.app.conf.dev_preload = True
508
583
 
509
584
  def _reserve_queue_slot(queue_name):
585
+ if self._queue_stalled.get(queue_name, False):
586
+ raise CeleryExecutor.QueueStalledException(
587
+ f"Queue '{queue_name}' is stalled. Service temporarily unavailable."
588
+ )
510
589
  limit = self.queue_config[queue_name].max_tasks_in_queue
511
590
  if limit:
512
591
  reservation_key = f"queue:reserved:{queue_name}"
@@ -540,6 +619,12 @@ class CeleryExecutor(object):
540
619
  def _send_task(task_fn, task_id, options):
541
620
  task_fn.apply_async(task_id=task_id, **options)
542
621
 
622
+ def _store_task_queue_mapping(task_id, queue_name):
623
+ self.redis_client.set(
624
+ f"task:queue:{task_id}", queue_name,
625
+ ex=self.app.conf.result_expires,
626
+ )
627
+
543
628
  task_id = str(uuid.uuid4())
544
629
  queue_name = self.app.conf.task_default_routing_key
545
630
  queue_name = getattr(task_fn, "queue", queue_name)
@@ -560,6 +645,9 @@ class CeleryExecutor(object):
560
645
  await asyncio.get_running_loop().run_in_executor(
561
646
  self.set_thread_pool, _send_task, task_fn, task_id, options
562
647
  )
648
+ await asyncio.get_running_loop().run_in_executor(
649
+ self.set_thread_pool, _store_task_queue_mapping, task_id, queue_name
650
+ )
563
651
  except asyncio.CancelledError:
564
652
  logger.info(f"Cancelling task {task_id} due to disconnect!")
565
653
  await self.terminate_task(task_id)
@@ -596,6 +684,14 @@ class CeleryExecutor(object):
596
684
  def _poll_task_state(celery_app, task_id):
597
685
  task = celery_app.AsyncResult(task_id)
598
686
  ready, state = task.ready(), task.state
687
+ if not ready and state == CeleryExecutor.AWAITING:
688
+ task_queue = self.redis_client.get(f"task:queue:{task_id}")
689
+ if task_queue:
690
+ queue_name = task_queue.decode() if isinstance(task_queue, bytes) else task_queue
691
+ if self._queue_stalled.get(queue_name, False):
692
+ raise CeleryExecutor.QueueStalledException(
693
+ f"Task {task_id} is in stalled queue '{queue_name}'. Service temporarily unavailable."
694
+ )
599
695
  return ready, state
600
696
 
601
697
  return await asyncio.get_running_loop().run_in_executor(
@@ -704,7 +800,7 @@ class FastAPICeleryTaskRoute(Route):
704
800
  or (state not in ["SUCCESS", "FAILURE", "RETRY", "REVOKED"])
705
801
  )
706
802
  if state in ["FAILURE", "RETRY", "REVOKED"]:
707
- error_response = DefaultErrorRoute.format_error(exc, debug=True).body.decode("utf-8")
803
+ error_response = DefaultErrorRoute.format_error(exc, internal=True).body.decode("utf-8")
708
804
  data = {
709
805
  "task_state": state,
710
806
  "task_id": task_id,
@@ -747,28 +843,33 @@ class CeleryRoute(Route, ABC):
747
843
 
748
844
 
749
845
  class CeleryErrorRoute(DefaultErrorRoute):
750
- DEFAULT_STATUS_CODE_MAPPINGS = {CeleryExecutor.QueueFullException: lambda exc: 429}
751
- DEFAULT_SILENCED_EXCEPTIONS = {CeleryExecutor.QueueFullException: lambda exc: True}
846
+ DEFAULT_STATUS_CODE_MAPPINGS = {
847
+ CeleryExecutor.QueueFullException: lambda exc: 429,
848
+ CeleryExecutor.QueueStalledException: lambda exc: 503,
849
+ }
850
+ DEFAULT_SILENCED_EXCEPTIONS = {
851
+ CeleryExecutor.QueueFullException: lambda exc: True,
852
+ }
752
853
 
753
854
  def add_default_exceptions_handler(
754
855
  fs_app,
755
- debug=False,
856
+ internal=False,
857
+ internal_prefixes=None,
756
858
  excs_to_status_codes=None,
757
859
  silenced_excs=None,
758
860
  ):
759
- extra_status = {CeleryExecutor.QueueFullException: lambda exc: 429}
760
- extra_silence = {CeleryExecutor.QueueFullException: lambda exc: True}
761
-
762
861
  status_codes = {
763
862
  **DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
863
+ **CeleryErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
764
864
  **(excs_to_status_codes or {}),
765
- **extra_status,
766
865
  }
767
866
 
768
867
  silenced = {
769
868
  **DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
869
+ **CeleryErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
770
870
  **(silenced_excs or {}),
771
- **extra_silence,
772
871
  }
773
872
 
774
- super(CeleryErrorRoute, CeleryErrorRoute).add_default_exceptions_handler(fs_app, debug, status_codes, silenced)
873
+ super(CeleryErrorRoute, CeleryErrorRoute).add_default_exceptions_handler(
874
+ fs_app, internal, internal_prefixes, status_codes, silenced,
875
+ )
@@ -21,7 +21,6 @@ from octostar.client import make_client
21
21
 
22
22
  MAX_ERROR_MESSAGE_BYTES = 256
23
23
  MAX_ERROR_TRACEBACK_BYTES = 10240
24
- DEFAULT_PROCESSOR_SUFFIX = "main"
25
24
 
26
25
 
27
26
  class CommonParsers(object):
@@ -314,7 +313,7 @@ class DefaultErrorRoute:
314
313
  },
315
314
  }
316
315
 
317
- def format_error(exc, body=b"", debug=False, excs_to_status_codes=DEFAULT_STATUS_CODE_MAPPINGS):
316
+ def format_error(exc, body=b"", internal=False, excs_to_status_codes=DEFAULT_STATUS_CODE_MAPPINGS):
318
317
  """Generic Error Handler"""
319
318
  status_code = 500
320
319
  for exc_type, handler in excs_to_status_codes.items():
@@ -325,7 +324,7 @@ class DefaultErrorRoute:
325
324
  message = exc.message
326
325
  except:
327
326
  message = str(exc)
328
- if debug:
327
+ if internal:
329
328
  message += "\n" + str(body)
330
329
  if len(message) > MAX_ERROR_MESSAGE_BYTES:
331
330
  message = message[-MAX_ERROR_MESSAGE_BYTES:]
@@ -336,16 +335,18 @@ class DefaultErrorRoute:
336
335
  except:
337
336
  tcbk = None
338
337
  response_content = {"message": message, "status": "error"}
339
- if debug:
338
+ if internal:
339
+ response_content["exception_class"] = f"python.{type(exc).__module__}.{type(exc).__qualname__}"
340
340
  response_content["traceback"] = tcbk
341
341
  return JSONResponse(status_code=status_code, content=response_content)
342
342
 
343
- async def handle_error(body: bytes, exc: Exception, debug: bool, excs_to_status_codes: dict):
344
- return DefaultErrorRoute.format_error(exc, body, debug, excs_to_status_codes)
343
+ async def handle_error(body: bytes, exc: Exception, internal: bool, excs_to_status_codes: dict):
344
+ return DefaultErrorRoute.format_error(exc, body, internal, excs_to_status_codes)
345
345
 
346
346
  def add_default_exceptions_handler(
347
347
  fs_app,
348
- debug=False,
348
+ internal=False,
349
+ internal_prefixes=None,
349
350
  excs_to_status_codes=None,
350
351
  silenced_excs=None,
351
352
  ):
@@ -353,9 +354,13 @@ class DefaultErrorRoute:
353
354
  excs_to_status_codes = DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS
354
355
  if silenced_excs is None:
355
356
  silenced_excs = DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS
357
+ internal_prefixes = tuple(internal_prefixes or [])
356
358
 
357
359
  async def _async_handle_error(request: Request, exc: Exception):
358
- return await DefaultErrorRoute.handle_error(b"", exc, debug, excs_to_status_codes)
360
+ is_internal = internal or (
361
+ internal_prefixes and request.url.path.startswith(internal_prefixes)
362
+ )
363
+ return await DefaultErrorRoute.handle_error(b"", exc, is_internal, excs_to_status_codes)
359
364
 
360
365
  # Added all three since FastAPI seems to intercept some exceptions before Exception
361
366
  fs_app.add_exception_handler(RequestValidationError, _async_handle_error)
@@ -385,9 +385,12 @@ class NifiContextManager(object):
385
385
  return [entity for entity in self.jsonify(self.out_entities)["content"]]
386
386
 
387
387
  def raise_exception(self, entity, exc):
388
- error_response = DefaultErrorRoute.format_error(exc)
388
+ error_response = DefaultErrorRoute.format_error(exc, internal=True)
389
+ error_body = json.loads(error_response.body)
389
390
  entity.request["exception"]["code"] = error_response.status_code
390
- entity.request["exception"]["body"] = json.loads(error_response.body)["message"]
391
+ entity.request["exception"]["body"] = error_body["message"]
392
+ entity.request["exception"]["exception_class"] = error_body.get("exception_class")
393
+ entity.request["exception"]["traceback"] = error_body.get("traceback")
391
394
  travel_dict(entity.request["nifi_attributes"], ["invokehttp", "response", "body"], "w")(
392
395
  entity.request["exception"]["body"]
393
396
  )
@@ -4,7 +4,7 @@ import math
4
4
  import multiprocessing
5
5
  import re
6
6
  from contextlib import contextmanager
7
- from typing import Optional, List, Tuple
7
+ from typing import Optional, List, Tuple, Union
8
8
 
9
9
  from iso639.exceptions import InvalidLanguageValue
10
10
  from pydantic import BaseModel, ConfigDict, Field
@@ -202,8 +202,57 @@ class FlairRecognizer(EntityRecognizer):
202
202
 
203
203
  results.append(flair_result)
204
204
 
205
+ sentences.clear_embeddings()
206
+
205
207
  return results
206
208
 
209
+ def analyze_batch(
210
+ self, texts: List[str], entities: List[str], nlp_artifacts_list: List[NlpArtifacts] = None
211
+ ) -> List[List[RecognizerResult]]:
212
+ """
213
+ Batch analyze using Flair's native Sentence batching.
214
+
215
+ Creates one Sentence per text, runs model.predict() in a single
216
+ forward pass, then extracts results per-Sentence.
217
+
218
+ Args:
219
+ texts: List of input texts
220
+ entities: Entity types to detect
221
+ nlp_artifacts_list: Ignored (Flair uses its own tokenization)
222
+
223
+ Returns:
224
+ List of lists of RecognizerResult, one inner list per input text
225
+ """
226
+ sentences = [Sentence(text) for text in texts]
227
+ self.model.predict(sentences)
228
+
229
+ if not entities:
230
+ entities = self.supported_entities
231
+
232
+ all_results = []
233
+ for sentence in sentences:
234
+ results = []
235
+ for entity in entities:
236
+ if entity not in self.supported_entities:
237
+ continue
238
+ for ent in sentence.get_spans("ner"):
239
+ if not self.__check_label(
240
+ entity, ent.labels[0].value, self.check_label_groups
241
+ ):
242
+ continue
243
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
244
+ ent.labels[0].value
245
+ )
246
+ explanation = self.build_flair_explanation(
247
+ round(ent.score, 2), textual_explanation
248
+ )
249
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
250
+ results.append(flair_result)
251
+ sentence.clear_embeddings()
252
+ all_results.append(results)
253
+
254
+ return all_results
255
+
207
256
  def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
208
257
  explanation = AnalysisExplanation(
209
258
  recognizer=self.__class__.__name__,
@@ -740,6 +789,128 @@ def compute_ner_presidio(
740
789
  return ner_objects
741
790
 
742
791
 
792
+ def _ensure_analyze_batch(recognizer):
793
+ """Monkey-patch a loop-based analyze_batch onto recognizers that lack native batch support."""
794
+ if hasattr(recognizer, 'analyze_batch'):
795
+ return
796
+ _analyze = recognizer.analyze
797
+
798
+ def analyze_batch(texts, entities, nlp_artifacts_list=None):
799
+ return [
800
+ _analyze(text, entities, nlp_artifacts_list[i] if nlp_artifacts_list else None)
801
+ for i, text in enumerate(texts)
802
+ ]
803
+
804
+ recognizer.analyze_batch = analyze_batch
805
+
806
+
807
+ def _compute_ner_batch(
808
+ texts,
809
+ language,
810
+ analyzer,
811
+ entities=None,
812
+ score_threshold=0.5,
813
+ context_width=150,
814
+ with_comentions=True,
815
+ with_context=True,
816
+ batch_size=32,
817
+ n_process=None,
818
+ ):
819
+ """
820
+ Batch NER across multiple texts using the analyzer's recognizers.
821
+
822
+ Recognizers with a native analyze_batch (e.g. FlairRecognizer) run a
823
+ single batched forward pass. Others fall back to a per-text loop via
824
+ monkey-patched analyze_batch. Per-text postprocessing (threshold,
825
+ context, co-mentions) is applied individually — each text is standalone.
826
+
827
+ Args:
828
+ texts: List of preprocessed text strings
829
+ language: Language code
830
+ analyzer: Pre-built AnalyzerEngine
831
+ entities: Optional list of entity types to detect (None = all)
832
+ score_threshold: Minimum confidence score
833
+ context_width: Character width for context and co-mention proximity
834
+ with_comentions: Include co-mentioned entities
835
+ with_context: Include surrounding context
836
+ batch_size: Batch size for NLP engine preprocessing
837
+ n_process: Number of processes for NLP engine (default 1)
838
+
839
+ Returns:
840
+ List of lists of NERObject, one inner list per input text
841
+ """
842
+ expanded_entities = expand_entities_for_analyzer(entities) if entities else None
843
+
844
+ # Batch NLP preprocessing (spaCy tokenization / NER)
845
+ with silence_logging(logging.ERROR):
846
+ nlp_artifacts_batch = list(
847
+ analyzer.nlp_engine.process_batch(
848
+ texts=texts,
849
+ language=language,
850
+ batch_size=batch_size,
851
+ )
852
+ )
853
+
854
+ batch_texts = [text for text, _ in nlp_artifacts_batch]
855
+ batch_nlp = [na for _, na in nlp_artifacts_batch]
856
+
857
+ # Ensure all recognizers have analyze_batch
858
+ with silence_logging(logging.ERROR):
859
+ recognizers = analyzer.registry.get_recognizers(
860
+ language=language,
861
+ entities=expanded_entities,
862
+ all_fields=not expanded_entities,
863
+ )
864
+
865
+ for recognizer in recognizers:
866
+ _ensure_analyze_batch(recognizer)
867
+
868
+ # Run batch analysis per recognizer
869
+ per_text_results = [[] for _ in texts]
870
+ for recognizer in recognizers:
871
+ recognizer_entities = expanded_entities
872
+ if not recognizer_entities:
873
+ recognizer_entities = recognizer.get_supported_entities()
874
+
875
+ batch_results = recognizer.analyze_batch(
876
+ batch_texts,
877
+ recognizer_entities,
878
+ batch_nlp,
879
+ )
880
+ for i, results in enumerate(batch_results):
881
+ per_text_results[i].extend(results)
882
+
883
+ # Per-text postprocessing → NERObjects
884
+ all_ner_objects = []
885
+ for text, results in zip(batch_texts, per_text_results):
886
+ ner_objects = []
887
+ for result in results:
888
+ if result.score >= score_threshold:
889
+ context_start = max(0, result.start - math.floor(context_width / 2))
890
+ context_end = min(len(text), result.end + math.ceil(context_width / 2))
891
+ context = text[context_start:context_end] if with_context else None
892
+ ner_objects.append(NERObject(
893
+ name=text[result.start:result.end],
894
+ label=normalize_presidio_label(result.entity_type),
895
+ score=float(result.score),
896
+ start=int(result.start),
897
+ count=1,
898
+ context=context,
899
+ ))
900
+ if with_comentions:
901
+ for i in range(len(ner_objects)):
902
+ entity = ner_objects[i]
903
+ comentions = [
904
+ ner_objects[j].name
905
+ for j in range(len(ner_objects))
906
+ if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
907
+ ]
908
+ ner_objects[i].comentions = comentions
909
+ all_ner_objects.append(ner_objects)
910
+
911
+ return all_ner_objects
912
+
913
+
743
914
  def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
744
915
  tokenizer = get_nltk_tokenizer(language)
745
916
  stemmer = Stemmer(language)
@@ -837,6 +1008,77 @@ def _strip_honorifics_for_ner(text: str) -> str:
837
1008
  return result
838
1009
 
839
1010
 
1011
+ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines):
1012
+ """Preprocess a single text for NER (newlines, honorifics, compression)."""
1013
+ if preprocess_newlines:
1014
+ text = _preprocess_newlines_for_ner(text)
1015
+ text = _strip_honorifics_for_ner(text)
1016
+
1017
+ cr = compression_ratio
1018
+ if cr == "auto":
1019
+ cr = max(1.0, len(text) / 15000) if fast else 1.0
1020
+
1021
+ if cr > 1.0:
1022
+ sentences = get_extractive_summary(
1023
+ text, language, int(len(text) / cr), fast=fast, with_scores=True
1024
+ )
1025
+ text = " ".join([s[0] for s in sentences])
1026
+
1027
+ return text
1028
+
1029
+
1030
+ def _ner_pipe_batch(
1031
+ texts,
1032
+ language,
1033
+ model,
1034
+ engine_type="spacy",
1035
+ fast=False,
1036
+ compression_ratio="auto",
1037
+ with_comentions=True,
1038
+ with_context=True,
1039
+ entities=None,
1040
+ score_threshold=0.5,
1041
+ batch_size=32,
1042
+ n_process=None,
1043
+ preprocess_newlines=True,
1044
+ _analyzer=None
1045
+ ):
1046
+ """
1047
+ Internal batch processing path for ner_pipe.
1048
+
1049
+ Uses a unified path for all engine types. Recognizers with native
1050
+ analyze_batch (FlairRecognizer) get true batched inference. Others
1051
+ fall back to a per-text loop via monkey-patched analyze_batch.
1052
+ Per-text postprocessing is applied individually.
1053
+ """
1054
+ processed_texts = []
1055
+ for t in texts:
1056
+ if not isinstance(t, str):
1057
+ raise TypeError(f"Each text must be str, not {type(t).__name__}")
1058
+ processed_texts.append(
1059
+ _preprocess_text_for_ner(t, language, fast, compression_ratio, preprocess_newlines)
1060
+ )
1061
+
1062
+ if _analyzer is None:
1063
+ _analyzer = build_presidio_analyzer(
1064
+ language=language,
1065
+ engine_type=engine_type,
1066
+ model=model,
1067
+ )
1068
+
1069
+ return _compute_ner_batch(
1070
+ processed_texts,
1071
+ language,
1072
+ _analyzer,
1073
+ entities=entities,
1074
+ score_threshold=score_threshold,
1075
+ with_comentions=with_comentions,
1076
+ with_context=with_context,
1077
+ batch_size=batch_size,
1078
+ n_process=n_process,
1079
+ )
1080
+
1081
+
840
1082
  def ner_pipe(
841
1083
  text,
842
1084
  language,
@@ -851,15 +1093,21 @@ def ner_pipe(
851
1093
  score_threshold=0.5,
852
1094
  batch_size=32,
853
1095
  n_process=None,
854
- preprocess_newlines=True
1096
+ preprocess_newlines=True,
1097
+ _analyzer=None
855
1098
  ):
856
1099
  """
857
- Run NER pipeline on text.
858
-
1100
+ Run NER pipeline on text or a batch of texts.
1101
+
859
1102
  Args:
860
- text: Input text (str). For multiple texts, iterate and call this function for each.
1103
+ text: Input text (str) or list of texts (list[str]).
1104
+ When a list is provided and engine_type is 'flair', texts are
1105
+ processed in a single batched forward pass using native Flair
1106
+ Sentence objects. For other engines, texts are processed
1107
+ individually through Presidio (still benefiting from a reused
1108
+ analyzer when called via get_ner_handler).
861
1109
  language: Language code (e.g., 'en', 'de', 'fr')
862
- model: Model name for spacy/flair engine
1110
+ model: Model name or instance for spacy/flair engine
863
1111
  engine_type: 'regex', 'flair', 'spacy' or 'custom'
864
1112
  fast: Use fast summarization for long texts
865
1113
  compression_ratio: Compression ratio for long texts ('auto' or float)
@@ -868,38 +1116,42 @@ def ner_pipe(
868
1116
  with_context: Include surrounding context
869
1117
  entities: List of entity types to detect (None = all)
870
1118
  score_threshold: Minimum confidence score
871
- batch_size: Batch size for processing
1119
+ batch_size: Batch size for processing (used as mini_batch_size for Flair)
872
1120
  n_process: Number of parallel processes
873
1121
  preprocess_newlines: Replace newlines with ' — ' to prevent entity merging
1122
+
1123
+ Returns:
1124
+ list[NERObject] when text is a str
1125
+ list[list[NERObject]] when text is a list[str]
874
1126
  """
875
1127
  if with_scores:
876
1128
  raise NotImplementedError("with_scores functionality is not implemented yet")
877
-
878
- if not isinstance(text, str):
879
- raise TypeError(f"text must be str, not {type(text).__name__}")
880
1129
 
881
- analyzer = build_presidio_analyzer(
882
- language=language,
883
- engine_type=engine_type,
884
- model=model,
885
- )
1130
+ if isinstance(text, list):
1131
+ return _ner_pipe_batch(
1132
+ text, language, model, engine_type, fast, compression_ratio,
1133
+ with_comentions=with_comentions, with_context=with_context,
1134
+ entities=entities, score_threshold=score_threshold,
1135
+ batch_size=batch_size, n_process=n_process,
1136
+ preprocess_newlines=preprocess_newlines, _analyzer=_analyzer
1137
+ )
886
1138
 
887
- if preprocess_newlines:
888
- text = _preprocess_newlines_for_ner(text)
889
- text = _strip_honorifics_for_ner(text)
1139
+ if not isinstance(text, str):
1140
+ raise TypeError(f"text must be str or list[str], not {type(text).__name__}")
890
1141
 
891
- if compression_ratio == "auto":
892
- compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
1142
+ if _analyzer is None:
1143
+ _analyzer = build_presidio_analyzer(
1144
+ language=language,
1145
+ engine_type=engine_type,
1146
+ model=model,
1147
+ )
893
1148
 
894
- if compression_ratio > 1.0:
895
- sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
896
- with_scores=True)
897
- text = " ".join([s[0] for s in sentences])
1149
+ text = _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines)
898
1150
 
899
1151
  ner = compute_ner_presidio(
900
1152
  text,
901
1153
  language,
902
- analyzer,
1154
+ _analyzer,
903
1155
  engine_type,
904
1156
  entities,
905
1157
  score_threshold,
@@ -928,6 +1180,12 @@ def get_ner_handler(
928
1180
  except LookupError:
929
1181
  language = "en"
930
1182
 
1183
+ analyzer = build_presidio_analyzer(
1184
+ language=language,
1185
+ engine_type=engine_type,
1186
+ model=model,
1187
+ )
1188
+
931
1189
  return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
932
1190
  text,
933
1191
  language,
@@ -942,7 +1200,8 @@ def get_ner_handler(
942
1200
  score_threshold,
943
1201
  batch_size,
944
1202
  n_process,
945
- preprocess_newlines
1203
+ preprocess_newlines,
1204
+ _analyzer=analyzer
946
1205
  )
947
1206
 
948
1207