streamlit-octostar-utils 0.4.2.dev14__tar.gz → 0.4.2.dev16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/celery.py +137 -36
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/fastapi.py +13 -8
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/nifi.py +5 -2
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/ner.py +285 -26
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/LICENSE +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/README.md +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/nlp/language.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/relationships.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.4.2.dev14 → streamlit_octostar_utils-0.4.2.dev16}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
|
@@ -14,6 +14,7 @@ import atexit
|
|
|
14
14
|
import redis
|
|
15
15
|
import uuid
|
|
16
16
|
import json
|
|
17
|
+
import hashlib
|
|
17
18
|
import shutil
|
|
18
19
|
import threading
|
|
19
20
|
from pottery import Redlock
|
|
@@ -56,12 +57,14 @@ class CeleryQueueConfig:
|
|
|
56
57
|
max_tasks_in_queue=None,
|
|
57
58
|
max_tasks_per_child=None,
|
|
58
59
|
max_memory_per_child=None,
|
|
60
|
+
stall_timeout=1200,
|
|
59
61
|
**options,
|
|
60
62
|
):
|
|
61
63
|
self.n_workers = n_workers
|
|
62
64
|
self.max_tasks_in_queue = max_tasks_in_queue
|
|
63
65
|
self.max_tasks_per_child = max_tasks_per_child
|
|
64
66
|
self.max_memory_per_child = max_memory_per_child # KiB
|
|
67
|
+
self.stall_timeout = stall_timeout # seconds; None or 0 to disable
|
|
65
68
|
self.options = options
|
|
66
69
|
|
|
67
70
|
|
|
@@ -90,6 +93,11 @@ class CelerySerialized:
|
|
|
90
93
|
|
|
91
94
|
class CeleryExecutor(object):
|
|
92
95
|
class QueueFullException(Exception):
|
|
96
|
+
"""Queue is full. Try again later."""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
class QueueStalledException(Exception):
|
|
100
|
+
"""Queue has tasks but none are being consumed — workers may be deadlocked or in a crash loop."""
|
|
93
101
|
pass
|
|
94
102
|
|
|
95
103
|
AWAITING = "AWAITING"
|
|
@@ -169,6 +177,12 @@ class CeleryExecutor(object):
|
|
|
169
177
|
self.stop_event = threading.Event()
|
|
170
178
|
self.worker_health_check_thread = None
|
|
171
179
|
self.worker_info = {}
|
|
180
|
+
|
|
181
|
+
# Queue stall detection
|
|
182
|
+
self._queue_fingerprints = {}
|
|
183
|
+
self._queue_fingerprint_changed_at = {}
|
|
184
|
+
self._queue_stalled = {}
|
|
185
|
+
|
|
172
186
|
atexit.register(self.close)
|
|
173
187
|
self.set_cleanup_task()
|
|
174
188
|
self.register_state_signals()
|
|
@@ -204,9 +218,20 @@ class CeleryExecutor(object):
|
|
|
204
218
|
if self.preload_functions:
|
|
205
219
|
celery_signals.worker_process_init.connect(self.preload_on_worker_init)
|
|
206
220
|
|
|
221
|
+
def set_last_completed_time(self, sender=None, task_id=None, task=None, **kwargs):
|
|
222
|
+
try:
|
|
223
|
+
queue = task.request.delivery_info.get(
|
|
224
|
+
"routing_key", self.app.conf.task_default_routing_key
|
|
225
|
+
) if task else None
|
|
226
|
+
if queue:
|
|
227
|
+
self.redis_client.set(f"queue:last_completed:{queue}", str(time.time()))
|
|
228
|
+
except Exception:
|
|
229
|
+
pass
|
|
230
|
+
|
|
207
231
|
def register_state_signals(self):
|
|
208
232
|
celery_signals.before_task_publish.connect(self.set_awaiting_state)
|
|
209
233
|
celery_signals.task_prerun.connect(self.set_started_state)
|
|
234
|
+
celery_signals.task_postrun.connect(self.set_last_completed_time)
|
|
210
235
|
|
|
211
236
|
def cleanup_task_results(in_dir, out_dir, redis_host, redis_port, task_expires, result_expires):
|
|
212
237
|
logger.info("Starting cleanup of expired task results...")
|
|
@@ -378,37 +403,87 @@ class CeleryExecutor(object):
|
|
|
378
403
|
def _worker_health_check_loop(self):
|
|
379
404
|
while not self.stop_event.is_set():
|
|
380
405
|
try:
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
poll_result = process.poll()
|
|
384
|
-
if poll_result is not None:
|
|
385
|
-
queue_name, slot, command = self.worker_info[process]
|
|
386
|
-
logger.warning(
|
|
387
|
-
f"Worker process dead for queue '{queue_name}' slot {slot}. "
|
|
388
|
-
f"Exit code: {poll_result}. Restarting..."
|
|
389
|
-
)
|
|
390
|
-
dead_processes.append(process)
|
|
391
|
-
if self.beat_process and self.beat_process.poll() is not None:
|
|
392
|
-
logger.warning(
|
|
393
|
-
f"Beat process dead (exit code: {self.beat_process.poll()}). Restarting..."
|
|
394
|
-
)
|
|
395
|
-
self.beat_process = None
|
|
396
|
-
for dead_process in dead_processes:
|
|
397
|
-
queue_name, slot, command = self.worker_info[dead_process]
|
|
398
|
-
self.processes.remove(dead_process)
|
|
399
|
-
del self.worker_info[dead_process]
|
|
400
|
-
new_process = subprocess.Popen(command)
|
|
401
|
-
self.processes.append(new_process)
|
|
402
|
-
self.worker_info[new_process] = (queue_name, slot, command)
|
|
403
|
-
logger.info(f"Restarted worker for queue '{queue_name}' slot {slot} (PID: {new_process.pid})")
|
|
404
|
-
if self.beat_process is None:
|
|
405
|
-
self.beat_process = subprocess.Popen(self.beat_command)
|
|
406
|
-
logger.info(f"Restarted beat process (PID: {self.beat_process.pid})")
|
|
406
|
+
self._restart_dead_processes()
|
|
407
|
+
self._check_queue_stalls()
|
|
407
408
|
time.sleep(5)
|
|
408
409
|
except Exception as e:
|
|
409
410
|
logger.error(f"Error in worker health check: {e}")
|
|
410
411
|
time.sleep(5)
|
|
411
412
|
|
|
413
|
+
def _restart_dead_processes(self):
|
|
414
|
+
dead_processes = []
|
|
415
|
+
for process in self.processes:
|
|
416
|
+
poll_result = process.poll()
|
|
417
|
+
if poll_result is not None:
|
|
418
|
+
queue_name, slot, command = self.worker_info[process]
|
|
419
|
+
logger.warning(
|
|
420
|
+
f"Worker process dead for queue '{queue_name}' slot {slot}. "
|
|
421
|
+
f"Exit code: {poll_result}. Restarting..."
|
|
422
|
+
)
|
|
423
|
+
dead_processes.append(process)
|
|
424
|
+
if self.beat_process and self.beat_process.poll() is not None:
|
|
425
|
+
logger.warning(
|
|
426
|
+
f"Beat process dead (exit code: {self.beat_process.poll()}). Restarting..."
|
|
427
|
+
)
|
|
428
|
+
self.beat_process = None
|
|
429
|
+
for dead_process in dead_processes:
|
|
430
|
+
queue_name, slot, command = self.worker_info[dead_process]
|
|
431
|
+
self.processes.remove(dead_process)
|
|
432
|
+
del self.worker_info[dead_process]
|
|
433
|
+
new_process = subprocess.Popen(command)
|
|
434
|
+
self.processes.append(new_process)
|
|
435
|
+
self.worker_info[new_process] = (queue_name, slot, command)
|
|
436
|
+
logger.info(f"Restarted worker for queue '{queue_name}' slot {slot} (PID: {new_process.pid})")
|
|
437
|
+
if self.beat_process is None:
|
|
438
|
+
self.beat_process = subprocess.Popen(self.beat_command)
|
|
439
|
+
logger.info(f"Restarted beat process (PID: {self.beat_process.pid})")
|
|
440
|
+
|
|
441
|
+
def _check_queue_stalls(self):
|
|
442
|
+
for queue_name, queue_config in self.queue_config.items():
|
|
443
|
+
if not queue_config.stall_timeout:
|
|
444
|
+
continue
|
|
445
|
+
try:
|
|
446
|
+
queue_items = self.redis_client.lrange(queue_name, 0, -1)
|
|
447
|
+
if len(queue_items) == 0:
|
|
448
|
+
self._queue_stalled[queue_name] = False
|
|
449
|
+
self._queue_fingerprints.pop(queue_name, None)
|
|
450
|
+
self._queue_fingerprint_changed_at.pop(queue_name, None)
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
fingerprint = hashlib.md5(b"".join(sorted(queue_items))).hexdigest()
|
|
454
|
+
now_time = time.time()
|
|
455
|
+
prev_fingerprint = self._queue_fingerprints.get(queue_name)
|
|
456
|
+
|
|
457
|
+
if fingerprint != prev_fingerprint:
|
|
458
|
+
self._queue_fingerprints[queue_name] = fingerprint
|
|
459
|
+
self._queue_fingerprint_changed_at[queue_name] = now_time
|
|
460
|
+
self._queue_stalled[queue_name] = False
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
fingerprint_age = now_time - self._queue_fingerprint_changed_at.get(queue_name, now_time)
|
|
464
|
+
|
|
465
|
+
last_completed_raw = self.redis_client.get(f"queue:last_completed:{queue_name}")
|
|
466
|
+
last_completed = float(last_completed_raw) if last_completed_raw else 0
|
|
467
|
+
time_since_completion = (now_time - last_completed) if last_completed else float("inf")
|
|
468
|
+
|
|
469
|
+
was_stalled = self._queue_stalled.get(queue_name, False)
|
|
470
|
+
is_stalled = (
|
|
471
|
+
fingerprint_age >= queue_config.stall_timeout
|
|
472
|
+
and time_since_completion >= queue_config.stall_timeout
|
|
473
|
+
)
|
|
474
|
+
self._queue_stalled[queue_name] = is_stalled
|
|
475
|
+
|
|
476
|
+
if is_stalled and not was_stalled:
|
|
477
|
+
logger.error(
|
|
478
|
+
f"Queue '{queue_name}' is STALLED: {len(queue_items)} task(s) stuck for "
|
|
479
|
+
f"{fingerprint_age:.0f}s with no completions in {time_since_completion:.0f}s. "
|
|
480
|
+
f"New requests will receive 503."
|
|
481
|
+
)
|
|
482
|
+
elif not is_stalled and was_stalled:
|
|
483
|
+
logger.info(f"Queue '{queue_name}' has recovered from stall.")
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.error(f"Error checking stall for queue '{queue_name}': {e}")
|
|
486
|
+
|
|
412
487
|
def close(self):
|
|
413
488
|
self.stop_event.set()
|
|
414
489
|
if self.worker_health_check_thread and self.worker_health_check_thread.is_alive():
|
|
@@ -507,6 +582,10 @@ class CeleryExecutor(object):
|
|
|
507
582
|
self.app.conf.dev_preload = True
|
|
508
583
|
|
|
509
584
|
def _reserve_queue_slot(queue_name):
|
|
585
|
+
if self._queue_stalled.get(queue_name, False):
|
|
586
|
+
raise CeleryExecutor.QueueStalledException(
|
|
587
|
+
f"Queue '{queue_name}' is stalled. Service temporarily unavailable."
|
|
588
|
+
)
|
|
510
589
|
limit = self.queue_config[queue_name].max_tasks_in_queue
|
|
511
590
|
if limit:
|
|
512
591
|
reservation_key = f"queue:reserved:{queue_name}"
|
|
@@ -540,6 +619,12 @@ class CeleryExecutor(object):
|
|
|
540
619
|
def _send_task(task_fn, task_id, options):
|
|
541
620
|
task_fn.apply_async(task_id=task_id, **options)
|
|
542
621
|
|
|
622
|
+
def _store_task_queue_mapping(task_id, queue_name):
|
|
623
|
+
self.redis_client.set(
|
|
624
|
+
f"task:queue:{task_id}", queue_name,
|
|
625
|
+
ex=self.app.conf.result_expires,
|
|
626
|
+
)
|
|
627
|
+
|
|
543
628
|
task_id = str(uuid.uuid4())
|
|
544
629
|
queue_name = self.app.conf.task_default_routing_key
|
|
545
630
|
queue_name = getattr(task_fn, "queue", queue_name)
|
|
@@ -560,6 +645,9 @@ class CeleryExecutor(object):
|
|
|
560
645
|
await asyncio.get_running_loop().run_in_executor(
|
|
561
646
|
self.set_thread_pool, _send_task, task_fn, task_id, options
|
|
562
647
|
)
|
|
648
|
+
await asyncio.get_running_loop().run_in_executor(
|
|
649
|
+
self.set_thread_pool, _store_task_queue_mapping, task_id, queue_name
|
|
650
|
+
)
|
|
563
651
|
except asyncio.CancelledError:
|
|
564
652
|
logger.info(f"Cancelling task {task_id} due to disconnect!")
|
|
565
653
|
await self.terminate_task(task_id)
|
|
@@ -596,6 +684,14 @@ class CeleryExecutor(object):
|
|
|
596
684
|
def _poll_task_state(celery_app, task_id):
|
|
597
685
|
task = celery_app.AsyncResult(task_id)
|
|
598
686
|
ready, state = task.ready(), task.state
|
|
687
|
+
if not ready and state == CeleryExecutor.AWAITING:
|
|
688
|
+
task_queue = self.redis_client.get(f"task:queue:{task_id}")
|
|
689
|
+
if task_queue:
|
|
690
|
+
queue_name = task_queue.decode() if isinstance(task_queue, bytes) else task_queue
|
|
691
|
+
if self._queue_stalled.get(queue_name, False):
|
|
692
|
+
raise CeleryExecutor.QueueStalledException(
|
|
693
|
+
f"Task {task_id} is in stalled queue '{queue_name}'. Service temporarily unavailable."
|
|
694
|
+
)
|
|
599
695
|
return ready, state
|
|
600
696
|
|
|
601
697
|
return await asyncio.get_running_loop().run_in_executor(
|
|
@@ -704,7 +800,7 @@ class FastAPICeleryTaskRoute(Route):
|
|
|
704
800
|
or (state not in ["SUCCESS", "FAILURE", "RETRY", "REVOKED"])
|
|
705
801
|
)
|
|
706
802
|
if state in ["FAILURE", "RETRY", "REVOKED"]:
|
|
707
|
-
error_response = DefaultErrorRoute.format_error(exc,
|
|
803
|
+
error_response = DefaultErrorRoute.format_error(exc, internal=True).body.decode("utf-8")
|
|
708
804
|
data = {
|
|
709
805
|
"task_state": state,
|
|
710
806
|
"task_id": task_id,
|
|
@@ -747,28 +843,33 @@ class CeleryRoute(Route, ABC):
|
|
|
747
843
|
|
|
748
844
|
|
|
749
845
|
class CeleryErrorRoute(DefaultErrorRoute):
|
|
750
|
-
DEFAULT_STATUS_CODE_MAPPINGS = {
|
|
751
|
-
|
|
846
|
+
DEFAULT_STATUS_CODE_MAPPINGS = {
|
|
847
|
+
CeleryExecutor.QueueFullException: lambda exc: 429,
|
|
848
|
+
CeleryExecutor.QueueStalledException: lambda exc: 503,
|
|
849
|
+
}
|
|
850
|
+
DEFAULT_SILENCED_EXCEPTIONS = {
|
|
851
|
+
CeleryExecutor.QueueFullException: lambda exc: True,
|
|
852
|
+
}
|
|
752
853
|
|
|
753
854
|
def add_default_exceptions_handler(
|
|
754
855
|
fs_app,
|
|
755
|
-
|
|
856
|
+
internal=False,
|
|
857
|
+
internal_prefixes=None,
|
|
756
858
|
excs_to_status_codes=None,
|
|
757
859
|
silenced_excs=None,
|
|
758
860
|
):
|
|
759
|
-
extra_status = {CeleryExecutor.QueueFullException: lambda exc: 429}
|
|
760
|
-
extra_silence = {CeleryExecutor.QueueFullException: lambda exc: True}
|
|
761
|
-
|
|
762
861
|
status_codes = {
|
|
763
862
|
**DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
|
|
863
|
+
**CeleryErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
|
|
764
864
|
**(excs_to_status_codes or {}),
|
|
765
|
-
**extra_status,
|
|
766
865
|
}
|
|
767
866
|
|
|
768
867
|
silenced = {
|
|
769
868
|
**DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
|
|
869
|
+
**CeleryErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
|
|
770
870
|
**(silenced_excs or {}),
|
|
771
|
-
**extra_silence,
|
|
772
871
|
}
|
|
773
872
|
|
|
774
|
-
super(CeleryErrorRoute, CeleryErrorRoute).add_default_exceptions_handler(
|
|
873
|
+
super(CeleryErrorRoute, CeleryErrorRoute).add_default_exceptions_handler(
|
|
874
|
+
fs_app, internal, internal_prefixes, status_codes, silenced,
|
|
875
|
+
)
|
|
@@ -21,7 +21,6 @@ from octostar.client import make_client
|
|
|
21
21
|
|
|
22
22
|
MAX_ERROR_MESSAGE_BYTES = 256
|
|
23
23
|
MAX_ERROR_TRACEBACK_BYTES = 10240
|
|
24
|
-
DEFAULT_PROCESSOR_SUFFIX = "main"
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
class CommonParsers(object):
|
|
@@ -314,7 +313,7 @@ class DefaultErrorRoute:
|
|
|
314
313
|
},
|
|
315
314
|
}
|
|
316
315
|
|
|
317
|
-
def format_error(exc, body=b"",
|
|
316
|
+
def format_error(exc, body=b"", internal=False, excs_to_status_codes=DEFAULT_STATUS_CODE_MAPPINGS):
|
|
318
317
|
"""Generic Error Handler"""
|
|
319
318
|
status_code = 500
|
|
320
319
|
for exc_type, handler in excs_to_status_codes.items():
|
|
@@ -325,7 +324,7 @@ class DefaultErrorRoute:
|
|
|
325
324
|
message = exc.message
|
|
326
325
|
except:
|
|
327
326
|
message = str(exc)
|
|
328
|
-
if
|
|
327
|
+
if internal:
|
|
329
328
|
message += "\n" + str(body)
|
|
330
329
|
if len(message) > MAX_ERROR_MESSAGE_BYTES:
|
|
331
330
|
message = message[-MAX_ERROR_MESSAGE_BYTES:]
|
|
@@ -336,16 +335,18 @@ class DefaultErrorRoute:
|
|
|
336
335
|
except:
|
|
337
336
|
tcbk = None
|
|
338
337
|
response_content = {"message": message, "status": "error"}
|
|
339
|
-
if
|
|
338
|
+
if internal:
|
|
339
|
+
response_content["exception_class"] = f"python.{type(exc).__module__}.{type(exc).__qualname__}"
|
|
340
340
|
response_content["traceback"] = tcbk
|
|
341
341
|
return JSONResponse(status_code=status_code, content=response_content)
|
|
342
342
|
|
|
343
|
-
async def handle_error(body: bytes, exc: Exception,
|
|
344
|
-
return DefaultErrorRoute.format_error(exc, body,
|
|
343
|
+
async def handle_error(body: bytes, exc: Exception, internal: bool, excs_to_status_codes: dict):
|
|
344
|
+
return DefaultErrorRoute.format_error(exc, body, internal, excs_to_status_codes)
|
|
345
345
|
|
|
346
346
|
def add_default_exceptions_handler(
|
|
347
347
|
fs_app,
|
|
348
|
-
|
|
348
|
+
internal=False,
|
|
349
|
+
internal_prefixes=None,
|
|
349
350
|
excs_to_status_codes=None,
|
|
350
351
|
silenced_excs=None,
|
|
351
352
|
):
|
|
@@ -353,9 +354,13 @@ class DefaultErrorRoute:
|
|
|
353
354
|
excs_to_status_codes = DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS
|
|
354
355
|
if silenced_excs is None:
|
|
355
356
|
silenced_excs = DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS
|
|
357
|
+
internal_prefixes = tuple(internal_prefixes or [])
|
|
356
358
|
|
|
357
359
|
async def _async_handle_error(request: Request, exc: Exception):
|
|
358
|
-
|
|
360
|
+
is_internal = internal or (
|
|
361
|
+
internal_prefixes and request.url.path.startswith(internal_prefixes)
|
|
362
|
+
)
|
|
363
|
+
return await DefaultErrorRoute.handle_error(b"", exc, is_internal, excs_to_status_codes)
|
|
359
364
|
|
|
360
365
|
# Added all three since FastAPI seems to intercept some exceptions before Exception
|
|
361
366
|
fs_app.add_exception_handler(RequestValidationError, _async_handle_error)
|
|
@@ -385,9 +385,12 @@ class NifiContextManager(object):
|
|
|
385
385
|
return [entity for entity in self.jsonify(self.out_entities)["content"]]
|
|
386
386
|
|
|
387
387
|
def raise_exception(self, entity, exc):
|
|
388
|
-
error_response = DefaultErrorRoute.format_error(exc)
|
|
388
|
+
error_response = DefaultErrorRoute.format_error(exc, internal=True)
|
|
389
|
+
error_body = json.loads(error_response.body)
|
|
389
390
|
entity.request["exception"]["code"] = error_response.status_code
|
|
390
|
-
entity.request["exception"]["body"] =
|
|
391
|
+
entity.request["exception"]["body"] = error_body["message"]
|
|
392
|
+
entity.request["exception"]["exception_class"] = error_body.get("exception_class")
|
|
393
|
+
entity.request["exception"]["traceback"] = error_body.get("traceback")
|
|
391
394
|
travel_dict(entity.request["nifi_attributes"], ["invokehttp", "response", "body"], "w")(
|
|
392
395
|
entity.request["exception"]["body"]
|
|
393
396
|
)
|
|
@@ -4,7 +4,7 @@ import math
|
|
|
4
4
|
import multiprocessing
|
|
5
5
|
import re
|
|
6
6
|
from contextlib import contextmanager
|
|
7
|
-
from typing import Optional, List, Tuple
|
|
7
|
+
from typing import Optional, List, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from iso639.exceptions import InvalidLanguageValue
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
@@ -202,8 +202,57 @@ class FlairRecognizer(EntityRecognizer):
|
|
|
202
202
|
|
|
203
203
|
results.append(flair_result)
|
|
204
204
|
|
|
205
|
+
sentences.clear_embeddings()
|
|
206
|
+
|
|
205
207
|
return results
|
|
206
208
|
|
|
209
|
+
def analyze_batch(
|
|
210
|
+
self, texts: List[str], entities: List[str], nlp_artifacts_list: List[NlpArtifacts] = None
|
|
211
|
+
) -> List[List[RecognizerResult]]:
|
|
212
|
+
"""
|
|
213
|
+
Batch analyze using Flair's native Sentence batching.
|
|
214
|
+
|
|
215
|
+
Creates one Sentence per text, runs model.predict() in a single
|
|
216
|
+
forward pass, then extracts results per-Sentence.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
texts: List of input texts
|
|
220
|
+
entities: Entity types to detect
|
|
221
|
+
nlp_artifacts_list: Ignored (Flair uses its own tokenization)
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
List of lists of RecognizerResult, one inner list per input text
|
|
225
|
+
"""
|
|
226
|
+
sentences = [Sentence(text) for text in texts]
|
|
227
|
+
self.model.predict(sentences)
|
|
228
|
+
|
|
229
|
+
if not entities:
|
|
230
|
+
entities = self.supported_entities
|
|
231
|
+
|
|
232
|
+
all_results = []
|
|
233
|
+
for sentence in sentences:
|
|
234
|
+
results = []
|
|
235
|
+
for entity in entities:
|
|
236
|
+
if entity not in self.supported_entities:
|
|
237
|
+
continue
|
|
238
|
+
for ent in sentence.get_spans("ner"):
|
|
239
|
+
if not self.__check_label(
|
|
240
|
+
entity, ent.labels[0].value, self.check_label_groups
|
|
241
|
+
):
|
|
242
|
+
continue
|
|
243
|
+
textual_explanation = self.DEFAULT_EXPLANATION.format(
|
|
244
|
+
ent.labels[0].value
|
|
245
|
+
)
|
|
246
|
+
explanation = self.build_flair_explanation(
|
|
247
|
+
round(ent.score, 2), textual_explanation
|
|
248
|
+
)
|
|
249
|
+
flair_result = self._convert_to_recognizer_result(ent, explanation)
|
|
250
|
+
results.append(flair_result)
|
|
251
|
+
sentence.clear_embeddings()
|
|
252
|
+
all_results.append(results)
|
|
253
|
+
|
|
254
|
+
return all_results
|
|
255
|
+
|
|
207
256
|
def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
|
|
208
257
|
explanation = AnalysisExplanation(
|
|
209
258
|
recognizer=self.__class__.__name__,
|
|
@@ -740,6 +789,128 @@ def compute_ner_presidio(
|
|
|
740
789
|
return ner_objects
|
|
741
790
|
|
|
742
791
|
|
|
792
|
+
def _ensure_analyze_batch(recognizer):
|
|
793
|
+
"""Monkey-patch a loop-based analyze_batch onto recognizers that lack native batch support."""
|
|
794
|
+
if hasattr(recognizer, 'analyze_batch'):
|
|
795
|
+
return
|
|
796
|
+
_analyze = recognizer.analyze
|
|
797
|
+
|
|
798
|
+
def analyze_batch(texts, entities, nlp_artifacts_list=None):
|
|
799
|
+
return [
|
|
800
|
+
_analyze(text, entities, nlp_artifacts_list[i] if nlp_artifacts_list else None)
|
|
801
|
+
for i, text in enumerate(texts)
|
|
802
|
+
]
|
|
803
|
+
|
|
804
|
+
recognizer.analyze_batch = analyze_batch
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def _compute_ner_batch(
|
|
808
|
+
texts,
|
|
809
|
+
language,
|
|
810
|
+
analyzer,
|
|
811
|
+
entities=None,
|
|
812
|
+
score_threshold=0.5,
|
|
813
|
+
context_width=150,
|
|
814
|
+
with_comentions=True,
|
|
815
|
+
with_context=True,
|
|
816
|
+
batch_size=32,
|
|
817
|
+
n_process=None,
|
|
818
|
+
):
|
|
819
|
+
"""
|
|
820
|
+
Batch NER across multiple texts using the analyzer's recognizers.
|
|
821
|
+
|
|
822
|
+
Recognizers with a native analyze_batch (e.g. FlairRecognizer) run a
|
|
823
|
+
single batched forward pass. Others fall back to a per-text loop via
|
|
824
|
+
monkey-patched analyze_batch. Per-text postprocessing (threshold,
|
|
825
|
+
context, co-mentions) is applied individually — each text is standalone.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
texts: List of preprocessed text strings
|
|
829
|
+
language: Language code
|
|
830
|
+
analyzer: Pre-built AnalyzerEngine
|
|
831
|
+
entities: Optional list of entity types to detect (None = all)
|
|
832
|
+
score_threshold: Minimum confidence score
|
|
833
|
+
context_width: Character width for context and co-mention proximity
|
|
834
|
+
with_comentions: Include co-mentioned entities
|
|
835
|
+
with_context: Include surrounding context
|
|
836
|
+
batch_size: Batch size for NLP engine preprocessing
|
|
837
|
+
n_process: Number of processes for NLP engine (default 1)
|
|
838
|
+
|
|
839
|
+
Returns:
|
|
840
|
+
List of lists of NERObject, one inner list per input text
|
|
841
|
+
"""
|
|
842
|
+
expanded_entities = expand_entities_for_analyzer(entities) if entities else None
|
|
843
|
+
|
|
844
|
+
# Batch NLP preprocessing (spaCy tokenization / NER)
|
|
845
|
+
with silence_logging(logging.ERROR):
|
|
846
|
+
nlp_artifacts_batch = list(
|
|
847
|
+
analyzer.nlp_engine.process_batch(
|
|
848
|
+
texts=texts,
|
|
849
|
+
language=language,
|
|
850
|
+
batch_size=batch_size,
|
|
851
|
+
)
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
batch_texts = [text for text, _ in nlp_artifacts_batch]
|
|
855
|
+
batch_nlp = [na for _, na in nlp_artifacts_batch]
|
|
856
|
+
|
|
857
|
+
# Ensure all recognizers have analyze_batch
|
|
858
|
+
with silence_logging(logging.ERROR):
|
|
859
|
+
recognizers = analyzer.registry.get_recognizers(
|
|
860
|
+
language=language,
|
|
861
|
+
entities=expanded_entities,
|
|
862
|
+
all_fields=not expanded_entities,
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
for recognizer in recognizers:
|
|
866
|
+
_ensure_analyze_batch(recognizer)
|
|
867
|
+
|
|
868
|
+
# Run batch analysis per recognizer
|
|
869
|
+
per_text_results = [[] for _ in texts]
|
|
870
|
+
for recognizer in recognizers:
|
|
871
|
+
recognizer_entities = expanded_entities
|
|
872
|
+
if not recognizer_entities:
|
|
873
|
+
recognizer_entities = recognizer.get_supported_entities()
|
|
874
|
+
|
|
875
|
+
batch_results = recognizer.analyze_batch(
|
|
876
|
+
batch_texts,
|
|
877
|
+
recognizer_entities,
|
|
878
|
+
batch_nlp,
|
|
879
|
+
)
|
|
880
|
+
for i, results in enumerate(batch_results):
|
|
881
|
+
per_text_results[i].extend(results)
|
|
882
|
+
|
|
883
|
+
# Per-text postprocessing → NERObjects
|
|
884
|
+
all_ner_objects = []
|
|
885
|
+
for text, results in zip(batch_texts, per_text_results):
|
|
886
|
+
ner_objects = []
|
|
887
|
+
for result in results:
|
|
888
|
+
if result.score >= score_threshold:
|
|
889
|
+
context_start = max(0, result.start - math.floor(context_width / 2))
|
|
890
|
+
context_end = min(len(text), result.end + math.ceil(context_width / 2))
|
|
891
|
+
context = text[context_start:context_end] if with_context else None
|
|
892
|
+
ner_objects.append(NERObject(
|
|
893
|
+
name=text[result.start:result.end],
|
|
894
|
+
label=normalize_presidio_label(result.entity_type),
|
|
895
|
+
score=float(result.score),
|
|
896
|
+
start=int(result.start),
|
|
897
|
+
count=1,
|
|
898
|
+
context=context,
|
|
899
|
+
))
|
|
900
|
+
if with_comentions:
|
|
901
|
+
for i in range(len(ner_objects)):
|
|
902
|
+
entity = ner_objects[i]
|
|
903
|
+
comentions = [
|
|
904
|
+
ner_objects[j].name
|
|
905
|
+
for j in range(len(ner_objects))
|
|
906
|
+
if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
|
|
907
|
+
]
|
|
908
|
+
ner_objects[i].comentions = comentions
|
|
909
|
+
all_ner_objects.append(ner_objects)
|
|
910
|
+
|
|
911
|
+
return all_ner_objects
|
|
912
|
+
|
|
913
|
+
|
|
743
914
|
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
|
744
915
|
tokenizer = get_nltk_tokenizer(language)
|
|
745
916
|
stemmer = Stemmer(language)
|
|
@@ -837,6 +1008,77 @@ def _strip_honorifics_for_ner(text: str) -> str:
|
|
|
837
1008
|
return result
|
|
838
1009
|
|
|
839
1010
|
|
|
1011
|
+
def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines):
|
|
1012
|
+
"""Preprocess a single text for NER (newlines, honorifics, compression)."""
|
|
1013
|
+
if preprocess_newlines:
|
|
1014
|
+
text = _preprocess_newlines_for_ner(text)
|
|
1015
|
+
text = _strip_honorifics_for_ner(text)
|
|
1016
|
+
|
|
1017
|
+
cr = compression_ratio
|
|
1018
|
+
if cr == "auto":
|
|
1019
|
+
cr = max(1.0, len(text) / 15000) if fast else 1.0
|
|
1020
|
+
|
|
1021
|
+
if cr > 1.0:
|
|
1022
|
+
sentences = get_extractive_summary(
|
|
1023
|
+
text, language, int(len(text) / cr), fast=fast, with_scores=True
|
|
1024
|
+
)
|
|
1025
|
+
text = " ".join([s[0] for s in sentences])
|
|
1026
|
+
|
|
1027
|
+
return text
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def _ner_pipe_batch(
|
|
1031
|
+
texts,
|
|
1032
|
+
language,
|
|
1033
|
+
model,
|
|
1034
|
+
engine_type="spacy",
|
|
1035
|
+
fast=False,
|
|
1036
|
+
compression_ratio="auto",
|
|
1037
|
+
with_comentions=True,
|
|
1038
|
+
with_context=True,
|
|
1039
|
+
entities=None,
|
|
1040
|
+
score_threshold=0.5,
|
|
1041
|
+
batch_size=32,
|
|
1042
|
+
n_process=None,
|
|
1043
|
+
preprocess_newlines=True,
|
|
1044
|
+
_analyzer=None
|
|
1045
|
+
):
|
|
1046
|
+
"""
|
|
1047
|
+
Internal batch processing path for ner_pipe.
|
|
1048
|
+
|
|
1049
|
+
Uses a unified path for all engine types. Recognizers with native
|
|
1050
|
+
analyze_batch (FlairRecognizer) get true batched inference. Others
|
|
1051
|
+
fall back to a per-text loop via monkey-patched analyze_batch.
|
|
1052
|
+
Per-text postprocessing is applied individually.
|
|
1053
|
+
"""
|
|
1054
|
+
processed_texts = []
|
|
1055
|
+
for t in texts:
|
|
1056
|
+
if not isinstance(t, str):
|
|
1057
|
+
raise TypeError(f"Each text must be str, not {type(t).__name__}")
|
|
1058
|
+
processed_texts.append(
|
|
1059
|
+
_preprocess_text_for_ner(t, language, fast, compression_ratio, preprocess_newlines)
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
if _analyzer is None:
|
|
1063
|
+
_analyzer = build_presidio_analyzer(
|
|
1064
|
+
language=language,
|
|
1065
|
+
engine_type=engine_type,
|
|
1066
|
+
model=model,
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
return _compute_ner_batch(
|
|
1070
|
+
processed_texts,
|
|
1071
|
+
language,
|
|
1072
|
+
_analyzer,
|
|
1073
|
+
entities=entities,
|
|
1074
|
+
score_threshold=score_threshold,
|
|
1075
|
+
with_comentions=with_comentions,
|
|
1076
|
+
with_context=with_context,
|
|
1077
|
+
batch_size=batch_size,
|
|
1078
|
+
n_process=n_process,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
|
|
840
1082
|
def ner_pipe(
|
|
841
1083
|
text,
|
|
842
1084
|
language,
|
|
@@ -851,15 +1093,21 @@ def ner_pipe(
|
|
|
851
1093
|
score_threshold=0.5,
|
|
852
1094
|
batch_size=32,
|
|
853
1095
|
n_process=None,
|
|
854
|
-
preprocess_newlines=True
|
|
1096
|
+
preprocess_newlines=True,
|
|
1097
|
+
_analyzer=None
|
|
855
1098
|
):
|
|
856
1099
|
"""
|
|
857
|
-
Run NER pipeline on text.
|
|
858
|
-
|
|
1100
|
+
Run NER pipeline on text or a batch of texts.
|
|
1101
|
+
|
|
859
1102
|
Args:
|
|
860
|
-
text: Input text (str)
|
|
1103
|
+
text: Input text (str) or list of texts (list[str]).
|
|
1104
|
+
When a list is provided and engine_type is 'flair', texts are
|
|
1105
|
+
processed in a single batched forward pass using native Flair
|
|
1106
|
+
Sentence objects. For other engines, texts are processed
|
|
1107
|
+
individually through Presidio (still benefiting from a reused
|
|
1108
|
+
analyzer when called via get_ner_handler).
|
|
861
1109
|
language: Language code (e.g., 'en', 'de', 'fr')
|
|
862
|
-
model: Model name for spacy/flair engine
|
|
1110
|
+
model: Model name or instance for spacy/flair engine
|
|
863
1111
|
engine_type: 'regex', 'flair', 'spacy' or 'custom'
|
|
864
1112
|
fast: Use fast summarization for long texts
|
|
865
1113
|
compression_ratio: Compression ratio for long texts ('auto' or float)
|
|
@@ -868,38 +1116,42 @@ def ner_pipe(
|
|
|
868
1116
|
with_context: Include surrounding context
|
|
869
1117
|
entities: List of entity types to detect (None = all)
|
|
870
1118
|
score_threshold: Minimum confidence score
|
|
871
|
-
batch_size: Batch size for processing
|
|
1119
|
+
batch_size: Batch size for processing (used as mini_batch_size for Flair)
|
|
872
1120
|
n_process: Number of parallel processes
|
|
873
1121
|
preprocess_newlines: Replace newlines with ' — ' to prevent entity merging
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
list[NERObject] when text is a str
|
|
1125
|
+
list[list[NERObject]] when text is a list[str]
|
|
874
1126
|
"""
|
|
875
1127
|
if with_scores:
|
|
876
1128
|
raise NotImplementedError("with_scores functionality is not implemented yet")
|
|
877
|
-
|
|
878
|
-
if not isinstance(text, str):
|
|
879
|
-
raise TypeError(f"text must be str, not {type(text).__name__}")
|
|
880
1129
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
1130
|
+
if isinstance(text, list):
|
|
1131
|
+
return _ner_pipe_batch(
|
|
1132
|
+
text, language, model, engine_type, fast, compression_ratio,
|
|
1133
|
+
with_comentions=with_comentions, with_context=with_context,
|
|
1134
|
+
entities=entities, score_threshold=score_threshold,
|
|
1135
|
+
batch_size=batch_size, n_process=n_process,
|
|
1136
|
+
preprocess_newlines=preprocess_newlines, _analyzer=_analyzer
|
|
1137
|
+
)
|
|
886
1138
|
|
|
887
|
-
if
|
|
888
|
-
text
|
|
889
|
-
text = _strip_honorifics_for_ner(text)
|
|
1139
|
+
if not isinstance(text, str):
|
|
1140
|
+
raise TypeError(f"text must be str or list[str], not {type(text).__name__}")
|
|
890
1141
|
|
|
891
|
-
if
|
|
892
|
-
|
|
1142
|
+
if _analyzer is None:
|
|
1143
|
+
_analyzer = build_presidio_analyzer(
|
|
1144
|
+
language=language,
|
|
1145
|
+
engine_type=engine_type,
|
|
1146
|
+
model=model,
|
|
1147
|
+
)
|
|
893
1148
|
|
|
894
|
-
|
|
895
|
-
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
|
|
896
|
-
with_scores=True)
|
|
897
|
-
text = " ".join([s[0] for s in sentences])
|
|
1149
|
+
text = _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines)
|
|
898
1150
|
|
|
899
1151
|
ner = compute_ner_presidio(
|
|
900
1152
|
text,
|
|
901
1153
|
language,
|
|
902
|
-
|
|
1154
|
+
_analyzer,
|
|
903
1155
|
engine_type,
|
|
904
1156
|
entities,
|
|
905
1157
|
score_threshold,
|
|
@@ -928,6 +1180,12 @@ def get_ner_handler(
|
|
|
928
1180
|
except LookupError:
|
|
929
1181
|
language = "en"
|
|
930
1182
|
|
|
1183
|
+
analyzer = build_presidio_analyzer(
|
|
1184
|
+
language=language,
|
|
1185
|
+
engine_type=engine_type,
|
|
1186
|
+
model=model,
|
|
1187
|
+
)
|
|
1188
|
+
|
|
931
1189
|
return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
|
932
1190
|
text,
|
|
933
1191
|
language,
|
|
@@ -942,7 +1200,8 @@ def get_ner_handler(
|
|
|
942
1200
|
score_threshold,
|
|
943
1201
|
batch_size,
|
|
944
1202
|
n_process,
|
|
945
|
-
preprocess_newlines
|
|
1203
|
+
preprocess_newlines,
|
|
1204
|
+
_analyzer=analyzer
|
|
946
1205
|
)
|
|
947
1206
|
|
|
948
1207
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|