lexiredact 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexiredact/README.md +9 -0
- lexiredact/__init__.py +41 -0
- lexiredact/adapters/__init__.py +5 -0
- lexiredact/adapters/chunk_adapter.py +93 -0
- lexiredact/app_logging.py +57 -0
- lexiredact/cache/__init__.py +11 -0
- lexiredact/cache/redis_cache.py +98 -0
- lexiredact/cli.py +489 -0
- lexiredact/config/__init__.py +6 -0
- lexiredact/config/loader.py +81 -0
- lexiredact/config/schema.py +112 -0
- lexiredact/exceptions.py +57 -0
- lexiredact/models/__init__.py +6 -0
- lexiredact/models/chunk.py +40 -0
- lexiredact/models/result.py +88 -0
- lexiredact/pipeline/__init__.py +10 -0
- lexiredact/pipeline/embedder/__init__.py +37 -0
- lexiredact/pipeline/embedder/base.py +95 -0
- lexiredact/pipeline/embedder/default.py +13 -0
- lexiredact/pipeline/embedder/huggingface.py +213 -0
- lexiredact/pipeline/embedder/registry.py +55 -0
- lexiredact/pipeline/embedder/sentence_transformers.py +172 -0
- lexiredact/pipeline/orchestrator.py +310 -0
- lexiredact/pipeline/pii/__init__.py +15 -0
- lexiredact/pipeline/pii/detector.py +133 -0
- lexiredact/pipeline/pii/engine_factory.py +91 -0
- lexiredact/pipeline/pii/redactor.py +78 -0
- lexiredact/pipeline/store/__init__.py +23 -0
- lexiredact/pipeline/store/base.py +61 -0
- lexiredact/pipeline/store/chroma.py +122 -0
- lexiredact/pipeline_api.py +90 -0
- lexiredact-0.0.2.dist-info/METADATA +95 -0
- lexiredact-0.0.2.dist-info/RECORD +36 -0
- lexiredact-0.0.2.dist-info/WHEEL +4 -0
- lexiredact-0.0.2.dist-info/entry_points.txt +2 -0
- lexiredact-0.0.2.dist-info/licenses/LICENSE +21 -0
lexiredact/README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Lexiredact
|
|
2
|
+
|
|
3
|
+
Privacy-preserving RAG ingestion middleware with dual-pipeline processing.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install -e ".[dev]" # core + dev tools
|
|
7
|
+
pip install -e ".[pii,embed]" # add Presidio + sentence-transformers
|
|
8
|
+
pip install -e ".[all]" # everything
|
|
9
|
+
```
|
lexiredact/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LexiRedact — Privacy-preserving RAG ingestion middleware with dual-pipeline processing.
|
|
3
|
+
|
|
4
|
+
Import everything you need from here:
|
|
5
|
+
from lexiredact import load_config, ProcessingResult, configure_logging
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from lexiredact.config.loader import load_config
|
|
11
|
+
from lexiredact.pipeline_api import LexiredactPipeline
|
|
12
|
+
from lexiredact.config.schema import LexiredactConfig
|
|
13
|
+
from lexiredact.exceptions import (
|
|
14
|
+
LexiredactCacheError,
|
|
15
|
+
LexiredactConfigError,
|
|
16
|
+
LexiredactError,
|
|
17
|
+
LexiredactInputError,
|
|
18
|
+
LexiredactStorageError,
|
|
19
|
+
)
|
|
20
|
+
from lexiredact.app_logging import configure_logging, get_logger
|
|
21
|
+
from lexiredact.models.chunk import Chunk
|
|
22
|
+
from lexiredact.models.result import DetectedEntity, ProcessingResult
|
|
23
|
+
|
|
24
|
+
__version__ = "0.0.2"
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"LexiredactConfig",
|
|
28
|
+
"LexiredactPipeline",
|
|
29
|
+
"load_config",
|
|
30
|
+
"ProcessingResult",
|
|
31
|
+
"DetectedEntity",
|
|
32
|
+
"Chunk",
|
|
33
|
+
"LexiredactError",
|
|
34
|
+
"LexiredactConfigError",
|
|
35
|
+
"LexiredactInputError",
|
|
36
|
+
"LexiredactStorageError",
|
|
37
|
+
"LexiredactCacheError",
|
|
38
|
+
"configure_logging",
|
|
39
|
+
"get_logger",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""
|
|
2
|
+
adapters/chunk_adapter.py — Maps raw user input dicts to internal Chunk objects.
|
|
3
|
+
|
|
4
|
+
Field names are never hardcoded here; they are always read from InputSchemaConfig.
|
|
5
|
+
This is the only place in the codebase that performs per-chunk input validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from lexiredact.config.schema import InputSchemaConfig
|
|
13
|
+
from lexiredact.exceptions import LexiredactInputError
|
|
14
|
+
from lexiredact.app_logging import get_logger
|
|
15
|
+
from lexiredact.models.chunk import Chunk
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ChunkAdapter:
|
|
21
|
+
"""Converts arbitrary user input dicts into validated Chunk objects.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
config: Field-name mapping configuration from InputSchemaConfig.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: InputSchemaConfig) -> None:
|
|
28
|
+
self._config = config
|
|
29
|
+
|
|
30
|
+
def adapt(self, raw: dict[str, Any]) -> Chunk:
|
|
31
|
+
"""Convert a single raw input dict into a Chunk.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
LexiredactInputError: If id_field missing, text_field missing,
|
|
35
|
+
or text is empty after stripping whitespace.
|
|
36
|
+
"""
|
|
37
|
+
cfg = self._config
|
|
38
|
+
|
|
39
|
+
if cfg.id_field not in raw:
|
|
40
|
+
raise LexiredactInputError(
|
|
41
|
+
f"Missing required id field '{cfg.id_field}' in input dict.",
|
|
42
|
+
context={
|
|
43
|
+
"expected_id_field": cfg.id_field,
|
|
44
|
+
"available_keys": list(raw.keys()),
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
chunk_id = str(raw[cfg.id_field])
|
|
49
|
+
|
|
50
|
+
if cfg.text_field not in raw:
|
|
51
|
+
raise LexiredactInputError(
|
|
52
|
+
f"Missing required text field '{cfg.text_field}' in chunk '{chunk_id}'.",
|
|
53
|
+
context={
|
|
54
|
+
"expected_text_field": cfg.text_field,
|
|
55
|
+
"available_keys": list(raw.keys()),
|
|
56
|
+
"chunk_id": chunk_id,
|
|
57
|
+
},
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
text: str = raw[cfg.text_field]
|
|
61
|
+
metadata: dict[str, Any] = {
|
|
62
|
+
key: raw[key] for key in cfg.metadata_fields if key in raw
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Chunk.__post_init__ raises LexiredactInputError if text is empty.
|
|
66
|
+
return Chunk(id=chunk_id, text=text, metadata=metadata)
|
|
67
|
+
|
|
68
|
+
def adapt_batch(
|
|
69
|
+
self, raws: list[dict[str, Any]]
|
|
70
|
+
) -> tuple[list[Chunk], list[dict[str, Any]]]:
|
|
71
|
+
"""Convert a list of raw dicts, collecting — not raising — per-item errors.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
(successful_chunks, failed_items)
|
|
75
|
+
failed_items entries: {"index": int, "error": str, "raw": dict}
|
|
76
|
+
"""
|
|
77
|
+
successful_chunks: list[Chunk] = []
|
|
78
|
+
failed_items: list[dict[str, Any]] = []
|
|
79
|
+
|
|
80
|
+
for index, raw in enumerate(raws):
|
|
81
|
+
try:
|
|
82
|
+
successful_chunks.append(self.adapt(raw))
|
|
83
|
+
except LexiredactInputError as exc:
|
|
84
|
+
logger.warning("Chunk at index %d failed adaptation: %s", index, exc)
|
|
85
|
+
failed_items.append({"index": index, "error": str(exc), "raw": raw})
|
|
86
|
+
|
|
87
|
+
if failed_items:
|
|
88
|
+
logger.warning(
|
|
89
|
+
"adapt_batch: %d/%d chunks failed. Proceeding with %d successful.",
|
|
90
|
+
len(failed_items), len(raws), len(successful_chunks),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return successful_chunks, failed_items
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
logging.py — Centralised logging configuration for lexiredact.
|
|
3
|
+
|
|
4
|
+
Registers a NullHandler at import time so that library consumers who have
|
|
5
|
+
not configured logging do not receive "No handlers could be found" warnings.
|
|
6
|
+
|
|
7
|
+
Usage inside library modules:
|
|
8
|
+
from lexiredact.logging import get_logger
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
# Register NullHandler at module-import time so LexiRedact is a well-behaved library.
|
|
17
|
+
logging.getLogger("lexiredact").addHandler(logging.NullHandler())
|
|
18
|
+
|
|
19
|
+
_LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def configure_logging(level: str = "INFO") -> None:
|
|
23
|
+
"""Configure the root ``LexiRedact`` logger with a StreamHandler.
|
|
24
|
+
|
|
25
|
+
Adds a single StreamHandler pointing to stderr with the standard format.
|
|
26
|
+
Safe to call multiple times — will not add duplicate handlers.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
level: Logging level string, e.g. "DEBUG", "INFO", "WARNING".
|
|
30
|
+
"""
|
|
31
|
+
root_logger = logging.getLogger("lexiredact")
|
|
32
|
+
numeric_level = getattr(logging, level.upper(), logging.INFO)
|
|
33
|
+
root_logger.setLevel(numeric_level)
|
|
34
|
+
|
|
35
|
+
for handler in root_logger.handlers:
|
|
36
|
+
if isinstance(handler, logging.StreamHandler) and not isinstance(
|
|
37
|
+
handler, logging.FileHandler
|
|
38
|
+
):
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
handler = logging.StreamHandler()
|
|
42
|
+
handler.setLevel(numeric_level)
|
|
43
|
+
handler.setFormatter(logging.Formatter(_LOG_FORMAT))
|
|
44
|
+
root_logger.addHandler(handler)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_logger(name: str) -> logging.Logger:
|
|
48
|
+
"""Return a child logger scoped under the ``LexiRedact`` namespace.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
name: Typically __name__ of the calling module. The "lexiredact."
|
|
52
|
+
prefix is added automatically.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
A logging.Logger instance namespaced under lexiredact.
|
|
56
|
+
"""
|
|
57
|
+
return logging.getLogger(f"lexiredact.{name}")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cache — Redis-backed embedding cache for lexiredact.
|
|
3
|
+
|
|
4
|
+
EmbeddingCache — transparent cache layer keyed on SHA-256 of chunk text.
|
|
5
|
+
Any Redis failure is caught silently; callers never see exceptions.
|
|
6
|
+
Disabled entirely when CacheConfig.enabled=False.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from lexiredact.cache.redis_cache import EmbeddingCache
|
|
10
|
+
|
|
11
|
+
__all__ = ["EmbeddingCache"]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cache/redis_cache.py — Redis-backed embedding cache.
|
|
3
|
+
|
|
4
|
+
Failure contract (critical):
|
|
5
|
+
- LexiredactCacheError is raised internally but NEVER propagated to callers.
|
|
6
|
+
- Any Redis error, JSON decode error, or connection failure results in a cache
|
|
7
|
+
miss (get → None) or a silent no-op (set → return). The pipeline continues.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
from lexiredact.config.schema import CacheConfig
|
|
16
|
+
from lexiredact.exceptions import LexiredactCacheError
|
|
17
|
+
from lexiredact.app_logging import get_logger
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EmbeddingCache:
|
|
23
|
+
"""Redis-backed embedding cache. Completely transparent to callers.
|
|
24
|
+
|
|
25
|
+
When config.enabled is False, every method is a no-op and get always
|
|
26
|
+
returns None. No Redis connection is attempted.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
config: Cache configuration (redis_url, ttl_seconds, key_prefix, enabled).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, config: CacheConfig) -> None:
|
|
33
|
+
self._config = config
|
|
34
|
+
self._client = None # redis.Redis instance; created lazily on first use.
|
|
35
|
+
logger.debug("EmbeddingCache initialized (enabled=%s)", config.enabled)
|
|
36
|
+
|
|
37
|
+
def get(self, text: str) -> list[float] | None:
|
|
38
|
+
"""Return cached embedding for text or None on any miss/error."""
|
|
39
|
+
if not self._config.enabled:
|
|
40
|
+
return None
|
|
41
|
+
key = self._make_key(text)
|
|
42
|
+
try:
|
|
43
|
+
self._ensure_connected()
|
|
44
|
+
raw = self._client.get(key) # type: ignore[union-attr]
|
|
45
|
+
if raw is None:
|
|
46
|
+
return None
|
|
47
|
+
vector: list[float] = json.loads(raw)
|
|
48
|
+
if not isinstance(vector, list):
|
|
49
|
+
raise LexiredactCacheError(
|
|
50
|
+
"Cached value is not a list",
|
|
51
|
+
context={"key": key, "got_type": type(vector).__name__},
|
|
52
|
+
)
|
|
53
|
+
return vector
|
|
54
|
+
except LexiredactCacheError as exc:
|
|
55
|
+
logger.warning("Cache get failed for key '%s': %s", key, exc)
|
|
56
|
+
return None
|
|
57
|
+
except json.JSONDecodeError as exc:
|
|
58
|
+
logger.warning("Cache get: JSON decode error for key '%s': %s", key, exc)
|
|
59
|
+
return None
|
|
60
|
+
except Exception as exc: # noqa: BLE001
|
|
61
|
+
logger.warning("Cache get: Redis error for key '%s': %s", key, exc)
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
def set(self, text: str, vector: list[float]) -> None:
|
|
65
|
+
"""Store embedding in Redis with configured TTL. Silent on any error."""
|
|
66
|
+
if not self._config.enabled:
|
|
67
|
+
return
|
|
68
|
+
key = self._make_key(text)
|
|
69
|
+
try:
|
|
70
|
+
self._ensure_connected()
|
|
71
|
+
self._client.setex(key, self._config.ttl_seconds, json.dumps(vector)) # type: ignore[union-attr]
|
|
72
|
+
logger.debug("Cache set: stored %d-dim vector at key '%s'.", len(vector), key)
|
|
73
|
+
except Exception as exc: # noqa: BLE001
|
|
74
|
+
logger.warning("Cache set failed for key '%s': %s", key, exc)
|
|
75
|
+
|
|
76
|
+
def _make_key(self, text: str) -> str:
|
|
77
|
+
"""Build Redis key: {prefix}:emb:{sha256(text)[:16]}"""
|
|
78
|
+
hash_fragment = hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
79
|
+
return f"{self._config.key_prefix}:emb:{hash_fragment}"
|
|
80
|
+
|
|
81
|
+
def _ensure_connected(self) -> None:
|
|
82
|
+
"""Lazily initialise the Redis client. Raises LexiredactCacheError on failure."""
|
|
83
|
+
if self._client is not None:
|
|
84
|
+
return
|
|
85
|
+
try:
|
|
86
|
+
import redis # type: ignore[import-untyped]
|
|
87
|
+
self._client = redis.Redis.from_url(
|
|
88
|
+
self._config.redis_url,
|
|
89
|
+
decode_responses=True,
|
|
90
|
+
socket_connect_timeout=2,
|
|
91
|
+
socket_timeout=2,
|
|
92
|
+
)
|
|
93
|
+
logger.debug("Redis client connected to %s.", self._config.redis_url)
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
raise LexiredactCacheError(
|
|
96
|
+
"Failed to initialise Redis client",
|
|
97
|
+
context={"redis_url": self._config.redis_url, "error": str(exc)},
|
|
98
|
+
) from exc
|