lexiredact 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lexiredact-0.0.2/.gitignore +13 -0
  2. lexiredact-0.0.2/LICENSE +21 -0
  3. lexiredact-0.0.2/PKG-INFO +95 -0
  4. lexiredact-0.0.2/README.md +41 -0
  5. lexiredact-0.0.2/lexiredact/README.md +9 -0
  6. lexiredact-0.0.2/lexiredact/__init__.py +41 -0
  7. lexiredact-0.0.2/lexiredact/adapters/__init__.py +5 -0
  8. lexiredact-0.0.2/lexiredact/adapters/chunk_adapter.py +93 -0
  9. lexiredact-0.0.2/lexiredact/app_logging.py +57 -0
  10. lexiredact-0.0.2/lexiredact/cache/__init__.py +11 -0
  11. lexiredact-0.0.2/lexiredact/cache/redis_cache.py +98 -0
  12. lexiredact-0.0.2/lexiredact/cli.py +489 -0
  13. lexiredact-0.0.2/lexiredact/config/__init__.py +6 -0
  14. lexiredact-0.0.2/lexiredact/config/loader.py +81 -0
  15. lexiredact-0.0.2/lexiredact/config/schema.py +112 -0
  16. lexiredact-0.0.2/lexiredact/exceptions.py +57 -0
  17. lexiredact-0.0.2/lexiredact/models/__init__.py +6 -0
  18. lexiredact-0.0.2/lexiredact/models/chunk.py +40 -0
  19. lexiredact-0.0.2/lexiredact/models/result.py +88 -0
  20. lexiredact-0.0.2/lexiredact/pipeline/__init__.py +10 -0
  21. lexiredact-0.0.2/lexiredact/pipeline/embedder/__init__.py +37 -0
  22. lexiredact-0.0.2/lexiredact/pipeline/embedder/base.py +95 -0
  23. lexiredact-0.0.2/lexiredact/pipeline/embedder/default.py +13 -0
  24. lexiredact-0.0.2/lexiredact/pipeline/embedder/huggingface.py +213 -0
  25. lexiredact-0.0.2/lexiredact/pipeline/embedder/registry.py +55 -0
  26. lexiredact-0.0.2/lexiredact/pipeline/embedder/sentence_transformers.py +172 -0
  27. lexiredact-0.0.2/lexiredact/pipeline/orchestrator.py +310 -0
  28. lexiredact-0.0.2/lexiredact/pipeline/pii/__init__.py +15 -0
  29. lexiredact-0.0.2/lexiredact/pipeline/pii/detector.py +133 -0
  30. lexiredact-0.0.2/lexiredact/pipeline/pii/engine_factory.py +91 -0
  31. lexiredact-0.0.2/lexiredact/pipeline/pii/redactor.py +78 -0
  32. lexiredact-0.0.2/lexiredact/pipeline/store/__init__.py +23 -0
  33. lexiredact-0.0.2/lexiredact/pipeline/store/base.py +61 -0
  34. lexiredact-0.0.2/lexiredact/pipeline/store/chroma.py +122 -0
  35. lexiredact-0.0.2/lexiredact/pipeline_api.py +90 -0
  36. lexiredact-0.0.2/lexiredact_config.yaml +43 -0
  37. lexiredact-0.0.2/pyproject.toml +131 -0
@@ -0,0 +1,13 @@
1
+ .venv/
2
+ venv/
3
+ __pycache__/
4
+ *.py[cod]
5
+ *.egg-info/
6
+ .pytest_cache/
7
+ .mypy_cache/
8
+ .ruff_cache/
9
+ build/
10
+ dist/
11
+ chroma_db/
12
+ .env
13
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Baihela Abid Hussain, Shwetan Bharat Londhe, Varad Limbkar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: lexiredact
3
+ Version: 0.0.2
4
+ Summary: Privacy-preserving RAG ingestion middleware with dual-pipeline processing
5
+ Project-URL: Homepage, https://github.com/baihelahusain/lexiredact
6
+ Project-URL: Repository, https://github.com/baihelahusain/LexiRedact
7
+ Project-URL: Issues, https://github.com/baihelahusain/LexiRedact/issues
8
+ Project-URL: Documentation, https://github.com/baihelahusain/LexiRedact
9
+ Author: Baihela Abid Hussain, Shwetan Bharat Londhe, Varad Limbkar
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: embeddings,llm,pii,privacy,rag,redaction,retrieval-augmented-generation,security,vector-database
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Security
24
+ Classifier: Topic :: Software Development :: Libraries
25
+ Requires-Python: >=3.10
26
+ Requires-Dist: pydantic<3.0,>=2.0
27
+ Requires-Dist: pyyaml<7.0,>=6.0
28
+ Provides-Extra: all
29
+ Requires-Dist: chromadb>=0.4; extra == 'all'
30
+ Requires-Dist: click>=8.0; extra == 'all'
31
+ Requires-Dist: presidio-analyzer; extra == 'all'
32
+ Requires-Dist: presidio-anonymizer; extra == 'all'
33
+ Requires-Dist: redis>=5.0; extra == 'all'
34
+ Requires-Dist: sentence-transformers>=2.0; extra == 'all'
35
+ Requires-Dist: spacy; extra == 'all'
36
+ Provides-Extra: cache
37
+ Requires-Dist: redis>=5.0; extra == 'cache'
38
+ Provides-Extra: cli
39
+ Requires-Dist: click>=8.0; extra == 'cli'
40
+ Provides-Extra: dev
41
+ Requires-Dist: mypy>=1.10; extra == 'dev'
42
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
43
+ Requires-Dist: pytest>=8.0; extra == 'dev'
44
+ Requires-Dist: ruff>=0.6; extra == 'dev'
45
+ Provides-Extra: embed
46
+ Requires-Dist: sentence-transformers>=2.0; extra == 'embed'
47
+ Provides-Extra: pii
48
+ Requires-Dist: presidio-analyzer; extra == 'pii'
49
+ Requires-Dist: presidio-anonymizer; extra == 'pii'
50
+ Requires-Dist: spacy; extra == 'pii'
51
+ Provides-Extra: store
52
+ Requires-Dist: chromadb>=0.4; extra == 'store'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # LexiRedact
56
+
57
+ LexiRedact is a privacy-preserving RAG ingestion middleware package for detecting and redacting PII before content is embedded and written to a vector store.
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install lexiredact
63
+ ```
64
+
65
+ Optional dependency groups are available for specific integrations:
66
+
67
+ ```bash
68
+ pip install "lexiredact[pii,embed,store,cache,cli]"
69
+ ```
70
+
71
+ ## Quick Start
72
+
73
+ ```python
74
+ from lexiredact import LexiredactPipeline, load_config
75
+
76
+ config = load_config("lexiredact_config.yaml")
77
+ pipeline = LexiredactPipeline(config)
78
+ ```
79
+
80
+ ## CLI
81
+
82
+ ```bash
83
+ lexiredact --help
84
+ ```
85
+
86
+ ## Release
87
+
88
+ This project publishes to PyPI from GitHub Actions when a version tag is pushed:
89
+
90
+ ```bash
91
+ git tag v0.0.2
92
+ git push origin v0.0.2
93
+ ```
94
+
95
+ The PyPI project must be configured for Trusted Publishing with the `pypi` environment and the `.github/workflows/publish.yml` workflow.
@@ -0,0 +1,41 @@
1
+ # LexiRedact
2
+
3
+ LexiRedact is a privacy-preserving RAG ingestion middleware package for detecting and redacting PII before content is embedded and written to a vector store.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install lexiredact
9
+ ```
10
+
11
+ Optional dependency groups are available for specific integrations:
12
+
13
+ ```bash
14
+ pip install "lexiredact[pii,embed,store,cache,cli]"
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ ```python
20
+ from lexiredact import LexiredactPipeline, load_config
21
+
22
+ config = load_config("lexiredact_config.yaml")
23
+ pipeline = LexiredactPipeline(config)
24
+ ```
25
+
26
+ ## CLI
27
+
28
+ ```bash
29
+ lexiredact --help
30
+ ```
31
+
32
+ ## Release
33
+
34
+ This project publishes to PyPI from GitHub Actions when a version tag is pushed:
35
+
36
+ ```bash
37
+ git tag v0.0.2
38
+ git push origin v0.0.2
39
+ ```
40
+
41
+ The PyPI project must be configured for Trusted Publishing with the `pypi` environment and the `.github/workflows/publish.yml` workflow.
@@ -0,0 +1,9 @@
1
+ # Lexiredact
2
+
3
+ Privacy-preserving RAG ingestion middleware with dual-pipeline processing.
4
+
5
+ ```bash
6
+ pip install -e ".[dev]" # core + dev tools
7
+ pip install -e ".[pii,embed]" # add Presidio + sentence-transformers
8
+ pip install -e ".[all]" # everything
9
+ ```
@@ -0,0 +1,41 @@
1
+ """
2
+ LexiRedact — Privacy-preserving RAG ingestion middleware with dual-pipeline processing.
3
+
4
+ Import everything you need from here:
5
+ from lexiredact import load_config, ProcessingResult, configure_logging
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from lexiredact.config.loader import load_config
11
+ from lexiredact.pipeline_api import LexiredactPipeline
12
+ from lexiredact.config.schema import LexiredactConfig
13
+ from lexiredact.exceptions import (
14
+ LexiredactCacheError,
15
+ LexiredactConfigError,
16
+ LexiredactError,
17
+ LexiredactInputError,
18
+ LexiredactStorageError,
19
+ )
20
+ from lexiredact.app_logging import configure_logging, get_logger
21
+ from lexiredact.models.chunk import Chunk
22
+ from lexiredact.models.result import DetectedEntity, ProcessingResult
23
+
24
+ __version__ = "0.0.2"
25
+
26
+ __all__ = [
27
+ "LexiredactConfig",
28
+ "LexiredactPipeline",
29
+ "load_config",
30
+ "ProcessingResult",
31
+ "DetectedEntity",
32
+ "Chunk",
33
+ "LexiredactError",
34
+ "LexiredactConfigError",
35
+ "LexiredactInputError",
36
+ "LexiredactStorageError",
37
+ "LexiredactCacheError",
38
+ "configure_logging",
39
+ "get_logger",
40
+ "__version__",
41
+ ]
@@ -0,0 +1,5 @@
1
+ """
2
+ adapters — Input adapters that translate caller-supplied dicts to internal models.
3
+
4
+ - chunk_adapter.py — ChunkAdapter: maps raw dicts → Chunk using InputSchemaConfig.
5
+ """
@@ -0,0 +1,93 @@
1
+ """
2
+ adapters/chunk_adapter.py — Maps raw user input dicts to internal Chunk objects.
3
+
4
+ Field names are never hardcoded here; they are always read from InputSchemaConfig.
5
+ This is the only place in the codebase that performs per-chunk input validation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+ from lexiredact.config.schema import InputSchemaConfig
13
+ from lexiredact.exceptions import LexiredactInputError
14
+ from lexiredact.app_logging import get_logger
15
+ from lexiredact.models.chunk import Chunk
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ class ChunkAdapter:
21
+ """Converts arbitrary user input dicts into validated Chunk objects.
22
+
23
+ Args:
24
+ config: Field-name mapping configuration from InputSchemaConfig.
25
+ """
26
+
27
+ def __init__(self, config: InputSchemaConfig) -> None:
28
+ self._config = config
29
+
30
+ def adapt(self, raw: dict[str, Any]) -> Chunk:
31
+ """Convert a single raw input dict into a Chunk.
32
+
33
+ Raises:
34
+ LexiredactInputError: If id_field missing, text_field missing,
35
+ or text is empty after stripping whitespace.
36
+ """
37
+ cfg = self._config
38
+
39
+ if cfg.id_field not in raw:
40
+ raise LexiredactInputError(
41
+ f"Missing required id field '{cfg.id_field}' in input dict.",
42
+ context={
43
+ "expected_id_field": cfg.id_field,
44
+ "available_keys": list(raw.keys()),
45
+ },
46
+ )
47
+
48
+ chunk_id = str(raw[cfg.id_field])
49
+
50
+ if cfg.text_field not in raw:
51
+ raise LexiredactInputError(
52
+ f"Missing required text field '{cfg.text_field}' in chunk '{chunk_id}'.",
53
+ context={
54
+ "expected_text_field": cfg.text_field,
55
+ "available_keys": list(raw.keys()),
56
+ "chunk_id": chunk_id,
57
+ },
58
+ )
59
+
60
+ text: str = raw[cfg.text_field]
61
+ metadata: dict[str, Any] = {
62
+ key: raw[key] for key in cfg.metadata_fields if key in raw
63
+ }
64
+
65
+ # Chunk.__post_init__ raises LexiredactInputError if text is empty.
66
+ return Chunk(id=chunk_id, text=text, metadata=metadata)
67
+
68
+ def adapt_batch(
69
+ self, raws: list[dict[str, Any]]
70
+ ) -> tuple[list[Chunk], list[dict[str, Any]]]:
71
+ """Convert a list of raw dicts, collecting — not raising — per-item errors.
72
+
73
+ Returns:
74
+ (successful_chunks, failed_items)
75
+ failed_items entries: {"index": int, "error": str, "raw": dict}
76
+ """
77
+ successful_chunks: list[Chunk] = []
78
+ failed_items: list[dict[str, Any]] = []
79
+
80
+ for index, raw in enumerate(raws):
81
+ try:
82
+ successful_chunks.append(self.adapt(raw))
83
+ except LexiredactInputError as exc:
84
+ logger.warning("Chunk at index %d failed adaptation: %s", index, exc)
85
+ failed_items.append({"index": index, "error": str(exc), "raw": raw})
86
+
87
+ if failed_items:
88
+ logger.warning(
89
+ "adapt_batch: %d/%d chunks failed. Proceeding with %d successful.",
90
+ len(failed_items), len(raws), len(successful_chunks),
91
+ )
92
+
93
+ return successful_chunks, failed_items
@@ -0,0 +1,57 @@
1
+ """
2
+ logging.py — Centralised logging configuration for lexiredact.
3
+
4
+ Registers a NullHandler at import time so that library consumers who have
5
+ not configured logging do not receive "No handlers could be found" warnings.
6
+
7
+ Usage inside library modules:
8
+ from lexiredact.logging import get_logger
9
+ logger = get_logger(__name__)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+
16
+ # Register NullHandler at module-import time so LexiRedact is a well-behaved library.
17
+ logging.getLogger("lexiredact").addHandler(logging.NullHandler())
18
+
19
+ _LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
20
+
21
+
22
+ def configure_logging(level: str = "INFO") -> None:
23
+ """Configure the root ``LexiRedact`` logger with a StreamHandler.
24
+
25
+ Adds a single StreamHandler pointing to stderr with the standard format.
26
+ Safe to call multiple times — will not add duplicate handlers.
27
+
28
+ Args:
29
+ level: Logging level string, e.g. "DEBUG", "INFO", "WARNING".
30
+ """
31
+ root_logger = logging.getLogger("lexiredact")
32
+ numeric_level = getattr(logging, level.upper(), logging.INFO)
33
+ root_logger.setLevel(numeric_level)
34
+
35
+ for handler in root_logger.handlers:
36
+ if isinstance(handler, logging.StreamHandler) and not isinstance(
37
+ handler, logging.FileHandler
38
+ ):
39
+ return
40
+
41
+ handler = logging.StreamHandler()
42
+ handler.setLevel(numeric_level)
43
+ handler.setFormatter(logging.Formatter(_LOG_FORMAT))
44
+ root_logger.addHandler(handler)
45
+
46
+
47
+ def get_logger(name: str) -> logging.Logger:
48
+ """Return a child logger scoped under the ``LexiRedact`` namespace.
49
+
50
+ Args:
51
+ name: Typically __name__ of the calling module. The "lexiredact."
52
+ prefix is added automatically.
53
+
54
+ Returns:
55
+ A logging.Logger instance namespaced under lexiredact.
56
+ """
57
+ return logging.getLogger(f"lexiredact.{name}")
@@ -0,0 +1,11 @@
1
+ """
2
+ cache — Redis-backed embedding cache for lexiredact.
3
+
4
+ EmbeddingCache — transparent cache layer keyed on SHA-256 of chunk text.
5
+ Any Redis failure is caught silently; callers never see exceptions.
6
+ Disabled entirely when CacheConfig.enabled=False.
7
+ """
8
+
9
+ from lexiredact.cache.redis_cache import EmbeddingCache
10
+
11
+ __all__ = ["EmbeddingCache"]
@@ -0,0 +1,98 @@
1
+ """
2
+ cache/redis_cache.py — Redis-backed embedding cache.
3
+
4
+ Failure contract (critical):
5
+ - LexiredactCacheError is raised internally but NEVER propagated to callers.
6
+ - Any Redis error, JSON decode error, or connection failure results in a cache
7
+ miss (get → None) or a silent no-op (set → return). The pipeline continues.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import json
14
+
15
+ from lexiredact.config.schema import CacheConfig
16
+ from lexiredact.exceptions import LexiredactCacheError
17
+ from lexiredact.app_logging import get_logger
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class EmbeddingCache:
23
+ """Redis-backed embedding cache. Completely transparent to callers.
24
+
25
+ When config.enabled is False, every method is a no-op and get always
26
+ returns None. No Redis connection is attempted.
27
+
28
+ Args:
29
+ config: Cache configuration (redis_url, ttl_seconds, key_prefix, enabled).
30
+ """
31
+
32
+ def __init__(self, config: CacheConfig) -> None:
33
+ self._config = config
34
+ self._client = None # redis.Redis instance; created lazily on first use.
35
+ logger.debug("EmbeddingCache initialized (enabled=%s)", config.enabled)
36
+
37
+ def get(self, text: str) -> list[float] | None:
38
+ """Return cached embedding for text or None on any miss/error."""
39
+ if not self._config.enabled:
40
+ return None
41
+ key = self._make_key(text)
42
+ try:
43
+ self._ensure_connected()
44
+ raw = self._client.get(key) # type: ignore[union-attr]
45
+ if raw is None:
46
+ return None
47
+ vector: list[float] = json.loads(raw)
48
+ if not isinstance(vector, list):
49
+ raise LexiredactCacheError(
50
+ "Cached value is not a list",
51
+ context={"key": key, "got_type": type(vector).__name__},
52
+ )
53
+ return vector
54
+ except LexiredactCacheError as exc:
55
+ logger.warning("Cache get failed for key '%s': %s", key, exc)
56
+ return None
57
+ except json.JSONDecodeError as exc:
58
+ logger.warning("Cache get: JSON decode error for key '%s': %s", key, exc)
59
+ return None
60
+ except Exception as exc: # noqa: BLE001
61
+ logger.warning("Cache get: Redis error for key '%s': %s", key, exc)
62
+ return None
63
+
64
+ def set(self, text: str, vector: list[float]) -> None:
65
+ """Store embedding in Redis with configured TTL. Silent on any error."""
66
+ if not self._config.enabled:
67
+ return
68
+ key = self._make_key(text)
69
+ try:
70
+ self._ensure_connected()
71
+ self._client.setex(key, self._config.ttl_seconds, json.dumps(vector)) # type: ignore[union-attr]
72
+ logger.debug("Cache set: stored %d-dim vector at key '%s'.", len(vector), key)
73
+ except Exception as exc: # noqa: BLE001
74
+ logger.warning("Cache set failed for key '%s': %s", key, exc)
75
+
76
+ def _make_key(self, text: str) -> str:
77
+ """Build Redis key: {prefix}:emb:{sha256(text)[:16]}"""
78
+ hash_fragment = hashlib.sha256(text.encode()).hexdigest()[:16]
79
+ return f"{self._config.key_prefix}:emb:{hash_fragment}"
80
+
81
+ def _ensure_connected(self) -> None:
82
+ """Lazily initialise the Redis client. Raises LexiredactCacheError on failure."""
83
+ if self._client is not None:
84
+ return
85
+ try:
86
+ import redis # type: ignore[import-untyped]
87
+ self._client = redis.Redis.from_url(
88
+ self._config.redis_url,
89
+ decode_responses=True,
90
+ socket_connect_timeout=2,
91
+ socket_timeout=2,
92
+ )
93
+ logger.debug("Redis client connected to %s.", self._config.redis_url)
94
+ except Exception as exc:
95
+ raise LexiredactCacheError(
96
+ "Failed to initialise Redis client",
97
+ context={"redis_url": self._config.redis_url, "error": str(exc)},
98
+ ) from exc