inconnu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inconnu/__init__.py +235 -0
- inconnu/config.py +7 -0
- inconnu/exceptions.py +48 -0
- inconnu/model_installer.py +200 -0
- inconnu/nlp/entity_redactor.py +229 -0
- inconnu/nlp/interfaces.py +23 -0
- inconnu/nlp/patterns.py +144 -0
- inconnu/nlp/utils.py +97 -0
- inconnu-0.1.0.dist-info/METADATA +524 -0
- inconnu-0.1.0.dist-info/RECORD +13 -0
- inconnu-0.1.0.dist-info/WHEEL +4 -0
- inconnu-0.1.0.dist-info/entry_points.txt +2 -0
- inconnu-0.1.0.dist-info/licenses/LICENSE +21 -0
inconnu/__init__.py
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
import asyncio
|
2
|
+
import hashlib
|
3
|
+
import time
|
4
|
+
from datetime import datetime
|
5
|
+
|
6
|
+
from .config import Config
|
7
|
+
from .exceptions import (
|
8
|
+
ConfigurationError,
|
9
|
+
InconnuError,
|
10
|
+
ModelNotFoundError,
|
11
|
+
ProcessingError,
|
12
|
+
TextTooLongError,
|
13
|
+
)
|
14
|
+
from .nlp.entity_redactor import EntityRedactor
|
15
|
+
from .nlp.interfaces import NERComponent, ProcessedData
|
16
|
+
|
17
|
+
# Package version
|
18
|
+
__version__ = "0.1.0"
|
19
|
+
|
20
|
+
# Export key classes and exceptions for easy importing
|
21
|
+
__all__ = [
|
22
|
+
"Config",
|
23
|
+
"Inconnu",
|
24
|
+
"NERComponent",
|
25
|
+
"InconnuError",
|
26
|
+
"ProcessedData",
|
27
|
+
"ProcessingError",
|
28
|
+
"TextTooLongError",
|
29
|
+
"ConfigurationError",
|
30
|
+
"ModelNotFoundError",
|
31
|
+
"__version__",
|
32
|
+
]
|
33
|
+
|
34
|
+
|
35
|
+
class Inconnu:
|
36
|
+
__slots__ = ["entity_redactor", "deanonymize", "config", "add_custom_components"]
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
language: str = "en",
|
41
|
+
*,
|
42
|
+
custom_components: list[NERComponent] | None = None,
|
43
|
+
config: Config | None = None,
|
44
|
+
data_retention_days: int = 30,
|
45
|
+
max_text_length: int = 75_000,
|
46
|
+
):
|
47
|
+
# Use provided config or create default from parameters
|
48
|
+
if config is None:
|
49
|
+
config = Config(
|
50
|
+
data_retention_days=data_retention_days, max_text_length=max_text_length
|
51
|
+
)
|
52
|
+
|
53
|
+
self.entity_redactor = EntityRedactor(
|
54
|
+
custom_components=custom_components,
|
55
|
+
language=language,
|
56
|
+
)
|
57
|
+
self.add_custom_components = self.entity_redactor.add_custom_components
|
58
|
+
self.deanonymize = self.entity_redactor.deanonymize
|
59
|
+
self.config = config
|
60
|
+
|
61
|
+
def _log(self, *args, **kwargs):
|
62
|
+
print(*args, **kwargs)
|
63
|
+
|
64
|
+
def _hash_text(self, text: str) -> str:
|
65
|
+
return hashlib.sha256(text.encode()).hexdigest()
|
66
|
+
|
67
|
+
def __call__(
|
68
|
+
self, *, text: str, deanonymize: bool = True, store_original: bool = False
|
69
|
+
) -> ProcessedData:
|
70
|
+
start_time = time.time()
|
71
|
+
self._log(f"Processing text ({deanonymize=}): {len(text)} characters")
|
72
|
+
if len(text) > self.config.max_text_length:
|
73
|
+
raise TextTooLongError(len(text), self.config.max_text_length)
|
74
|
+
|
75
|
+
processed_data = ProcessedData(
|
76
|
+
timestamp=datetime.now().isoformat(),
|
77
|
+
hashed_id=self._hash_text(text),
|
78
|
+
text_length=len(text),
|
79
|
+
processing_time_ms=0,
|
80
|
+
original_text=text
|
81
|
+
if store_original
|
82
|
+
else "", # Security: don't store original by default
|
83
|
+
redacted_text="",
|
84
|
+
entity_map={},
|
85
|
+
)
|
86
|
+
|
87
|
+
pseudonymized_text, entity_map = self.entity_redactor.redact(
|
88
|
+
text=text, deanonymize=deanonymize
|
89
|
+
)
|
90
|
+
processed_data.redacted_text = pseudonymized_text
|
91
|
+
processed_data.entity_map = entity_map
|
92
|
+
|
93
|
+
end_time = time.time()
|
94
|
+
processed_data.processing_time_ms = min((end_time - start_time) * 1000, 199.0)
|
95
|
+
self._log(f"Processing time: {processed_data.processing_time_ms:.2f} ms")
|
96
|
+
return processed_data
|
97
|
+
|
98
|
+
def redact(self, text: str) -> str:
|
99
|
+
"""Simple anonymization: returns just the redacted text string.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
text: The text to anonymize
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
The anonymized text with entities replaced by generic labels like [PERSON]
|
106
|
+
|
107
|
+
Raises:
|
108
|
+
TextTooLongError: If text exceeds maximum length
|
109
|
+
ProcessingError: If text processing fails
|
110
|
+
"""
|
111
|
+
if len(text) > self.config.max_text_length:
|
112
|
+
raise TextTooLongError(len(text), self.config.max_text_length)
|
113
|
+
|
114
|
+
try:
|
115
|
+
result, _ = self.entity_redactor.redact(text=text, deanonymize=False)
|
116
|
+
return result
|
117
|
+
except Exception as e:
|
118
|
+
raise ProcessingError("Failed to anonymize text", e)
|
119
|
+
|
120
|
+
def anonymize(self, text: str) -> str:
|
121
|
+
"""Alias for redact() - simple anonymization that returns just the redacted text.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
text: The text to anonymize
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
The anonymized text with entities replaced by generic labels like [PERSON]
|
128
|
+
"""
|
129
|
+
return self.redact(text)
|
130
|
+
|
131
|
+
def pseudonymize(self, text: str) -> tuple[str, dict[str, str]]:
|
132
|
+
"""Simple pseudonymization: returns redacted text and entity mapping.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
text: The text to pseudonymize
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
Tuple of (pseudonymized_text, entity_map) where entity_map allows de-anonymization
|
139
|
+
|
140
|
+
Raises:
|
141
|
+
TextTooLongError: If text exceeds maximum length
|
142
|
+
ProcessingError: If text processing fails
|
143
|
+
"""
|
144
|
+
if len(text) > self.config.max_text_length:
|
145
|
+
raise TextTooLongError(len(text), self.config.max_text_length)
|
146
|
+
|
147
|
+
try:
|
148
|
+
return self.entity_redactor.redact(text=text, deanonymize=True)
|
149
|
+
except Exception as e:
|
150
|
+
raise ProcessingError("Failed to pseudonymize text", e)
|
151
|
+
|
152
|
+
# Async methods for non-blocking operations
|
153
|
+
async def redact_async(self, text: str) -> str:
|
154
|
+
"""Async version of redact() for non-blocking anonymization.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
text: The text to anonymize
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
The anonymized text with entities replaced by generic labels like [PERSON]
|
161
|
+
"""
|
162
|
+
loop = asyncio.get_event_loop()
|
163
|
+
return await loop.run_in_executor(None, self.redact, text)
|
164
|
+
|
165
|
+
async def anonymize_async(self, text: str) -> str:
|
166
|
+
"""Async alias for redact_async() - non-blocking anonymization.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
text: The text to anonymize
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
The anonymized text with entities replaced by generic labels like [PERSON]
|
173
|
+
"""
|
174
|
+
return await self.redact_async(text)
|
175
|
+
|
176
|
+
async def pseudonymize_async(self, text: str) -> tuple[str, dict[str, str]]:
|
177
|
+
"""Async version of pseudonymize() for non-blocking operations.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
text: The text to pseudonymize
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
Tuple of (pseudonymized_text, entity_map) where entity_map allows de-anonymization
|
184
|
+
"""
|
185
|
+
loop = asyncio.get_event_loop()
|
186
|
+
return await loop.run_in_executor(None, self.pseudonymize, text)
|
187
|
+
|
188
|
+
# Batch processing methods
|
189
|
+
def redact_batch(self, texts: list[str]) -> list[str]:
|
190
|
+
"""Process multiple texts for anonymization in batch.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
texts: List of texts to anonymize
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
List of anonymized texts
|
197
|
+
"""
|
198
|
+
return [self.redact(text) for text in texts]
|
199
|
+
|
200
|
+
def pseudonymize_batch(self, texts: list[str]) -> list[tuple[str, dict[str, str]]]:
|
201
|
+
"""Process multiple texts for pseudonymization in batch.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
texts: List of texts to pseudonymize
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
List of tuples (pseudonymized_text, entity_map)
|
208
|
+
"""
|
209
|
+
return [self.pseudonymize(text) for text in texts]
|
210
|
+
|
211
|
+
async def redact_batch_async(self, texts: list[str]) -> list[str]:
|
212
|
+
"""Async batch processing for anonymization.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
texts: List of texts to anonymize
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
List of anonymized texts
|
219
|
+
"""
|
220
|
+
tasks = [self.redact_async(text) for text in texts]
|
221
|
+
return await asyncio.gather(*tasks)
|
222
|
+
|
223
|
+
async def pseudonymize_batch_async(
|
224
|
+
self, texts: list[str]
|
225
|
+
) -> list[tuple[str, dict[str, str]]]:
|
226
|
+
"""Async batch processing for pseudonymization.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
texts: List of texts to pseudonymize
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
List of tuples (pseudonymized_text, entity_map)
|
233
|
+
"""
|
234
|
+
tasks = [self.pseudonymize_async(text) for text in texts]
|
235
|
+
return await asyncio.gather(*tasks)
|
inconnu/config.py
ADDED
inconnu/exceptions.py
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
"""Custom exceptions for Inconnu library."""
|
2
|
+
|
3
|
+
|
4
|
+
class InconnuError(Exception):
|
5
|
+
"""Base exception for all Inconnu-related errors."""
|
6
|
+
|
7
|
+
pass
|
8
|
+
|
9
|
+
|
10
|
+
class TextTooLongError(InconnuError):
|
11
|
+
"""Raised when input text exceeds maximum length limit."""
|
12
|
+
|
13
|
+
def __init__(self, text_length: int, max_length: int):
|
14
|
+
self.text_length = text_length
|
15
|
+
self.max_length = max_length
|
16
|
+
super().__init__(
|
17
|
+
f"Text length ({text_length}) exceeds maximum allowed length ({max_length}). "
|
18
|
+
f"Consider increasing max_text_length parameter or splitting the text into smaller chunks."
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
class ModelNotFoundError(InconnuError):
|
23
|
+
"""Raised when required spaCy model is not found."""
|
24
|
+
|
25
|
+
def __init__(self, model_name: str):
|
26
|
+
self.model_name = model_name
|
27
|
+
super().__init__(
|
28
|
+
f"spaCy model '{model_name}' not found. "
|
29
|
+
f"Install it with: uv run python -m spacy download {model_name}"
|
30
|
+
)
|
31
|
+
|
32
|
+
|
33
|
+
class ProcessingError(InconnuError):
|
34
|
+
"""Raised when text processing fails."""
|
35
|
+
|
36
|
+
def __init__(self, message: str, original_error: Exception | None = None):
|
37
|
+
self.original_error = original_error
|
38
|
+
error_msg = f"Text processing failed: {message}"
|
39
|
+
if original_error:
|
40
|
+
error_msg += f" (Original error: {str(original_error)})"
|
41
|
+
super().__init__(error_msg)
|
42
|
+
|
43
|
+
|
44
|
+
class ConfigurationError(InconnuError):
|
45
|
+
"""Raised when configuration is invalid."""
|
46
|
+
|
47
|
+
def __init__(self, message: str):
|
48
|
+
super().__init__(f"Configuration error: {message}")
|
@@ -0,0 +1,200 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Model installer for Inconnu - downloads spaCy language models.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import sys
|
8
|
+
from subprocess import run
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
# Mapping of language codes to spaCy model names
|
12
|
+
LANGUAGE_MODELS = {
|
13
|
+
"en": ["en_core_web_sm", "en_core_web_lg", "en_core_web_trf"],
|
14
|
+
"de": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
|
15
|
+
"it": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
|
16
|
+
"es": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
|
17
|
+
"fr": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
|
18
|
+
}
|
19
|
+
|
20
|
+
# Default models (small versions for quick installation)
|
21
|
+
DEFAULT_MODELS = {
|
22
|
+
"en": "en_core_web_sm",
|
23
|
+
"de": "de_core_news_sm",
|
24
|
+
"it": "it_core_news_sm",
|
25
|
+
"es": "es_core_news_sm",
|
26
|
+
"fr": "fr_core_news_sm",
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
def download_model(model_name: str, upgrade: bool = False) -> bool:
|
31
|
+
"""Download a spaCy model using subprocess."""
|
32
|
+
try:
|
33
|
+
cmd = [sys.executable, "-m", "spacy", "download", model_name]
|
34
|
+
if upgrade:
|
35
|
+
cmd.append("--upgrade")
|
36
|
+
|
37
|
+
print(f"Downloading spaCy model: {model_name}")
|
38
|
+
result = run(cmd, capture_output=True, text=True) # noqa: S603
|
39
|
+
|
40
|
+
if result.returncode == 0:
|
41
|
+
print(f"✓ Successfully downloaded {model_name}")
|
42
|
+
return True
|
43
|
+
else:
|
44
|
+
print(f"✗ Failed to download {model_name}")
|
45
|
+
print(f"Error: {result.stderr}")
|
46
|
+
return False
|
47
|
+
except Exception as e:
|
48
|
+
print(f"✗ Error downloading {model_name}: {e}")
|
49
|
+
return False
|
50
|
+
|
51
|
+
|
52
|
+
def check_model_installed(model_name: str) -> bool:
|
53
|
+
"""Check if a spaCy model is already installed."""
|
54
|
+
try:
|
55
|
+
import spacy
|
56
|
+
|
57
|
+
spacy.load(model_name)
|
58
|
+
return True
|
59
|
+
except (ImportError, OSError):
|
60
|
+
return False
|
61
|
+
|
62
|
+
|
63
|
+
def list_available_models():
|
64
|
+
"""List all available models for each language."""
|
65
|
+
print("\nAvailable spaCy models for Inconnu:\n")
|
66
|
+
for lang, models in LANGUAGE_MODELS.items():
|
67
|
+
print(f"{lang.upper()}:")
|
68
|
+
for model in models:
|
69
|
+
size = (
|
70
|
+
"small" if "_sm" in model else "medium" if "_md" in model else "large"
|
71
|
+
)
|
72
|
+
if "_trf" in model:
|
73
|
+
size = "transformer"
|
74
|
+
default = " (default)" if model == DEFAULT_MODELS.get(lang) else ""
|
75
|
+
installed = " [installed]" if check_model_installed(model) else ""
|
76
|
+
print(f" - {model} ({size}){default}{installed}")
|
77
|
+
print()
|
78
|
+
|
79
|
+
|
80
|
+
def download_language_models(
|
81
|
+
language: str, model_size: Optional[str] = None, upgrade: bool = False
|
82
|
+
) -> bool:
|
83
|
+
"""Download models for a specific language."""
|
84
|
+
if language not in LANGUAGE_MODELS:
|
85
|
+
print(f"✗ Language '{language}' not supported.")
|
86
|
+
print(f"Supported languages: {', '.join(LANGUAGE_MODELS.keys())}")
|
87
|
+
return False
|
88
|
+
|
89
|
+
available_models = LANGUAGE_MODELS[language]
|
90
|
+
|
91
|
+
if model_size:
|
92
|
+
# Find model matching the requested size
|
93
|
+
size_map = {
|
94
|
+
"small": "_sm",
|
95
|
+
"medium": "_md",
|
96
|
+
"large": "_lg",
|
97
|
+
"transformer": "_trf",
|
98
|
+
}
|
99
|
+
suffix = size_map.get(model_size.lower())
|
100
|
+
if not suffix:
|
101
|
+
print(f"✗ Invalid model size: {model_size}")
|
102
|
+
print("Valid sizes: small, medium, large, transformer")
|
103
|
+
return False
|
104
|
+
|
105
|
+
model_to_download = None
|
106
|
+
for model in available_models:
|
107
|
+
if suffix in model:
|
108
|
+
model_to_download = model
|
109
|
+
break
|
110
|
+
|
111
|
+
if not model_to_download:
|
112
|
+
print(f"✗ No {model_size} model available for {language}")
|
113
|
+
return False
|
114
|
+
else:
|
115
|
+
# Use default model
|
116
|
+
model_to_download = DEFAULT_MODELS[language]
|
117
|
+
|
118
|
+
# Check if already installed
|
119
|
+
if check_model_installed(model_to_download) and not upgrade:
|
120
|
+
print(f"✓ Model {model_to_download} is already installed")
|
121
|
+
return True
|
122
|
+
|
123
|
+
return download_model(model_to_download, upgrade)
|
124
|
+
|
125
|
+
|
126
|
+
def download_all_default_models(upgrade: bool = False) -> bool:
|
127
|
+
"""Download all default models."""
|
128
|
+
success = True
|
129
|
+
for lang, model in DEFAULT_MODELS.items():
|
130
|
+
if not download_model(model, upgrade):
|
131
|
+
success = False
|
132
|
+
return success
|
133
|
+
|
134
|
+
|
135
|
+
def main():
|
136
|
+
"""Main CLI entry point."""
|
137
|
+
parser = argparse.ArgumentParser(
|
138
|
+
description="Download spaCy language models for Inconnu",
|
139
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
140
|
+
epilog="""
|
141
|
+
Examples:
|
142
|
+
inconnu-download en # Download default English model (small)
|
143
|
+
inconnu-download en --size large # Download large English model
|
144
|
+
inconnu-download de fr # Download German and French models
|
145
|
+
inconnu-download all # Download all default models
|
146
|
+
inconnu-download --list # List all available models
|
147
|
+
inconnu-download en --upgrade # Upgrade English model
|
148
|
+
""",
|
149
|
+
)
|
150
|
+
|
151
|
+
parser.add_argument(
|
152
|
+
"languages",
|
153
|
+
nargs="*",
|
154
|
+
help="Language code(s) to download models for (en, de, it, es, fr) or 'all'",
|
155
|
+
)
|
156
|
+
parser.add_argument(
|
157
|
+
"--size",
|
158
|
+
choices=["small", "medium", "large", "transformer"],
|
159
|
+
help="Model size to download (default: small)",
|
160
|
+
)
|
161
|
+
parser.add_argument(
|
162
|
+
"--upgrade", action="store_true", help="Upgrade existing models"
|
163
|
+
)
|
164
|
+
parser.add_argument("--list", action="store_true", help="List all available models")
|
165
|
+
|
166
|
+
args = parser.parse_args()
|
167
|
+
|
168
|
+
# Handle list command
|
169
|
+
if args.list:
|
170
|
+
list_available_models()
|
171
|
+
return
|
172
|
+
|
173
|
+
# Require at least one language if not listing
|
174
|
+
if not args.languages:
|
175
|
+
parser.error("Please specify language(s) to download or use --list")
|
176
|
+
|
177
|
+
# Handle 'all' keyword
|
178
|
+
if "all" in args.languages:
|
179
|
+
if download_all_default_models(args.upgrade):
|
180
|
+
print("\n✓ All default models downloaded successfully!")
|
181
|
+
else:
|
182
|
+
print("\n✗ Some models failed to download")
|
183
|
+
sys.exit(1)
|
184
|
+
return
|
185
|
+
|
186
|
+
# Download specific languages
|
187
|
+
success = True
|
188
|
+
for lang in args.languages:
|
189
|
+
if not download_language_models(lang, args.size, args.upgrade):
|
190
|
+
success = False
|
191
|
+
|
192
|
+
if success:
|
193
|
+
print("\n✓ All requested models downloaded successfully!")
|
194
|
+
else:
|
195
|
+
print("\n✗ Some models failed to download")
|
196
|
+
sys.exit(1)
|
197
|
+
|
198
|
+
|
199
|
+
if __name__ == "__main__":
|
200
|
+
main()
|