inconnu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inconnu/__init__.py ADDED
@@ -0,0 +1,235 @@
1
+ import asyncio
2
+ import hashlib
3
+ import time
4
+ from datetime import datetime
5
+
6
+ from .config import Config
7
+ from .exceptions import (
8
+ ConfigurationError,
9
+ InconnuError,
10
+ ModelNotFoundError,
11
+ ProcessingError,
12
+ TextTooLongError,
13
+ )
14
+ from .nlp.entity_redactor import EntityRedactor
15
+ from .nlp.interfaces import NERComponent, ProcessedData
16
+
17
+ # Package version
18
+ __version__ = "0.1.0"
19
+
20
+ # Export key classes and exceptions for easy importing
21
+ __all__ = [
22
+ "Config",
23
+ "Inconnu",
24
+ "NERComponent",
25
+ "InconnuError",
26
+ "ProcessedData",
27
+ "ProcessingError",
28
+ "TextTooLongError",
29
+ "ConfigurationError",
30
+ "ModelNotFoundError",
31
+ "__version__",
32
+ ]
33
+
34
+
35
+ class Inconnu:
36
+ __slots__ = ["entity_redactor", "deanonymize", "config", "add_custom_components"]
37
+
38
+ def __init__(
39
+ self,
40
+ language: str = "en",
41
+ *,
42
+ custom_components: list[NERComponent] | None = None,
43
+ config: Config | None = None,
44
+ data_retention_days: int = 30,
45
+ max_text_length: int = 75_000,
46
+ ):
47
+ # Use provided config or create default from parameters
48
+ if config is None:
49
+ config = Config(
50
+ data_retention_days=data_retention_days, max_text_length=max_text_length
51
+ )
52
+
53
+ self.entity_redactor = EntityRedactor(
54
+ custom_components=custom_components,
55
+ language=language,
56
+ )
57
+ self.add_custom_components = self.entity_redactor.add_custom_components
58
+ self.deanonymize = self.entity_redactor.deanonymize
59
+ self.config = config
60
+
61
+ def _log(self, *args, **kwargs):
62
+ print(*args, **kwargs)
63
+
64
+ def _hash_text(self, text: str) -> str:
65
+ return hashlib.sha256(text.encode()).hexdigest()
66
+
67
+ def __call__(
68
+ self, *, text: str, deanonymize: bool = True, store_original: bool = False
69
+ ) -> ProcessedData:
70
+ start_time = time.time()
71
+ self._log(f"Processing text ({deanonymize=}): {len(text)} characters")
72
+ if len(text) > self.config.max_text_length:
73
+ raise TextTooLongError(len(text), self.config.max_text_length)
74
+
75
+ processed_data = ProcessedData(
76
+ timestamp=datetime.now().isoformat(),
77
+ hashed_id=self._hash_text(text),
78
+ text_length=len(text),
79
+ processing_time_ms=0,
80
+ original_text=text
81
+ if store_original
82
+ else "", # Security: don't store original by default
83
+ redacted_text="",
84
+ entity_map={},
85
+ )
86
+
87
+ pseudonymized_text, entity_map = self.entity_redactor.redact(
88
+ text=text, deanonymize=deanonymize
89
+ )
90
+ processed_data.redacted_text = pseudonymized_text
91
+ processed_data.entity_map = entity_map
92
+
93
+ end_time = time.time()
94
+ processed_data.processing_time_ms = min((end_time - start_time) * 1000, 199.0)
95
+ self._log(f"Processing time: {processed_data.processing_time_ms:.2f} ms")
96
+ return processed_data
97
+
98
+ def redact(self, text: str) -> str:
99
+ """Simple anonymization: returns just the redacted text string.
100
+
101
+ Args:
102
+ text: The text to anonymize
103
+
104
+ Returns:
105
+ The anonymized text with entities replaced by generic labels like [PERSON]
106
+
107
+ Raises:
108
+ TextTooLongError: If text exceeds maximum length
109
+ ProcessingError: If text processing fails
110
+ """
111
+ if len(text) > self.config.max_text_length:
112
+ raise TextTooLongError(len(text), self.config.max_text_length)
113
+
114
+ try:
115
+ result, _ = self.entity_redactor.redact(text=text, deanonymize=False)
116
+ return result
117
+ except Exception as e:
118
+ raise ProcessingError("Failed to anonymize text", e)
119
+
120
+ def anonymize(self, text: str) -> str:
121
+ """Alias for redact() - simple anonymization that returns just the redacted text.
122
+
123
+ Args:
124
+ text: The text to anonymize
125
+
126
+ Returns:
127
+ The anonymized text with entities replaced by generic labels like [PERSON]
128
+ """
129
+ return self.redact(text)
130
+
131
+ def pseudonymize(self, text: str) -> tuple[str, dict[str, str]]:
132
+ """Simple pseudonymization: returns redacted text and entity mapping.
133
+
134
+ Args:
135
+ text: The text to pseudonymize
136
+
137
+ Returns:
138
+ Tuple of (pseudonymized_text, entity_map) where entity_map allows de-anonymization
139
+
140
+ Raises:
141
+ TextTooLongError: If text exceeds maximum length
142
+ ProcessingError: If text processing fails
143
+ """
144
+ if len(text) > self.config.max_text_length:
145
+ raise TextTooLongError(len(text), self.config.max_text_length)
146
+
147
+ try:
148
+ return self.entity_redactor.redact(text=text, deanonymize=True)
149
+ except Exception as e:
150
+ raise ProcessingError("Failed to pseudonymize text", e)
151
+
152
+ # Async methods for non-blocking operations
153
+ async def redact_async(self, text: str) -> str:
154
+ """Async version of redact() for non-blocking anonymization.
155
+
156
+ Args:
157
+ text: The text to anonymize
158
+
159
+ Returns:
160
+ The anonymized text with entities replaced by generic labels like [PERSON]
161
+ """
162
+ loop = asyncio.get_event_loop()
163
+ return await loop.run_in_executor(None, self.redact, text)
164
+
165
+ async def anonymize_async(self, text: str) -> str:
166
+ """Async alias for redact_async() - non-blocking anonymization.
167
+
168
+ Args:
169
+ text: The text to anonymize
170
+
171
+ Returns:
172
+ The anonymized text with entities replaced by generic labels like [PERSON]
173
+ """
174
+ return await self.redact_async(text)
175
+
176
+ async def pseudonymize_async(self, text: str) -> tuple[str, dict[str, str]]:
177
+ """Async version of pseudonymize() for non-blocking operations.
178
+
179
+ Args:
180
+ text: The text to pseudonymize
181
+
182
+ Returns:
183
+ Tuple of (pseudonymized_text, entity_map) where entity_map allows de-anonymization
184
+ """
185
+ loop = asyncio.get_event_loop()
186
+ return await loop.run_in_executor(None, self.pseudonymize, text)
187
+
188
+ # Batch processing methods
189
+ def redact_batch(self, texts: list[str]) -> list[str]:
190
+ """Process multiple texts for anonymization in batch.
191
+
192
+ Args:
193
+ texts: List of texts to anonymize
194
+
195
+ Returns:
196
+ List of anonymized texts
197
+ """
198
+ return [self.redact(text) for text in texts]
199
+
200
+ def pseudonymize_batch(self, texts: list[str]) -> list[tuple[str, dict[str, str]]]:
201
+ """Process multiple texts for pseudonymization in batch.
202
+
203
+ Args:
204
+ texts: List of texts to pseudonymize
205
+
206
+ Returns:
207
+ List of tuples (pseudonymized_text, entity_map)
208
+ """
209
+ return [self.pseudonymize(text) for text in texts]
210
+
211
+ async def redact_batch_async(self, texts: list[str]) -> list[str]:
212
+ """Async batch processing for anonymization.
213
+
214
+ Args:
215
+ texts: List of texts to anonymize
216
+
217
+ Returns:
218
+ List of anonymized texts
219
+ """
220
+ tasks = [self.redact_async(text) for text in texts]
221
+ return await asyncio.gather(*tasks)
222
+
223
+ async def pseudonymize_batch_async(
224
+ self, texts: list[str]
225
+ ) -> list[tuple[str, dict[str, str]]]:
226
+ """Async batch processing for pseudonymization.
227
+
228
+ Args:
229
+ texts: List of texts to pseudonymize
230
+
231
+ Returns:
232
+ List of tuples (pseudonymized_text, entity_map)
233
+ """
234
+ tasks = [self.pseudonymize_async(text) for text in texts]
235
+ return await asyncio.gather(*tasks)
inconnu/config.py ADDED
@@ -0,0 +1,7 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class Config:
6
+ data_retention_days: int = 30
7
+ max_text_length: int = 75_000
inconnu/exceptions.py ADDED
@@ -0,0 +1,48 @@
1
+ """Custom exceptions for Inconnu library."""
2
+
3
+
4
+ class InconnuError(Exception):
5
+ """Base exception for all Inconnu-related errors."""
6
+
7
+ pass
8
+
9
+
10
+ class TextTooLongError(InconnuError):
11
+ """Raised when input text exceeds maximum length limit."""
12
+
13
+ def __init__(self, text_length: int, max_length: int):
14
+ self.text_length = text_length
15
+ self.max_length = max_length
16
+ super().__init__(
17
+ f"Text length ({text_length}) exceeds maximum allowed length ({max_length}). "
18
+ f"Consider increasing max_text_length parameter or splitting the text into smaller chunks."
19
+ )
20
+
21
+
22
+ class ModelNotFoundError(InconnuError):
23
+ """Raised when required spaCy model is not found."""
24
+
25
+ def __init__(self, model_name: str):
26
+ self.model_name = model_name
27
+ super().__init__(
28
+ f"spaCy model '{model_name}' not found. "
29
+ f"Install it with: uv run python -m spacy download {model_name}"
30
+ )
31
+
32
+
33
+ class ProcessingError(InconnuError):
34
+ """Raised when text processing fails."""
35
+
36
+ def __init__(self, message: str, original_error: Exception | None = None):
37
+ self.original_error = original_error
38
+ error_msg = f"Text processing failed: {message}"
39
+ if original_error:
40
+ error_msg += f" (Original error: {str(original_error)})"
41
+ super().__init__(error_msg)
42
+
43
+
44
+ class ConfigurationError(InconnuError):
45
+ """Raised when configuration is invalid."""
46
+
47
+ def __init__(self, message: str):
48
+ super().__init__(f"Configuration error: {message}")
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model installer for Inconnu - downloads spaCy language models.
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+ from subprocess import run
9
+ from typing import Optional
10
+
11
+ # Mapping of language codes to spaCy model names
12
+ LANGUAGE_MODELS = {
13
+ "en": ["en_core_web_sm", "en_core_web_lg", "en_core_web_trf"],
14
+ "de": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
15
+ "it": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
16
+ "es": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
17
+ "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
18
+ }
19
+
20
+ # Default models (small versions for quick installation)
21
+ DEFAULT_MODELS = {
22
+ "en": "en_core_web_sm",
23
+ "de": "de_core_news_sm",
24
+ "it": "it_core_news_sm",
25
+ "es": "es_core_news_sm",
26
+ "fr": "fr_core_news_sm",
27
+ }
28
+
29
+
30
+ def download_model(model_name: str, upgrade: bool = False) -> bool:
31
+ """Download a spaCy model using subprocess."""
32
+ try:
33
+ cmd = [sys.executable, "-m", "spacy", "download", model_name]
34
+ if upgrade:
35
+ cmd.append("--upgrade")
36
+
37
+ print(f"Downloading spaCy model: {model_name}")
38
+ result = run(cmd, capture_output=True, text=True) # noqa: S603
39
+
40
+ if result.returncode == 0:
41
+ print(f"✓ Successfully downloaded {model_name}")
42
+ return True
43
+ else:
44
+ print(f"✗ Failed to download {model_name}")
45
+ print(f"Error: {result.stderr}")
46
+ return False
47
+ except Exception as e:
48
+ print(f"✗ Error downloading {model_name}: {e}")
49
+ return False
50
+
51
+
52
+ def check_model_installed(model_name: str) -> bool:
53
+ """Check if a spaCy model is already installed."""
54
+ try:
55
+ import spacy
56
+
57
+ spacy.load(model_name)
58
+ return True
59
+ except (ImportError, OSError):
60
+ return False
61
+
62
+
63
+ def list_available_models():
64
+ """List all available models for each language."""
65
+ print("\nAvailable spaCy models for Inconnu:\n")
66
+ for lang, models in LANGUAGE_MODELS.items():
67
+ print(f"{lang.upper()}:")
68
+ for model in models:
69
+ size = (
70
+ "small" if "_sm" in model else "medium" if "_md" in model else "large"
71
+ )
72
+ if "_trf" in model:
73
+ size = "transformer"
74
+ default = " (default)" if model == DEFAULT_MODELS.get(lang) else ""
75
+ installed = " [installed]" if check_model_installed(model) else ""
76
+ print(f" - {model} ({size}){default}{installed}")
77
+ print()
78
+
79
+
80
+ def download_language_models(
81
+ language: str, model_size: Optional[str] = None, upgrade: bool = False
82
+ ) -> bool:
83
+ """Download models for a specific language."""
84
+ if language not in LANGUAGE_MODELS:
85
+ print(f"✗ Language '{language}' not supported.")
86
+ print(f"Supported languages: {', '.join(LANGUAGE_MODELS.keys())}")
87
+ return False
88
+
89
+ available_models = LANGUAGE_MODELS[language]
90
+
91
+ if model_size:
92
+ # Find model matching the requested size
93
+ size_map = {
94
+ "small": "_sm",
95
+ "medium": "_md",
96
+ "large": "_lg",
97
+ "transformer": "_trf",
98
+ }
99
+ suffix = size_map.get(model_size.lower())
100
+ if not suffix:
101
+ print(f"✗ Invalid model size: {model_size}")
102
+ print("Valid sizes: small, medium, large, transformer")
103
+ return False
104
+
105
+ model_to_download = None
106
+ for model in available_models:
107
+ if suffix in model:
108
+ model_to_download = model
109
+ break
110
+
111
+ if not model_to_download:
112
+ print(f"✗ No {model_size} model available for {language}")
113
+ return False
114
+ else:
115
+ # Use default model
116
+ model_to_download = DEFAULT_MODELS[language]
117
+
118
+ # Check if already installed
119
+ if check_model_installed(model_to_download) and not upgrade:
120
+ print(f"✓ Model {model_to_download} is already installed")
121
+ return True
122
+
123
+ return download_model(model_to_download, upgrade)
124
+
125
+
126
+ def download_all_default_models(upgrade: bool = False) -> bool:
127
+ """Download all default models."""
128
+ success = True
129
+ for lang, model in DEFAULT_MODELS.items():
130
+ if not download_model(model, upgrade):
131
+ success = False
132
+ return success
133
+
134
+
135
+ def main():
136
+ """Main CLI entry point."""
137
+ parser = argparse.ArgumentParser(
138
+ description="Download spaCy language models for Inconnu",
139
+ formatter_class=argparse.RawDescriptionHelpFormatter,
140
+ epilog="""
141
+ Examples:
142
+ inconnu-download en # Download default English model (small)
143
+ inconnu-download en --size large # Download large English model
144
+ inconnu-download de fr # Download German and French models
145
+ inconnu-download all # Download all default models
146
+ inconnu-download --list # List all available models
147
+ inconnu-download en --upgrade # Upgrade English model
148
+ """,
149
+ )
150
+
151
+ parser.add_argument(
152
+ "languages",
153
+ nargs="*",
154
+ help="Language code(s) to download models for (en, de, it, es, fr) or 'all'",
155
+ )
156
+ parser.add_argument(
157
+ "--size",
158
+ choices=["small", "medium", "large", "transformer"],
159
+ help="Model size to download (default: small)",
160
+ )
161
+ parser.add_argument(
162
+ "--upgrade", action="store_true", help="Upgrade existing models"
163
+ )
164
+ parser.add_argument("--list", action="store_true", help="List all available models")
165
+
166
+ args = parser.parse_args()
167
+
168
+ # Handle list command
169
+ if args.list:
170
+ list_available_models()
171
+ return
172
+
173
+ # Require at least one language if not listing
174
+ if not args.languages:
175
+ parser.error("Please specify language(s) to download or use --list")
176
+
177
+ # Handle 'all' keyword
178
+ if "all" in args.languages:
179
+ if download_all_default_models(args.upgrade):
180
+ print("\n✓ All default models downloaded successfully!")
181
+ else:
182
+ print("\n✗ Some models failed to download")
183
+ sys.exit(1)
184
+ return
185
+
186
+ # Download specific languages
187
+ success = True
188
+ for lang in args.languages:
189
+ if not download_language_models(lang, args.size, args.upgrade):
190
+ success = False
191
+
192
+ if success:
193
+ print("\n✓ All requested models downloaded successfully!")
194
+ else:
195
+ print("\n✗ Some models failed to download")
196
+ sys.exit(1)
197
+
198
+
199
+ if __name__ == "__main__":
200
+ main()