inconnu 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inconnu/__init__.py +1 -1
- inconnu/model_installer.py +86 -2
- inconnu/nlp/entity_redactor.py +6 -0
- {inconnu-0.1.0.dist-info → inconnu-0.1.1.dist-info}/METADATA +85 -81
- inconnu-0.1.1.dist-info/RECORD +13 -0
- inconnu-0.1.0.dist-info/RECORD +0 -13
- {inconnu-0.1.0.dist-info → inconnu-0.1.1.dist-info}/WHEEL +0 -0
- {inconnu-0.1.0.dist-info → inconnu-0.1.1.dist-info}/entry_points.txt +0 -0
- {inconnu-0.1.0.dist-info → inconnu-0.1.1.dist-info}/licenses/LICENSE +0 -0
inconnu/__init__.py
CHANGED
inconnu/model_installer.py
CHANGED
@@ -4,8 +4,10 @@ Model installer for Inconnu - downloads spaCy language models.
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
import argparse
|
7
|
+
import os
|
7
8
|
import sys
|
8
|
-
from
|
9
|
+
from pathlib import Path
|
10
|
+
from subprocess import CalledProcessError, run
|
9
11
|
from typing import Optional
|
10
12
|
|
11
13
|
# Mapping of language codes to spaCy model names
|
@@ -27,8 +29,60 @@ DEFAULT_MODELS = {
|
|
27
29
|
}
|
28
30
|
|
29
31
|
|
32
|
+
def is_uv_environment() -> bool:
|
33
|
+
"""Check if running in a UV environment."""
|
34
|
+
# Check for UV environment markers
|
35
|
+
return (
|
36
|
+
os.environ.get("UV_PROJECT_ROOT") is not None
|
37
|
+
or Path(sys.prefix).name == ".venv"
|
38
|
+
and Path(sys.prefix).parent.joinpath("uv.lock").exists()
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
def ensure_pip_available() -> bool:
|
43
|
+
"""Ensure pip is available, install it if running in UV environment."""
|
44
|
+
try:
|
45
|
+
# Try importing pip to check if it's available
|
46
|
+
import pip # type: ignore # noqa: F401
|
47
|
+
|
48
|
+
return True
|
49
|
+
except ImportError:
|
50
|
+
if is_uv_environment():
|
51
|
+
print("📦 UV environment detected. Installing pip...")
|
52
|
+
try:
|
53
|
+
result = run( # noqa: S603
|
54
|
+
["uv", "pip", "install", "pip", "--upgrade"], # noqa: S607
|
55
|
+
capture_output=True,
|
56
|
+
text=True,
|
57
|
+
)
|
58
|
+
if result.returncode == 0:
|
59
|
+
print("✓ pip installed successfully")
|
60
|
+
return True
|
61
|
+
else:
|
62
|
+
print(f"✗ Failed to install pip: {result.stderr}")
|
63
|
+
return False
|
64
|
+
except (CalledProcessError, FileNotFoundError) as e:
|
65
|
+
print(f"✗ Error installing pip: {e}")
|
66
|
+
return False
|
67
|
+
return False
|
68
|
+
|
69
|
+
|
30
70
|
def download_model(model_name: str, upgrade: bool = False) -> bool:
|
31
71
|
"""Download a spaCy model using subprocess."""
|
72
|
+
# First, ensure pip is available
|
73
|
+
if not ensure_pip_available():
|
74
|
+
print("\n⚠️ pip is not available and could not be installed.")
|
75
|
+
if is_uv_environment():
|
76
|
+
print("\n💡 For UV environments, you can install models directly:")
|
77
|
+
print(
|
78
|
+
f" uv add 'inconnu[{model_name.split('_')[0]}]' # for default model"
|
79
|
+
)
|
80
|
+
print(
|
81
|
+
f" uv add 'inconnu[{model_name.split('_')[0]}-lg]' # for large model"
|
82
|
+
)
|
83
|
+
print("\n Or install all languages: uv add 'inconnu[all]'")
|
84
|
+
return False
|
85
|
+
|
32
86
|
try:
|
33
87
|
cmd = [sys.executable, "-m", "spacy", "download", model_name]
|
34
88
|
if upgrade:
|
@@ -126,12 +180,26 @@ def download_language_models(
|
|
126
180
|
def download_all_default_models(upgrade: bool = False) -> bool:
|
127
181
|
"""Download all default models."""
|
128
182
|
success = True
|
129
|
-
for
|
183
|
+
for model in DEFAULT_MODELS.values():
|
130
184
|
if not download_model(model, upgrade):
|
131
185
|
success = False
|
132
186
|
return success
|
133
187
|
|
134
188
|
|
189
|
+
def print_uv_instructions():
|
190
|
+
"""Print instructions for UV users."""
|
191
|
+
print("\n📘 UV Installation Instructions:")
|
192
|
+
print("\nFor UV environments, models can be installed as dependencies:")
|
193
|
+
print(" uv add 'inconnu[en]' # English")
|
194
|
+
print(" uv add 'inconnu[de]' # German")
|
195
|
+
print(" uv add 'inconnu[en,de,fr]' # Multiple languages")
|
196
|
+
print(" uv add 'inconnu[all]' # All languages")
|
197
|
+
print("\n Larger models:")
|
198
|
+
print(" uv add 'inconnu[en-lg]' # English large")
|
199
|
+
print(" uv add 'inconnu[en-trf]' # English transformer")
|
200
|
+
print("\n Available sizes: sm (default), md, lg, trf (English only)")
|
201
|
+
|
202
|
+
|
135
203
|
def main():
|
136
204
|
"""Main CLI entry point."""
|
137
205
|
parser = argparse.ArgumentParser(
|
@@ -162,9 +230,17 @@ Examples:
|
|
162
230
|
"--upgrade", action="store_true", help="Upgrade existing models"
|
163
231
|
)
|
164
232
|
parser.add_argument("--list", action="store_true", help="List all available models")
|
233
|
+
parser.add_argument(
|
234
|
+
"--uv-help", action="store_true", help="Show UV installation instructions"
|
235
|
+
)
|
165
236
|
|
166
237
|
args = parser.parse_args()
|
167
238
|
|
239
|
+
# Handle UV help
|
240
|
+
if args.uv_help:
|
241
|
+
print_uv_instructions()
|
242
|
+
return
|
243
|
+
|
168
244
|
# Handle list command
|
169
245
|
if args.list:
|
170
246
|
list_available_models()
|
@@ -172,6 +248,14 @@ Examples:
|
|
172
248
|
|
173
249
|
# Require at least one language if not listing
|
174
250
|
if not args.languages:
|
251
|
+
if is_uv_environment():
|
252
|
+
print("⚠️ UV environment detected!")
|
253
|
+
print_uv_instructions()
|
254
|
+
print("\nOr use 'inconnu-download --list' to see available models")
|
255
|
+
print(
|
256
|
+
"Or use 'inconnu-download LANG' to download via this tool (requires pip)"
|
257
|
+
)
|
258
|
+
return
|
175
259
|
parser.error("Please specify language(s) to download or use --list")
|
176
260
|
|
177
261
|
# Handle 'all' keyword
|
inconnu/nlp/entity_redactor.py
CHANGED
@@ -19,6 +19,8 @@ class SpacyModels(StrEnum):
|
|
19
19
|
EN_CORE_WEB_TRF = "en_core_web_trf"
|
20
20
|
DE_CORE_NEWS_SM = "de_core_news_sm"
|
21
21
|
IT_CORE_NEWS_SM = "it_core_news_sm"
|
22
|
+
ES_CORE_NEWS_SM = "es_core_news_sm"
|
23
|
+
FR_CORE_NEWS_SM = "fr_core_news_sm"
|
22
24
|
EN_CORE_WEB_SM = "en_core_web_sm"
|
23
25
|
|
24
26
|
|
@@ -155,6 +157,10 @@ class EntityRedactor:
|
|
155
157
|
model_name = SpacyModels.EN_CORE_WEB_SM
|
156
158
|
case "it":
|
157
159
|
model_name = SpacyModels.IT_CORE_NEWS_SM
|
160
|
+
case "es":
|
161
|
+
model_name = SpacyModels.ES_CORE_NEWS_SM
|
162
|
+
case "fr":
|
163
|
+
model_name = SpacyModels.FR_CORE_NEWS_SM
|
158
164
|
case _:
|
159
165
|
# Default to English small model for unsupported languages
|
160
166
|
model_name = SpacyModels.EN_CORE_WEB_SM
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: inconnu
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: GDPR-compliant data privacy tool for entity redaction and de-anonymization
|
5
5
|
Project-URL: Homepage, https://github.com/0xjgv/inconnu
|
6
6
|
Project-URL: Documentation, https://github.com/0xjgv/inconnu#readme
|
@@ -26,16 +26,14 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
26
26
|
Requires-Python: >=3.10
|
27
27
|
Requires-Dist: phonenumbers>=9.0.8
|
28
28
|
Requires-Dist: spacy>=3.8.7
|
29
|
-
Provides-Extra: all
|
30
|
-
Provides-Extra: de
|
31
|
-
Provides-Extra: en
|
32
|
-
Provides-Extra: es
|
33
|
-
Provides-Extra: fr
|
34
|
-
Provides-Extra: it
|
35
29
|
Description-Content-Type: text/markdown
|
36
30
|
|
37
31
|
# Inconnu
|
38
32
|
|
33
|
+
[](https://github.com/0xjgv/inconnu)
|
34
|
+
[](https://inconnu.ai)
|
35
|
+
[](https://pypi.org/project/inconnu/)
|
36
|
+
|
39
37
|
## What is Inconnu?
|
40
38
|
|
41
39
|
Inconnu is a GDPR-compliant data privacy tool designed for entity redaction and de-anonymization. It provides cutting-edge NLP-based tools for anonymizing and pseudonymizing text data while maintaining data utility, ensuring your business meets stringent privacy regulations.
|
@@ -62,40 +60,60 @@ Inconnu is a GDPR-compliant data privacy tool designed for entity redaction and
|
|
62
60
|
### Install from PyPI
|
63
61
|
|
64
62
|
```bash
|
65
|
-
#
|
63
|
+
# Using pip
|
66
64
|
pip install inconnu
|
67
65
|
|
68
|
-
#
|
69
|
-
|
66
|
+
# Using UV (Recommended)
|
67
|
+
uv add inconnu
|
68
|
+
```
|
69
|
+
|
70
|
+
**Note**: Language models are NOT included as optional dependencies. You'll need to download them separately using the `inconnu-download` command after installation (see below).
|
70
71
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
72
|
+
### Download Language Models
|
73
|
+
|
74
|
+
After installing Inconnu, use the `inconnu-download` command to download spaCy language models:
|
75
|
+
|
76
|
+
```bash
|
77
|
+
# Download default (small) models
|
78
|
+
inconnu-download en # English
|
79
|
+
inconnu-download de # German
|
80
|
+
inconnu-download en de fr # Multiple languages
|
81
|
+
inconnu-download all # All default models
|
76
82
|
|
77
|
-
#
|
78
|
-
|
83
|
+
# Download specific model sizes
|
84
|
+
inconnu-download en --size large # Large English model
|
85
|
+
inconnu-download en --size transformer # Transformer model (English only)
|
79
86
|
|
80
|
-
#
|
81
|
-
|
87
|
+
# List available models and check what's installed
|
88
|
+
inconnu-download --list
|
89
|
+
|
90
|
+
# Upgrade existing models
|
91
|
+
inconnu-download en --upgrade
|
92
|
+
|
93
|
+
# Get help for UV environments
|
94
|
+
inconnu-download --uv-help
|
82
95
|
```
|
83
96
|
|
84
|
-
|
97
|
+
#### How Model Installation Works
|
98
|
+
|
99
|
+
1. **No Optional Dependencies**: spaCy models are NOT included as pip/uv optional dependencies to avoid unnecessary downloads during dependency resolution
|
100
|
+
2. **On-Demand Downloads**: The `inconnu-download` command downloads only the models you need
|
101
|
+
3. **Smart Environment Detection**: Automatically detects UV environments and provides appropriate guidance
|
102
|
+
4. **Verification**: Checks if models are already installed before downloading
|
103
|
+
|
104
|
+
#### Available Model Sizes
|
105
|
+
|
106
|
+
- **Small (sm)**: Default, fast processing, ~15-50MB, good for high-volume
|
107
|
+
- **Medium (md)**: Better accuracy, ~50-200MB, moderate speed
|
108
|
+
- **Large (lg)**: High accuracy, ~200-600MB, slower processing
|
109
|
+
- **Transformer (trf)**: Highest accuracy, ~400MB+, GPU-optimized (English only)
|
85
110
|
|
86
|
-
|
111
|
+
#### Alternative: Direct spaCy Download
|
87
112
|
|
113
|
+
You can also use spaCy directly if preferred:
|
88
114
|
```bash
|
89
|
-
|
90
|
-
|
91
|
-
inconnu-download de fr # Download German and French models
|
92
|
-
inconnu-download en --size large # Download large English model
|
93
|
-
inconnu-download all # Download all default models
|
94
|
-
inconnu-download --list # List all available models
|
95
|
-
|
96
|
-
# Or using spaCy directly
|
97
|
-
python -m spacy download en_core_web_sm
|
98
|
-
python -m spacy download de_core_news_sm
|
115
|
+
python -m spacy download en_core_web_sm # English small
|
116
|
+
python -m spacy download de_core_news_lg # German large
|
99
117
|
```
|
100
118
|
|
101
119
|
### Install from Source
|
@@ -108,9 +126,9 @@ python -m spacy download de_core_news_sm
|
|
108
126
|
|
109
127
|
2. **Install with UV (recommended for development)**:
|
110
128
|
```bash
|
111
|
-
|
112
|
-
|
113
|
-
make test
|
129
|
+
uv sync # Install dependencies
|
130
|
+
inconnu-download en de # Download language models
|
131
|
+
make test # Run tests
|
114
132
|
```
|
115
133
|
|
116
134
|
3. **Or install with pip**:
|
@@ -119,69 +137,55 @@ python -m spacy download de_core_news_sm
|
|
119
137
|
python -m spacy download en_core_web_sm
|
120
138
|
```
|
121
139
|
|
122
|
-
###
|
140
|
+
### Development Commands
|
123
141
|
|
124
|
-
|
142
|
+
For development, the Makefile provides convenience targets:
|
125
143
|
|
126
|
-
#### English Models
|
127
144
|
```bash
|
128
|
-
#
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
145
|
+
# Download models using make commands
|
146
|
+
make model-en # English small
|
147
|
+
make model-de # German small
|
148
|
+
make model-it # Italian small
|
149
|
+
make model-es # Spanish small
|
150
|
+
make model-fr # French small
|
151
|
+
|
152
|
+
# Other development commands
|
153
|
+
make test # Run tests
|
154
|
+
make lint # Check code with ruff
|
155
|
+
make format # Format code
|
156
|
+
make clean # Clean cache and format code
|
136
157
|
```
|
137
158
|
|
138
|
-
|
139
|
-
```bash
|
140
|
-
# German model
|
141
|
-
make model-de
|
142
|
-
uv run python -m spacy download de_core_news_sm
|
143
|
-
|
144
|
-
# Italian model
|
145
|
-
make model-it
|
146
|
-
uv run python -m spacy download it_core_news_sm
|
159
|
+
### Using Different Models in Code
|
147
160
|
|
148
|
-
|
149
|
-
make model-es
|
150
|
-
uv run python -m spacy download es_core_news_sm
|
161
|
+
To use a different model size, first download it, then specify it when initializing:
|
151
162
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
# For enhanced accuracy (manual installation)
|
157
|
-
# Medium German model - better accuracy
|
158
|
-
uv run python -m spacy download de_core_news_md
|
159
|
-
|
160
|
-
# Large German model - highest accuracy
|
161
|
-
uv run python -m spacy download de_core_news_lg
|
162
|
-
```
|
163
|
-
|
164
|
-
#### Using Different Models
|
163
|
+
```python
|
164
|
+
from inconnu import Inconnu
|
165
|
+
from inconnu.nlp.entity_redactor import SpacyModels
|
165
166
|
|
166
|
-
|
167
|
+
# First, download the model you want
|
168
|
+
# $ inconnu-download en --size large
|
167
169
|
|
168
|
-
|
169
|
-
|
170
|
+
# Then use it in your code
|
171
|
+
inconnu = Inconnu(
|
172
|
+
language="en",
|
173
|
+
model_name=SpacyModels.EN_CORE_WEB_LG # Use large model
|
174
|
+
)
|
170
175
|
|
171
|
-
#
|
172
|
-
|
173
|
-
custom_components=None,
|
176
|
+
# For highest accuracy (transformer model)
|
177
|
+
inconnu_trf = Inconnu(
|
174
178
|
language="en",
|
175
|
-
model_name=SpacyModels.EN_CORE_WEB_TRF
|
179
|
+
model_name=SpacyModels.EN_CORE_WEB_TRF
|
176
180
|
)
|
177
181
|
```
|
178
182
|
|
179
183
|
**Model Selection Guide:**
|
180
|
-
- `en_core_web_sm`: Fast processing, good for high-volume
|
181
|
-
- `en_core_web_lg`: Better accuracy, moderate
|
182
|
-
- `en_core_web_trf`: Highest accuracy,
|
184
|
+
- `en_core_web_sm`: Fast processing, good for high-volume
|
185
|
+
- `en_core_web_lg`: Better accuracy, moderate speed
|
186
|
+
- `en_core_web_trf`: Highest accuracy, GPU-optimized (recommended for sensitive data)
|
183
187
|
|
184
|
-
For
|
188
|
+
For a complete list of supported models, run `inconnu-download --list`
|
185
189
|
|
186
190
|
## Development Setup
|
187
191
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
inconnu/__init__.py,sha256=FZgRj2vfQsd8H57_h5qoRIkOcEJbb8lgoqQrDW38LXc,7567
|
2
|
+
inconnu/config.py,sha256=SFZjg0IpzOfac8RNmCnq9sjxqHmbhAkA1LfGHqfYiP8,129
|
3
|
+
inconnu/exceptions.py,sha256=9qEqqwiRLvy5gDEPTiiTyyr_U5SQdzivBFPFx7HErG4,1547
|
4
|
+
inconnu/model_installer.py,sha256=U8J9Qay6ioFCkcuC98XbcH8pS2w6i449MT4YfdsufHI,9651
|
5
|
+
inconnu/nlp/entity_redactor.py,sha256=S6f1VCc7LcGlHEZ8bb24P8V3eWGJIz5CtYRUteLl9cw,8243
|
6
|
+
inconnu/nlp/interfaces.py,sha256=B9FhChpPBg7nmFOJltWga5nWzMsnP9yj7SxfnBjJydg,495
|
7
|
+
inconnu/nlp/patterns.py,sha256=VxwgetKRd22esnjeya86j4oNKGzcHXIiQ6VE1LAVNzE,5662
|
8
|
+
inconnu/nlp/utils.py,sha256=700Tz-wR4JFYvnvuAvyu2x2YNwkOPtvQx007H-wS-7Y,2775
|
9
|
+
inconnu-0.1.1.dist-info/METADATA,sha256=x7p2rhMnLWJhfzyh575m77fFWv4Hj3eRRaLxKPcXRws,17231
|
10
|
+
inconnu-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
11
|
+
inconnu-0.1.1.dist-info/entry_points.txt,sha256=jBJr5LeX-XGEBh5iMQIJr5zdzqbyOUyw3rSgWZfQcDk,66
|
12
|
+
inconnu-0.1.1.dist-info/licenses/LICENSE,sha256=LMGDpdSqFgydJ63Q0EjrcYxFvATmqE_bdNHrdsAEqNE,1089
|
13
|
+
inconnu-0.1.1.dist-info/RECORD,,
|
inconnu-0.1.0.dist-info/RECORD
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
inconnu/__init__.py,sha256=FHDRvMfesj7UYM1JSLwzWcDQs7eqp-zFoljNCU--YZk,7567
|
2
|
-
inconnu/config.py,sha256=SFZjg0IpzOfac8RNmCnq9sjxqHmbhAkA1LfGHqfYiP8,129
|
3
|
-
inconnu/exceptions.py,sha256=9qEqqwiRLvy5gDEPTiiTyyr_U5SQdzivBFPFx7HErG4,1547
|
4
|
-
inconnu/model_installer.py,sha256=_PphTFdkJXsz0vwqrY0W9RTbxPaYYJylgBT1H9w7AHk,6433
|
5
|
-
inconnu/nlp/entity_redactor.py,sha256=TD1G8qDX4bI9bAi5zR5oR1IbJJSst80dF2wXBCloj1Y,8003
|
6
|
-
inconnu/nlp/interfaces.py,sha256=B9FhChpPBg7nmFOJltWga5nWzMsnP9yj7SxfnBjJydg,495
|
7
|
-
inconnu/nlp/patterns.py,sha256=VxwgetKRd22esnjeya86j4oNKGzcHXIiQ6VE1LAVNzE,5662
|
8
|
-
inconnu/nlp/utils.py,sha256=700Tz-wR4JFYvnvuAvyu2x2YNwkOPtvQx007H-wS-7Y,2775
|
9
|
-
inconnu-0.1.0.dist-info/METADATA,sha256=CHGP-uLQ2xf5HOOT_aGO1ePE_qXkEG3lV8LrQZ-ctWM,16533
|
10
|
-
inconnu-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
11
|
-
inconnu-0.1.0.dist-info/entry_points.txt,sha256=jBJr5LeX-XGEBh5iMQIJr5zdzqbyOUyw3rSgWZfQcDk,66
|
12
|
-
inconnu-0.1.0.dist-info/licenses/LICENSE,sha256=LMGDpdSqFgydJ63Q0EjrcYxFvATmqE_bdNHrdsAEqNE,1089
|
13
|
-
inconnu-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|