armos 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(pip3 --version)",
5
+ "Bash(pip3 install *)"
6
+ ]
7
+ }
8
+ }
armos-0.1.0/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ *.egg
8
+ .env
9
+ .venv
10
+ venv/
11
+ env/
12
+ .pytest_cache/
13
+ .coverage
14
+ htmlcov/
15
+ *.pyc
16
+ .DS_Store
armos-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Armos
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
armos-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: armos
3
+ Version: 0.1.0
4
+ Summary: Automatic PII masking for OpenAI and Anthropic SDKs
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: presidio-analyzer>=2.2.0
9
+ Requires-Dist: presidio-anonymizer>=2.2.0
10
+ Requires-Dist: spacy>=3.7.0
11
+ Provides-Extra: all
12
+ Requires-Dist: anthropic>=0.20.0; extra == 'all'
13
+ Requires-Dist: openai>=1.0.0; extra == 'all'
14
+ Requires-Dist: redis>=5.0.0; extra == 'all'
15
+ Provides-Extra: anthropic
16
+ Requires-Dist: anthropic>=0.20.0; extra == 'anthropic'
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
19
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
20
+ Requires-Dist: python-dotenv>=1.0.0; extra == 'dev'
21
+ Provides-Extra: openai
22
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
23
+ Provides-Extra: redis
24
+ Requires-Dist: redis>=5.0.0; extra == 'redis'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # Armos
28
+
29
+ **PII never reaches your LLM. One line of code.**
30
+
31
+ Armos wraps the OpenAI and Anthropic SDKs to automatically detect and mask personally identifiable information (PII) before it leaves your server — and restore the real values in the response. Your application code changes by exactly one word.
32
+
33
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
34
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
35
+
36
+ ---
37
+
38
+ ## The problem
39
+
40
+ Every time your application calls an LLM, it sends raw text to a third-party server. If a user's message contains their name, Aadhaar number, email, PAN card, or credit card — that data leaves your infrastructure.
41
+
42
+ This matters for:
43
+ - **Healthcare apps** — patient names, dates of birth, medical IDs
44
+ - **Fintech apps** — PAN, Aadhaar, bank details
45
+ - **Customer support tools** — names, emails, phone numbers, addresses
46
+ - **Any app** where users type free text that gets sent to OpenAI or Anthropic
47
+
48
+ Most teams know this is a risk. Few have time to build a proper masking layer before shipping. Armos is that layer, pre-built.
49
+
50
+ ---
51
+
52
+ ## How it works
53
+
54
+ ![How Armos works](assets/how-it-works.png)
55
+
56
+ **Detection runs entirely on your machine.** Presidio + spaCy analyse the text locally. No data is sent to any Armos server — there is no Armos server. The vault (token ↔ real value map) lives in your process memory, or optionally in your own Redis instance.
57
+
58
+ ---
59
+
60
+ ## Quickstart
61
+
62
+ ### Install
63
+
64
+ ```bash
65
+ pip install armos
66
+ ```
67
+
68
+ For Redis-backed persistence across requests:
69
+ ```bash
70
+ pip install armos[redis]
71
+ ```
72
+
73
+ > **Note:** On first use, download the spaCy language model:
74
+ > ```bash
75
+ > python -m spacy download en_core_web_lg
76
+ > ```
77
+
78
+ ### OpenAI
79
+
80
+ ```python
81
+ # Before
82
+ from openai import OpenAI
83
+ client = OpenAI()
84
+
85
+ # After — one import added, one word changed
86
+ from openai import OpenAI
87
+ from armos import ArmosOpenAI
88
+
89
+ client = ArmosOpenAI(OpenAI())
90
+
91
+ # Everything else is identical
92
+ response = client.chat.completions.create(
93
+ model="gpt-4o",
94
+ messages=[{
95
+ "role": "user",
96
+ "content": "Summarise the case for patient John Smith, Aadhaar 2345 6789 0123"
97
+ }]
98
+ )
99
+
100
+ # Real values are restored in the response automatically
101
+ print(response.choices[0].message.content)
102
+ ```
103
+
104
+ ### Anthropic
105
+
106
+ ```python
107
+ from anthropic import Anthropic
108
+ from armos import ArmosAnthropic
109
+
110
+ client = ArmosAnthropic(Anthropic())
111
+
112
+ message = client.messages.create(
113
+ model="claude-sonnet-4-6",
114
+ max_tokens=1024,
115
+ messages=[{
116
+ "role": "user",
117
+ "content": "Patient John Smith, DOB 12/04/1982, PAN ABCDE1234F"
118
+ }]
119
+ )
120
+
121
+ print(message.content[0].text) # real values restored
122
+ ```
123
+
124
+ ### With Redis (persistent vault across requests)
125
+
126
+ ```python
127
+ # Token mappings survive across processes and requests
128
+ client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379")
129
+ client = ArmosAnthropic(Anthropic(), store="redis://localhost:6379")
130
+
131
+ # Custom TTL (default: 24 hours)
132
+ client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379", vault_ttl=3600)
133
+ ```
134
+
135
+ ### Standalone (any LLM or framework)
136
+
137
+ ```python
138
+ from armos import Armos
139
+
140
+ guard = Armos()
141
+
142
+ result = guard.mask("Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com")
143
+ print(result.text)
144
+ # → "Patient [PII:NAME:a1b2c3d4], Aadhaar [PII:AADHAAR:b2c3d4e5], email [PII:EMAIL:e5f6g7h8]"
145
+
146
+ print(result.has_pii) # True
147
+
148
+ restored = guard.demask(result.text)
149
+ print(restored)
150
+ # → "Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com"
151
+ ```
152
+
153
+ ---
154
+
155
+ ## What gets detected
156
+
157
+ | Entity | Token | Example |
158
+ |--------|-------|---------|
159
+ | Person name | `[PII:NAME:…]` | John Smith |
160
+ | Email address | `[PII:EMAIL:…]` | john@hospital.com |
161
+ | Phone number | `[PII:PHONE:…]` | +91 98765 43210 |
162
+ | Aadhaar number | `[PII:AADHAAR:…]` | 2345 6789 0123 |
163
+ | PAN card | `[PII:PAN:…]` | ABCDE1234F |
164
+ | Credit / debit card | `[PII:CARD:…]` | 4111 1111 1111 1111 |
165
+ | IP address | `[PII:IP:…]` | 192.168.1.100 |
166
+ | API keys & secrets | `[PII:APIKEY:…]` | sk-abc123… / AKIA… / ghp_… |
167
+
168
+ ---
169
+
170
+ ## Token design
171
+
172
+ Tokens are **deterministic** and **normalisation-aware**:
173
+
174
+ ```
175
+ "john smith" → [PII:NAME:a1b2c3d4] ← stored: "john smith"
176
+ "John Smith" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
177
+ "JOHN SMITH" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
178
+ ```
179
+
180
+ All casing variants of the same name map to one token. The LLM sees one consistent entity across a conversation — not three different people. De-masking restores the first-seen value.
181
+
182
+ ---
183
+
184
+ ## Vault options
185
+
186
+ | Option | Default | Use when |
187
+ |--------|---------|----------|
188
+ | In-memory | `Armos()` | Single request or single process |
189
+ | Redis | `Armos(store="redis://…")` | Multi-turn conversations, multiple workers, or across requests |
190
+
191
+ In-memory vault is zero configuration and the default. Redis vault persists token mappings so a token created in request 1 can be de-masked in request 5.
192
+
193
+ ---
194
+
195
+ ## v1 limitations
196
+
197
+ 1. **Streaming not supported** — `stream=True` passes through without masking. (v1.1)
198
+ 2. **Async clients not supported** — `AsyncOpenAI`, `AsyncAnthropic` pass through without masking. (v1.1)
199
+ 3. **OpenAI Responses API not intercepted** — `client.responses.create()` passes through. (v1.1)
200
+ 4. **Embeddings not masked** — `client.embeddings.create()` sends text as-is. (v1.1)
201
+ 5. **Indian name accuracy** — `en_core_web_lg` is trained on English text; Indian names have lower recall than Western names. Fine-tuning planned for v2.
202
+ 6. **Casing: first-seen wins** — De-masking always restores the first-seen casing of an entity. Use consistent casing in your prompts for exact restoration.
203
+ 7. **Token length** — `[PII:NAME:a1b2c3d4]` is 18 chars vs `John` (4 chars). Near context-window limits this may push content over. Rare in practice.
204
+
205
+ ---
206
+
207
+ ## Contributing
208
+
209
+ Armos is open source and MIT licensed. Issues and pull requests welcome.
210
+
211
+ ```bash
212
+ git clone https://github.com/armos-ai/armos
213
+ cd armos
214
+ pip install -e ".[dev,all]"
215
+ python -m spacy download en_core_web_lg
216
+ pytest tests/ -v
217
+ ```
218
+
219
+ ## License
220
+
221
+ MIT
armos-0.1.0/README.md ADDED
@@ -0,0 +1,195 @@
1
+ # Armos
2
+
3
+ **PII never reaches your LLM. One line of code.**
4
+
5
+ Armos wraps the OpenAI and Anthropic SDKs to automatically detect and mask personally identifiable information (PII) before it leaves your server — and restore the real values in the response. Your application code changes by exactly one word.
6
+
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
9
+
10
+ ---
11
+
12
+ ## The problem
13
+
14
+ Every time your application calls an LLM, it sends raw text to a third-party server. If a user's message contains their name, Aadhaar number, email, PAN card, or credit card — that data leaves your infrastructure.
15
+
16
+ This matters for:
17
+ - **Healthcare apps** — patient names, dates of birth, medical IDs
18
+ - **Fintech apps** — PAN, Aadhaar, bank details
19
+ - **Customer support tools** — names, emails, phone numbers, addresses
20
+ - **Any app** where users type free text that gets sent to OpenAI or Anthropic
21
+
22
+ Most teams know this is a risk. Few have time to build a proper masking layer before shipping. Armos is that layer, pre-built.
23
+
24
+ ---
25
+
26
+ ## How it works
27
+
28
+ ![How Armos works](assets/how-it-works.png)
29
+
30
+ **Detection runs entirely on your machine.** Presidio + spaCy analyse the text locally. No data is sent to any Armos server — there is no Armos server. The vault (token ↔ real value map) lives in your process memory, or optionally in your own Redis instance.
31
+
32
+ ---
33
+
34
+ ## Quickstart
35
+
36
+ ### Install
37
+
38
+ ```bash
39
+ pip install armos
40
+ ```
41
+
42
+ For Redis-backed persistence across requests:
43
+ ```bash
44
+ pip install armos[redis]
45
+ ```
46
+
47
+ > **Note:** On first use, download the spaCy language model:
48
+ > ```bash
49
+ > python -m spacy download en_core_web_lg
50
+ > ```
51
+
52
+ ### OpenAI
53
+
54
+ ```python
55
+ # Before
56
+ from openai import OpenAI
57
+ client = OpenAI()
58
+
59
+ # After — one import added, one word changed
60
+ from openai import OpenAI
61
+ from armos import ArmosOpenAI
62
+
63
+ client = ArmosOpenAI(OpenAI())
64
+
65
+ # Everything else is identical
66
+ response = client.chat.completions.create(
67
+ model="gpt-4o",
68
+ messages=[{
69
+ "role": "user",
70
+ "content": "Summarise the case for patient John Smith, Aadhaar 2345 6789 0123"
71
+ }]
72
+ )
73
+
74
+ # Real values are restored in the response automatically
75
+ print(response.choices[0].message.content)
76
+ ```
77
+
78
+ ### Anthropic
79
+
80
+ ```python
81
+ from anthropic import Anthropic
82
+ from armos import ArmosAnthropic
83
+
84
+ client = ArmosAnthropic(Anthropic())
85
+
86
+ message = client.messages.create(
87
+ model="claude-sonnet-4-6",
88
+ max_tokens=1024,
89
+ messages=[{
90
+ "role": "user",
91
+ "content": "Patient John Smith, DOB 12/04/1982, PAN ABCDE1234F"
92
+ }]
93
+ )
94
+
95
+ print(message.content[0].text) # real values restored
96
+ ```
97
+
98
+ ### With Redis (persistent vault across requests)
99
+
100
+ ```python
101
+ # Token mappings survive across processes and requests
102
+ client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379")
103
+ client = ArmosAnthropic(Anthropic(), store="redis://localhost:6379")
104
+
105
+ # Custom TTL (default: 24 hours)
106
+ client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379", vault_ttl=3600)
107
+ ```
108
+
109
+ ### Standalone (any LLM or framework)
110
+
111
+ ```python
112
+ from armos import Armos
113
+
114
+ guard = Armos()
115
+
116
+ result = guard.mask("Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com")
117
+ print(result.text)
118
+ # → "Patient [PII:NAME:a1b2c3d4], Aadhaar [PII:AADHAAR:b2c3d4e5], email [PII:EMAIL:e5f6g7h8]"
119
+
120
+ print(result.has_pii) # True
121
+
122
+ restored = guard.demask(result.text)
123
+ print(restored)
124
+ # → "Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com"
125
+ ```
126
+
127
+ ---
128
+
129
+ ## What gets detected
130
+
131
+ | Entity | Token | Example |
132
+ |--------|-------|---------|
133
+ | Person name | `[PII:NAME:…]` | John Smith |
134
+ | Email address | `[PII:EMAIL:…]` | john@hospital.com |
135
+ | Phone number | `[PII:PHONE:…]` | +91 98765 43210 |
136
+ | Aadhaar number | `[PII:AADHAAR:…]` | 2345 6789 0123 |
137
+ | PAN card | `[PII:PAN:…]` | ABCDE1234F |
138
+ | Credit / debit card | `[PII:CARD:…]` | 4111 1111 1111 1111 |
139
+ | IP address | `[PII:IP:…]` | 192.168.1.100 |
140
+ | API keys & secrets | `[PII:APIKEY:…]` | sk-abc123… / AKIA… / ghp_… |
141
+
142
+ ---
143
+
144
+ ## Token design
145
+
146
+ Tokens are **deterministic** and **normalisation-aware**:
147
+
148
+ ```
149
+ "john smith" → [PII:NAME:a1b2c3d4] ← stored: "john smith"
150
+ "John Smith" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
151
+ "JOHN SMITH" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
152
+ ```
153
+
154
+ All casing variants of the same name map to one token. The LLM sees one consistent entity across a conversation — not three different people. De-masking restores the first-seen value.
155
+
156
+ ---
157
+
158
+ ## Vault options
159
+
160
+ | Option | Default | Use when |
161
+ |--------|---------|----------|
162
+ | In-memory | `Armos()` | Single request or single process |
163
+ | Redis | `Armos(store="redis://…")` | Multi-turn conversations, multiple workers, or across requests |
164
+
165
+ In-memory vault is zero configuration and the default. Redis vault persists token mappings so a token created in request 1 can be de-masked in request 5.
166
+
167
+ ---
168
+
169
+ ## v1 limitations
170
+
171
+ 1. **Streaming not supported** — `stream=True` passes through without masking. (v1.1)
172
+ 2. **Async clients not supported** — `AsyncOpenAI`, `AsyncAnthropic` pass through without masking. (v1.1)
173
+ 3. **OpenAI Responses API not intercepted** — `client.responses.create()` passes through. (v1.1)
174
+ 4. **Embeddings not masked** — `client.embeddings.create()` sends text as-is. (v1.1)
175
+ 5. **Indian name accuracy** — `en_core_web_lg` is trained on English text; Indian names have lower recall than Western names. Fine-tuning planned for v2.
176
+ 6. **Casing: first-seen wins** — De-masking always restores the first-seen casing of an entity. Use consistent casing in your prompts for exact restoration.
177
+ 7. **Token length** — `[PII:NAME:a1b2c3d4]` is 18 chars vs `John` (4 chars). Near context-window limits this may push content over. Rare in practice.
178
+
179
+ ---
180
+
181
+ ## Contributing
182
+
183
+ Armos is open source and MIT licensed. Issues and pull requests welcome.
184
+
185
+ ```bash
186
+ git clone https://github.com/armos-ai/armos
187
+ cd armos
188
+ pip install -e ".[dev,all]"
189
+ python -m spacy download en_core_web_lg
190
+ pytest tests/ -v
191
+ ```
192
+
193
+ ## License
194
+
195
+ MIT
Binary file
@@ -0,0 +1,32 @@
1
+ [project]
2
+ name = "armos"
3
+ version = "0.1.0"
4
+ description = "Automatic PII masking for OpenAI and Anthropic SDKs"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = {text = "MIT"}
8
+ license-files = ["LICENSE"]
9
+
10
+ dependencies = [
11
+ "presidio-analyzer>=2.2.0",
12
+ "presidio-anonymizer>=2.2.0",
13
+ "spacy>=3.7.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ redis = ["redis>=5.0.0"]
18
+ openai = ["openai>=1.0.0"]
19
+ anthropic = ["anthropic>=0.20.0"]
20
+ all = ["redis>=5.0.0", "openai>=1.0.0", "anthropic>=0.20.0"]
21
+ dev = [
22
+ "pytest>=8.0.0",
23
+ "pytest-asyncio>=0.23.0",
24
+ "python-dotenv>=1.0.0",
25
+ ]
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/armos"]
29
+
30
+ [build-system]
31
+ requires = ["hatchling"]
32
+ build-backend = "hatchling.build"
@@ -0,0 +1,29 @@
1
+ # SPDX-License-Identifier: MIT
2
+ """
3
+ Armos — Automatic PII masking for OpenAI and Anthropic SDKs.
4
+
5
+ One line change. PII never reaches your LLM provider.
6
+
7
+ Quick start:
8
+ from openai import OpenAI
9
+ from armos import ArmosOpenAI
10
+
11
+ client = ArmosOpenAI(OpenAI())
12
+ # Use exactly as you would the normal OpenAI client.
13
+ # PII is masked automatically before every request.
14
+ # Real values are restored automatically in every response.
15
+ """
16
+
17
+ from .guard import Armos
18
+ from .wrappers.openai import ArmosOpenAI
19
+ from .wrappers.anthropic import ArmosAnthropic
20
+ from .models import MaskResult, DetectedEntity
21
+
22
+ __version__ = "0.1.0"
23
+ __all__ = [
24
+ "Armos",
25
+ "ArmosOpenAI",
26
+ "ArmosAnthropic",
27
+ "MaskResult",
28
+ "DetectedEntity",
29
+ ]
@@ -0,0 +1 @@
1
+ # SPDX-License-Identifier: MIT
@@ -0,0 +1,99 @@
1
+ # SPDX-License-Identifier: MIT
2
+ from typing import List
3
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
4
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
5
+
6
+ from ..models import DetectedEntity
7
+ from .recognisers.aadhaar import AadhaarRecogniser
8
+ from .recognisers.pan import PANRecogniser
9
+ from .recognisers.standard import APIKeyRecogniser
10
+
11
+
12
+ ENTITY_TYPES = [
13
+ "PERSON",
14
+ "EMAIL_ADDRESS",
15
+ "PHONE_NUMBER",
16
+ "AADHAAR_NUMBER",
17
+ "IN_PAN",
18
+ "CREDIT_CARD",
19
+ "IP_ADDRESS",
20
+ "API_KEY",
21
+ ]
22
+
23
+ ENTITY_SHORT_CODES = {
24
+ "PERSON": "NAME",
25
+ "EMAIL_ADDRESS": "EMAIL",
26
+ "PHONE_NUMBER": "PHONE",
27
+ "AADHAAR_NUMBER": "AADHAAR",
28
+ "IN_PAN": "PAN",
29
+ "CREDIT_CARD": "CARD",
30
+ "IP_ADDRESS": "IP",
31
+ "API_KEY": "APIKEY",
32
+ }
33
+
34
+
35
+ class DetectionEngine:
36
+ """
37
+ Orchestrates all PII recognisers.
38
+ Detection runs entirely locally — no text is sent anywhere.
39
+ """
40
+
41
+ def __init__(self):
42
+ self._analyzer = self._build_analyzer()
43
+
44
+ def _build_analyzer(self) -> AnalyzerEngine:
45
+ configuration = {
46
+ "nlp_engine_name": "spacy",
47
+ "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
48
+ }
49
+ provider = NlpEngineProvider(nlp_configuration=configuration)
50
+ nlp_engine = provider.create_engine()
51
+
52
+ registry = RecognizerRegistry()
53
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
54
+
55
+ registry.add_recognizer(AadhaarRecogniser())
56
+ registry.add_recognizer(PANRecogniser())
57
+ registry.add_recognizer(APIKeyRecogniser())
58
+
59
+ return AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
60
+
61
+ def detect(self, text: str, language: str = "en") -> List[DetectedEntity]:
62
+ """Detect all PII in text. Returns entities sorted by position."""
63
+ if not text or not text.strip():
64
+ return []
65
+
66
+ results = self._analyzer.analyze(
67
+ text=text,
68
+ entities=ENTITY_TYPES,
69
+ language=language,
70
+ )
71
+
72
+ entities = [
73
+ DetectedEntity(
74
+ entity_type=ENTITY_SHORT_CODES.get(r.entity_type, r.entity_type),
75
+ text=text[r.start:r.end],
76
+ start=r.start,
77
+ end=r.end,
78
+ score=r.score,
79
+ )
80
+ for r in results
81
+ ]
82
+
83
+ entities.sort(key=lambda e: e.start)
84
+ return self._resolve_overlaps(entities)
85
+
86
+ def _resolve_overlaps(self, entities: List[DetectedEntity]) -> List[DetectedEntity]:
87
+ """Remove overlapping detections. Higher confidence wins."""
88
+ if not entities:
89
+ return entities
90
+
91
+ resolved = [entities[0]]
92
+ for current in entities[1:]:
93
+ previous = resolved[-1]
94
+ if current.start < previous.end:
95
+ if current.score > previous.score:
96
+ resolved[-1] = current
97
+ else:
98
+ resolved.append(current)
99
+ return resolved
@@ -0,0 +1 @@
1
+ # SPDX-License-Identifier: MIT
@@ -0,0 +1,45 @@
1
+ # SPDX-License-Identifier: MIT
2
+ from presidio_analyzer import Pattern, PatternRecognizer
3
+
4
+
5
+ class AadhaarRecogniser(PatternRecognizer):
6
+ """
7
+ Detects Indian Aadhaar numbers.
8
+
9
+ Format: 12 digits. First digit cannot be 0 or 1.
10
+ Common representations:
11
+ 2345 6789 0123 (spaces — most common)
12
+ 2345-6789-0123 (hyphens)
13
+ 234567890123 (no separator — lower confidence)
14
+ """
15
+
16
+ PATTERNS = [
17
+ Pattern(
18
+ name="aadhaar_spaces",
19
+ regex=r"\b[2-9]\d{3}\s\d{4}\s\d{4}\b",
20
+ score=0.95,
21
+ ),
22
+ Pattern(
23
+ name="aadhaar_hyphens",
24
+ regex=r"\b[2-9]\d{3}-\d{4}-\d{4}\b",
25
+ score=0.95,
26
+ ),
27
+ Pattern(
28
+ name="aadhaar_plain",
29
+ regex=r"\b[2-9]\d{11}\b",
30
+ score=0.6,
31
+ ),
32
+ ]
33
+
34
+ CONTEXT = [
35
+ "aadhaar", "aadhar", "uid", "uidai",
36
+ "unique identification", "biometric id", "aadhaaar"
37
+ ]
38
+
39
+ def __init__(self):
40
+ super().__init__(
41
+ supported_entity="AADHAAR_NUMBER",
42
+ patterns=self.PATTERNS,
43
+ context=self.CONTEXT,
44
+ supported_language="en",
45
+ )