armos 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- armos-0.1.0/.claude/settings.local.json +8 -0
- armos-0.1.0/.gitignore +16 -0
- armos-0.1.0/LICENSE +21 -0
- armos-0.1.0/PKG-INFO +221 -0
- armos-0.1.0/README.md +195 -0
- armos-0.1.0/assets/how-it-works.png +0 -0
- armos-0.1.0/pyproject.toml +32 -0
- armos-0.1.0/src/armos/__init__.py +29 -0
- armos-0.1.0/src/armos/detection/__init__.py +1 -0
- armos-0.1.0/src/armos/detection/engine.py +99 -0
- armos-0.1.0/src/armos/detection/recognisers/__init__.py +1 -0
- armos-0.1.0/src/armos/detection/recognisers/aadhaar.py +45 -0
- armos-0.1.0/src/armos/detection/recognisers/pan.py +37 -0
- armos-0.1.0/src/armos/detection/recognisers/standard.py +31 -0
- armos-0.1.0/src/armos/guard.py +38 -0
- armos-0.1.0/src/armos/masking/__init__.py +1 -0
- armos-0.1.0/src/armos/masking/tokenizer.py +54 -0
- armos-0.1.0/src/armos/masking/vault/__init__.py +28 -0
- armos-0.1.0/src/armos/masking/vault/base.py +39 -0
- armos-0.1.0/src/armos/masking/vault/memory.py +35 -0
- armos-0.1.0/src/armos/masking/vault/redis.py +65 -0
- armos-0.1.0/src/armos/models.py +24 -0
- armos-0.1.0/src/armos/wrappers/__init__.py +1 -0
- armos-0.1.0/src/armos/wrappers/anthropic.py +82 -0
- armos-0.1.0/src/armos/wrappers/openai.py +100 -0
- armos-0.1.0/tests/__init__.py +0 -0
- armos-0.1.0/tests/test_detection.py +60 -0
- armos-0.1.0/tests/test_guard.py +82 -0
- armos-0.1.0/tests/test_masking.py +51 -0
- armos-0.1.0/tests/test_vault.py +57 -0
- armos-0.1.0/tests/test_wrapper_anthropic.py +57 -0
- armos-0.1.0/tests/test_wrapper_openai.py +77 -0
armos-0.1.0/.gitignore
ADDED
armos-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Armos
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
armos-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: armos
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic PII masking for OpenAI and Anthropic SDKs
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: presidio-analyzer>=2.2.0
|
|
9
|
+
Requires-Dist: presidio-anonymizer>=2.2.0
|
|
10
|
+
Requires-Dist: spacy>=3.7.0
|
|
11
|
+
Provides-Extra: all
|
|
12
|
+
Requires-Dist: anthropic>=0.20.0; extra == 'all'
|
|
13
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
14
|
+
Requires-Dist: redis>=5.0.0; extra == 'all'
|
|
15
|
+
Provides-Extra: anthropic
|
|
16
|
+
Requires-Dist: anthropic>=0.20.0; extra == 'anthropic'
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
20
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == 'dev'
|
|
21
|
+
Provides-Extra: openai
|
|
22
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
23
|
+
Provides-Extra: redis
|
|
24
|
+
Requires-Dist: redis>=5.0.0; extra == 'redis'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Armos
|
|
28
|
+
|
|
29
|
+
**PII never reaches your LLM. One line of code.**
|
|
30
|
+
|
|
31
|
+
Armos wraps the OpenAI and Anthropic SDKs to automatically detect and mask personally identifiable information (PII) before it leaves your server — and restore the real values in the response. Your application code changes by exactly one word.
|
|
32
|
+
|
|
33
|
+
[](https://opensource.org/licenses/MIT)
|
|
34
|
+
[](https://www.python.org/downloads/)
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## The problem
|
|
39
|
+
|
|
40
|
+
Every time your application calls an LLM, it sends raw text to a third-party server. If a user's message contains their name, Aadhaar number, email, PAN card, or credit card — that data leaves your infrastructure.
|
|
41
|
+
|
|
42
|
+
This matters for:
|
|
43
|
+
- **Healthcare apps** — patient names, dates of birth, medical IDs
|
|
44
|
+
- **Fintech apps** — PAN, Aadhaar, bank details
|
|
45
|
+
- **Customer support tools** — names, emails, phone numbers, addresses
|
|
46
|
+
- **Any app** where users type free text that gets sent to OpenAI or Anthropic
|
|
47
|
+
|
|
48
|
+
Most teams know this is a risk. Few have time to build a proper masking layer before shipping. Armos is that layer, pre-built.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## How it works
|
|
53
|
+
|
|
54
|
+

|
|
55
|
+
|
|
56
|
+
**Detection runs entirely on your machine.** Presidio + spaCy analyse the text locally. No data is sent to any Armos server — there is no Armos server. The vault (token ↔ real value map) lives in your process memory, or optionally in your own Redis instance.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Quickstart
|
|
61
|
+
|
|
62
|
+
### Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install armos
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
For Redis-backed persistence across requests:
|
|
69
|
+
```bash
|
|
70
|
+
pip install armos[redis]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
> **Note:** On first use, download the spaCy language model:
|
|
74
|
+
> ```bash
|
|
75
|
+
> python -m spacy download en_core_web_lg
|
|
76
|
+
> ```
|
|
77
|
+
|
|
78
|
+
### OpenAI
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# Before
|
|
82
|
+
from openai import OpenAI
|
|
83
|
+
client = OpenAI()
|
|
84
|
+
|
|
85
|
+
# After — one import added, one word changed
|
|
86
|
+
from openai import OpenAI
|
|
87
|
+
from armos import ArmosOpenAI
|
|
88
|
+
|
|
89
|
+
client = ArmosOpenAI(OpenAI())
|
|
90
|
+
|
|
91
|
+
# Everything else is identical
|
|
92
|
+
response = client.chat.completions.create(
|
|
93
|
+
model="gpt-4o",
|
|
94
|
+
messages=[{
|
|
95
|
+
"role": "user",
|
|
96
|
+
"content": "Summarise the case for patient John Smith, Aadhaar 2345 6789 0123"
|
|
97
|
+
}]
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Real values are restored in the response automatically
|
|
101
|
+
print(response.choices[0].message.content)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Anthropic
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from anthropic import Anthropic
|
|
108
|
+
from armos import ArmosAnthropic
|
|
109
|
+
|
|
110
|
+
client = ArmosAnthropic(Anthropic())
|
|
111
|
+
|
|
112
|
+
message = client.messages.create(
|
|
113
|
+
model="claude-sonnet-4-6",
|
|
114
|
+
max_tokens=1024,
|
|
115
|
+
messages=[{
|
|
116
|
+
"role": "user",
|
|
117
|
+
"content": "Patient John Smith, DOB 12/04/1982, PAN ABCDE1234F"
|
|
118
|
+
}]
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
print(message.content[0].text) # real values restored
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### With Redis (persistent vault across requests)
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# Token mappings survive across processes and requests
|
|
128
|
+
client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379")
|
|
129
|
+
client = ArmosAnthropic(Anthropic(), store="redis://localhost:6379")
|
|
130
|
+
|
|
131
|
+
# Custom TTL (default: 24 hours)
|
|
132
|
+
client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379", vault_ttl=3600)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Standalone (any LLM or framework)
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from armos import Armos
|
|
139
|
+
|
|
140
|
+
guard = Armos()
|
|
141
|
+
|
|
142
|
+
result = guard.mask("Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com")
|
|
143
|
+
print(result.text)
|
|
144
|
+
# → "Patient [PII:NAME:a1b2c3d4], Aadhaar [PII:AADHAAR:b2c3d4e5], email [PII:EMAIL:e5f6g7h8]"
|
|
145
|
+
|
|
146
|
+
print(result.has_pii) # True
|
|
147
|
+
|
|
148
|
+
restored = guard.demask(result.text)
|
|
149
|
+
print(restored)
|
|
150
|
+
# → "Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## What gets detected
|
|
156
|
+
|
|
157
|
+
| Entity | Token | Example |
|
|
158
|
+
|--------|-------|---------|
|
|
159
|
+
| Person name | `[PII:NAME:…]` | John Smith |
|
|
160
|
+
| Email address | `[PII:EMAIL:…]` | john@hospital.com |
|
|
161
|
+
| Phone number | `[PII:PHONE:…]` | +91 98765 43210 |
|
|
162
|
+
| Aadhaar number | `[PII:AADHAAR:…]` | 2345 6789 0123 |
|
|
163
|
+
| PAN card | `[PII:PAN:…]` | ABCDE1234F |
|
|
164
|
+
| Credit / debit card | `[PII:CARD:…]` | 4111 1111 1111 1111 |
|
|
165
|
+
| IP address | `[PII:IP:…]` | 192.168.1.100 |
|
|
166
|
+
| API keys & secrets | `[PII:APIKEY:…]` | sk-abc123… / AKIA… / ghp_… |
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Token design
|
|
171
|
+
|
|
172
|
+
Tokens are **deterministic** and **normalisation-aware**:
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
"john smith" → [PII:NAME:a1b2c3d4] ← stored: "john smith"
|
|
176
|
+
"John Smith" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
|
|
177
|
+
"JOHN SMITH" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
All casing variants of the same name map to one token. The LLM sees one consistent entity across a conversation — not three different people. De-masking restores the first-seen value.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Vault options
|
|
185
|
+
|
|
186
|
+
| Option | Default | Use when |
|
|
187
|
+
|--------|---------|----------|
|
|
188
|
+
| In-memory | `Armos()` | Single request or single process |
|
|
189
|
+
| Redis | `Armos(store="redis://…")` | Multi-turn conversations, multiple workers, or across requests |
|
|
190
|
+
|
|
191
|
+
In-memory vault is zero configuration and the default. Redis vault persists token mappings so a token created in request 1 can be de-masked in request 5.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## v1 limitations
|
|
196
|
+
|
|
197
|
+
1. **Streaming not supported** — `stream=True` passes through without masking. (v1.1)
|
|
198
|
+
2. **Async clients not supported** — `AsyncOpenAI`, `AsyncAnthropic` pass through without masking. (v1.1)
|
|
199
|
+
3. **OpenAI Responses API not intercepted** — `client.responses.create()` passes through. (v1.1)
|
|
200
|
+
4. **Embeddings not masked** — `client.embeddings.create()` sends text as-is. (v1.1)
|
|
201
|
+
5. **Indian name accuracy** — `en_core_web_lg` is trained on English text; Indian names have lower recall than Western names. Fine-tuning planned for v2.
|
|
202
|
+
6. **Casing: first-seen wins** — De-masking always restores the first-seen casing of an entity. Use consistent casing in your prompts for exact restoration.
|
|
203
|
+
7. **Token length** — `[PII:NAME:a1b2c3d4]` is 18 chars vs `John` (4 chars). Near context-window limits this may push content over. Rare in practice.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Contributing
|
|
208
|
+
|
|
209
|
+
Armos is open source and MIT licensed. Issues and pull requests welcome.
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
git clone https://github.com/armos-ai/armos
|
|
213
|
+
cd armos
|
|
214
|
+
pip install -e ".[dev,all]"
|
|
215
|
+
python -m spacy download en_core_web_lg
|
|
216
|
+
pytest tests/ -v
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
MIT
|
armos-0.1.0/README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Armos
|
|
2
|
+
|
|
3
|
+
**PII never reaches your LLM. One line of code.**
|
|
4
|
+
|
|
5
|
+
Armos wraps the OpenAI and Anthropic SDKs to automatically detect and mask personally identifiable information (PII) before it leaves your server — and restore the real values in the response. Your application code changes by exactly one word.
|
|
6
|
+
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://www.python.org/downloads/)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## The problem
|
|
13
|
+
|
|
14
|
+
Every time your application calls an LLM, it sends raw text to a third-party server. If a user's message contains their name, Aadhaar number, email, PAN card, or credit card — that data leaves your infrastructure.
|
|
15
|
+
|
|
16
|
+
This matters for:
|
|
17
|
+
- **Healthcare apps** — patient names, dates of birth, medical IDs
|
|
18
|
+
- **Fintech apps** — PAN, Aadhaar, bank details
|
|
19
|
+
- **Customer support tools** — names, emails, phone numbers, addresses
|
|
20
|
+
- **Any app** where users type free text that gets sent to OpenAI or Anthropic
|
|
21
|
+
|
|
22
|
+
Most teams know this is a risk. Few have time to build a proper masking layer before shipping. Armos is that layer, pre-built.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## How it works
|
|
27
|
+
|
|
28
|
+

|
|
29
|
+
|
|
30
|
+
**Detection runs entirely on your machine.** Presidio + spaCy analyse the text locally. No data is sent to any Armos server — there is no Armos server. The vault (token ↔ real value map) lives in your process memory, or optionally in your own Redis instance.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quickstart
|
|
35
|
+
|
|
36
|
+
### Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install armos
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
For Redis-backed persistence across requests:
|
|
43
|
+
```bash
|
|
44
|
+
pip install armos[redis]
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
> **Note:** On first use, download the spaCy language model:
|
|
48
|
+
> ```bash
|
|
49
|
+
> python -m spacy download en_core_web_lg
|
|
50
|
+
> ```
|
|
51
|
+
|
|
52
|
+
### OpenAI
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
# Before
|
|
56
|
+
from openai import OpenAI
|
|
57
|
+
client = OpenAI()
|
|
58
|
+
|
|
59
|
+
# After — one import added, one word changed
|
|
60
|
+
from openai import OpenAI
|
|
61
|
+
from armos import ArmosOpenAI
|
|
62
|
+
|
|
63
|
+
client = ArmosOpenAI(OpenAI())
|
|
64
|
+
|
|
65
|
+
# Everything else is identical
|
|
66
|
+
response = client.chat.completions.create(
|
|
67
|
+
model="gpt-4o",
|
|
68
|
+
messages=[{
|
|
69
|
+
"role": "user",
|
|
70
|
+
"content": "Summarise the case for patient John Smith, Aadhaar 2345 6789 0123"
|
|
71
|
+
}]
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Real values are restored in the response automatically
|
|
75
|
+
print(response.choices[0].message.content)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Anthropic
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from anthropic import Anthropic
|
|
82
|
+
from armos import ArmosAnthropic
|
|
83
|
+
|
|
84
|
+
client = ArmosAnthropic(Anthropic())
|
|
85
|
+
|
|
86
|
+
message = client.messages.create(
|
|
87
|
+
model="claude-sonnet-4-6",
|
|
88
|
+
max_tokens=1024,
|
|
89
|
+
messages=[{
|
|
90
|
+
"role": "user",
|
|
91
|
+
"content": "Patient John Smith, DOB 12/04/1982, PAN ABCDE1234F"
|
|
92
|
+
}]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
print(message.content[0].text) # real values restored
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### With Redis (persistent vault across requests)
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Token mappings survive across processes and requests
|
|
102
|
+
client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379")
|
|
103
|
+
client = ArmosAnthropic(Anthropic(), store="redis://localhost:6379")
|
|
104
|
+
|
|
105
|
+
# Custom TTL (default: 24 hours)
|
|
106
|
+
client = ArmosOpenAI(OpenAI(), store="redis://localhost:6379", vault_ttl=3600)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Standalone (any LLM or framework)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from armos import Armos
|
|
113
|
+
|
|
114
|
+
guard = Armos()
|
|
115
|
+
|
|
116
|
+
result = guard.mask("Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com")
|
|
117
|
+
print(result.text)
|
|
118
|
+
# → "Patient [PII:NAME:a1b2c3d4], Aadhaar [PII:AADHAAR:b2c3d4e5], email [PII:EMAIL:e5f6g7h8]"
|
|
119
|
+
|
|
120
|
+
print(result.has_pii) # True
|
|
121
|
+
|
|
122
|
+
restored = guard.demask(result.text)
|
|
123
|
+
print(restored)
|
|
124
|
+
# → "Patient John Smith, Aadhaar 2345 6789 0123, email john@hospital.com"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## What gets detected
|
|
130
|
+
|
|
131
|
+
| Entity | Token | Example |
|
|
132
|
+
|--------|-------|---------|
|
|
133
|
+
| Person name | `[PII:NAME:…]` | John Smith |
|
|
134
|
+
| Email address | `[PII:EMAIL:…]` | john@hospital.com |
|
|
135
|
+
| Phone number | `[PII:PHONE:…]` | +91 98765 43210 |
|
|
136
|
+
| Aadhaar number | `[PII:AADHAAR:…]` | 2345 6789 0123 |
|
|
137
|
+
| PAN card | `[PII:PAN:…]` | ABCDE1234F |
|
|
138
|
+
| Credit / debit card | `[PII:CARD:…]` | 4111 1111 1111 1111 |
|
|
139
|
+
| IP address | `[PII:IP:…]` | 192.168.1.100 |
|
|
140
|
+
| API keys & secrets | `[PII:APIKEY:…]` | sk-abc123… / AKIA… / ghp_… |
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Token design
|
|
145
|
+
|
|
146
|
+
Tokens are **deterministic** and **normalisation-aware**:
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
"john smith" → [PII:NAME:a1b2c3d4] ← stored: "john smith"
|
|
150
|
+
"John Smith" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
|
|
151
|
+
"JOHN SMITH" → [PII:NAME:a1b2c3d4] ← same token, vault unchanged
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
All casing variants of the same name map to one token. The LLM sees one consistent entity across a conversation — not three different people. De-masking restores the first-seen value.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Vault options
|
|
159
|
+
|
|
160
|
+
| Option | Default | Use when |
|
|
161
|
+
|--------|---------|----------|
|
|
162
|
+
| In-memory | `Armos()` | Single request or single process |
|
|
163
|
+
| Redis | `Armos(store="redis://…")` | Multi-turn conversations, multiple workers, or across requests |
|
|
164
|
+
|
|
165
|
+
In-memory vault is zero configuration and the default. Redis vault persists token mappings so a token created in request 1 can be de-masked in request 5.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## v1 limitations
|
|
170
|
+
|
|
171
|
+
1. **Streaming not supported** — `stream=True` passes through without masking. (v1.1)
|
|
172
|
+
2. **Async clients not supported** — `AsyncOpenAI`, `AsyncAnthropic` pass through without masking. (v1.1)
|
|
173
|
+
3. **OpenAI Responses API not intercepted** — `client.responses.create()` passes through. (v1.1)
|
|
174
|
+
4. **Embeddings not masked** — `client.embeddings.create()` sends text as-is. (v1.1)
|
|
175
|
+
5. **Indian name accuracy** — `en_core_web_lg` is trained on English text; Indian names have lower recall than Western names. Fine-tuning planned for v2.
|
|
176
|
+
6. **Casing: first-seen wins** — De-masking always restores the first-seen casing of an entity. Use consistent casing in your prompts for exact restoration.
|
|
177
|
+
7. **Token length** — `[PII:NAME:a1b2c3d4]` is 18 chars vs `John` (4 chars). Near context-window limits this may push content over. Rare in practice.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Contributing
|
|
182
|
+
|
|
183
|
+
Armos is open source and MIT licensed. Issues and pull requests welcome.
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
git clone https://github.com/armos-ai/armos
|
|
187
|
+
cd armos
|
|
188
|
+
pip install -e ".[dev,all]"
|
|
189
|
+
python -m spacy download en_core_web_lg
|
|
190
|
+
pytest tests/ -v
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT
|
|
Binary file
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "armos"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Automatic PII masking for OpenAI and Anthropic SDKs"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
license-files = ["LICENSE"]
|
|
9
|
+
|
|
10
|
+
dependencies = [
|
|
11
|
+
"presidio-analyzer>=2.2.0",
|
|
12
|
+
"presidio-anonymizer>=2.2.0",
|
|
13
|
+
"spacy>=3.7.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
redis = ["redis>=5.0.0"]
|
|
18
|
+
openai = ["openai>=1.0.0"]
|
|
19
|
+
anthropic = ["anthropic>=0.20.0"]
|
|
20
|
+
all = ["redis>=5.0.0", "openai>=1.0.0", "anthropic>=0.20.0"]
|
|
21
|
+
dev = [
|
|
22
|
+
"pytest>=8.0.0",
|
|
23
|
+
"pytest-asyncio>=0.23.0",
|
|
24
|
+
"python-dotenv>=1.0.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["src/armos"]
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["hatchling"]
|
|
32
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""
|
|
3
|
+
Armos — Automatic PII masking for OpenAI and Anthropic SDKs.
|
|
4
|
+
|
|
5
|
+
One line change. PII never reaches your LLM provider.
|
|
6
|
+
|
|
7
|
+
Quick start:
|
|
8
|
+
from openai import OpenAI
|
|
9
|
+
from armos import ArmosOpenAI
|
|
10
|
+
|
|
11
|
+
client = ArmosOpenAI(OpenAI())
|
|
12
|
+
# Use exactly as you would the normal OpenAI client.
|
|
13
|
+
# PII is masked automatically before every request.
|
|
14
|
+
# Real values are restored automatically in every response.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .guard import Armos
|
|
18
|
+
from .wrappers.openai import ArmosOpenAI
|
|
19
|
+
from .wrappers.anthropic import ArmosAnthropic
|
|
20
|
+
from .models import MaskResult, DetectedEntity
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
__all__ = [
|
|
24
|
+
"Armos",
|
|
25
|
+
"ArmosOpenAI",
|
|
26
|
+
"ArmosAnthropic",
|
|
27
|
+
"MaskResult",
|
|
28
|
+
"DetectedEntity",
|
|
29
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
from typing import List
|
|
3
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
|
4
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
5
|
+
|
|
6
|
+
from ..models import DetectedEntity
|
|
7
|
+
from .recognisers.aadhaar import AadhaarRecogniser
|
|
8
|
+
from .recognisers.pan import PANRecogniser
|
|
9
|
+
from .recognisers.standard import APIKeyRecogniser
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
ENTITY_TYPES = [
|
|
13
|
+
"PERSON",
|
|
14
|
+
"EMAIL_ADDRESS",
|
|
15
|
+
"PHONE_NUMBER",
|
|
16
|
+
"AADHAAR_NUMBER",
|
|
17
|
+
"IN_PAN",
|
|
18
|
+
"CREDIT_CARD",
|
|
19
|
+
"IP_ADDRESS",
|
|
20
|
+
"API_KEY",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
ENTITY_SHORT_CODES = {
|
|
24
|
+
"PERSON": "NAME",
|
|
25
|
+
"EMAIL_ADDRESS": "EMAIL",
|
|
26
|
+
"PHONE_NUMBER": "PHONE",
|
|
27
|
+
"AADHAAR_NUMBER": "AADHAAR",
|
|
28
|
+
"IN_PAN": "PAN",
|
|
29
|
+
"CREDIT_CARD": "CARD",
|
|
30
|
+
"IP_ADDRESS": "IP",
|
|
31
|
+
"API_KEY": "APIKEY",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DetectionEngine:
|
|
36
|
+
"""
|
|
37
|
+
Orchestrates all PII recognisers.
|
|
38
|
+
Detection runs entirely locally — no text is sent anywhere.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
self._analyzer = self._build_analyzer()
|
|
43
|
+
|
|
44
|
+
def _build_analyzer(self) -> AnalyzerEngine:
|
|
45
|
+
configuration = {
|
|
46
|
+
"nlp_engine_name": "spacy",
|
|
47
|
+
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
|
48
|
+
}
|
|
49
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
|
50
|
+
nlp_engine = provider.create_engine()
|
|
51
|
+
|
|
52
|
+
registry = RecognizerRegistry()
|
|
53
|
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
|
54
|
+
|
|
55
|
+
registry.add_recognizer(AadhaarRecogniser())
|
|
56
|
+
registry.add_recognizer(PANRecogniser())
|
|
57
|
+
registry.add_recognizer(APIKeyRecogniser())
|
|
58
|
+
|
|
59
|
+
return AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
|
|
60
|
+
|
|
61
|
+
def detect(self, text: str, language: str = "en") -> List[DetectedEntity]:
|
|
62
|
+
"""Detect all PII in text. Returns entities sorted by position."""
|
|
63
|
+
if not text or not text.strip():
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
results = self._analyzer.analyze(
|
|
67
|
+
text=text,
|
|
68
|
+
entities=ENTITY_TYPES,
|
|
69
|
+
language=language,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
entities = [
|
|
73
|
+
DetectedEntity(
|
|
74
|
+
entity_type=ENTITY_SHORT_CODES.get(r.entity_type, r.entity_type),
|
|
75
|
+
text=text[r.start:r.end],
|
|
76
|
+
start=r.start,
|
|
77
|
+
end=r.end,
|
|
78
|
+
score=r.score,
|
|
79
|
+
)
|
|
80
|
+
for r in results
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
entities.sort(key=lambda e: e.start)
|
|
84
|
+
return self._resolve_overlaps(entities)
|
|
85
|
+
|
|
86
|
+
def _resolve_overlaps(self, entities: List[DetectedEntity]) -> List[DetectedEntity]:
|
|
87
|
+
"""Remove overlapping detections. Higher confidence wins."""
|
|
88
|
+
if not entities:
|
|
89
|
+
return entities
|
|
90
|
+
|
|
91
|
+
resolved = [entities[0]]
|
|
92
|
+
for current in entities[1:]:
|
|
93
|
+
previous = resolved[-1]
|
|
94
|
+
if current.start < previous.end:
|
|
95
|
+
if current.score > previous.score:
|
|
96
|
+
resolved[-1] = current
|
|
97
|
+
else:
|
|
98
|
+
resolved.append(current)
|
|
99
|
+
return resolved
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
from presidio_analyzer import Pattern, PatternRecognizer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AadhaarRecogniser(PatternRecognizer):
|
|
6
|
+
"""
|
|
7
|
+
Detects Indian Aadhaar numbers.
|
|
8
|
+
|
|
9
|
+
Format: 12 digits. First digit cannot be 0 or 1.
|
|
10
|
+
Common representations:
|
|
11
|
+
2345 6789 0123 (spaces — most common)
|
|
12
|
+
2345-6789-0123 (hyphens)
|
|
13
|
+
234567890123 (no separator — lower confidence)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
PATTERNS = [
|
|
17
|
+
Pattern(
|
|
18
|
+
name="aadhaar_spaces",
|
|
19
|
+
regex=r"\b[2-9]\d{3}\s\d{4}\s\d{4}\b",
|
|
20
|
+
score=0.95,
|
|
21
|
+
),
|
|
22
|
+
Pattern(
|
|
23
|
+
name="aadhaar_hyphens",
|
|
24
|
+
regex=r"\b[2-9]\d{3}-\d{4}-\d{4}\b",
|
|
25
|
+
score=0.95,
|
|
26
|
+
),
|
|
27
|
+
Pattern(
|
|
28
|
+
name="aadhaar_plain",
|
|
29
|
+
regex=r"\b[2-9]\d{11}\b",
|
|
30
|
+
score=0.6,
|
|
31
|
+
),
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
CONTEXT = [
|
|
35
|
+
"aadhaar", "aadhar", "uid", "uidai",
|
|
36
|
+
"unique identification", "biometric id", "aadhaaar"
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
super().__init__(
|
|
41
|
+
supported_entity="AADHAAR_NUMBER",
|
|
42
|
+
patterns=self.PATTERNS,
|
|
43
|
+
context=self.CONTEXT,
|
|
44
|
+
supported_language="en",
|
|
45
|
+
)
|