vipii 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vipii-0.1.0/.github/workflows/ci.yml +27 -0
- vipii-0.1.0/.gitignore +60 -0
- vipii-0.1.0/AGENTS.md +75 -0
- vipii-0.1.0/LICENSE +21 -0
- vipii-0.1.0/PKG-INFO +161 -0
- vipii-0.1.0/README.md +132 -0
- vipii-0.1.0/examples/_helpers.py +16 -0
- vipii-0.1.0/examples/basic_detection.py +26 -0
- vipii-0.1.0/examples/concurrent_detection.py +80 -0
- vipii-0.1.0/examples/custom_only.py +24 -0
- vipii-0.1.0/examples/custom_pattern.py +31 -0
- vipii-0.1.0/examples/custom_recognizers.yml +7 -0
- vipii-0.1.0/examples/customer_service.txt +6 -0
- vipii-0.1.0/examples/ner_model.py +44 -0
- vipii-0.1.0/examples/redaction.py +24 -0
- vipii-0.1.0/examples/usage.py +30 -0
- vipii-0.1.0/examples/yaml_config.py +27 -0
- vipii-0.1.0/pyproject.toml +53 -0
- vipii-0.1.0/src/vipii/__init__.py +16 -0
- vipii-0.1.0/src/vipii/builtin_recognizers.yml +57 -0
- vipii-0.1.0/src/vipii/cli.py +124 -0
- vipii-0.1.0/src/vipii/config.py +241 -0
- vipii-0.1.0/src/vipii/constants.py +126 -0
- vipii-0.1.0/src/vipii/detector.py +126 -0
- vipii-0.1.0/src/vipii/models.py +48 -0
- vipii-0.1.0/src/vipii/presidio.py +59 -0
- vipii-0.1.0/src/vipii/recognizers/__init__.py +39 -0
- vipii-0.1.0/src/vipii/recognizers/base.py +17 -0
- vipii-0.1.0/src/vipii/recognizers/ner.py +112 -0
- vipii-0.1.0/src/vipii/recognizers/pattern.py +59 -0
- vipii-0.1.0/src/vipii/recognizers/registry.py +26 -0
- vipii-0.1.0/src/vipii/recognizers/validators.py +77 -0
- vipii-0.1.0/src/vipii/scoring.py +44 -0
- vipii-0.1.0/tests/.gitignore +12 -0
- vipii-0.1.0/tests/conftest.py +8 -0
- vipii-0.1.0/tests/fixtures/synthetic_vietnamese_pii.jsonl +50 -0
- vipii-0.1.0/tests/test_cli.py +100 -0
- vipii-0.1.0/tests/test_config.py +86 -0
- vipii-0.1.0/tests/test_detector.py +120 -0
- vipii-0.1.0/tests/test_fixture_dataset.py +23 -0
- vipii-0.1.0/tests/test_ner.py +109 -0
- vipii-0.1.0/uv.lock +3279 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
matrix:
|
|
12
|
+
python-version: ["3.9", "3.13"]
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: ${{ matrix.python-version }}
|
|
18
|
+
- name: Install
|
|
19
|
+
run: |
|
|
20
|
+
python -m pip install --upgrade pip
|
|
21
|
+
python -m pip install -e ".[dev]"
|
|
22
|
+
- name: Ruff
|
|
23
|
+
run: |
|
|
24
|
+
ruff check .
|
|
25
|
+
ruff format --check .
|
|
26
|
+
- name: Pytest
|
|
27
|
+
run: pytest
|
vipii-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
build/
|
|
11
|
+
dist/
|
|
12
|
+
downloads/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# Test, coverage, and tooling caches
|
|
26
|
+
.coverage
|
|
27
|
+
.coverage.*
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
.ruff_cache/
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
.pyre/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
htmlcov/
|
|
35
|
+
coverage.xml
|
|
36
|
+
|
|
37
|
+
# Virtual environments
|
|
38
|
+
.venv/
|
|
39
|
+
venv/
|
|
40
|
+
ENV/
|
|
41
|
+
env/
|
|
42
|
+
|
|
43
|
+
# Environment files
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
!.env.example
|
|
47
|
+
|
|
48
|
+
# IDE and editor files
|
|
49
|
+
.vscode/
|
|
50
|
+
.idea/
|
|
51
|
+
*.swp
|
|
52
|
+
*.swo
|
|
53
|
+
*~
|
|
54
|
+
|
|
55
|
+
# OS files
|
|
56
|
+
.DS_Store
|
|
57
|
+
Thumbs.db
|
|
58
|
+
|
|
59
|
+
# Local example/runtime output
|
|
60
|
+
*.log
|
vipii-0.1.0/AGENTS.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Repository Guidelines
|
|
2
|
+
|
|
3
|
+
## Project Shape
|
|
4
|
+
|
|
5
|
+
- `vipii` is a Python package using a `src/` layout: package code lives in `src/vipii`.
|
|
6
|
+
- The public API is exported from `src/vipii/__init__.py`.
|
|
7
|
+
- Core modules:
|
|
8
|
+
- `models.py`: frozen dataclasses for `PIIMatch` and regex `Pattern`.
|
|
9
|
+
- `recognizers.py`: built-in Vietnamese structured PII recognizers, validators, and registry.
|
|
10
|
+
- `scoring.py`: context-window normalization and score boosting.
|
|
11
|
+
- `detector.py`: detector orchestration, overlap resolution, and redaction.
|
|
12
|
+
- `cli.py`: `argparse` CLI for `vipii scan`.
|
|
13
|
+
- `presidio.py`: optional Presidio adapter; importing the module should not require Presidio.
|
|
14
|
+
|
|
15
|
+
## Coding Style
|
|
16
|
+
|
|
17
|
+
- Target Python is `>=3.9`; keep compatibility with Python 3.9 through 3.13.
|
|
18
|
+
- Use `from __future__ import annotations` in Python modules.
|
|
19
|
+
- Prefer small, typed functions and dataclasses over large classes.
|
|
20
|
+
- Use absolute imports from `vipii`, matching the existing modules.
|
|
21
|
+
- Keep source formatted for Ruff with a 100-character line length.
|
|
22
|
+
- Existing lint rules come from Ruff: `E`, `F`, `I`, `UP`, `B`, and `SIM`.
|
|
23
|
+
- Keep user-facing text and file IO UTF-8 friendly; tests and examples contain Vietnamese text.
|
|
24
|
+
- Use Any|Any for optional return in funciton instead of typing.Optional
|
|
25
|
+
- At interface functions, place '...' instead left empty
|
|
26
|
+
|
|
27
|
+
## Naming Patterns
|
|
28
|
+
|
|
29
|
+
- PII entity labels are uppercase strings such as `CCCD`, `PHONE_NUMBER`, and `BANK_ACCOUNT`.
|
|
30
|
+
- Recognizer names are lowercase snake_case such as `phone_number` and `vehicle_plate`.
|
|
31
|
+
- Validators use `valid_*` names and return `bool`.
|
|
32
|
+
- Helper functions use snake_case and are module-level unless they need object state.
|
|
33
|
+
- CLI command functions are named around the action, for example `scan_input`, `scan_file`,
|
|
34
|
+
and `scan_text`.
|
|
35
|
+
|
|
36
|
+
## Architecture Patterns
|
|
37
|
+
|
|
38
|
+
- Built-in recognizers are regex `Pattern` objects plus optional validators and context words.
|
|
39
|
+
- Scores start from `base_score` and are boosted by nearby context words in `scoring.py`.
|
|
40
|
+
- `PIIDetector.detect()` gathers candidates from the registry, then resolves overlapping spans.
|
|
41
|
+
- `PIIDetector.redact()` masks detected spans while preserving surrounding text.
|
|
42
|
+
- Custom patterns are added through `PIIDetector.add_pattern()` and wrapped as a recognizer.
|
|
43
|
+
- Optional dependencies should stay lazy, as in `presidio.py`.
|
|
44
|
+
|
|
45
|
+
## Testing Style
|
|
46
|
+
|
|
47
|
+
- Tests use pytest and live under `tests/`.
|
|
48
|
+
- Prefer behavior-focused tests against the public API or CLI entry points.
|
|
49
|
+
- CLI tests call `vipii.cli.main(...)` directly and assert stdout with `capsys`.
|
|
50
|
+
- Use `tmp_path` for file-based CLI tests.
|
|
51
|
+
- Fixture-driven detector coverage uses JSONL in `tests/fixtures/`.
|
|
52
|
+
- Some tests use `# type: ignore[no-untyped-def]` for pytest fixtures without annotations.
|
|
53
|
+
|
|
54
|
+
## Commands
|
|
55
|
+
|
|
56
|
+
Install for development:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install -e ".[dev]"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Run lint and format checks:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
ruff check .
|
|
66
|
+
ruff format --check .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Run tests:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pytest
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
CI runs the same Ruff and pytest commands on Python 3.9 and 3.13.
|
vipii-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 vipii contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vipii-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vipii
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Vietnamese PII detection with regex recognizers, validators, and context scoring.
|
|
5
|
+
Author: vipii contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: nlp,pii,privacy,redaction,vietnamese
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: ruff>=0.5.0; extra == 'dev'
|
|
23
|
+
Provides-Extra: ner
|
|
24
|
+
Requires-Dist: torch>=2.0.0; extra == 'ner'
|
|
25
|
+
Requires-Dist: transformers>=4.40.0; extra == 'ner'
|
|
26
|
+
Provides-Extra: presidio
|
|
27
|
+
Requires-Dist: presidio-analyzer>=2.2.0; extra == 'presidio'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# vipii
|
|
31
|
+
|
|
32
|
+
`vipii` is a Python library for detecting Vietnamese personally identifiable information (PII) in
|
|
33
|
+
UTF-8 text. It combines deterministic regex-based recognizers, validator functions, overlap
|
|
34
|
+
resolution, and Vietnamese context-window scoring to identify structured entities such as national
|
|
35
|
+
IDs, phone numbers, tax codes, bank identifiers, passports, and vehicle plates.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install vipii
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
For local development:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install -e ".[dev]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Python API
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from vipii import PIIDetector, Pattern
|
|
53
|
+
|
|
54
|
+
detector = PIIDetector()
|
|
55
|
+
detector.add_pattern(
|
|
56
|
+
Pattern(label="CUSTOMER_ID", regex=r"\bKH-\d{6}\b", context_words=["mã khách hàng"])
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
matches = detector.detect(
|
|
60
|
+
"Khách hàng Nguyễn Văn A, số điện thoại 0912 345 678, CCCD 001203000123."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
for match in matches:
|
|
64
|
+
print(match.label, match.text, match.score)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Optional NER
|
|
68
|
+
|
|
69
|
+
Regex recognizers cover structured PII. For free-form names, locations, organizations, and addresses,
|
|
70
|
+
enable an external Hugging Face token-classification model:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install "vipii[ner]"
|
|
74
|
+
vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from vipii import PIIDetector
|
|
79
|
+
|
|
80
|
+
detector = PIIDetector(ner_model="your-vietnamese-ner-model")
|
|
81
|
+
matches = detector.detect("Nguyễn Văn A sống tại Hà Nội")
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
The NER layer maps model labels such as `PER`, `LOC`, and `ORG` to `PERSON`, `LOCATION`, and
|
|
85
|
+
`ORGANIZATION`. The model is not bundled; choose and evaluate one for your domain before production
|
|
86
|
+
use.
|
|
87
|
+
|
|
88
|
+
## CLI
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
vipii scan "Số điện thoại 0912 345 678 và CCCD 001203000123"
|
|
92
|
+
vipii scan examples/customer_service.txt
|
|
93
|
+
vipii scan examples/customer_service.txt --format json
|
|
94
|
+
vipii scan examples/customer_service.txt --redact
|
|
95
|
+
vipii scan "CCCD 001203000123" --redact
|
|
96
|
+
vipii scan "Mã khách hàng KH-123456" --config examples/custom_recognizers.yml
|
|
97
|
+
vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## YAML recognizer config
|
|
101
|
+
|
|
102
|
+
Built-in recognizers are loaded from `src/vipii/builtin_recognizers.yml`. You can append your own
|
|
103
|
+
recognizers from a YAML file without writing Python:
|
|
104
|
+
|
|
105
|
+
```yaml
|
|
106
|
+
recognizers:
|
|
107
|
+
- name: customer_id
|
|
108
|
+
label: CUSTOMER_ID
|
|
109
|
+
patterns:
|
|
110
|
+
- regex: '\bKH-\d{6}\b'
|
|
111
|
+
context_words: ["mã khách hàng", "customer id"]
|
|
112
|
+
base_score: 0.6
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Use `validator` only when you want one of vipii's built-in validators: `cccd`, `cmnd`, `phone`,
|
|
116
|
+
`tax_code`, `bank_card`, `bank_account`, `passport`, or `vehicle_plate`.
|
|
117
|
+
|
|
118
|
+
## Built-in recognizers
|
|
119
|
+
|
|
120
|
+
- `CCCD` and `CMND`
|
|
121
|
+
- `PHONE_NUMBER`
|
|
122
|
+
- `MST`
|
|
123
|
+
- `BANK_CARD`
|
|
124
|
+
- `BANK_ACCOUNT`
|
|
125
|
+
- `PASSPORT`
|
|
126
|
+
- `VEHICLE_PLATE`
|
|
127
|
+
|
|
128
|
+
The recognizers intentionally favor clear structured PII plus nearby Vietnamese context words such as
|
|
129
|
+
`số điện thoại`, `cccd`, `mã số thuế`, and `biển số xe`. Names and free-form addresses can be handled
|
|
130
|
+
by the optional NER layer.
|
|
131
|
+
|
|
132
|
+
## Development
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
pip install -e ".[dev]"
|
|
136
|
+
ruff check .
|
|
137
|
+
ruff format --check .
|
|
138
|
+
pytest
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Publishing
|
|
142
|
+
|
|
143
|
+
Build and inspect the package before uploading:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
python -m pip install --upgrade build twine
|
|
147
|
+
python -m build
|
|
148
|
+
python -m twine check dist/*
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Upload to TestPyPI first:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
python -m twine upload --repository testpypi dist/*
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Then upload the same checked artifacts to PyPI:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
python -m twine upload dist/*
|
|
161
|
+
```
|
vipii-0.1.0/README.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# vipii
|
|
2
|
+
|
|
3
|
+
`vipii` is a Python library for detecting Vietnamese personally identifiable information (PII) in
|
|
4
|
+
UTF-8 text. It combines deterministic regex-based recognizers, validator functions, overlap
|
|
5
|
+
resolution, and Vietnamese context-window scoring to identify structured entities such as national
|
|
6
|
+
IDs, phone numbers, tax codes, bank identifiers, passports, and vehicle plates.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install vipii
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
For local development:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install -e ".[dev]"
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Python API
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from vipii import PIIDetector, Pattern
|
|
24
|
+
|
|
25
|
+
detector = PIIDetector()
|
|
26
|
+
detector.add_pattern(
|
|
27
|
+
Pattern(label="CUSTOMER_ID", regex=r"\bKH-\d{6}\b", context_words=["mã khách hàng"])
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
matches = detector.detect(
|
|
31
|
+
"Khách hàng Nguyễn Văn A, số điện thoại 0912 345 678, CCCD 001203000123."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
for match in matches:
|
|
35
|
+
print(match.label, match.text, match.score)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Optional NER
|
|
39
|
+
|
|
40
|
+
Regex recognizers cover structured PII. For free-form names, locations, organizations, and addresses,
|
|
41
|
+
enable an external Hugging Face token-classification model:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "vipii[ner]"
|
|
45
|
+
vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from vipii import PIIDetector
|
|
50
|
+
|
|
51
|
+
detector = PIIDetector(ner_model="your-vietnamese-ner-model")
|
|
52
|
+
matches = detector.detect("Nguyễn Văn A sống tại Hà Nội")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The NER layer maps model labels such as `PER`, `LOC`, and `ORG` to `PERSON`, `LOCATION`, and
|
|
56
|
+
`ORGANIZATION`. The model is not bundled; choose and evaluate one for your domain before production
|
|
57
|
+
use.
|
|
58
|
+
|
|
59
|
+
## CLI
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
vipii scan "Số điện thoại 0912 345 678 và CCCD 001203000123"
|
|
63
|
+
vipii scan examples/customer_service.txt
|
|
64
|
+
vipii scan examples/customer_service.txt --format json
|
|
65
|
+
vipii scan examples/customer_service.txt --redact
|
|
66
|
+
vipii scan "CCCD 001203000123" --redact
|
|
67
|
+
vipii scan "Mã khách hàng KH-123456" --config examples/custom_recognizers.yml
|
|
68
|
+
vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## YAML recognizer config
|
|
72
|
+
|
|
73
|
+
Built-in recognizers are loaded from `src/vipii/builtin_recognizers.yml`. You can append your own
|
|
74
|
+
recognizers from a YAML file without writing Python:
|
|
75
|
+
|
|
76
|
+
```yaml
|
|
77
|
+
recognizers:
|
|
78
|
+
- name: customer_id
|
|
79
|
+
label: CUSTOMER_ID
|
|
80
|
+
patterns:
|
|
81
|
+
- regex: '\bKH-\d{6}\b'
|
|
82
|
+
context_words: ["mã khách hàng", "customer id"]
|
|
83
|
+
base_score: 0.6
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Use `validator` only when you want one of vipii's built-in validators: `cccd`, `cmnd`, `phone`,
|
|
87
|
+
`tax_code`, `bank_card`, `bank_account`, `passport`, or `vehicle_plate`.
|
|
88
|
+
|
|
89
|
+
## Built-in recognizers
|
|
90
|
+
|
|
91
|
+
- `CCCD` and `CMND`
|
|
92
|
+
- `PHONE_NUMBER`
|
|
93
|
+
- `MST`
|
|
94
|
+
- `BANK_CARD`
|
|
95
|
+
- `BANK_ACCOUNT`
|
|
96
|
+
- `PASSPORT`
|
|
97
|
+
- `VEHICLE_PLATE`
|
|
98
|
+
|
|
99
|
+
The recognizers intentionally favor clear structured PII plus nearby Vietnamese context words such as
|
|
100
|
+
`số điện thoại`, `cccd`, `mã số thuế`, and `biển số xe`. Names and free-form addresses can be handled
|
|
101
|
+
by the optional NER layer.
|
|
102
|
+
|
|
103
|
+
## Development
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install -e ".[dev]"
|
|
107
|
+
ruff check .
|
|
108
|
+
ruff format --check .
|
|
109
|
+
pytest
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Publishing
|
|
113
|
+
|
|
114
|
+
Build and inspect the package before uploading:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
python -m pip install --upgrade build twine
|
|
118
|
+
python -m build
|
|
119
|
+
python -m twine check dist/*
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Upload to TestPyPI first:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
python -m twine upload --repository testpypi dist/*
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Then upload the same checked artifacts to PyPI:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
python -m twine upload dist/*
|
|
132
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Shared helpers for example scripts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from vipii import PIIDetector
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def print_matches(title: str, text: str, detector: PIIDetector) -> None:
|
|
9
|
+
print(f"\n{title}")
|
|
10
|
+
print("-" * len(title))
|
|
11
|
+
print(text)
|
|
12
|
+
for match in detector.detect(text):
|
|
13
|
+
print(
|
|
14
|
+
f"{match.label:<14} {match.start:>2}:{match.end:<2} "
|
|
15
|
+
f"score={match.score:.2f} text={match.text!r}"
|
|
16
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Detect built-in Vietnamese structured PII.
|
|
2
|
+
|
|
3
|
+
Run from the repository after installing the package:
|
|
4
|
+
|
|
5
|
+
python examples/basic_detection.py
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from _helpers import print_matches
|
|
11
|
+
|
|
12
|
+
from vipii import PIIDetector
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
detector = PIIDetector()
|
|
17
|
+
text = (
|
|
18
|
+
"Khách hàng có số điện thoại 0912 345 678, CCCD 001203000123, "
|
|
19
|
+
"mã số thuế 0312345678 và biển số xe 51F-123.45."
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
print_matches("Built-in structured PII", text, detector)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Run recognizers concurrently.
|
|
2
|
+
|
|
3
|
+
Run from the repository after installing the package:
|
|
4
|
+
|
|
5
|
+
python examples/concurrent_detection.py
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from time import perf_counter, sleep
|
|
13
|
+
|
|
14
|
+
from _helpers import print_matches
|
|
15
|
+
|
|
16
|
+
from vipii import PIIDetector
|
|
17
|
+
from vipii.models import PIIMatch
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SlowRegexRecognizer:
|
|
22
|
+
name: str
|
|
23
|
+
label: str
|
|
24
|
+
regex: str
|
|
25
|
+
delay_seconds: float = 0.5
|
|
26
|
+
|
|
27
|
+
def recognize(self, text: str) -> list[PIIMatch]:
|
|
28
|
+
sleep(self.delay_seconds)
|
|
29
|
+
return [
|
|
30
|
+
PIIMatch(
|
|
31
|
+
label=self.label,
|
|
32
|
+
start=match.start(),
|
|
33
|
+
end=match.end(),
|
|
34
|
+
text=match.group(0),
|
|
35
|
+
score=0.8,
|
|
36
|
+
recognizer=self.name,
|
|
37
|
+
)
|
|
38
|
+
for match in re.finditer(self.regex, text)
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def timed_detect(title: str, detector: PIIDetector, text: str) -> None:
|
|
43
|
+
started = perf_counter()
|
|
44
|
+
matches = detector.detect(text)
|
|
45
|
+
elapsed = perf_counter() - started
|
|
46
|
+
|
|
47
|
+
print(f"\n{title}")
|
|
48
|
+
print("-" * len(title))
|
|
49
|
+
print(f"Detected {len(matches)} matches in {elapsed:.2f}s")
|
|
50
|
+
for match in matches:
|
|
51
|
+
print(
|
|
52
|
+
f"{match.label:<14} {match.start:>2}:{match.end:<2} "
|
|
53
|
+
f"score={match.score:.2f} text={match.text!r}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main() -> None:
|
|
58
|
+
text = "Mã khách hàng KH-123456 có mã đơn hàng DH-98765."
|
|
59
|
+
recognizers = [
|
|
60
|
+
SlowRegexRecognizer("customer_id", "CUSTOMER_ID", r"\bKH-\d{6}\b"),
|
|
61
|
+
SlowRegexRecognizer("order_id", "ORDER_ID", r"\bDH-\d{5}\b"),
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
concurrent_detector = PIIDetector(
|
|
65
|
+
recognizers=recognizers,
|
|
66
|
+
include_builtins=False,
|
|
67
|
+
)
|
|
68
|
+
sequential_detector = PIIDetector(
|
|
69
|
+
recognizers=recognizers,
|
|
70
|
+
include_builtins=False,
|
|
71
|
+
max_workers=1,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print_matches("Concurrent recognizers", text, concurrent_detector)
|
|
75
|
+
timed_detect("Concurrent timing", concurrent_detector, text)
|
|
76
|
+
timed_detect("Sequential timing", sequential_detector, text)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
main()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Run a detector with only custom recognizers.
|
|
2
|
+
|
|
3
|
+
Run from the repository after installing the package:
|
|
4
|
+
|
|
5
|
+
python examples/custom_only.py
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from _helpers import print_matches
|
|
11
|
+
|
|
12
|
+
from vipii import Pattern, PIIDetector
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
detector = PIIDetector(include_builtins=False)
|
|
17
|
+
detector.add_pattern(Pattern(label="ORDER_ID", regex=r"\bDH-\d{5}\b"))
|
|
18
|
+
text = "Đơn hàng DH-12345 của số điện thoại 0912345678."
|
|
19
|
+
|
|
20
|
+
print_matches("Custom-only detector", text, detector)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
main()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Add a custom regex pattern in Python.
|
|
2
|
+
|
|
3
|
+
Run from the repository after installing the package:
|
|
4
|
+
|
|
5
|
+
python examples/custom_pattern.py
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from _helpers import print_matches
|
|
11
|
+
|
|
12
|
+
from vipii import Pattern, PIIDetector
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
detector = PIIDetector()
|
|
17
|
+
detector.add_pattern(
|
|
18
|
+
Pattern(
|
|
19
|
+
label="CUSTOMER_ID",
|
|
20
|
+
regex=r"\bKH-\d{6}\b",
|
|
21
|
+
context_words=["mã khách hàng", "customer id"],
|
|
22
|
+
base_score=0.6,
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
text = "Mã khách hàng KH-123456 có số điện thoại 0912 345 678."
|
|
26
|
+
|
|
27
|
+
print_matches("Custom pattern from Python", text, detector)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
main()
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
Lưu ý: toàn bộ dữ liệu trong ví dụ này là giả.
|
|
2
|
+
|
|
3
|
+
Khách hàng Trần Văn Minh gọi lên tổng đài, số điện thoại 090 123 4567.
|
|
4
|
+
Bạn ấy cung cấp CCCD 001203000123 và mã số thuế cá nhân 0312345678.
|
|
5
|
+
Nhân viên ghi nhận biển số xe 51F-123.45 và hộ chiếu B1234567.
|
|
6
|
+
Thông tin thanh toán có thẻ 9704 0000 1234 5678 và tài khoản ngân hàng 123456789012 tại ngân hàng thử nghiệm.
|