vipii 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. vipii-0.1.0/.github/workflows/ci.yml +27 -0
  2. vipii-0.1.0/.gitignore +60 -0
  3. vipii-0.1.0/AGENTS.md +75 -0
  4. vipii-0.1.0/LICENSE +21 -0
  5. vipii-0.1.0/PKG-INFO +161 -0
  6. vipii-0.1.0/README.md +132 -0
  7. vipii-0.1.0/examples/_helpers.py +16 -0
  8. vipii-0.1.0/examples/basic_detection.py +26 -0
  9. vipii-0.1.0/examples/concurrent_detection.py +80 -0
  10. vipii-0.1.0/examples/custom_only.py +24 -0
  11. vipii-0.1.0/examples/custom_pattern.py +31 -0
  12. vipii-0.1.0/examples/custom_recognizers.yml +7 -0
  13. vipii-0.1.0/examples/customer_service.txt +6 -0
  14. vipii-0.1.0/examples/ner_model.py +44 -0
  15. vipii-0.1.0/examples/redaction.py +24 -0
  16. vipii-0.1.0/examples/usage.py +30 -0
  17. vipii-0.1.0/examples/yaml_config.py +27 -0
  18. vipii-0.1.0/pyproject.toml +53 -0
  19. vipii-0.1.0/src/vipii/__init__.py +16 -0
  20. vipii-0.1.0/src/vipii/builtin_recognizers.yml +57 -0
  21. vipii-0.1.0/src/vipii/cli.py +124 -0
  22. vipii-0.1.0/src/vipii/config.py +241 -0
  23. vipii-0.1.0/src/vipii/constants.py +126 -0
  24. vipii-0.1.0/src/vipii/detector.py +126 -0
  25. vipii-0.1.0/src/vipii/models.py +48 -0
  26. vipii-0.1.0/src/vipii/presidio.py +59 -0
  27. vipii-0.1.0/src/vipii/recognizers/__init__.py +39 -0
  28. vipii-0.1.0/src/vipii/recognizers/base.py +17 -0
  29. vipii-0.1.0/src/vipii/recognizers/ner.py +112 -0
  30. vipii-0.1.0/src/vipii/recognizers/pattern.py +59 -0
  31. vipii-0.1.0/src/vipii/recognizers/registry.py +26 -0
  32. vipii-0.1.0/src/vipii/recognizers/validators.py +77 -0
  33. vipii-0.1.0/src/vipii/scoring.py +44 -0
  34. vipii-0.1.0/tests/.gitignore +12 -0
  35. vipii-0.1.0/tests/conftest.py +8 -0
  36. vipii-0.1.0/tests/fixtures/synthetic_vietnamese_pii.jsonl +50 -0
  37. vipii-0.1.0/tests/test_cli.py +100 -0
  38. vipii-0.1.0/tests/test_config.py +86 -0
  39. vipii-0.1.0/tests/test_detector.py +120 -0
  40. vipii-0.1.0/tests/test_fixture_dataset.py +23 -0
  41. vipii-0.1.0/tests/test_ner.py +109 -0
  42. vipii-0.1.0/uv.lock +3279 -0
@@ -0,0 +1,27 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ strategy:
11
+ matrix:
12
+ python-version: ["3.9", "3.13"]
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: ${{ matrix.python-version }}
18
+ - name: Install
19
+ run: |
20
+ python -m pip install --upgrade pip
21
+ python -m pip install -e ".[dev]"
22
+ - name: Ruff
23
+ run: |
24
+ ruff check .
25
+ ruff format --check .
26
+ - name: Pytest
27
+ run: pytest
vipii-0.1.0/.gitignore ADDED
@@ -0,0 +1,60 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ build/
11
+ dist/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # Test, coverage, and tooling caches
26
+ .coverage
27
+ .coverage.*
28
+ .pytest_cache/
29
+ .ruff_cache/
30
+ .mypy_cache/
31
+ .pyre/
32
+ .tox/
33
+ .nox/
34
+ htmlcov/
35
+ coverage.xml
36
+
37
+ # Virtual environments
38
+ .venv/
39
+ venv/
40
+ ENV/
41
+ env/
42
+
43
+ # Environment files
44
+ .env
45
+ .env.*
46
+ !.env.example
47
+
48
+ # IDE and editor files
49
+ .vscode/
50
+ .idea/
51
+ *.swp
52
+ *.swo
53
+ *~
54
+
55
+ # OS files
56
+ .DS_Store
57
+ Thumbs.db
58
+
59
+ # Local example/runtime output
60
+ *.log
vipii-0.1.0/AGENTS.md ADDED
@@ -0,0 +1,75 @@
1
+ # Repository Guidelines
2
+
3
+ ## Project Shape
4
+
5
+ - `vipii` is a Python package using a `src/` layout: package code lives in `src/vipii`.
6
+ - The public API is exported from `src/vipii/__init__.py`.
7
+ - Core modules:
8
+ - `models.py`: frozen dataclasses for `PIIMatch` and regex `Pattern`.
9
+ - `recognizers.py`: built-in Vietnamese structured PII recognizers, validators, and registry.
10
+ - `scoring.py`: context-window normalization and score boosting.
11
+ - `detector.py`: detector orchestration, overlap resolution, and redaction.
12
+ - `cli.py`: `argparse` CLI for `vipii scan`.
13
+ - `presidio.py`: optional Presidio adapter; importing the module should not require Presidio.
14
+
15
+ ## Coding Style
16
+
17
+ - Target Python is `>=3.9`; keep compatibility with Python 3.9 through 3.13.
18
+ - Use `from __future__ import annotations` in Python modules.
19
+ - Prefer small, typed functions and dataclasses over large classes.
20
+ - Use absolute imports from `vipii`, matching the existing modules.
21
+ - Keep source formatted for Ruff with a 100-character line length.
22
+ - Existing lint rules come from Ruff: `E`, `F`, `I`, `UP`, `B`, and `SIM`.
23
+ - Keep user-facing text and file IO UTF-8 friendly; tests and examples contain Vietnamese text.
24
+ - Use Any|Any for optional return in funciton instead of typing.Optional
25
+ - At interface functions, place '...' instead left empty
26
+
27
+ ## Naming Patterns
28
+
29
+ - PII entity labels are uppercase strings such as `CCCD`, `PHONE_NUMBER`, and `BANK_ACCOUNT`.
30
+ - Recognizer names are lowercase snake_case such as `phone_number` and `vehicle_plate`.
31
+ - Validators use `valid_*` names and return `bool`.
32
+ - Helper functions use snake_case and are module-level unless they need object state.
33
+ - CLI command functions are named around the action, for example `scan_input`, `scan_file`,
34
+ and `scan_text`.
35
+
36
+ ## Architecture Patterns
37
+
38
+ - Built-in recognizers are regex `Pattern` objects plus optional validators and context words.
39
+ - Scores start from `base_score` and are boosted by nearby context words in `scoring.py`.
40
+ - `PIIDetector.detect()` gathers candidates from the registry, then resolves overlapping spans.
41
+ - `PIIDetector.redact()` masks detected spans while preserving surrounding text.
42
+ - Custom patterns are added through `PIIDetector.add_pattern()` and wrapped as a recognizer.
43
+ - Optional dependencies should stay lazy, as in `presidio.py`.
44
+
45
+ ## Testing Style
46
+
47
+ - Tests use pytest and live under `tests/`.
48
+ - Prefer behavior-focused tests against the public API or CLI entry points.
49
+ - CLI tests call `vipii.cli.main(...)` directly and assert stdout with `capsys`.
50
+ - Use `tmp_path` for file-based CLI tests.
51
+ - Fixture-driven detector coverage uses JSONL in `tests/fixtures/`.
52
+ - Some tests use `# type: ignore[no-untyped-def]` for pytest fixtures without annotations.
53
+
54
+ ## Commands
55
+
56
+ Install for development:
57
+
58
+ ```bash
59
+ pip install -e ".[dev]"
60
+ ```
61
+
62
+ Run lint and format checks:
63
+
64
+ ```bash
65
+ ruff check .
66
+ ruff format --check .
67
+ ```
68
+
69
+ Run tests:
70
+
71
+ ```bash
72
+ pytest
73
+ ```
74
+
75
+ CI runs the same Ruff and pytest commands on Python 3.9 and 3.13.
vipii-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 vipii contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vipii-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.4
2
+ Name: vipii
3
+ Version: 0.1.0
4
+ Summary: Vietnamese PII detection with regex recognizers, validators, and context scoring.
5
+ Author: vipii contributors
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: nlp,pii,privacy,redaction,vietnamese
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Python: >=3.9
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=8.0; extra == 'dev'
22
+ Requires-Dist: ruff>=0.5.0; extra == 'dev'
23
+ Provides-Extra: ner
24
+ Requires-Dist: torch>=2.0.0; extra == 'ner'
25
+ Requires-Dist: transformers>=4.40.0; extra == 'ner'
26
+ Provides-Extra: presidio
27
+ Requires-Dist: presidio-analyzer>=2.2.0; extra == 'presidio'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # vipii
31
+
32
+ `vipii` is a Python library for detecting Vietnamese personally identifiable information (PII) in
33
+ UTF-8 text. It combines deterministic regex-based recognizers, validator functions, overlap
34
+ resolution, and Vietnamese context-window scoring to identify structured entities such as national
35
+ IDs, phone numbers, tax codes, bank identifiers, passports, and vehicle plates.
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install vipii
41
+ ```
42
+
43
+ For local development:
44
+
45
+ ```bash
46
+ pip install -e ".[dev]"
47
+ ```
48
+
49
+ ## Python API
50
+
51
+ ```python
52
+ from vipii import PIIDetector, Pattern
53
+
54
+ detector = PIIDetector()
55
+ detector.add_pattern(
56
+ Pattern(label="CUSTOMER_ID", regex=r"\bKH-\d{6}\b", context_words=["mã khách hàng"])
57
+ )
58
+
59
+ matches = detector.detect(
60
+ "Khách hàng Nguyễn Văn A, số điện thoại 0912 345 678, CCCD 001203000123."
61
+ )
62
+
63
+ for match in matches:
64
+ print(match.label, match.text, match.score)
65
+ ```
66
+
67
+ ## Optional NER
68
+
69
+ Regex recognizers cover structured PII. For free-form names, locations, organizations, and addresses,
70
+ enable an external Hugging Face token-classification model:
71
+
72
+ ```bash
73
+ pip install "vipii[ner]"
74
+ vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
75
+ ```
76
+
77
+ ```python
78
+ from vipii import PIIDetector
79
+
80
+ detector = PIIDetector(ner_model="your-vietnamese-ner-model")
81
+ matches = detector.detect("Nguyễn Văn A sống tại Hà Nội")
82
+ ```
83
+
84
+ The NER layer maps model labels such as `PER`, `LOC`, and `ORG` to `PERSON`, `LOCATION`, and
85
+ `ORGANIZATION`. The model is not bundled; choose and evaluate one for your domain before production
86
+ use.
87
+
88
+ ## CLI
89
+
90
+ ```bash
91
+ vipii scan "Số điện thoại 0912 345 678 và CCCD 001203000123"
92
+ vipii scan examples/customer_service.txt
93
+ vipii scan examples/customer_service.txt --format json
94
+ vipii scan examples/customer_service.txt --redact
95
+ vipii scan "CCCD 001203000123" --redact
96
+ vipii scan "Mã khách hàng KH-123456" --config examples/custom_recognizers.yml
97
+ vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
98
+ ```
99
+
100
+ ## YAML recognizer config
101
+
102
+ Built-in recognizers are loaded from `src/vipii/builtin_recognizers.yml`. You can append your own
103
+ recognizers from a YAML file without writing Python:
104
+
105
+ ```yaml
106
+ recognizers:
107
+ - name: customer_id
108
+ label: CUSTOMER_ID
109
+ patterns:
110
+ - regex: '\bKH-\d{6}\b'
111
+ context_words: ["mã khách hàng", "customer id"]
112
+ base_score: 0.6
113
+ ```
114
+
115
+ Use `validator` only when you want one of vipii's built-in validators: `cccd`, `cmnd`, `phone`,
116
+ `tax_code`, `bank_card`, `bank_account`, `passport`, or `vehicle_plate`.
117
+
118
+ ## Built-in recognizers
119
+
120
+ - `CCCD` and `CMND`
121
+ - `PHONE_NUMBER`
122
+ - `MST`
123
+ - `BANK_CARD`
124
+ - `BANK_ACCOUNT`
125
+ - `PASSPORT`
126
+ - `VEHICLE_PLATE`
127
+
128
+ The recognizers intentionally favor clear structured PII plus nearby Vietnamese context words such as
129
+ `số điện thoại`, `cccd`, `mã số thuế`, and `biển số xe`. Names and free-form addresses can be handled
130
+ by the optional NER layer.
131
+
132
+ ## Development
133
+
134
+ ```bash
135
+ pip install -e ".[dev]"
136
+ ruff check .
137
+ ruff format --check .
138
+ pytest
139
+ ```
140
+
141
+ ## Publishing
142
+
143
+ Build and inspect the package before uploading:
144
+
145
+ ```bash
146
+ python -m pip install --upgrade build twine
147
+ python -m build
148
+ python -m twine check dist/*
149
+ ```
150
+
151
+ Upload to TestPyPI first:
152
+
153
+ ```bash
154
+ python -m twine upload --repository testpypi dist/*
155
+ ```
156
+
157
+ Then upload the same checked artifacts to PyPI:
158
+
159
+ ```bash
160
+ python -m twine upload dist/*
161
+ ```
vipii-0.1.0/README.md ADDED
@@ -0,0 +1,132 @@
1
+ # vipii
2
+
3
+ `vipii` is a Python library for detecting Vietnamese personally identifiable information (PII) in
4
+ UTF-8 text. It combines deterministic regex-based recognizers, validator functions, overlap
5
+ resolution, and Vietnamese context-window scoring to identify structured entities such as national
6
+ IDs, phone numbers, tax codes, bank identifiers, passports, and vehicle plates.
7
+
8
+ ## Install
9
+
10
+ ```bash
11
+ pip install vipii
12
+ ```
13
+
14
+ For local development:
15
+
16
+ ```bash
17
+ pip install -e ".[dev]"
18
+ ```
19
+
20
+ ## Python API
21
+
22
+ ```python
23
+ from vipii import PIIDetector, Pattern
24
+
25
+ detector = PIIDetector()
26
+ detector.add_pattern(
27
+ Pattern(label="CUSTOMER_ID", regex=r"\bKH-\d{6}\b", context_words=["mã khách hàng"])
28
+ )
29
+
30
+ matches = detector.detect(
31
+ "Khách hàng Nguyễn Văn A, số điện thoại 0912 345 678, CCCD 001203000123."
32
+ )
33
+
34
+ for match in matches:
35
+ print(match.label, match.text, match.score)
36
+ ```
37
+
38
+ ## Optional NER
39
+
40
+ Regex recognizers cover structured PII. For free-form names, locations, organizations, and addresses,
41
+ enable an external Hugging Face token-classification model:
42
+
43
+ ```bash
44
+ pip install "vipii[ner]"
45
+ vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
46
+ ```
47
+
48
+ ```python
49
+ from vipii import PIIDetector
50
+
51
+ detector = PIIDetector(ner_model="your-vietnamese-ner-model")
52
+ matches = detector.detect("Nguyễn Văn A sống tại Hà Nội")
53
+ ```
54
+
55
+ The NER layer maps model labels such as `PER`, `LOC`, and `ORG` to `PERSON`, `LOCATION`, and
56
+ `ORGANIZATION`. The model is not bundled; choose and evaluate one for your domain before production
57
+ use.
58
+
59
+ ## CLI
60
+
61
+ ```bash
62
+ vipii scan "Số điện thoại 0912 345 678 và CCCD 001203000123"
63
+ vipii scan examples/customer_service.txt
64
+ vipii scan examples/customer_service.txt --format json
65
+ vipii scan examples/customer_service.txt --redact
66
+ vipii scan "CCCD 001203000123" --redact
67
+ vipii scan "Mã khách hàng KH-123456" --config examples/custom_recognizers.yml
68
+ vipii scan "Nguyễn Văn A sống tại Hà Nội" --ner-model your-vietnamese-ner-model
69
+ ```
70
+
71
+ ## YAML recognizer config
72
+
73
+ Built-in recognizers are loaded from `src/vipii/builtin_recognizers.yml`. You can append your own
74
+ recognizers from a YAML file without writing Python:
75
+
76
+ ```yaml
77
+ recognizers:
78
+ - name: customer_id
79
+ label: CUSTOMER_ID
80
+ patterns:
81
+ - regex: '\bKH-\d{6}\b'
82
+ context_words: ["mã khách hàng", "customer id"]
83
+ base_score: 0.6
84
+ ```
85
+
86
+ Use `validator` only when you want one of vipii's built-in validators: `cccd`, `cmnd`, `phone`,
87
+ `tax_code`, `bank_card`, `bank_account`, `passport`, or `vehicle_plate`.
88
+
89
+ ## Built-in recognizers
90
+
91
+ - `CCCD` and `CMND`
92
+ - `PHONE_NUMBER`
93
+ - `MST`
94
+ - `BANK_CARD`
95
+ - `BANK_ACCOUNT`
96
+ - `PASSPORT`
97
+ - `VEHICLE_PLATE`
98
+
99
+ The recognizers intentionally favor clear structured PII plus nearby Vietnamese context words such as
100
+ `số điện thoại`, `cccd`, `mã số thuế`, and `biển số xe`. Names and free-form addresses can be handled
101
+ by the optional NER layer.
102
+
103
+ ## Development
104
+
105
+ ```bash
106
+ pip install -e ".[dev]"
107
+ ruff check .
108
+ ruff format --check .
109
+ pytest
110
+ ```
111
+
112
+ ## Publishing
113
+
114
+ Build and inspect the package before uploading:
115
+
116
+ ```bash
117
+ python -m pip install --upgrade build twine
118
+ python -m build
119
+ python -m twine check dist/*
120
+ ```
121
+
122
+ Upload to TestPyPI first:
123
+
124
+ ```bash
125
+ python -m twine upload --repository testpypi dist/*
126
+ ```
127
+
128
+ Then upload the same checked artifacts to PyPI:
129
+
130
+ ```bash
131
+ python -m twine upload dist/*
132
+ ```
@@ -0,0 +1,16 @@
1
+ """Shared helpers for example scripts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from vipii import PIIDetector
6
+
7
+
8
+ def print_matches(title: str, text: str, detector: PIIDetector) -> None:
9
+ print(f"\n{title}")
10
+ print("-" * len(title))
11
+ print(text)
12
+ for match in detector.detect(text):
13
+ print(
14
+ f"{match.label:<14} {match.start:>2}:{match.end:<2} "
15
+ f"score={match.score:.2f} text={match.text!r}"
16
+ )
@@ -0,0 +1,26 @@
1
+ """Detect built-in Vietnamese structured PII.
2
+
3
+ Run from the repository after installing the package:
4
+
5
+ python examples/basic_detection.py
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from _helpers import print_matches
11
+
12
+ from vipii import PIIDetector
13
+
14
+
15
+ def main() -> None:
16
+ detector = PIIDetector()
17
+ text = (
18
+ "Khách hàng có số điện thoại 0912 345 678, CCCD 001203000123, "
19
+ "mã số thuế 0312345678 và biển số xe 51F-123.45."
20
+ )
21
+
22
+ print_matches("Built-in structured PII", text, detector)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
@@ -0,0 +1,80 @@
1
+ """Run recognizers concurrently.
2
+
3
+ Run from the repository after installing the package:
4
+
5
+ python examples/concurrent_detection.py
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass
12
+ from time import perf_counter, sleep
13
+
14
+ from _helpers import print_matches
15
+
16
+ from vipii import PIIDetector
17
+ from vipii.models import PIIMatch
18
+
19
+
20
+ @dataclass
21
+ class SlowRegexRecognizer:
22
+ name: str
23
+ label: str
24
+ regex: str
25
+ delay_seconds: float = 0.5
26
+
27
+ def recognize(self, text: str) -> list[PIIMatch]:
28
+ sleep(self.delay_seconds)
29
+ return [
30
+ PIIMatch(
31
+ label=self.label,
32
+ start=match.start(),
33
+ end=match.end(),
34
+ text=match.group(0),
35
+ score=0.8,
36
+ recognizer=self.name,
37
+ )
38
+ for match in re.finditer(self.regex, text)
39
+ ]
40
+
41
+
42
+ def timed_detect(title: str, detector: PIIDetector, text: str) -> None:
43
+ started = perf_counter()
44
+ matches = detector.detect(text)
45
+ elapsed = perf_counter() - started
46
+
47
+ print(f"\n{title}")
48
+ print("-" * len(title))
49
+ print(f"Detected {len(matches)} matches in {elapsed:.2f}s")
50
+ for match in matches:
51
+ print(
52
+ f"{match.label:<14} {match.start:>2}:{match.end:<2} "
53
+ f"score={match.score:.2f} text={match.text!r}"
54
+ )
55
+
56
+
57
+ def main() -> None:
58
+ text = "Mã khách hàng KH-123456 có mã đơn hàng DH-98765."
59
+ recognizers = [
60
+ SlowRegexRecognizer("customer_id", "CUSTOMER_ID", r"\bKH-\d{6}\b"),
61
+ SlowRegexRecognizer("order_id", "ORDER_ID", r"\bDH-\d{5}\b"),
62
+ ]
63
+
64
+ concurrent_detector = PIIDetector(
65
+ recognizers=recognizers,
66
+ include_builtins=False,
67
+ )
68
+ sequential_detector = PIIDetector(
69
+ recognizers=recognizers,
70
+ include_builtins=False,
71
+ max_workers=1,
72
+ )
73
+
74
+ print_matches("Concurrent recognizers", text, concurrent_detector)
75
+ timed_detect("Concurrent timing", concurrent_detector, text)
76
+ timed_detect("Sequential timing", sequential_detector, text)
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
@@ -0,0 +1,24 @@
1
+ """Run a detector with only custom recognizers.
2
+
3
+ Run from the repository after installing the package:
4
+
5
+ python examples/custom_only.py
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from _helpers import print_matches
11
+
12
+ from vipii import Pattern, PIIDetector
13
+
14
+
15
+ def main() -> None:
16
+ detector = PIIDetector(include_builtins=False)
17
+ detector.add_pattern(Pattern(label="ORDER_ID", regex=r"\bDH-\d{5}\b"))
18
+ text = "Đơn hàng DH-12345 của số điện thoại 0912345678."
19
+
20
+ print_matches("Custom-only detector", text, detector)
21
+
22
+
23
+ if __name__ == "__main__":
24
+ main()
@@ -0,0 +1,31 @@
1
+ """Add a custom regex pattern in Python.
2
+
3
+ Run from the repository after installing the package:
4
+
5
+ python examples/custom_pattern.py
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from _helpers import print_matches
11
+
12
+ from vipii import Pattern, PIIDetector
13
+
14
+
15
+ def main() -> None:
16
+ detector = PIIDetector()
17
+ detector.add_pattern(
18
+ Pattern(
19
+ label="CUSTOMER_ID",
20
+ regex=r"\bKH-\d{6}\b",
21
+ context_words=["mã khách hàng", "customer id"],
22
+ base_score=0.6,
23
+ )
24
+ )
25
+ text = "Mã khách hàng KH-123456 có số điện thoại 0912 345 678."
26
+
27
+ print_matches("Custom pattern from Python", text, detector)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main()
@@ -0,0 +1,7 @@
1
+ recognizers:
2
+ - name: customer_id
3
+ label: CUSTOMER_ID
4
+ patterns:
5
+ - regex: '\bKH-\d{6}\b'
6
+ context_words: ["mã khách hàng", "customer id"]
7
+ base_score: 0.6
@@ -0,0 +1,6 @@
1
+ Lưu ý: toàn bộ dữ liệu trong ví dụ này là giả.
2
+
3
+ Khách hàng Trần Văn Minh gọi lên tổng đài, số điện thoại 090 123 4567.
4
+ Bạn ấy cung cấp CCCD 001203000123 và mã số thuế cá nhân 0312345678.
5
+ Nhân viên ghi nhận biển số xe 51F-123.45 và hộ chiếu B1234567.
6
+ Thông tin thanh toán có thẻ 9704 0000 1234 5678 và tài khoản ngân hàng 123456789012 tại ngân hàng thử nghiệm.