allyanonimiser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. allyanonimiser-0.1.0/LICENSE +21 -0
  2. allyanonimiser-0.1.0/MANIFEST.in +25 -0
  3. allyanonimiser-0.1.0/PKG-INFO +252 -0
  4. allyanonimiser-0.1.0/README.md +205 -0
  5. allyanonimiser-0.1.0/allyanonimiser.egg-info/PKG-INFO +252 -0
  6. allyanonimiser-0.1.0/allyanonimiser.egg-info/SOURCES.txt +33 -0
  7. allyanonimiser-0.1.0/allyanonimiser.egg-info/dependency_links.txt +1 -0
  8. allyanonimiser-0.1.0/allyanonimiser.egg-info/requires.txt +15 -0
  9. allyanonimiser-0.1.0/allyanonimiser.egg-info/top_level.txt +5 -0
  10. allyanonimiser-0.1.0/generators/__init__.py +11 -0
  11. allyanonimiser-0.1.0/generators/au_synthetic_data.py +603 -0
  12. allyanonimiser-0.1.0/generators/dataset_publisher.py +609 -0
  13. allyanonimiser-0.1.0/generators/llm_augmenter.py +889 -0
  14. allyanonimiser-0.1.0/insurance/__init__.py +16 -0
  15. allyanonimiser-0.1.0/insurance/claim_notes_analyzer.py +267 -0
  16. allyanonimiser-0.1.0/insurance/email_analyzer.py +411 -0
  17. allyanonimiser-0.1.0/insurance/medical_report_analyzer.py +508 -0
  18. allyanonimiser-0.1.0/patterns/__init__.py +18 -0
  19. allyanonimiser-0.1.0/patterns/au_patterns.py +279 -0
  20. allyanonimiser-0.1.0/patterns/general_patterns.py +195 -0
  21. allyanonimiser-0.1.0/patterns/insurance_patterns.py +205 -0
  22. allyanonimiser-0.1.0/pyproject.toml +71 -0
  23. allyanonimiser-0.1.0/requirements.txt +27 -0
  24. allyanonimiser-0.1.0/setup.cfg +4 -0
  25. allyanonimiser-0.1.0/setup.py +51 -0
  26. allyanonimiser-0.1.0/tests/__init__.py +3 -0
  27. allyanonimiser-0.1.0/tests/conftest.py +160 -0
  28. allyanonimiser-0.1.0/tests/test_analyzer.py +156 -0
  29. allyanonimiser-0.1.0/tests/test_anonymizer.py +201 -0
  30. allyanonimiser-0.1.0/tests/test_generators.py +235 -0
  31. allyanonimiser-0.1.0/tests/test_main_interface.py +179 -0
  32. allyanonimiser-0.1.0/utils/__init__.py +0 -0
  33. allyanonimiser-0.1.0/utils/long_text_processor.py +429 -0
  34. allyanonimiser-0.1.0/utils/presidio_helpers.py +396 -0
  35. allyanonimiser-0.1.0/utils/spacy_helpers.py +308 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Stephen Oates
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,25 @@
1
+ include LICENSE
2
+ include README.md
3
+ include requirements.txt
4
+
5
+ recursive-include tests *.py
6
+ recursive-include allyanonimiser *.py
7
+
8
+ prune .git
9
+ prune .github
10
+ prune venv
11
+ prune env
12
+ prune dist
13
+ prune build
14
+ prune output
15
+ prune temp_files
16
+ prune anonymized_files
17
+ prune challenging_dataset
18
+ prune datasets
19
+ prune cache
20
+
21
+ global-exclude *.py[cod]
22
+ global-exclude __pycache__
23
+ global-exclude *.so
24
+ global-exclude .DS_Store
25
+ global-exclude CLAUDE.md
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.1
2
+ Name: allyanonimiser
3
+ Version: 0.1.0
4
+ Summary: Australian-focused PII detection and anonymization for the insurance industry
5
+ Home-page: https://github.com/srepho/Allyanonimiser
6
+ Author: Stephen Oates
7
+ Author-email: Stephen Oates <stephen.j.a.oates@gmail.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2025 Stephen Oates
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+
30
+ Project-URL: Homepage, https://github.com/srepho/Allyanonimiser
31
+ Project-URL: Bug Tracker, https://github.com/srepho/Allyanonimiser/issues
32
+ Project-URL: Documentation, https://github.com/srepho/Allyanonimiser#readme
33
+ Keywords: pii,anonymization,privacy,insurance,australia
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.8
36
+ Classifier: Programming Language :: Python :: 3.9
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: License :: OSI Approved :: MIT License
39
+ Classifier: Operating System :: OS Independent
40
+ Classifier: Topic :: Security
41
+ Classifier: Topic :: Text Processing
42
+ Requires-Python: >=3.8
43
+ Description-Content-Type: text/markdown
44
+ Provides-Extra: dev
45
+ Provides-Extra: llm
46
+ License-File: LICENSE
47
+
48
+ # Allyanonimiser
49
+
50
+ Australian-focused PII detection and anonymization for the insurance industry.
51
+
52
+ ## Features
53
+
54
+ - **Australian-Specific PII Detection**: Specialized recognizers for Australian TFNs, Medicare numbers, driver's licenses, and other Australian-specific identifiers.
55
+ - **Insurance Industry Focus**: Recognition of policy numbers, claim references, vehicle identifiers, and other insurance-specific data.
56
+ - **Long Text Processing**: Optimized for processing lengthy free-text fields like claim notes, medical reports, and emails.
57
+ - **Custom Pattern Creation**: Easy creation of custom entity recognizers for organization-specific data.
58
+ - **Synthetic Data Generation**: Generate realistic Australian test data for validation.
59
+ - **LLM Integration**: Use Language Models to create challenging datasets for testing.
60
+ - **Extensible Architecture**: Built on Presidio and spaCy with a modular, extensible design.
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ # Install from PyPI
66
+ pip install allyanonimiser
67
+
68
+ # Install the required spaCy model
69
+ python -m spacy download en_core_web_lg
70
+ ```
71
+
72
+ ## Quick Start
73
+
74
+ ```python
75
+ from allyanonimiser import create_au_insurance_analyzer
76
+
77
+ # Create an analyzer with Australian and insurance patterns
78
+ analyzer = create_au_insurance_analyzer()
79
+
80
+ # Analyze text
81
+ results = analyzer.analyze(
82
+ text="Please reference your policy AU-12345678 for claims related to your vehicle rego XYZ123.",
83
+ language="en"
84
+ )
85
+
86
+ # Print results
87
+ for result in results:
88
+ print(f"Entity: {result.entity_type}, Text: {result.text}, Score: {result.score}")
89
+ ```
90
+
91
+ ## Processing Insurance Documents
92
+
93
+ ### Claim Notes
94
+
95
+ ```python
96
+ from allyanonimiser import analyze_claim_notes
97
+
98
+ # Long claim note text
99
+ claim_note = """
100
+ Claim Details:
101
+ Spoke with the insured John Smith (TFN: 123 456 789) regarding damage to his vehicle ABC123.
102
+ The incident occurred on 14/05/2023 when another vehicle collided with the rear of his car.
103
+ Policy number: POL-987654321
104
+
105
+ Vehicle Details:
106
+ Toyota Corolla 2020
107
+ VIN: 1HGCM82633A123456
108
+ Registration: ABC123
109
+
110
+ Contact Information:
111
+ Phone: 0412 345 678
112
+ Email: john.smith@example.com
113
+ Address: 123 Main St, Sydney NSW 2000
114
+ """
115
+
116
+ # Analyze the claim note
117
+ analysis = analyze_claim_notes(claim_note)
118
+
119
+ # Access structured information
120
+ print("Incident Description:", analysis["incident_description"])
121
+ print("\nPII-rich segments:")
122
+ for segment in analysis["pii_segments"]:
123
+ print(f" - {segment['text'][:50]}... (PII likelihood: {segment['pii_likelihood']:.2f})")
124
+
125
+ # Anonymize the text
126
+ from allyanonimiser import EnhancedAnonymizer
127
+ anonymizer = EnhancedAnonymizer(analyzer=create_au_insurance_analyzer())
128
+ anonymized = anonymizer.anonymize(claim_note)
129
+ print("\nAnonymized text:")
130
+ print(anonymized["text"])
131
+ ```
132
+
133
+ ### Processing Emails
134
+
135
+ ```python
136
+ from allyanonimiser.insurance import InsuranceEmailAnalyzer
137
+
138
+ email_text = """
139
+ From: adjuster@insurance.com.au
140
+ To: customer@example.com
141
+ Subject: Your Claim CL-12345678
142
+
143
+ Dear Mr. Smith,
144
+
145
+ Thank you for your recent claim submission regarding your vehicle (Registration: XYZ123).
146
+
147
+ We have assigned your claim number CL-12345678. Please reference this number in all future correspondence.
148
+
149
+ Your policy POL-9876543 covers this type of damage, and we'll need the following information:
150
+ 1. Your Medicare number
151
+ 2. Additional photos of the damage
152
+ 3. The repair quote from the mechanic
153
+
154
+ Please call me at 03 9876 5432 if you have any questions.
155
+
156
+ Kind regards,
157
+ Sarah Johnson
158
+ Claims Assessor
159
+ """
160
+
161
+ email_analyzer = InsuranceEmailAnalyzer()
162
+ analysis = email_analyzer.analyze(email_text)
163
+
164
+ print("Email Subject:", analysis["subject"])
165
+ print("Claim Number:", analysis["claim_number"])
166
+ print("Policy Number:", analysis["policy_number"])
167
+ print("Customer Name:", analysis["customer_name"])
168
+ print("Identified PII:", analysis["pii_entities"])
169
+ ```
170
+
171
+ ## Creating Custom Patterns
172
+
173
+ ```python
174
+ from allyanonimiser import CustomPatternDefinition, create_pattern_from_examples
175
+
176
+ # Create a custom pattern for internal reference numbers
177
+ internal_ref_examples = [
178
+ "Internal reference: REF-12345",
179
+ "Ref Number: REF-98765",
180
+ "Reference: REF-55555"
181
+ ]
182
+
183
+ pattern = create_pattern_from_examples(
184
+ entity_type="INTERNAL_REFERENCE",
185
+ examples=internal_ref_examples,
186
+ context=["internal", "reference", "ref"],
187
+ pattern_type="regex"
188
+ )
189
+
190
+ # Add to an existing analyzer
191
+ analyzer.add_pattern(pattern)
192
+ ```
193
+
194
+ ## Using the Pattern Registry
195
+
196
+ ```python
197
+ from allyanonimiser import PatternRegistry, CustomPatternDefinition
198
+
199
+ # Create a registry
200
+ registry = PatternRegistry()
201
+
202
+ # Register patterns
203
+ registry.register_pattern(CustomPatternDefinition(
204
+ entity_type="BROKER_CODE",
205
+ patterns=["BRK-[0-9]{4}"],
206
+ context=["broker", "agent", "representative"],
207
+ name="broker_code_recognizer"
208
+ ))
209
+
210
+ # Share patterns across applications
211
+ registry.export_patterns("insurance_patterns.json")
212
+
213
+ # Later, in another application
214
+ registry.import_patterns("insurance_patterns.json")
215
+ ```
216
+
217
+ ## Working with Australian Data
218
+
219
+ ```python
220
+ from allyanonimiser.patterns import get_au_pattern_definitions
221
+
222
+ # Get all Australian pattern definitions
223
+ au_patterns = get_au_pattern_definitions()
224
+
225
+ # Print information about each pattern
226
+ for pattern in au_patterns:
227
+ print(f"Entity Type: {pattern['entity_type']}")
228
+ print(f"Description: {pattern['description']}")
229
+ print(f"Example Patterns: {pattern['patterns'][:2]}")
230
+ print("Context Terms:", ", ".join(pattern['context'][:5]))
231
+ print()
232
+ ```
233
+
234
+ ## Generating Australian Test Data
235
+
236
+ ```python
237
+ from allyanonimiser.generators import AustralianSyntheticDataGenerator
238
+
239
+ # Create a data generator
240
+ generator = AustralianSyntheticDataGenerator()
241
+
242
+ # Generate a dataset of Australian insurance documents
243
+ generator.generate_dataset(
244
+ num_documents=50,
245
+ output_dir="au_insurance_dataset",
246
+ include_annotations=True
247
+ )
248
+ ```
249
+
250
+ ## License
251
+
252
+ MIT License
@@ -0,0 +1,205 @@
1
+ # Allyanonimiser
2
+
3
+ Australian-focused PII detection and anonymization for the insurance industry.
4
+
5
+ ## Features
6
+
7
+ - **Australian-Specific PII Detection**: Specialized recognizers for Australian TFNs, Medicare numbers, driver's licenses, and other Australian-specific identifiers.
8
+ - **Insurance Industry Focus**: Recognition of policy numbers, claim references, vehicle identifiers, and other insurance-specific data.
9
+ - **Long Text Processing**: Optimized for processing lengthy free-text fields like claim notes, medical reports, and emails.
10
+ - **Custom Pattern Creation**: Easy creation of custom entity recognizers for organization-specific data.
11
+ - **Synthetic Data Generation**: Generate realistic Australian test data for validation.
12
+ - **LLM Integration**: Use Language Models to create challenging datasets for testing.
13
+ - **Extensible Architecture**: Built on Presidio and spaCy with a modular, extensible design.
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ # Install from PyPI
19
+ pip install allyanonimiser
20
+
21
+ # Install the required spaCy model
22
+ python -m spacy download en_core_web_lg
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from allyanonimiser import create_au_insurance_analyzer
29
+
30
+ # Create an analyzer with Australian and insurance patterns
31
+ analyzer = create_au_insurance_analyzer()
32
+
33
+ # Analyze text
34
+ results = analyzer.analyze(
35
+ text="Please reference your policy AU-12345678 for claims related to your vehicle rego XYZ123.",
36
+ language="en"
37
+ )
38
+
39
+ # Print results
40
+ for result in results:
41
+ print(f"Entity: {result.entity_type}, Text: {result.text}, Score: {result.score}")
42
+ ```
43
+
44
+ ## Processing Insurance Documents
45
+
46
+ ### Claim Notes
47
+
48
+ ```python
49
+ from allyanonimiser import analyze_claim_notes
50
+
51
+ # Long claim note text
52
+ claim_note = """
53
+ Claim Details:
54
+ Spoke with the insured John Smith (TFN: 123 456 789) regarding damage to his vehicle ABC123.
55
+ The incident occurred on 14/05/2023 when another vehicle collided with the rear of his car.
56
+ Policy number: POL-987654321
57
+
58
+ Vehicle Details:
59
+ Toyota Corolla 2020
60
+ VIN: 1HGCM82633A123456
61
+ Registration: ABC123
62
+
63
+ Contact Information:
64
+ Phone: 0412 345 678
65
+ Email: john.smith@example.com
66
+ Address: 123 Main St, Sydney NSW 2000
67
+ """
68
+
69
+ # Analyze the claim note
70
+ analysis = analyze_claim_notes(claim_note)
71
+
72
+ # Access structured information
73
+ print("Incident Description:", analysis["incident_description"])
74
+ print("\nPII-rich segments:")
75
+ for segment in analysis["pii_segments"]:
76
+ print(f" - {segment['text'][:50]}... (PII likelihood: {segment['pii_likelihood']:.2f})")
77
+
78
+ # Anonymize the text
79
+ from allyanonimiser import EnhancedAnonymizer
80
+ anonymizer = EnhancedAnonymizer(analyzer=create_au_insurance_analyzer())
81
+ anonymized = anonymizer.anonymize(claim_note)
82
+ print("\nAnonymized text:")
83
+ print(anonymized["text"])
84
+ ```
85
+
86
+ ### Processing Emails
87
+
88
+ ```python
89
+ from allyanonimiser.insurance import InsuranceEmailAnalyzer
90
+
91
+ email_text = """
92
+ From: adjuster@insurance.com.au
93
+ To: customer@example.com
94
+ Subject: Your Claim CL-12345678
95
+
96
+ Dear Mr. Smith,
97
+
98
+ Thank you for your recent claim submission regarding your vehicle (Registration: XYZ123).
99
+
100
+ We have assigned your claim number CL-12345678. Please reference this number in all future correspondence.
101
+
102
+ Your policy POL-9876543 covers this type of damage, and we'll need the following information:
103
+ 1. Your Medicare number
104
+ 2. Additional photos of the damage
105
+ 3. The repair quote from the mechanic
106
+
107
+ Please call me at 03 9876 5432 if you have any questions.
108
+
109
+ Kind regards,
110
+ Sarah Johnson
111
+ Claims Assessor
112
+ """
113
+
114
+ email_analyzer = InsuranceEmailAnalyzer()
115
+ analysis = email_analyzer.analyze(email_text)
116
+
117
+ print("Email Subject:", analysis["subject"])
118
+ print("Claim Number:", analysis["claim_number"])
119
+ print("Policy Number:", analysis["policy_number"])
120
+ print("Customer Name:", analysis["customer_name"])
121
+ print("Identified PII:", analysis["pii_entities"])
122
+ ```
123
+
124
+ ## Creating Custom Patterns
125
+
126
+ ```python
127
+ from allyanonimiser import CustomPatternDefinition, create_pattern_from_examples
128
+
129
+ # Create a custom pattern for internal reference numbers
130
+ internal_ref_examples = [
131
+ "Internal reference: REF-12345",
132
+ "Ref Number: REF-98765",
133
+ "Reference: REF-55555"
134
+ ]
135
+
136
+ pattern = create_pattern_from_examples(
137
+ entity_type="INTERNAL_REFERENCE",
138
+ examples=internal_ref_examples,
139
+ context=["internal", "reference", "ref"],
140
+ pattern_type="regex"
141
+ )
142
+
143
+ # Add to an existing analyzer
144
+ analyzer.add_pattern(pattern)
145
+ ```
146
+
147
+ ## Using the Pattern Registry
148
+
149
+ ```python
150
+ from allyanonimiser import PatternRegistry, CustomPatternDefinition
151
+
152
+ # Create a registry
153
+ registry = PatternRegistry()
154
+
155
+ # Register patterns
156
+ registry.register_pattern(CustomPatternDefinition(
157
+ entity_type="BROKER_CODE",
158
+ patterns=["BRK-[0-9]{4}"],
159
+ context=["broker", "agent", "representative"],
160
+ name="broker_code_recognizer"
161
+ ))
162
+
163
+ # Share patterns across applications
164
+ registry.export_patterns("insurance_patterns.json")
165
+
166
+ # Later, in another application
167
+ registry.import_patterns("insurance_patterns.json")
168
+ ```
169
+
170
+ ## Working with Australian Data
171
+
172
+ ```python
173
+ from allyanonimiser.patterns import get_au_pattern_definitions
174
+
175
+ # Get all Australian pattern definitions
176
+ au_patterns = get_au_pattern_definitions()
177
+
178
+ # Print information about each pattern
179
+ for pattern in au_patterns:
180
+ print(f"Entity Type: {pattern['entity_type']}")
181
+ print(f"Description: {pattern['description']}")
182
+ print(f"Example Patterns: {pattern['patterns'][:2]}")
183
+ print("Context Terms:", ", ".join(pattern['context'][:5]))
184
+ print()
185
+ ```
186
+
187
+ ## Generating Australian Test Data
188
+
189
+ ```python
190
+ from allyanonimiser.generators import AustralianSyntheticDataGenerator
191
+
192
+ # Create a data generator
193
+ generator = AustralianSyntheticDataGenerator()
194
+
195
+ # Generate a dataset of Australian insurance documents
196
+ generator.generate_dataset(
197
+ num_documents=50,
198
+ output_dir="au_insurance_dataset",
199
+ include_annotations=True
200
+ )
201
+ ```
202
+
203
+ ## License
204
+
205
+ MIT License