text-mallet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text_mallet-0.1.0.dist-info/METADATA +163 -0
- text_mallet-0.1.0.dist-info/RECORD +28 -0
- text_mallet-0.1.0.dist-info/WHEEL +5 -0
- text_mallet-0.1.0.dist-info/licenses/LICENSE +21 -0
- text_mallet-0.1.0.dist-info/top_level.txt +1 -0
- tmallet/__init__.py +3 -0
- tmallet/core/__init__.py +0 -0
- tmallet/core/pipeline.py +314 -0
- tmallet/obfuscators/__init__.py +25 -0
- tmallet/obfuscators/base.py +17 -0
- tmallet/obfuscators/morph/__init__.py +0 -0
- tmallet/obfuscators/morph/lemmatizer.py +17 -0
- tmallet/obfuscators/pos/__init__.py +0 -0
- tmallet/obfuscators/pos/config.py +36 -0
- tmallet/obfuscators/pos/pos_filter.py +226 -0
- tmallet/obfuscators/replacement_token.py +22 -0
- tmallet/obfuscators/shannon/__init__.py +0 -0
- tmallet/obfuscators/shannon/config.py +11 -0
- tmallet/obfuscators/shannon/shannon_bert.py +327 -0
- tmallet/obfuscators/shannon/shannon_filter.py +202 -0
- tmallet/obfuscators/shannon/visualise.py +223 -0
- tmallet/obfuscators/structural/__init__.py +0 -0
- tmallet/obfuscators/structural/config.py +14 -0
- tmallet/obfuscators/structural/scramble_hier.py +182 -0
- tmallet/obfuscators/structural/scramble_linear.py +89 -0
- tmallet/utils/__init__.py +4 -0
- tmallet/utils/helper.py +65 -0
- tmallet/utils/spacy_registry.py +75 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: text-mallet
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Toolkit for Obfuscating Text Through Transformations.
|
|
5
|
+
Author-email: CORAL <gallagher.eu@protonmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://text-mallet.readthedocs.io/
|
|
8
|
+
Project-URL: Repository, https://github.com/DanielGall500/reef
|
|
9
|
+
Project-URL: Documentation, https://text-mallet.readthedocs.io/
|
|
10
|
+
Keywords: nlp,obfuscation,copyright-protection,derived text formats
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: spacy>=3.0.0
|
|
17
|
+
Requires-Dist: wordfreq>=3.1.1
|
|
18
|
+
Requires-Dist: pydantic>=2.13.2
|
|
19
|
+
Requires-Dist: sacremoses>=0.1.1
|
|
20
|
+
Requires-Dist: syntok>=1.4.4
|
|
21
|
+
Requires-Dist: pandas>=3.0.1
|
|
22
|
+
Requires-Dist: transformers>=5.9.0
|
|
23
|
+
Requires-Dist: datasets>=4.8.5
|
|
24
|
+
Requires-Dist: torch>=2.12.0
|
|
25
|
+
Requires-Dist: plotnine>=0.15.4
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: sphinx>=9.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: shibuya; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
[](https://spacy.io)
|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
<br />
|
|
38
|
+
<div align="center">
|
|
39
|
+
<img src="assets/mallet.svg" alt="Logo" width="200" height="200">
|
|
40
|
+
|
|
41
|
+
<p align="center">
|
|
42
|
+
Smash Text Into Obfuscated Formats
|
|
43
|
+
<br />
|
|
44
|
+
<br />
|
|
45
|
+
<br />
|
|
46
|
+
<a href="https://text-mallet.readthedocs.io/en/latest/">Documentation</a>
|
|
47
|
+
·
|
|
48
|
+
<a href="https://github.com/DanielGall500/text-hammer/issues/new?labels=bug&template=bug-report---.md">Report Bug</a>
|
|
49
|
+
·
|
|
50
|
+
<a href="https://github.com/DanielGall500/text-hammer/issues/new?labels=enhancement&template=feature-request---.md">Request Feature</a>
|
|
51
|
+
</p>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
A lightweight text transformation engine for smashing text into [derived](https://text-plus.org/en/themen-dokumentation/atf) formats. `text-mallet` reduces the risk of privacy or copyright infringement while preserving the structural, semantic, and syntactic utility of your data for downstream NLP tasks like classification, information retrieval, topic modeling, and semantic similarity.
|
|
55
|
+
|
|
56
|
+
The package natively supports text processing in **English** and **German**.
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
```bash
|
|
60
|
+
pip install text-mallet
|
|
61
|
+
|
|
62
|
+
# For obfuscating English:
|
|
63
|
+
python -m spacy download en_core_web_trf
|
|
64
|
+
|
|
65
|
+
# For obfuscating German:
|
|
66
|
+
python -m spacy download de_dep_news_trf
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Strategic Obfuscation
|
|
70
|
+
|
|
71
|
+
The central aim of text obfuscation using this package is to prevent **reconstructability**. Doing so involves eroding various aspects of text, such as:
|
|
72
|
+
* **Word Forms**: The exact sequence of characters.
|
|
73
|
+
* **Syntactic Features**: Morpho-syntactic token properties.
|
|
74
|
+
* **Meanings**: Core semantic associations.
|
|
75
|
+
* **Grammatical Relations**: Hierarchical sentence dependency tree structure.
|
|
76
|
+
* **Sequence Information**: Flat linear sequence boundaries.
|
|
77
|
+
|
|
78
|
+
Each layer contributes to the discoverability and reconstructibility of the source text. This package provides four main approaches to obfuscation: POS filtering, mutual information filtering, hierarchical scrambling, and bag-of-words. For a more detailed overview of each method, please see the [documentation](https://text-mallet.readthedocs.io/).
|
|
79
|
+
|
|
80
|
+
### Why Obfuscate Text?
|
|
81
|
+
When training models for text generation, we typically need all of the content and style of the original, fluent text. However, there are many tasks such as classification, semantic similarity scoring, topic modelling, and so on, where the original text may not be required in its original form to help model performance. There is typically a trove of public-domain data that can be used for model training, but there are still many questions around the usage of copyright-protected data in training. This package offers a route to preserve some of the value of copyrighted texts while hindering their reconstruction, whether that be through training-data reconstruction or model outputs.
|
|
82
|
+
|
|
83
|
+
The creation of transformed texts that are thus no longer consumable by humans, but still useful for training on specific tasks.
|
|
84
|
+
|
|
85
|
+
### Impact on Performance
|
|
86
|
+
While training on transformed text formats can introduce minor performance drops compared to raw text baselines, `text-mallett` is explicitly designed to **accompany** non-obfuscated, public-domain collections. Introducing these anonymised, non-human-consumable formats adds proprietary text signals into your pipeline without exposing the original text to model parameters.
|
|
87
|
+
|
|
88
|
+
### Basic Obfuscation
|
|
89
|
+
There are multiple general obfuscation approaches to choose from, separated into three general categories:
|
|
90
|
+
* Structural Obfuscation (bag-of-words, hierarchical)
|
|
91
|
+
* Part-of-Speech Filtering
|
|
92
|
+
* Mutual-Information Filtering
|
|
93
|
+
|
|
94
|
+
Let's start with an example of bag-of-words scrambling.
|
|
95
|
+
```python
|
|
96
|
+
from tmallet import TMallet
|
|
97
|
+
|
|
98
|
+
# 1. Define the Obfuscation Configuration
|
|
99
|
+
algorithm = "scramble-BoW"
|
|
100
|
+
config = {
|
|
101
|
+
"level": "document",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# 2. Define Sample Text
|
|
105
|
+
sample = "Leipzig is the most populous city in the German state of Saxony. The city has a population of 633,592 residents as of 31 December 2025. It is the eighth-largest city in Germany and is part of the Central German Metropolitan Region. Leipzig is located about 150 km (90 mi) southwest of Berlin, in the southernmost part of the North German Plain (the Leipzig Bay), at the confluence of the White Elster and its tributaries Pleiße and Parthe."
|
|
106
|
+
|
|
107
|
+
# 3. Load Text Mallet and Obfuscate
|
|
108
|
+
tmallet = TMallet(lang="en", prefer_gpu=True)
|
|
109
|
+
tmallet.load_obfuscator(algorithm, config)
|
|
110
|
+
|
|
111
|
+
obfuscated_text_sample = tmallet.obfuscate(sample)
|
|
112
|
+
print(obfuscated_text_sample)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Output
|
|
116
|
+
```bash
|
|
117
|
+
southernmost 150 eighth-largest most the (90 Central located Bay), Parthe. mi) in city of German Germany part population the is Plain of populous and as Leipzig the German the the km White its Metropolitan Berlin, and Leipzig in the confluence 2025. of of city It Elster is Region. December tributaries The state Saxony. (the southwest the residents the of city Leipzig German is in is Pleiße of a has at part about of 31 and North 633,592
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Obfuscate based on an approximation of 'word importance'**
|
|
121
|
+
Mutual information measures how much information context tells you about a word.
|
|
122
|
+
Words which are both _rare_ and _context-dependant_ tend to be _important_ to the meaning of a text.
|
|
123
|
+
We can apply a filter to set upper or lower bounds on such an MI score, filering at the word level.
|
|
124
|
+
```python
|
|
125
|
+
from tmallet import TMallet
|
|
126
|
+
|
|
127
|
+
algorithm = "shannon",
|
|
128
|
+
config = {
|
|
129
|
+
"threshold": 9.5,
|
|
130
|
+
"bound": "lower",
|
|
131
|
+
"replacement_mechanism": "default",
|
|
132
|
+
"max_context_length": 128,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
tmallet = TMallet(lang=lang, prefer_gpu=True)
|
|
136
|
+
tmallet.load_obfuscator(algorithm, config)
|
|
137
|
+
|
|
138
|
+
text = "Data obfuscation is the process of modifying sensitive data in such a way that it is of no or little value to unauthorized intruders while still being usable by software or authorized personnel. Data masking can also be referred as anonymization, or tokenization, depending on different context."
|
|
139
|
+
obfuscated = tmallet.obfuscate(text)
|
|
140
|
+
print(obfuscated)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Output
|
|
144
|
+
```
|
|
145
|
+
_ _ is the _ of _ _ _ in _ a _ that it is of no or little _ to _ _ _ _ _ _ by software or _ _. _ _ can also be referred as _, or _, _ on different _.
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
If we obfuscate too strongly using mutual information, we'll end up with obfuscated sentences like:
|
|
149
|
+
```
|
|
150
|
+
, . . - ., or . the / . the . - –, . -
|
|
151
|
+
```
|
|
152
|
+
That's, well, probably not very useful. Ideally, we can find a balance between the obfuscation of some words and inclusion of others. Using the bound as `lower` instead of `upper` will preserve those words which tend to be more meaningful in a text. Running the above example again with the lower bound of 9, we get an obfuscation which preserves core semantic value while make the text more difficult to reconstruct:
|
|
153
|
+
```
|
|
154
|
+
Data obfuscation _ _ process _ modifying sensitive data _ such _ way _ _ _ _ _ _ _ value _ unauthorized intruders while still being usable _ _ _ authorized personnel _ Data masking _ _ _ _ _ anonymization _ _ tokenization _ depending _ _ context _
|
|
155
|
+
```
|
|
156
|
+
Here's an overview of an approximation of pointwise word-level mutual information, i.e. PMI(word; context), over 12,000 tokens taken from 10 random texts in the FineWeb-Edu dataset, for instance.
|
|
157
|
+

|
|
158
|
+
|
|
159
|
+
#### Contributions
|
|
160
|
+
Contributions are always welcome, this package was primarily developed in conjunction with carrying out research on the potential value of such obfuscation formats in encoder pre-training. Therefore, the package has many ways it could be improved upon and there are likely opportunities for efficiency improvements in particular.
|
|
161
|
+
|
|
162
|
+
#### Acknowledgements
|
|
163
|
+
Part of this work was conducted within the [CORAL project](https://coral-nlp.github.io) funded by the German Federal Ministry of Research, Technology, and Space (BMFTR) under the grant number 16IS24077A. Responsibility for the content of this publication lies with the authors.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
text_mallet-0.1.0.dist-info/licenses/LICENSE,sha256=FDzyLfdTplJkQ0PoTuGRwyBUOpZZ-IA04Ox2X5aHQLM,1107
|
|
2
|
+
tmallet/__init__.py,sha256=usScnGjg3_WbsWgYJxfRhDKE7KV0dgZr9rb9Wm7IN6w,65
|
|
3
|
+
tmallet/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
tmallet/core/pipeline.py,sha256=ARNcGUI8k1o4Bao4MqUxVQoZTna_GgFxMxKm-bWnDq0,12075
|
|
5
|
+
tmallet/obfuscators/__init__.py,sha256=FtrtALbi9bQM1lLOQtt1fe5QFNSBeAPAfcQsuk36xnk,943
|
|
6
|
+
tmallet/obfuscators/base.py,sha256=xTkH5AHxKS-AMUvApLpFAEz5m-P0ICrnVI_wpH8mBiE,573
|
|
7
|
+
tmallet/obfuscators/replacement_token.py,sha256=fkdvpBwQ4zV_tzF-7HSb0a39O1Eeb75J0f0H1esBQUk,486
|
|
8
|
+
tmallet/obfuscators/morph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
tmallet/obfuscators/morph/lemmatizer.py,sha256=dTphm3Mqr6TVIARQCtQE4uBAEFFiXl9VugnKxk9LEWA,463
|
|
10
|
+
tmallet/obfuscators/pos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
tmallet/obfuscators/pos/config.py,sha256=AsqBJeSBn8qVH9EKXV3HJM8k_RdOSlKpVZ1JU54-zTs,709
|
|
12
|
+
tmallet/obfuscators/pos/pos_filter.py,sha256=IZTqp3tx2DowfTwFEVoUSsHfBjaf4BykMQIwx4Gjap0,8463
|
|
13
|
+
tmallet/obfuscators/shannon/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
tmallet/obfuscators/shannon/config.py,sha256=br5jYkALcz8NDzQ7vD124ppktILWsVLHkfiLlFfL5oc,310
|
|
15
|
+
tmallet/obfuscators/shannon/shannon_bert.py,sha256=KjOteTlHYiwZ8cnPqZhEu731Uo6o-pvMpOHALjNsspQ,11425
|
|
16
|
+
tmallet/obfuscators/shannon/shannon_filter.py,sha256=swIk-10iFMwH_gLczpz4xs52UZNVwQeQ7d8mf7Q870M,7638
|
|
17
|
+
tmallet/obfuscators/shannon/visualise.py,sha256=Rb2zQPAA5k3OUjdw5_qNhoKO05lMQFe6TqYWu0ZrKRE,7068
|
|
18
|
+
tmallet/obfuscators/structural/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
tmallet/obfuscators/structural/config.py,sha256=r8lR8ru-7rk697HSbu_J_IC08kp0YYDPIB6fui2xqPU,314
|
|
20
|
+
tmallet/obfuscators/structural/scramble_hier.py,sha256=TnwBfzB_UBEfEE5cwsKboBSuNKG2BN9XO7ZyRRF8DRE,6064
|
|
21
|
+
tmallet/obfuscators/structural/scramble_linear.py,sha256=jAw3NpVO-6XNFUP717zHvn0vryMTgNCyS04gUuDtvog,3175
|
|
22
|
+
tmallet/utils/__init__.py,sha256=JBSgvXwfvxwyTF_J_3cnqFkY4tSo1wiQS8E1pu39MI8,202
|
|
23
|
+
tmallet/utils/helper.py,sha256=ezNBKOp9ejA4fBNwnEVLeBqg0KM02953v5VTLD2-9UI,2094
|
|
24
|
+
tmallet/utils/spacy_registry.py,sha256=WSgw1zlW56CqhrMQ29R7rbykQ-wPa3KjrUcivUqnl7E,2492
|
|
25
|
+
text_mallet-0.1.0.dist-info/METADATA,sha256=1Gb-jy8f7MKae7Og6hBnUtr0UCI7spW8yWGkX3_-XhY,9181
|
|
26
|
+
text_mallet-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
27
|
+
text_mallet-0.1.0.dist-info/top_level.txt,sha256=trxVebx9xwb6N6JkzEmOBAyBjqslgtY2VIF-f_ZWjT0,8
|
|
28
|
+
text_mallet-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Institute for Applied Informatics (InfAI), Leipzig
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tmallet
|
tmallet/__init__.py
ADDED
tmallet/core/__init__.py
ADDED
|
File without changes
|
tmallet/core/pipeline.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import partial
|
|
3
|
+
from itertools import islice
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from datasets import Dataset, concatenate_datasets, load_dataset, load_from_disk
|
|
8
|
+
|
|
9
|
+
from tmallet.obfuscators import (
|
|
10
|
+
HierarchicalScrambleConfig,
|
|
11
|
+
HierarchicalScrambleObfuscator,
|
|
12
|
+
LemmaObfuscator,
|
|
13
|
+
LinearScrambleConfig,
|
|
14
|
+
LinearScrambleObfuscator,
|
|
15
|
+
POSFilter,
|
|
16
|
+
POSFilterConfig,
|
|
17
|
+
ShannonFilter,
|
|
18
|
+
ShannonFilterConfig,
|
|
19
|
+
)
|
|
20
|
+
from tmallet.obfuscators.base import Obfuscator, SpaCyObfuscator
|
|
21
|
+
from tmallet.utils import LangConfig, SpaCyInterface, flatten_dict
|
|
22
|
+
|
|
23
|
+
ObfuscationTechnique = Literal[
|
|
24
|
+
"pos-filter", # retain or remove specific POS tags
|
|
25
|
+
"scramble-hier", # dependency-parsing structural obfuscation
|
|
26
|
+
"scramble-BoW", # randomly shuffle words at the sentence or document level
|
|
27
|
+
"shannon", # filter based on an approximation of word importance
|
|
28
|
+
"lemmatize", # word-level lemmatisation - still available, but no longer supported
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TMallet:
|
|
33
|
+
"""A text obfuscation manager that applies transformations to text.
|
|
34
|
+
|
|
35
|
+
This class applies selected algorithmic obfuscation techniques (such as POS filtering,
|
|
36
|
+
bag-of-words scrambling, or information-theoretic filtering) to strings, lists of text,
|
|
37
|
+
or entire datasets.
|
|
38
|
+
|
|
39
|
+
Arguments:
|
|
40
|
+
lang (LangConfig): The language configuration code (e.g., "en").
|
|
41
|
+
prefer_gpu (bool): Whether spaCy is configured to leverage GPU acceleration.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# apply_spacy_preprocessing: determines whether spacy is used or not
|
|
45
|
+
# for the initial text processing
|
|
46
|
+
# -> determined automatically based on the configuration selected
|
|
47
|
+
apply_spacy_preprocessing: bool = False
|
|
48
|
+
is_obfuscation_set_up: bool = False
|
|
49
|
+
active_obfuscator = None
|
|
50
|
+
active_config: dict | None = None
|
|
51
|
+
active_algorithm: str | None = None
|
|
52
|
+
|
|
53
|
+
def __init__(self, lang: LangConfig = "en", prefer_gpu: bool = False):
|
|
54
|
+
"""Initialises the obfuscation pipeline.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
lang (LangConfig, optional): The target language configuration - either "en" (English) or "de" (German). Defaults to "en". Let us know if you'd be interested in support for further languages.
|
|
58
|
+
prefer_gpu (bool, optional): If True, attempts to allocate spaCy operations
|
|
59
|
+
on the GPU. Defaults to False.
|
|
60
|
+
"""
|
|
61
|
+
self.spacy_interface: SpaCyInterface = SpaCyInterface(
|
|
62
|
+
lang=lang, prefer_gpu=prefer_gpu
|
|
63
|
+
)
|
|
64
|
+
self.lang: LangConfig = lang
|
|
65
|
+
self.prefer_gpu = prefer_gpu
|
|
66
|
+
|
|
67
|
+
def load_obfuscator(
|
|
68
|
+
self,
|
|
69
|
+
algorithm: ObfuscationTechnique,
|
|
70
|
+
config: dict[str, str],
|
|
71
|
+
):
|
|
72
|
+
"""Validates configuration and dynamically instantiates an obfuscation algorithm.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
algorithm (str): The identifier of the obfuscation technique (e.g., 'pos-filter').
|
|
76
|
+
config (Dict): Key-value pairings containing parameters for the specific algorithm.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
TMallet: The current class instance to allow for method chaining.
|
|
80
|
+
"""
|
|
81
|
+
self.active_config = self._validate_config(algorithm, config)
|
|
82
|
+
self.active_obfuscator = self._get_obfuscator(algorithm)
|
|
83
|
+
self.active_obfuscator.set_config(self.active_config)
|
|
84
|
+
self.active_algorithm = algorithm
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
def obfuscate(self, text: str) -> dict | str:
|
|
88
|
+
"""Obfuscates standalone text strings or lists of strings.
|
|
89
|
+
|
|
90
|
+
Requires an obfuscator to be loaded via `load_obfuscator` prior to invocation.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
text (Union[List[str], str]): Single text payload or collection of texts to process.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
RuntimeError: If an obfuscator and configuration have not been loaded yet.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Dict: The modified, obfuscated text or collection of texts in the form of a dictionary.
|
|
100
|
+
"""
|
|
101
|
+
if (
|
|
102
|
+
self.active_obfuscator is None
|
|
103
|
+
and self.active_config is None
|
|
104
|
+
and not self.active_algorithm == "lemmatize"
|
|
105
|
+
):
|
|
106
|
+
raise RuntimeError(
|
|
107
|
+
"Please use `set_obfuscator` to setup the obfuscation details first."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if self.apply_spacy_preprocessing:
|
|
111
|
+
text = self.spacy_interface.process(text)
|
|
112
|
+
|
|
113
|
+
return self.active_obfuscator.obfuscate(text)
|
|
114
|
+
|
|
115
|
+
def _obfuscate_batch(
|
|
116
|
+
self,
|
|
117
|
+
batch: dict,
|
|
118
|
+
column: str,
|
|
119
|
+
column_obfuscated: str,
|
|
120
|
+
multi: bool = True,
|
|
121
|
+
) -> dict:
|
|
122
|
+
"""Processes a single dictionary batch extracted from a Dataset pipeline wrapper.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
batch (Dict[str, Any]): A batch slice containing lists mapped to column keys.
|
|
126
|
+
column (str): The column key containing the raw text strings.
|
|
127
|
+
column_obfuscated (str): Target base column key for saving the output.
|
|
128
|
+
multi (bool, optional): If True, flattens a complex nested dictionary output
|
|
129
|
+
directly into the batch root elements. Defaults to True.
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
KeyError: If the specified target data column does not exist inside the batch.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dict[str, Any]: The batch containing the obfuscated results.
|
|
136
|
+
"""
|
|
137
|
+
if column not in batch.keys():
|
|
138
|
+
raise KeyError(
|
|
139
|
+
f"Invalid column provided. Please choose one of {list(batch.keys())}"
|
|
140
|
+
)
|
|
141
|
+
texts = batch[column]
|
|
142
|
+
|
|
143
|
+
if not multi:
|
|
144
|
+
batch[column_obfuscated] = [self.obfuscate(text) for text in texts]
|
|
145
|
+
else:
|
|
146
|
+
obfuscation_output = [flatten_dict(self.obfuscate(text)) for text in texts]
|
|
147
|
+
|
|
148
|
+
all_keys = obfuscation_output[0].keys()
|
|
149
|
+
batch.update(
|
|
150
|
+
{
|
|
151
|
+
key: [sample[key] for sample in obfuscation_output]
|
|
152
|
+
for key in all_keys
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return batch
|
|
157
|
+
|
|
158
|
+
def obfuscate_dataset(
|
|
159
|
+
self,
|
|
160
|
+
dataset: Dataset,
|
|
161
|
+
column: str,
|
|
162
|
+
column_obfuscated: str,
|
|
163
|
+
batch_size: int = 10,
|
|
164
|
+
num_proc: int | None = None,
|
|
165
|
+
):
|
|
166
|
+
"""Maps obfuscation across an entire HuggingFace/compatible dataset object sequentially.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
dataset (Dataset): The underlying dataset collection containing columns of data.
|
|
170
|
+
column (str): Key of the column containing raw target text.
|
|
171
|
+
column_obfuscated (str): Target base column key for saving the output.
|
|
172
|
+
batch_size (int, optional): Size of chunk arrays processed together. Defaults to 10.
|
|
173
|
+
num_proc (Optional[int], optional): CPU core count split handling parallel tasks.
|
|
174
|
+
Defaults to None.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dataset: A newly updated copy of the dataset containing obfuscation columns.
|
|
178
|
+
"""
|
|
179
|
+
obfuscated_dataset = dataset.map(
|
|
180
|
+
partial(
|
|
181
|
+
self._obfuscate_batch,
|
|
182
|
+
column=column,
|
|
183
|
+
column_obfuscated=column_obfuscated,
|
|
184
|
+
),
|
|
185
|
+
batched=True,
|
|
186
|
+
batch_size=batch_size,
|
|
187
|
+
desc="Obfuscating...",
|
|
188
|
+
num_proc=num_proc,
|
|
189
|
+
# cache_file_name=None,
|
|
190
|
+
load_from_cache_file=False,
|
|
191
|
+
)
|
|
192
|
+
return obfuscated_dataset
|
|
193
|
+
|
|
194
|
+
def obfuscate_dataset_by_chunk(
|
|
195
|
+
self,
|
|
196
|
+
dataset_repo: str,
|
|
197
|
+
column: str,
|
|
198
|
+
column_obfuscated: str,
|
|
199
|
+
save_chunks_to_folder: Path,
|
|
200
|
+
dataset_config: str | None = None,
|
|
201
|
+
dataset_split: str = "train",
|
|
202
|
+
chunk_size: int = 5_000,
|
|
203
|
+
batch_size: int = 100,
|
|
204
|
+
num_proc: int | None = None,
|
|
205
|
+
num_samples: int | None = None,
|
|
206
|
+
) -> Dataset:
|
|
207
|
+
"""Streams a dataset from the Hub in chunks, obfuscates each chunk,
|
|
208
|
+
and saves checkpoints to disk for fault tolerance.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
dataset_repo (str): HuggingFace Hub repo ID or local path for load_dataset.
|
|
212
|
+
column (str): Key of the column containing raw target text.
|
|
213
|
+
column_obfuscated (str): Target base column key for saving the output.
|
|
214
|
+
save_chunks_to_folder (Path): Directory to save/load disk checkpoints.
|
|
215
|
+
dataset_config (Optional[str]): Dataset config/subset name passed to load_dataset.
|
|
216
|
+
dataset_split (str): Split to stream (e.g. "train", "validation"). Defaults to "train".
|
|
217
|
+
chunk_size (int): Number of examples per chunk. Defaults to 5_000.
|
|
218
|
+
batch_size (int): Inner batch size passed to .map. Defaults to 100.
|
|
219
|
+
num_proc (Optional[int]): CPU parallelism for .map. Defaults to None.
|
|
220
|
+
num_samples (Optional[int]): Optional cap on total examples to process.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Dataset: Concatenated dataset of all processed chunks.
|
|
224
|
+
"""
|
|
225
|
+
stream = load_dataset(
|
|
226
|
+
dataset_repo,
|
|
227
|
+
dataset_config,
|
|
228
|
+
split=dataset_split,
|
|
229
|
+
streaming=True,
|
|
230
|
+
)
|
|
231
|
+
if num_samples:
|
|
232
|
+
stream = stream.take(num_samples)
|
|
233
|
+
|
|
234
|
+
iterator = iter(stream)
|
|
235
|
+
processed_chunks = []
|
|
236
|
+
chunk_index = 0
|
|
237
|
+
|
|
238
|
+
while True:
|
|
239
|
+
start = chunk_index * chunk_size
|
|
240
|
+
end = start + chunk_size
|
|
241
|
+
ckpt_path = Path(save_chunks_to_folder) / f"obfuscated_ckpt_{start}_{end}"
|
|
242
|
+
|
|
243
|
+
if os.path.exists(ckpt_path):
|
|
244
|
+
print(f"Loading checkpoint {ckpt_path}")
|
|
245
|
+
chunk = load_from_disk(ckpt_path)
|
|
246
|
+
# Advance stream past already-processed examples
|
|
247
|
+
list(islice(iterator, chunk_size))
|
|
248
|
+
else:
|
|
249
|
+
rows = list(islice(iterator, chunk_size))
|
|
250
|
+
if not rows:
|
|
251
|
+
break # Stream exhausted
|
|
252
|
+
print(f"Processing examples {start}:{end}")
|
|
253
|
+
chunk = Dataset.from_list(rows)
|
|
254
|
+
chunk = self.obfuscate_dataset(
|
|
255
|
+
chunk,
|
|
256
|
+
column=column,
|
|
257
|
+
column_obfuscated=column_obfuscated,
|
|
258
|
+
batch_size=batch_size,
|
|
259
|
+
num_proc=num_proc,
|
|
260
|
+
)
|
|
261
|
+
chunk.save_to_disk(ckpt_path)
|
|
262
|
+
|
|
263
|
+
processed_chunks.append(chunk)
|
|
264
|
+
chunk_index += 1
|
|
265
|
+
|
|
266
|
+
obfuscated_dataset = concatenate_datasets(processed_chunks)
|
|
267
|
+
return obfuscated_dataset
|
|
268
|
+
|
|
269
|
+
def get_active_obfuscator(self):
|
|
270
|
+
return self.active_obfuscator
|
|
271
|
+
|
|
272
|
+
def _validate_config(self, algorithm: str, config: dict):
|
|
273
|
+
match algorithm:
|
|
274
|
+
case "pos-filter":
|
|
275
|
+
return POSFilterConfig(**config)
|
|
276
|
+
case "scramble-BoW":
|
|
277
|
+
return LinearScrambleConfig(**config)
|
|
278
|
+
case "scramble-hier":
|
|
279
|
+
return HierarchicalScrambleConfig(**config)
|
|
280
|
+
case "shannon":
|
|
281
|
+
return ShannonFilterConfig(**config)
|
|
282
|
+
case "lemmatize":
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
def _get_obfuscator(
|
|
286
|
+
self, algorithm: ObfuscationTechnique
|
|
287
|
+
) -> Obfuscator | SpaCyObfuscator:
|
|
288
|
+
match algorithm:
|
|
289
|
+
case "pos-filter":
|
|
290
|
+
self.apply_spacy_preprocessing = True
|
|
291
|
+
self.spacy_interface.set_pipeline("pos")
|
|
292
|
+
return POSFilter()
|
|
293
|
+
case "scramble-hier":
|
|
294
|
+
self.apply_spacy_preprocessing = True
|
|
295
|
+
self.spacy_interface.set_pipeline("full")
|
|
296
|
+
return HierarchicalScrambleObfuscator()
|
|
297
|
+
case "scramble-BoW":
|
|
298
|
+
self.apply_spacy_preprocessing = False
|
|
299
|
+
return LinearScrambleObfuscator()
|
|
300
|
+
case "shannon":
|
|
301
|
+
self.apply_spacy_preprocessing = False
|
|
302
|
+
self.spacy_interface.set_pipeline("pos")
|
|
303
|
+
return ShannonFilter(
|
|
304
|
+
lang=self.lang,
|
|
305
|
+
spacy_interface=self.spacy_interface,
|
|
306
|
+
prefer_gpu=self.prefer_gpu,
|
|
307
|
+
)
|
|
308
|
+
case "lemmatize":
|
|
309
|
+
self.apply_spacy_preprocessing = True
|
|
310
|
+
return LemmaObfuscator()
|
|
311
|
+
case _:
|
|
312
|
+
raise ValueError(
|
|
313
|
+
f"Input {algorithm} invalid. Please provide a valid obfuscation algorithm."
|
|
314
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from tmallet.obfuscators.morph.lemmatizer import LemmaObfuscator
|
|
2
|
+
from tmallet.obfuscators.pos.config import POSFilterConfig
|
|
3
|
+
from tmallet.obfuscators.pos.pos_filter import POSFilter
|
|
4
|
+
from tmallet.obfuscators.shannon.config import ShannonFilterConfig
|
|
5
|
+
from tmallet.obfuscators.shannon.shannon_bert import ShannonBERT
|
|
6
|
+
from tmallet.obfuscators.shannon.shannon_filter import ShannonFilter
|
|
7
|
+
from tmallet.obfuscators.structural.config import (
|
|
8
|
+
HierarchicalScrambleConfig,
|
|
9
|
+
LinearScrambleConfig,
|
|
10
|
+
)
|
|
11
|
+
from tmallet.obfuscators.structural.scramble_hier import HierarchicalScrambleObfuscator
|
|
12
|
+
from tmallet.obfuscators.structural.scramble_linear import LinearScrambleObfuscator
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"POSFilter",
|
|
16
|
+
"POSFilterConfig",
|
|
17
|
+
"ShannonFilter",
|
|
18
|
+
"ShannonFilterConfig",
|
|
19
|
+
"ShannonBERT",
|
|
20
|
+
"LinearScrambleObfuscator",
|
|
21
|
+
"LinearScrambleConfig",
|
|
22
|
+
"HierarchicalScrambleObfuscator",
|
|
23
|
+
"HierarchicalScrambleConfig",
|
|
24
|
+
"LemmaObfuscator",
|
|
25
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from spacy.tokens import Doc
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SpaCyObfuscator:
|
|
5
|
+
def set_config(self, config):
|
|
6
|
+
raise NotImplementedError("Implement set_config for setting config details.")
|
|
7
|
+
|
|
8
|
+
def obfuscate(self, doc: Doc) -> dict | str:
|
|
9
|
+
raise NotImplementedError("SpaCy obfuscator is not implemented.")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Obfuscator:
|
|
13
|
+
def set_config(self, config):
|
|
14
|
+
raise NotImplementedError("Implement set_config for setting config details.")
|
|
15
|
+
|
|
16
|
+
def obfuscate(self, text: str) -> dict | str:
|
|
17
|
+
raise NotImplementedError("General-purpose obfuscator is not implemented.")
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from spacy.tokens import Doc
|
|
4
|
+
|
|
5
|
+
from tmallet.obfuscators.base import SpaCyObfuscator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LemmaObfuscator(SpaCyObfuscator):
|
|
9
|
+
def obfuscate(self, doc: Doc, config: Dict = {}) -> str:
|
|
10
|
+
return self._lemmatise(doc)
|
|
11
|
+
|
|
12
|
+
def set_config(self, config: dict):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def _lemmatise(self, doc: Doc) -> str:
|
|
16
|
+
lemmatised_text = "".join([token.lemma_ + token.whitespace_ for token in doc])
|
|
17
|
+
return lemmatised_text
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FilterType(str, Enum):
|
|
8
|
+
Retain = "retain"
|
|
9
|
+
Remove = "remove"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class POSTag(str, Enum):
|
|
13
|
+
ADJ = "ADJ"
|
|
14
|
+
ADP = "ADP"
|
|
15
|
+
ADV = "ADV"
|
|
16
|
+
AUX = "AUX"
|
|
17
|
+
CCONJ = "CCONJ"
|
|
18
|
+
DET = "DET"
|
|
19
|
+
INTJ = "INTJ"
|
|
20
|
+
NOUN = "NOUN"
|
|
21
|
+
NUM = "NUM"
|
|
22
|
+
PART = "PART"
|
|
23
|
+
PRON = "PRON"
|
|
24
|
+
PROPN = "PROPN"
|
|
25
|
+
PUNCT = "PUNCT"
|
|
26
|
+
SCONJ = "SCONJ"
|
|
27
|
+
SYM = "SYM"
|
|
28
|
+
VERB = "VERB"
|
|
29
|
+
X = "X"
|
|
30
|
+
SPACE = "SPACE"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class POSFilterConfig(BaseModel):
|
|
34
|
+
filter_type: FilterType | List[FilterType] = FilterType.Retain
|
|
35
|
+
pos_tags: List[POSTag] = [POSTag.NOUN, POSTag.PROPN]
|
|
36
|
+
replacement_mechanism: str | List[str] = "default"
|