fable-client 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fable_client-0.4.1/LICENSE +21 -0
- fable_client-0.4.1/PKG-INFO +489 -0
- fable_client-0.4.1/README.md +454 -0
- fable_client-0.4.1/fable_client/__init__.py +5 -0
- fable_client-0.4.1/fable_client/_cli.py +439 -0
- fable_client-0.4.1/fable_client/_client.py +91 -0
- fable_client-0.4.1/fable_client/_estimate.py +98 -0
- fable_client-0.4.1/fable_client/_model.py +20 -0
- fable_client-0.4.1/fable_client/main.py +9 -0
- fable_client-0.4.1/fable_client/types.py +4 -0
- fable_client-0.4.1/pyproject.toml +73 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 University Medical Center Leipzig, Dept. Medical Data Science
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fable-client
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: HTTP-based client for interacting with the FABLE service for privacy-preserving record linkage with Bloom filters.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: record linkage,privacy,bloom filter,bitarray,cryptography,service,client,cli
|
|
8
|
+
Author: Maximilian Jugl
|
|
9
|
+
Requires-Python: >=3.10,<4
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Security :: Cryptography
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
26
|
+
Provides-Extra: faker
|
|
27
|
+
Requires-Dist: click (>=8.0.0,<9.0.0)
|
|
28
|
+
Requires-Dist: fable-core (>=0.1.5,<0.2.0)
|
|
29
|
+
Requires-Dist: fable-model (>=0.1.7,<0.2.0)
|
|
30
|
+
Requires-Dist: faker (>=26.0.0) ; extra == "faker"
|
|
31
|
+
Requires-Dist: httpx (>=0.28.0,<0.29.0)
|
|
32
|
+
Project-URL: Repository, https://github.com/ul-mds/fable-client
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/fable-client/)
|
|
36
|
+
[](https://pypi.org/project/fable-client/)
|
|
37
|
+

|
|
38
|
+
[](https://pypi.org/project/fable-client/)
|
|
39
|
+
|
|
40
|
+
# FABLE Client
|
|
41
|
+
|
|
42
|
+
This package contains a HTTP-based client for working with the server provided by
|
|
43
|
+
the [PPRL service](https://github.com/ul-mds/fable-pprl-service) which is part of the FABLE
|
|
44
|
+
(**F**ederated **A**nonymized **B**loom filter **L**inkage **E**ngine) ecosystem.
|
|
45
|
+
It also contains a command-line application which uses the library to process CSV files.
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install fable-client
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Weight estimation requires additional packages which are not shipped by default.
|
|
54
|
+
To add them, install this package using the following command.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install fable-client[faker]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Library methods
|
|
61
|
+
|
|
62
|
+
The library exposes functions for entity pre-processing, masking and bit vector matching.
|
|
63
|
+
They follow the data model that is also used by the FABLE PPRL service, which is exposed through
|
|
64
|
+
the [FABLE model package](https://github.com/ul-mds/fable-model).
|
|
65
|
+
|
|
66
|
+
### Entity transformation
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import fable_client
|
|
70
|
+
from fable_model import (
|
|
71
|
+
EntityTransformRequest,
|
|
72
|
+
TransformConfig,
|
|
73
|
+
EmptyValueHandling,
|
|
74
|
+
AttributeValueEntity,
|
|
75
|
+
GlobalTransformerConfig,
|
|
76
|
+
NormalizationTransformer,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
80
|
+
|
|
81
|
+
response = client.transform(
|
|
82
|
+
EntityTransformRequest(
|
|
83
|
+
config=TransformConfig(empty_value=EmptyValueHandling.error),
|
|
84
|
+
entities=[AttributeValueEntity(id="001", attributes={"first_name": "Müller", "last_name": "Ludenscheidt"})],
|
|
85
|
+
global_transformers=GlobalTransformerConfig(before=[NormalizationTransformer()]),
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
print(response.entities)
|
|
90
|
+
# => [AttributeValueEntity(id='001', attributes={'first_name': 'muller', 'last_name': 'ludenscheidt'})]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Entity masking
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import fable_client
|
|
97
|
+
from fable_model import (
|
|
98
|
+
EntityMaskRequest,
|
|
99
|
+
MaskConfig,
|
|
100
|
+
HashConfig,
|
|
101
|
+
HashFunction,
|
|
102
|
+
HashAlgorithm,
|
|
103
|
+
RandomHash,
|
|
104
|
+
CLKFilter,
|
|
105
|
+
AttributeValueEntity,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
109
|
+
|
|
110
|
+
response = client.mask(
|
|
111
|
+
EntityMaskRequest(
|
|
112
|
+
config=MaskConfig(
|
|
113
|
+
token_size=2,
|
|
114
|
+
hash=HashConfig(
|
|
115
|
+
function=HashFunction(algorithms=[HashAlgorithm.sha1], key="s3cr3t_k3y"), strategy=RandomHash()
|
|
116
|
+
),
|
|
117
|
+
filter=CLKFilter(hash_values=5, filter_size=256),
|
|
118
|
+
),
|
|
119
|
+
entities=[AttributeValueEntity(id="001", attributes={"first_name": "muller", "last_name": "ludenscheidt"})],
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
print(response.entities)
|
|
124
|
+
# => [BitVectorEntity(id='001', value='SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A=')]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Bit vector matching
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import fable_client
|
|
131
|
+
from fable_model import VectorMatchRequest, MatchConfig, SimilarityMeasure, BitVectorEntity
|
|
132
|
+
|
|
133
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
134
|
+
|
|
135
|
+
response = client.match(
|
|
136
|
+
VectorMatchRequest(
|
|
137
|
+
config=MatchConfig(measure=SimilarityMeasure.jaccard, threshold=0.8),
|
|
138
|
+
domain=[BitVectorEntity(id="001", value="SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A=")],
|
|
139
|
+
range=[
|
|
140
|
+
BitVectorEntity(id="100", value="UKkgqBHBDJJCANICELSpWMAUBYCMEMLrZgEQGBKRC7A="),
|
|
141
|
+
BitVectorEntity(id="101", value="H5DN45iUeEjrjbHZrzHb3AyQk9O4IgxcpENKKzEKRLE="),
|
|
142
|
+
],
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
print(response.matches)
|
|
147
|
+
# => [Match(domain=BitVectorEntity(id='001', value='SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A='), range=BitVectorEntity(id='100', value='UKkgqBHBDJJCANICELSpWMAUBYCMEMLrZgEQGBKRC7A='), similarity=0.8536585365853658)]
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Attribute weight estimation
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
import fable_client
|
|
154
|
+
from fable_model import (
|
|
155
|
+
AttributeValueEntity,
|
|
156
|
+
BaseTransformRequest,
|
|
157
|
+
TransformConfig,
|
|
158
|
+
EmptyValueHandling,
|
|
159
|
+
GlobalTransformerConfig,
|
|
160
|
+
NormalizationTransformer,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
164
|
+
|
|
165
|
+
stats = fable_client.estimate.compute_attribute_stats(
|
|
166
|
+
client,
|
|
167
|
+
[
|
|
168
|
+
AttributeValueEntity(id="001", attributes={"given_name": "Max", "last_name": "Mustermann", "gender": "m"}),
|
|
169
|
+
AttributeValueEntity(id="002", attributes={"given_name": "Maria", "last_name": "Musterfrau", "gender": "f"}),
|
|
170
|
+
],
|
|
171
|
+
BaseTransformRequest(
|
|
172
|
+
config=TransformConfig(empty_value=EmptyValueHandling.skip),
|
|
173
|
+
global_transformers=GlobalTransformerConfig(before=[NormalizationTransformer()]),
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
print(stats)
|
|
178
|
+
# => {'given_name': {'average_tokens': 5.0, 'ngram_entropy': 2.9219280948873623}, 'last_name': {'average_tokens': 11.0, 'ngram_entropy': 3.913977073182751}, 'gender': {'average_tokens': 2.0, 'ngram_entropy': 2.0}}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Command line interface
|
|
182
|
+
|
|
183
|
+
The `fable` command exposes all the library's functions and adapts them to work with CSV files.
|
|
184
|
+
Running `fable --help` provides an overview of the command options.
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
$ fable --help
|
|
188
|
+
Usage: fable [OPTIONS] COMMAND [ARGS]...
|
|
189
|
+
|
|
190
|
+
HTTP client for performing PPRL based on Bloom filters.
|
|
191
|
+
|
|
192
|
+
Options:
|
|
193
|
+
--base-url TEXT base URL to HTTP-based PPRL service
|
|
194
|
+
-b, --batch-size INTEGER RANGE amount of bit vectors to match at a time [x>=1]
|
|
195
|
+
--timeout-secs INTEGER RANGE seconds until a request times out [x>=1]
|
|
196
|
+
--delimiter TEXT column delimiter for CSV files
|
|
197
|
+
--encoding TEXT character encoding for files
|
|
198
|
+
--help Show this message and exit.
|
|
199
|
+
|
|
200
|
+
Commands:
|
|
201
|
+
estimate Estimate attribute weights based on randomly generated data.
|
|
202
|
+
mask Mask a CSV file with entities.
|
|
203
|
+
match Match bit vectors from CSV files against each other.
|
|
204
|
+
transform Perform pre-processing on a CSV file with entities
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The `fable` command works on two basic types of CSV files that follow a simple structure.
|
|
208
|
+
Entity files are CSV files that contain a column with a unique identifier and arbitrary additional columns which
|
|
209
|
+
contain values for certain attributes that identify an entity.
|
|
210
|
+
Each row is representative of a single entity.
|
|
211
|
+
|
|
212
|
+
```csv
|
|
213
|
+
id,first_name,last_name,date_of_birth,gender
|
|
214
|
+
001,Natalie,Sampson,1956-12-16,female
|
|
215
|
+
002,Eric,Lynch,1910-01-11,female
|
|
216
|
+
003,Pam,Vaughn,1983-10-05,male
|
|
217
|
+
004,David,Jackson,2006-01-27,male
|
|
218
|
+
005,Rachel,Dyer,1904-02-02,female
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Bit vector files contain an ID column and a value column which contains a representative bit vector.
|
|
222
|
+
These bit vectors are generally generated by masking a record from an entity file.
|
|
223
|
+
|
|
224
|
+
```csv
|
|
225
|
+
id,value
|
|
226
|
+
001,0Dr8t+kE5ltI+xdM85fwx0QLrTIgvFN35/0YvODNdOE0AaUHPphikXYy4LlArE4UqfjPs+wKtT233R7lBzSp5mwkCjTzA1tl0N7s+sFeKyIrOiGk0gNIYvA=
|
|
227
|
+
002,QMEIkE9TN1Quv0K0QAIk1RZD3qF7nQh0IyOYqVDf8IQkyaLGcFjiLHsEgBpU8CRSCuATbWpjEwGi3dilizySQy4miGiJolilYmwKysjseq+IFsAU3T1IRjA=
|
|
228
|
+
003,BqFoNZhrAVBq9SV1wBK0dUZLHDM9hCBoO4XdKCzvasSUELQeAB8+DV5tAhDl5KCSJfDCB6JG4WSoCFbozXqBYSUMqEQJE0JwhpRK6oLOcRRoGwGESDBMZwA=
|
|
229
|
+
004,8C9KItMTwtz4oXQvo8G0t1bTnwspnghmJwyqqcL2RIHASb4XJHAqybMCXQBm5mq6h/kdxGbblxBjhy79jRUcI60haqZhNsst0n7OUAxM/UoZVumIilRIbCA=
|
|
230
|
+
005,CFk4I0sKwnRoiTEOQASy1QZfHCGB1GBgYQDcZwDDtIkGGLOmLRhrQyOSlQDUDoYTbvaBRVqbkRnqmYQbDTEGlG+2y60FMmBEKtxsr0I4I00oMpuoXAsDWmA=
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Pre-processing is done with the `fable transform` command.
|
|
234
|
+
It requires a base transform request file, an entity file and an output file to write the pre-processed entities to.
|
|
235
|
+
Attribute and global transformer configurations can be provided, but at least one must be specified.
|
|
236
|
+
|
|
237
|
+
In this example, a global normalization transformer which is executed before all other attribute-specific transformers
|
|
238
|
+
is defined.
|
|
239
|
+
Date time reformatting is applied to the "date of birth" column in the input file.
|
|
240
|
+
|
|
241
|
+
_request.json_
|
|
242
|
+
|
|
243
|
+
```json
|
|
244
|
+
{
|
|
245
|
+
"config": {
|
|
246
|
+
"empty_value": "skip"
|
|
247
|
+
},
|
|
248
|
+
"attribute_transformers": [
|
|
249
|
+
{
|
|
250
|
+
"attribute_name": "date_of_birth",
|
|
251
|
+
"transformers": [
|
|
252
|
+
{
|
|
253
|
+
"name": "date_time",
|
|
254
|
+
"input_format": "%Y-%m-%d",
|
|
255
|
+
"output_format": "%Y%m%d"
|
|
256
|
+
}
|
|
257
|
+
]
|
|
258
|
+
}
|
|
259
|
+
],
|
|
260
|
+
"global_transformers": {
|
|
261
|
+
"before": [
|
|
262
|
+
{
|
|
263
|
+
"name": "normalization"
|
|
264
|
+
}
|
|
265
|
+
]
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
$ fable transform ./request.json ./input.csv ./output.csv
|
|
272
|
+
Transforming entities [####################################] 100%
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
_output.csv_
|
|
276
|
+
|
|
277
|
+
```csv
|
|
278
|
+
id,first_name,last_name,date_of_birth,gender
|
|
279
|
+
001,natalie,sampson,19561216,female
|
|
280
|
+
002,eric,lynch,19100111,female
|
|
281
|
+
003,pam,vaughn,19831005,male
|
|
282
|
+
004,david,jackson,20060127,male
|
|
283
|
+
005,rachel,dyer,19040202,female
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Masking is done with `fable mask` and its subcommands.
|
|
287
|
+
It requires a base mask request file, an entity file and an output file to write the masked entities to.
|
|
288
|
+
|
|
289
|
+
_request.json_
|
|
290
|
+
|
|
291
|
+
```json
|
|
292
|
+
{
|
|
293
|
+
"config": {
|
|
294
|
+
"token_size": 2,
|
|
295
|
+
"hash": {
|
|
296
|
+
"function": {
|
|
297
|
+
"algorithms": ["sha256"],
|
|
298
|
+
"key": "s3cr3t_k3y",
|
|
299
|
+
"strategy": {
|
|
300
|
+
"name": "random_hash"
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"prepend_attribute_name": true,
|
|
305
|
+
"filter": {
|
|
306
|
+
"type": "clk",
|
|
307
|
+
"filter_size": 512,
|
|
308
|
+
"hash_values": 5,
|
|
309
|
+
"padding": "_",
|
|
310
|
+
"hardeners": [
|
|
311
|
+
{
|
|
312
|
+
"name": "permute",
|
|
313
|
+
"seed": 727
|
|
314
|
+
},
|
|
315
|
+
{
|
|
316
|
+
"name": "rehash",
|
|
317
|
+
"window_size": 16,
|
|
318
|
+
"window_step": 8,
|
|
319
|
+
"samples": 2
|
|
320
|
+
}
|
|
321
|
+
]
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
_input.csv_
|
|
328
|
+
|
|
329
|
+
```csv
|
|
330
|
+
id,first_name,last_name,date_of_birth,gender
|
|
331
|
+
001,natalie,sampson,19561216,female
|
|
332
|
+
002,eric,lynch,19100111,female
|
|
333
|
+
003,pam,vaughn,19831005,male
|
|
334
|
+
004,david,jackson,20060127,male
|
|
335
|
+
005,rachel,dyer,19040202,female
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
```
|
|
339
|
+
$ fable mask ./request.json ./input.csv ./output.csv
|
|
340
|
+
Masking entities [####################################] 100%
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
_output.csv_
|
|
344
|
+
|
|
345
|
+
```csv
|
|
346
|
+
id,value
|
|
347
|
+
001,wAWgITvQ1/VACpRYC2EKrfCkWziyEhmyKwi5sMsFrAQVoIBygTQScPRoIIAto0AwS0ihlcAIFAcQRwccY5IOmQ==
|
|
348
|
+
002,cFCwQIABQ+TgSSdlGM/z54BEUgmYhA1GKtCxQAKAXFIWiPAFIQYaFArgM61pUAAeATwBlBEOEw4Oowe0rbcMGw==
|
|
349
|
+
003,IgK16AAISCRoCuVAb1UBZYBBhGgxSEkKeMkTUCKAx4IAsNGJBS4ShgBAGIapBIQWJLiBFEEKAIWAGYS8ZZGMKw==
|
|
350
|
+
004,ZlBkyoYIEWmeaxbPDNng5JjHACkCAJwjlBCJQBJ4ZBSyOAukACUahOAFQ20oNwTQEDRA005+VUUfsUQcKCGNxg==
|
|
351
|
+
005,cUekQFQkI7TpTcRwmcNDoodRRBshlSEiAUjBQiMlxBLTmODMJICmDmxgUqYKonQEMFD58QsogRQFIgYUwJDOHA==
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Matching is done with the `fable match` command.
|
|
355
|
+
It allows the matching of multiple bit vector input files at once.
|
|
356
|
+
If more than two files are provided, the command will pick out pairs of files and matches their contents against one
|
|
357
|
+
another.
|
|
358
|
+
|
|
359
|
+
In this example, the bit vectors of two files are matched against each other.
|
|
360
|
+
The Jaccard index is used as a similarity measure and a match threshold of 70% is applied.
|
|
361
|
+
|
|
362
|
+
_request.json_
|
|
363
|
+
|
|
364
|
+
```json
|
|
365
|
+
{
|
|
366
|
+
"config": {
|
|
367
|
+
"measure": "jaccard",
|
|
368
|
+
"threshold": 0.7
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
_domain.csv_
|
|
374
|
+
|
|
375
|
+
```csv
|
|
376
|
+
id,value
|
|
377
|
+
001,wAWgITvQ1/VACpRYC2EKrfCkWziyEhmyKwi5sMsFrAQVoIBygTQScPRoIIAto0AwS0ihlcAIFAcQRwccY5IOmQ==
|
|
378
|
+
002,cFCwQIABQ+TgSSdlGM/z54BEUgmYhA1GKtCxQAKAXFIWiPAFIQYaFArgM61pUAAeATwBlBEOEw4Oowe0rbcMGw==
|
|
379
|
+
003,IgK16AAISCRoCuVAb1UBZYBBhGgxSEkKeMkTUCKAx4IAsNGJBS4ShgBAGIapBIQWJLiBFEEKAIWAGYS8ZZGMKw==
|
|
380
|
+
004,ZlBkyoYIEWmeaxbPDNng5JjHACkCAJwjlBCJQBJ4ZBSyOAukACUahOAFQ20oNwTQEDRA005+VUUfsUQcKCGNxg==
|
|
381
|
+
005,cUekQFQkI7TpTcRwmcNDoodRRBshlSEiAUjBQiMlxBLTmODMJICmDmxgUqYKonQEMFD58QsogRQFIgYUwJDOHA==
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
_range.csv_
|
|
385
|
+
|
|
386
|
+
```csv
|
|
387
|
+
id,value
|
|
388
|
+
101,kUSyxIgtIDSAB7ZYDkFQRZpFoMkCjCCCbDTWAUJTRAAEBpspBX4PNUZKi1AIVCABAjg6EAoKuwVleeUYgRBYoQ==
|
|
389
|
+
102,IAA0YE4MGexIiYdEjwNzoOKmIA4CEHEiKQASYFPhxQTQlPAAgYW3AWBYmQJ8YMoaAj0ZkoOrFyUmFo52TDcIKw==
|
|
390
|
+
103,BFAwREkkQbTdzddgDHFWgMRJMyxAMW+jq2ASICMBtIEr+YDCBRUgxEDIsQpciO4mAK3h2cIbXFQCMlaVpJPZIQ==
|
|
391
|
+
104,wBWgITvQ2/VACpRYC2EKrfCkWxiyEhmyKwi5sMsFrBQVoIBygTQScPRoIIAto0AwS0ihldAIFAcQRwccY5IOmQ==
|
|
392
|
+
105,QCCwIKQAED5AjaZYmodDcZAEBKkIxgAiDfEUoDKEdgEAEJAMAwcfQEbQkaQ4ANAABqiUscAKPQZEMJxRhTGIGQ==
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
```
|
|
396
|
+
$ fable match request.json domain.csv range.csv output.csv
|
|
397
|
+
Matching bit vectors from domain.csv and range.csv [####################################] 100%
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
_output.csv_
|
|
401
|
+
|
|
402
|
+
```csv
|
|
403
|
+
domain_id,domain_file,range_id,range_file,similarity
|
|
404
|
+
001,domain.csv,104,range.csv,0.9690721649484536
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
Weight estimation is done with the `fable estimate` command.
|
|
408
|
+
It generates random data based off of user specification and computes estimates for attribute weights.
|
|
409
|
+
Data can be generated using [Faker](https://faker.readthedocs.io/).
|
|
410
|
+
|
|
411
|
+
*faker.json*
|
|
412
|
+
|
|
413
|
+
```json
|
|
414
|
+
{
|
|
415
|
+
"seed": 727,
|
|
416
|
+
"count": 5000,
|
|
417
|
+
"locale": ["de_DE"],
|
|
418
|
+
"generators": [
|
|
419
|
+
{"function_name": "first_name_nonbinary", "attribute_name": "given_name"},
|
|
420
|
+
{"function_name": "last_name", "attribute_name": "last_name"},
|
|
421
|
+
{"function_name": "random_element", "attribute_name": "gender", "args": {"elements": ["m", "f"]}},
|
|
422
|
+
{"function_name": "street_name", "attribute_name": "street_name"},
|
|
423
|
+
{"function_name": "city", "attribute_name": "municipality"},
|
|
424
|
+
{"function_name": "postcode", "attribute_name": "postcode"}
|
|
425
|
+
]
|
|
426
|
+
}
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
```
|
|
430
|
+
$ fable estimate faker faker.json faker-output.json
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
*faker-output.json*
|
|
434
|
+
|
|
435
|
+
```json
|
|
436
|
+
[
|
|
437
|
+
{
|
|
438
|
+
"attribute_name": "given_name",
|
|
439
|
+
"weight": 7.657958943890718,
|
|
440
|
+
"average_token_count": 7.5686
|
|
441
|
+
},
|
|
442
|
+
{
|
|
443
|
+
"attribute_name": "last_name",
|
|
444
|
+
"weight": 7.444573503220938,
|
|
445
|
+
"average_token_count": 7.5204
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"attribute_name": "gender",
|
|
449
|
+
"weight": 1.9999971146079947,
|
|
450
|
+
"average_token_count": 2.0
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
"attribute_name": "street_name",
|
|
454
|
+
"weight": 7.605565770282046,
|
|
455
|
+
"average_token_count": 16.2188
|
|
456
|
+
},
|
|
457
|
+
{
|
|
458
|
+
"attribute_name": "municipality",
|
|
459
|
+
"weight": 7.659422921807241,
|
|
460
|
+
"average_token_count": 9.952
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
"attribute_name": "postcode",
|
|
464
|
+
"weight": 6.7812429085107,
|
|
465
|
+
"average_token_count": 5.9464
|
|
466
|
+
}
|
|
467
|
+
]
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
## Configuring pytest
|
|
471
|
+
|
|
472
|
+
In order to run integration tests, the FABLE PPRL service is needed.
|
|
473
|
+
The first option is to spin up the service independently and direct pytest to it.
|
|
474
|
+
Alternatively, pytest can start a Docker test container for the duration of the test run.
|
|
475
|
+
The following table shows all available configuration options.
|
|
476
|
+
These variables can be defined in `.env` or `.env.test`.
|
|
477
|
+
|
|
478
|
+
| **Environment variable** | **Description** | **Default** |
|
|
479
|
+
|-----------------------------------|-----------------------------------------------------------------------------|-------------|
|
|
480
|
+
| PYTEST_PPRL_BASE_URL<sup>1)</sup> | Base URL for the FABLE PPRL service | |
|
|
481
|
+
| PYTEST_PPRL_SERVICE_VERSION | Tag of the FABLE PPRL service image that will run inside the test container | latest |
|
|
482
|
+
| PYTEST_PRRL_SERVICE_PORT | Port that will be exposed by the test container | 8080 |
|
|
483
|
+
|
|
484
|
+
<sup>1)</sup> If defined, pytest will not spin up a test container.
|
|
485
|
+
|
|
486
|
+
## License
|
|
487
|
+
|
|
488
|
+
MIT.
|
|
489
|
+
|