fable-client 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fable_client/__init__.py +5 -0
- fable_client/_cli.py +439 -0
- fable_client/_client.py +91 -0
- fable_client/_estimate.py +98 -0
- fable_client/_model.py +20 -0
- fable_client/main.py +9 -0
- fable_client/types.py +4 -0
- fable_client-0.4.1.dist-info/METADATA +489 -0
- fable_client-0.4.1.dist-info/RECORD +12 -0
- fable_client-0.4.1.dist-info/WHEEL +4 -0
- fable_client-0.4.1.dist-info/entry_points.txt +3 -0
- fable_client-0.4.1.dist-info/licenses/LICENSE +21 -0
fable_client/__init__.py
ADDED
fable_client/_cli.py
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import csv
|
|
3
|
+
import itertools
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, TypeVar
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import httpx
|
|
10
|
+
from fable_model import (
|
|
11
|
+
BitVectorEntity,
|
|
12
|
+
BaseMatchRequest,
|
|
13
|
+
MatchMethod,
|
|
14
|
+
BaseTransformRequest,
|
|
15
|
+
AttributeValueEntity,
|
|
16
|
+
BaseMaskRequest,
|
|
17
|
+
TransformConfig,
|
|
18
|
+
EmptyValueHandling,
|
|
19
|
+
GlobalTransformerConfig,
|
|
20
|
+
NormalizationTransformer,
|
|
21
|
+
WeightedAttributeConfig,
|
|
22
|
+
)
|
|
23
|
+
from pydantic import BaseModel
|
|
24
|
+
|
|
25
|
+
from ._client import FableClient
|
|
26
|
+
from ._estimate import compute_attribute_stats
|
|
27
|
+
from ._model import FakerGeneratorConfig, FakerGeneratorSpec
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def create_client(ctx: click.Context) -> FableClient:
|
|
31
|
+
return FableClient(client=httpx.Client(base_url=ctx.obj["BASE_URL"], timeout=int(ctx.obj["TIMEOUT_SECS"])))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def read_bit_vector_entity_file(reader: csv.DictReader, id_column: str, value_column: str):
|
|
35
|
+
"""
|
|
36
|
+
Read a CSV file containing bit vector entities.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
reader: CSV dict reader instance
|
|
40
|
+
id_column: name of ID column
|
|
41
|
+
value_column: name of value column
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
list of bit vector entities
|
|
45
|
+
"""
|
|
46
|
+
return [BitVectorEntity(id=row[id_column], value=row[value_column]) for row in reader]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_attribute_value_entity_file(reader: csv.DictReader, id_column: str):
|
|
50
|
+
field_names: list[str] = list(reader.fieldnames)
|
|
51
|
+
|
|
52
|
+
if id_column not in field_names:
|
|
53
|
+
click.echo(f"Column {id_column} not found in CSV file", err=True)
|
|
54
|
+
raise click.exceptions.Exit(1)
|
|
55
|
+
|
|
56
|
+
def _row_to_entity(row: dict[str, Any]):
|
|
57
|
+
return AttributeValueEntity(
|
|
58
|
+
id=str(row[id_column]),
|
|
59
|
+
attributes={
|
|
60
|
+
attribute_name: str(attribute_value)
|
|
61
|
+
for attribute_name, attribute_value in row.items()
|
|
62
|
+
if attribute_name != id_column
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
entities = list(_row_to_entity(row) for row in reader)
|
|
67
|
+
|
|
68
|
+
return field_names, entities
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
_M = TypeVar("_M", bound=BaseModel)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def parse_json_file_into(ctx: click.Context, path: str | Path, model: type[_M]) -> _M:
|
|
75
|
+
with open(path, mode="r", encoding=ctx.obj["ENCODING"]) as f:
|
|
76
|
+
return model(**json.load(f))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@contextlib.contextmanager
|
|
80
|
+
def read_csv_file(ctx: click.Context, path: str | Path, mode: str = "r"):
|
|
81
|
+
with open(path, mode=mode, encoding=ctx.obj["ENCODING"], newline="") as f:
|
|
82
|
+
yield csv.DictReader(f, delimiter=ctx.obj["DELIMITER"])
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@contextlib.contextmanager
|
|
86
|
+
def write_csv_file(
|
|
87
|
+
ctx: click.Context, path: str | Path, fieldnames: list[str], mode: str = "w", write_header: bool = True
|
|
88
|
+
):
|
|
89
|
+
with open(path, mode=mode, encoding=ctx.obj["ENCODING"], newline="") as f:
|
|
90
|
+
writer = csv.DictWriter(f, delimiter=ctx.obj["DELIMITER"], fieldnames=fieldnames)
|
|
91
|
+
|
|
92
|
+
if write_header:
|
|
93
|
+
writer.writeheader()
|
|
94
|
+
|
|
95
|
+
yield writer
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@click.group()
|
|
99
|
+
@click.pass_context
|
|
100
|
+
@click.option("--base-url", default="http://localhost:8080", help="base URL to HTTP-based PPRL service")
|
|
101
|
+
@click.option(
|
|
102
|
+
"-b", "--batch-size", type=click.IntRange(min=1), default=1_000, help="amount of bit vectors to match at a time"
|
|
103
|
+
)
|
|
104
|
+
@click.option("--timeout-secs", type=click.IntRange(min=1), default=30, help="seconds until a request times out")
|
|
105
|
+
@click.option("--delimiter", type=str, default=",", help="column delimiter for CSV files")
|
|
106
|
+
@click.option("--encoding", type=str, default="utf-8", help="character encoding for files")
|
|
107
|
+
def app(ctx: click.Context, base_url: str, batch_size: int, timeout_secs: int, delimiter: str, encoding: str):
|
|
108
|
+
ctx.ensure_object(dict)
|
|
109
|
+
ctx.obj["BASE_URL"] = base_url
|
|
110
|
+
ctx.obj["BATCH_SIZE"] = batch_size
|
|
111
|
+
ctx.obj["TIMEOUT_SECS"] = timeout_secs
|
|
112
|
+
ctx.obj["DELIMITER"] = delimiter
|
|
113
|
+
ctx.obj["ENCODING"] = encoding
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.command()
|
|
117
|
+
@click.pass_context
|
|
118
|
+
@click.argument("base_match_request_file_path", type=click.Path(exists=True, path_type=Path))
|
|
119
|
+
@click.argument("vector_file_path", type=click.Path(exists=True, path_type=Path, dir_okay=False), nargs=-1)
|
|
120
|
+
@click.argument("output_file_path", type=click.Path(path_type=Path, dir_okay=False))
|
|
121
|
+
@click.option("--id-column", type=str, default="id", help="column name in input CSV file containing vector ID")
|
|
122
|
+
@click.option("--value-column", type=str, default="value", help="column name in input CSV file containing vector value")
|
|
123
|
+
def match(
|
|
124
|
+
ctx: click.Context,
|
|
125
|
+
base_match_request_file_path: Path,
|
|
126
|
+
vector_file_path: tuple[Path, ...],
|
|
127
|
+
output_file_path: Path,
|
|
128
|
+
id_column: str,
|
|
129
|
+
value_column: str,
|
|
130
|
+
):
|
|
131
|
+
"""
|
|
132
|
+
Match bit vectors from CSV files against each other.
|
|
133
|
+
|
|
134
|
+
BASE_MATCH_REQUEST_FILE_PATH is the path to a JSON file containing the base match request.
|
|
135
|
+
VECTOR_FILE_PATH is the path to a CSV file containing bit vectors.
|
|
136
|
+
At least two files must be specified.
|
|
137
|
+
OUTPUT_FILE_PATH is the path of the CSV file where the matches should be written to.
|
|
138
|
+
"""
|
|
139
|
+
if len(vector_file_path) < 2:
|
|
140
|
+
click.echo("Must specify at least two CSV files containing vectors", err=True)
|
|
141
|
+
ctx.exit(1)
|
|
142
|
+
|
|
143
|
+
client = create_client(ctx)
|
|
144
|
+
base_match_request = parse_json_file_into(ctx, base_match_request_file_path, BaseMatchRequest)
|
|
145
|
+
|
|
146
|
+
batch_size = int(ctx.obj["BATCH_SIZE"])
|
|
147
|
+
file_count = len(vector_file_path)
|
|
148
|
+
vectors_per_file: list[list[BitVectorEntity]] = []
|
|
149
|
+
|
|
150
|
+
for p in vector_file_path:
|
|
151
|
+
with read_csv_file(ctx, p, mode="r") as reader:
|
|
152
|
+
vectors_per_file.append(read_bit_vector_entity_file(reader, id_column, value_column))
|
|
153
|
+
|
|
154
|
+
# check that all files have the same amount of entries
|
|
155
|
+
do_pairwise_matching = base_match_request.config.method == MatchMethod.pairwise
|
|
156
|
+
|
|
157
|
+
if do_pairwise_matching:
|
|
158
|
+
vector_lens = set(len(v) for v in vectors_per_file)
|
|
159
|
+
|
|
160
|
+
if len(vector_lens) != 1:
|
|
161
|
+
click.echo(
|
|
162
|
+
"All bit vector files must have the same amount of vectors for pairwise matching, got"
|
|
163
|
+
f"{', '.join([str(len(v) for v in vectors_per_file)])}"
|
|
164
|
+
)
|
|
165
|
+
ctx.exit(1)
|
|
166
|
+
|
|
167
|
+
with write_csv_file(
|
|
168
|
+
ctx,
|
|
169
|
+
output_file_path,
|
|
170
|
+
["domain_id", "domain_file", "range_id", "range_file", "similarity"],
|
|
171
|
+
mode="w",
|
|
172
|
+
write_header=True,
|
|
173
|
+
) as writer:
|
|
174
|
+
for domain_idx in range(0, file_count - 1):
|
|
175
|
+
for range_idx in range(domain_idx + 1, file_count):
|
|
176
|
+
# get domain and range vectors
|
|
177
|
+
domain_vectors, range_vectors = vectors_per_file[domain_idx], vectors_per_file[range_idx]
|
|
178
|
+
# these are tracked for user feedback
|
|
179
|
+
domain_file_path, range_file_path = vector_file_path[domain_idx], vector_file_path[range_idx]
|
|
180
|
+
|
|
181
|
+
# construct the starting indices for batch-wise processing
|
|
182
|
+
domain_start_idx = list(range(0, len(domain_vectors), batch_size))
|
|
183
|
+
range_start_idx = list(range(0, len(range_vectors), batch_size))
|
|
184
|
+
|
|
185
|
+
# when doing pairwise matching, matching will be performed row-wise
|
|
186
|
+
if do_pairwise_matching:
|
|
187
|
+
idx_pairs = zip(domain_start_idx, range_start_idx)
|
|
188
|
+
# otherwise, cross-wise matching is performed
|
|
189
|
+
else:
|
|
190
|
+
idx_pairs = itertools.product(domain_start_idx, range_start_idx)
|
|
191
|
+
|
|
192
|
+
with click.progressbar(
|
|
193
|
+
idx_pairs, label=f"Matching bit vectors from {domain_file_path.name} and {range_file_path.name}"
|
|
194
|
+
) as pbar:
|
|
195
|
+
# iterate over pairs of starting indices for domain and range
|
|
196
|
+
for idx_tpl in pbar:
|
|
197
|
+
domain_idx, range_idx = idx_tpl[0], idx_tpl[1]
|
|
198
|
+
|
|
199
|
+
# retrieve batch of vectors
|
|
200
|
+
domain_vector_batch = domain_vectors[domain_idx : domain_idx + batch_size]
|
|
201
|
+
range_vector_batch = range_vectors[range_idx : range_idx + batch_size]
|
|
202
|
+
|
|
203
|
+
# and perform matching
|
|
204
|
+
r = client.match(
|
|
205
|
+
base_match_request.with_vectors(
|
|
206
|
+
domain_lst=domain_vector_batch, range_lst=range_vector_batch
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
writer.writerows(
|
|
211
|
+
[
|
|
212
|
+
{
|
|
213
|
+
"domain_id": m.domain.id,
|
|
214
|
+
"domain_file": domain_file_path.name,
|
|
215
|
+
"range_id": m.range.id,
|
|
216
|
+
"range_file": range_file_path.name,
|
|
217
|
+
"similarity": m.similarity,
|
|
218
|
+
}
|
|
219
|
+
for m in r.matches
|
|
220
|
+
]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@app.command()
|
|
225
|
+
@click.pass_context
|
|
226
|
+
@click.argument("base_transform_request_file_path", type=click.Path(exists=True, path_type=Path))
|
|
227
|
+
@click.argument("entity_file_path", type=click.Path(exists=True, path_type=Path))
|
|
228
|
+
@click.argument("output_file_path", type=click.Path(path_type=Path, dir_okay=False))
|
|
229
|
+
@click.option("--entity-id-column", type=str, default="id", help="column name in entity CSV file containing ID")
|
|
230
|
+
def transform(
|
|
231
|
+
ctx: click.Context,
|
|
232
|
+
base_transform_request_file_path: Path,
|
|
233
|
+
entity_file_path: Path,
|
|
234
|
+
output_file_path: Path,
|
|
235
|
+
entity_id_column: str,
|
|
236
|
+
):
|
|
237
|
+
"""
|
|
238
|
+
Perform pre-processing on a CSV file with entities.
|
|
239
|
+
|
|
240
|
+
BASE_TRANSFORM_REQUEST_FILE_PATH is the path to a JSON file containing the base transform request.
|
|
241
|
+
ENTITY_FILE_PATH is the path to the CSV file containing entities.
|
|
242
|
+
OUTPUT_FILE_PATH is the path of the CSV file where the pre-processed entities should be written to.
|
|
243
|
+
"""
|
|
244
|
+
client = create_client(ctx)
|
|
245
|
+
base_transform_request = parse_json_file_into(ctx, base_transform_request_file_path, BaseTransformRequest)
|
|
246
|
+
|
|
247
|
+
# read entities
|
|
248
|
+
with read_csv_file(ctx, entity_file_path, mode="r") as reader:
|
|
249
|
+
field_names, entities = read_attribute_value_entity_file(reader, entity_id_column)
|
|
250
|
+
|
|
251
|
+
# create list of indices for batching
|
|
252
|
+
batch_size = int(ctx.obj["BATCH_SIZE"])
|
|
253
|
+
idx = list(range(0, len(entities), batch_size))
|
|
254
|
+
|
|
255
|
+
with (
|
|
256
|
+
write_csv_file(ctx, output_file_path, field_names, mode="w", write_header=True) as writer,
|
|
257
|
+
click.progressbar(idx, label="Transforming entities") as pbar,
|
|
258
|
+
):
|
|
259
|
+
for i in pbar:
|
|
260
|
+
# create batch
|
|
261
|
+
entity_batch = entities[i : i + batch_size]
|
|
262
|
+
r = client.transform(base_transform_request.with_entities(entity_batch))
|
|
263
|
+
|
|
264
|
+
# write results
|
|
265
|
+
writer.writerows([{entity_id_column: entity.id, **entity.attributes} for entity in r.entities])
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@app.command()
|
|
269
|
+
@click.pass_context
|
|
270
|
+
@click.argument("base_mask_request_file_path", type=click.Path(exists=True, path_type=Path))
|
|
271
|
+
@click.argument("entity_file_path", type=click.Path(exists=True, path_type=Path))
|
|
272
|
+
@click.argument("output_file_path", type=click.Path(dir_okay=False, file_okay=True, path_type=Path))
|
|
273
|
+
@click.option("--entity-id-column", type=str, default="id", help="column name in entity CSV file containing ID")
|
|
274
|
+
@click.option(
|
|
275
|
+
"--entity-value-column", type=str, default="value", help="column name in output CSV file containing vector value"
|
|
276
|
+
)
|
|
277
|
+
def mask(
|
|
278
|
+
ctx: click.Context,
|
|
279
|
+
base_mask_request_file_path: Path,
|
|
280
|
+
entity_file_path: Path,
|
|
281
|
+
output_file_path: Path,
|
|
282
|
+
entity_id_column: str,
|
|
283
|
+
entity_value_column: str,
|
|
284
|
+
):
|
|
285
|
+
"""
|
|
286
|
+
Mask a CSV file with entities.
|
|
287
|
+
|
|
288
|
+
BASE_MASK_REQUEST_FILE_PATH is the path to a JSON file containing the base mask request.
|
|
289
|
+
ENTITY_FILE_PATH is the path to the CSV file containing entities.
|
|
290
|
+
OUTPUT_FILE_PATH is the path of the CSV file where the masked entities should be written to.
|
|
291
|
+
"""
|
|
292
|
+
client = create_client(ctx)
|
|
293
|
+
base_mask_request = parse_json_file_into(ctx, base_mask_request_file_path, BaseMaskRequest)
|
|
294
|
+
|
|
295
|
+
with read_csv_file(ctx, entity_file_path, mode="r") as reader:
|
|
296
|
+
_, entities = read_attribute_value_entity_file(reader, entity_id_column)
|
|
297
|
+
|
|
298
|
+
# create list of indices for batching
|
|
299
|
+
batch_size = int(ctx.obj["BATCH_SIZE"])
|
|
300
|
+
idx = list(range(0, len(entities), batch_size))
|
|
301
|
+
|
|
302
|
+
with (
|
|
303
|
+
write_csv_file(
|
|
304
|
+
ctx, output_file_path, [entity_id_column, entity_value_column], mode="w", write_header=True
|
|
305
|
+
) as writer,
|
|
306
|
+
click.progressbar(idx, label="Masking entities") as pbar,
|
|
307
|
+
):
|
|
308
|
+
for i in pbar:
|
|
309
|
+
# create batch
|
|
310
|
+
entity_batch = entities[i : i + batch_size]
|
|
311
|
+
r = client.mask(base_mask_request.with_entities(entity_batch))
|
|
312
|
+
|
|
313
|
+
# write results
|
|
314
|
+
writer.writerows(
|
|
315
|
+
[{entity_id_column: entity.id, entity_value_column: entity.value} for entity in r.entities]
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@app.group()
|
|
320
|
+
def estimate():
|
|
321
|
+
"""Estimate attribute weights based on randomly generated data."""
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def common_estimate_options(fn):
|
|
326
|
+
fn = click.option(
|
|
327
|
+
"--base-transform-request-file-path",
|
|
328
|
+
type=click.Path(exists=True, path_type=Path),
|
|
329
|
+
help="path to file containing attribute-level and global transformer definitions",
|
|
330
|
+
)(fn)
|
|
331
|
+
fn = click.option(
|
|
332
|
+
"-q",
|
|
333
|
+
"--token-size",
|
|
334
|
+
type=click.IntRange(min=2),
|
|
335
|
+
default=2,
|
|
336
|
+
help="size of tokens to split each attribute value into",
|
|
337
|
+
)(fn)
|
|
338
|
+
fn = click.option(
|
|
339
|
+
"-p", "--padding", type=str, default="_", help="padding to use when splitting attribute values into tokens"
|
|
340
|
+
)(fn)
|
|
341
|
+
|
|
342
|
+
return fn
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
@estimate.command()
|
|
346
|
+
@click.pass_context
|
|
347
|
+
@click.argument("GENERATOR_CONFIG_FILE_PATH", type=click.Path(exists=True, path_type=Path))
|
|
348
|
+
@click.argument("ATTRIBUTE_CONFIG_OUTPUT_FILE_PATH", type=click.Path(path_type=Path))
|
|
349
|
+
@common_estimate_options
|
|
350
|
+
def faker(
|
|
351
|
+
ctx: click.Context,
|
|
352
|
+
generator_config_file_path: Path,
|
|
353
|
+
attribute_config_output_file_path: Path,
|
|
354
|
+
base_transform_request_file_path: Path | None,
|
|
355
|
+
token_size: int,
|
|
356
|
+
padding: str,
|
|
357
|
+
):
|
|
358
|
+
"""
|
|
359
|
+
Estimate attribute weights based on data generated by Faker.
|
|
360
|
+
|
|
361
|
+
GENERATOR_CONFIG_FILE_PATH is the file which defines the Faker providers to use.
|
|
362
|
+
ATTRIBUTE_CONFIG_OUTPUT_FILE_PATH is the path to the file where the attribute weights will be written to.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
from faker import Faker
|
|
367
|
+
except ImportError:
|
|
368
|
+
click.echo("Faker not found, install it with `pip install fable_client[faker]`", err=True)
|
|
369
|
+
raise click.exceptions.Exit(1)
|
|
370
|
+
|
|
371
|
+
# set up vars
|
|
372
|
+
client = create_client(ctx)
|
|
373
|
+
faker_generator_config = parse_json_file_into(ctx, generator_config_file_path, FakerGeneratorConfig)
|
|
374
|
+
batch_size = int(ctx.obj["BATCH_SIZE"])
|
|
375
|
+
|
|
376
|
+
# load base transform request, if specified
|
|
377
|
+
if base_transform_request_file_path is None:
|
|
378
|
+
base_transform_request = BaseTransformRequest(
|
|
379
|
+
config=TransformConfig(empty_value=EmptyValueHandling.skip),
|
|
380
|
+
global_transformers=GlobalTransformerConfig(before=[NormalizationTransformer()]),
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
base_transform_request = parse_json_file_into(ctx, base_transform_request_file_path, BaseTransformRequest)
|
|
384
|
+
|
|
385
|
+
# create faker instance
|
|
386
|
+
fake = Faker(faker_generator_config.locale)
|
|
387
|
+
fake.seed_instance(faker_generator_config.seed)
|
|
388
|
+
|
|
389
|
+
# creates a callable function from a generator specification
|
|
390
|
+
def _create_faker_generator(generator: FakerGeneratorSpec):
|
|
391
|
+
generator_fn = getattr(fake, generator.function_name, None)
|
|
392
|
+
|
|
393
|
+
# check that the function actually exists
|
|
394
|
+
if not callable(generator_fn):
|
|
395
|
+
click.echo(f"Invalid faker function: {generator.function_name}", err=True)
|
|
396
|
+
raise click.exceptions.Exit(1)
|
|
397
|
+
|
|
398
|
+
# wrapper function with no args that calls the generator
|
|
399
|
+
def _generate():
|
|
400
|
+
return str(generator_fn(**generator.args))
|
|
401
|
+
|
|
402
|
+
return _generate
|
|
403
|
+
|
|
404
|
+
# attribute name -> generator fn
|
|
405
|
+
attribute_name_to_generator_fn = {
|
|
406
|
+
generator.attribute_name: _create_faker_generator(generator) for generator in faker_generator_config.generators
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
entities = [
|
|
410
|
+
# loop by amount of entities to generate
|
|
411
|
+
AttributeValueEntity(
|
|
412
|
+
# loop by attribute name and generator function pairs
|
|
413
|
+
id=str(i),
|
|
414
|
+
attributes={
|
|
415
|
+
attribute_name: generator_fn()
|
|
416
|
+
for attribute_name, generator_fn in attribute_name_to_generator_fn.items()
|
|
417
|
+
},
|
|
418
|
+
)
|
|
419
|
+
for i in range(faker_generator_config.count)
|
|
420
|
+
]
|
|
421
|
+
|
|
422
|
+
# compute stats for each attribute
|
|
423
|
+
attribute_name_to_stats = compute_attribute_stats(
|
|
424
|
+
client, entities, base_transform_request, token_size, padding, batch_size
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# create list of attribute configs
|
|
428
|
+
attribute_configs = [
|
|
429
|
+
WeightedAttributeConfig(
|
|
430
|
+
attribute_name=attribute_name,
|
|
431
|
+
weight=attribute_stats["ngram_entropy"],
|
|
432
|
+
average_token_count=attribute_stats["average_tokens"],
|
|
433
|
+
)
|
|
434
|
+
for attribute_name, attribute_stats in attribute_name_to_stats.items()
|
|
435
|
+
]
|
|
436
|
+
|
|
437
|
+
# export them
|
|
438
|
+
with open(attribute_config_output_file_path, mode="w", encoding="utf-8") as f:
|
|
439
|
+
json.dump([cfg.model_dump(mode="json", exclude_none=True) for cfg in attribute_configs], f, indent=2)
|
fable_client/_client.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from json import JSONDecodeError
|
|
2
|
+
from typing import TypeVar
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
from fable_model import (
|
|
6
|
+
VectorMatchRequest,
|
|
7
|
+
VectorMatchResponse,
|
|
8
|
+
EntityTransformRequest,
|
|
9
|
+
EntityTransformResponse,
|
|
10
|
+
EntityMaskRequest,
|
|
11
|
+
EntityMaskResponse,
|
|
12
|
+
)
|
|
13
|
+
from pydantic import BaseModel, ValidationError
|
|
14
|
+
|
|
15
|
+
_MI = TypeVar("_MI", bound=BaseModel)
|
|
16
|
+
_MO = TypeVar("_MO", bound=BaseModel)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GenericErrorResponse(BaseModel):
|
|
20
|
+
detail: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ValidationErrorDetail(BaseModel):
|
|
24
|
+
loc: list[str]
|
|
25
|
+
msg: str
|
|
26
|
+
type: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ValidationErrorResponse(BaseModel):
|
|
30
|
+
detail: list[ValidationErrorDetail]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FableError(httpx.HTTPError):
|
|
34
|
+
def __init__(
|
|
35
|
+
self, message: str, request: httpx.Request, error: GenericErrorResponse | ValidationErrorResponse = None
|
|
36
|
+
):
|
|
37
|
+
super().__init__(message)
|
|
38
|
+
self._request = request
|
|
39
|
+
self.error_response = error
|
|
40
|
+
|
|
41
|
+
self.error_type = "unknown"
|
|
42
|
+
|
|
43
|
+
if isinstance(error, GenericErrorResponse):
|
|
44
|
+
self.error_type = "default"
|
|
45
|
+
|
|
46
|
+
if isinstance(error, ValidationErrorResponse):
|
|
47
|
+
self.error_type = "validation"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def new_error_from_response(r: httpx.Response):
|
|
51
|
+
error_response = None
|
|
52
|
+
error_message = f"received status code {r.status_code}"
|
|
53
|
+
|
|
54
|
+
# validation error (422 by default with FastAPI)
|
|
55
|
+
if r.status_code == httpx.codes.UNPROCESSABLE_ENTITY.value:
|
|
56
|
+
try:
|
|
57
|
+
error_response = ValidationErrorResponse(**r.json())
|
|
58
|
+
error_message += ": invalid request"
|
|
59
|
+
except (ValidationError, JSONDecodeError):
|
|
60
|
+
pass
|
|
61
|
+
else:
|
|
62
|
+
try:
|
|
63
|
+
error_response = GenericErrorResponse(**r.json())
|
|
64
|
+
error_message += f": {error_response.detail}"
|
|
65
|
+
except (ValidationError, JSONDecodeError):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
return FableError(error_message, r.request, error_response)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class FableClient(object):
|
|
72
|
+
def __init__(self, client: httpx.Client = None, base_url: str = None):
|
|
73
|
+
self._client = client or httpx.Client(base_url=base_url)
|
|
74
|
+
|
|
75
|
+
def _request(self, path: str, model_in: _MI, model_out: type[_MO]) -> _MO:
|
|
76
|
+
r = self._client.post(path, json=model_in.model_dump(mode="json"))
|
|
77
|
+
|
|
78
|
+
# we generally expect a 200 here
|
|
79
|
+
if r.status_code != httpx.codes.OK.value:
|
|
80
|
+
raise new_error_from_response(r)
|
|
81
|
+
|
|
82
|
+
return model_out(**r.json())
|
|
83
|
+
|
|
84
|
+
def match(self, request: VectorMatchRequest):
|
|
85
|
+
return self._request("match/", request, VectorMatchResponse)
|
|
86
|
+
|
|
87
|
+
def transform(self, request: EntityTransformRequest):
|
|
88
|
+
return self._request("transform/", request, EntityTransformResponse)
|
|
89
|
+
|
|
90
|
+
def mask(self, request: EntityMaskRequest):
|
|
91
|
+
return self._request("mask/", request, EntityMaskResponse)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
__all__ = [
|
|
2
|
+
"split_into_wordlist",
|
|
3
|
+
"tokenize_wordlist",
|
|
4
|
+
"compute_average_tokens_for_token_list",
|
|
5
|
+
"count_tokens_in_token_list",
|
|
6
|
+
"compute_ngram_entropy",
|
|
7
|
+
"compute_attribute_stats",
|
|
8
|
+
"AttributeStats",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from collections import defaultdict, Counter
|
|
13
|
+
from typing import TypedDict
|
|
14
|
+
|
|
15
|
+
import fable_core
|
|
16
|
+
from fable_model import AttributeValueEntity, BaseTransformRequest
|
|
17
|
+
|
|
18
|
+
from ._client import FableClient
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AttributeStats(TypedDict):
|
|
22
|
+
average_tokens: float
|
|
23
|
+
ngram_entropy: float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def split_into_wordlist(entities: list[AttributeValueEntity]) -> dict[str, list[str]]:
|
|
27
|
+
"""Split a list of entities into a dictionary of attribute names to values."""
|
|
28
|
+
attr_name_to_wordlist: dict[str, list[str]] = defaultdict(list)
|
|
29
|
+
|
|
30
|
+
for entity in entities:
|
|
31
|
+
for attr_name, attr_value in entity.attributes.items():
|
|
32
|
+
attr_name_to_wordlist[attr_name].append(attr_value)
|
|
33
|
+
|
|
34
|
+
return attr_name_to_wordlist
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def tokenize_wordlist(wordlist: list[str], token_size=2, padding="_") -> list[set[str]]:
|
|
38
|
+
return [fable_core.common.tokenize(word, q=token_size, padding=padding) for word in wordlist]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def compute_average_tokens_for_token_list(token_list: list[set[str]]) -> float:
|
|
42
|
+
total_token_count = sum(len(tokens) for tokens in token_list)
|
|
43
|
+
|
|
44
|
+
if total_token_count == 0:
|
|
45
|
+
return 0
|
|
46
|
+
|
|
47
|
+
return total_token_count / len(token_list)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def count_tokens_in_token_list(token_list: list[set[str]]) -> dict[str, int]:
|
|
51
|
+
token_counter: dict[str, int] = Counter()
|
|
52
|
+
|
|
53
|
+
for word_tokens in token_list:
|
|
54
|
+
for token in word_tokens:
|
|
55
|
+
token_counter[token] += 1
|
|
56
|
+
|
|
57
|
+
return token_counter
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compute_ngram_entropy(token_counts: dict[str, int]) -> float:
|
|
61
|
+
total_ngram_count = sum(c for c in token_counts.values())
|
|
62
|
+
entropy = 0
|
|
63
|
+
|
|
64
|
+
for count in token_counts.values():
|
|
65
|
+
p = count / total_ngram_count
|
|
66
|
+
entropy += p * math.log2(p)
|
|
67
|
+
|
|
68
|
+
return -entropy
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def compute_attribute_stats(
|
|
72
|
+
client: FableClient,
|
|
73
|
+
entities: list[AttributeValueEntity],
|
|
74
|
+
base_transform_request: BaseTransformRequest,
|
|
75
|
+
token_size: int = 2,
|
|
76
|
+
padding: str = "_",
|
|
77
|
+
batch_size: int = 100,
|
|
78
|
+
):
|
|
79
|
+
processed_entities: list[AttributeValueEntity] = []
|
|
80
|
+
|
|
81
|
+
for i in range(0, len(entities), batch_size):
|
|
82
|
+
req = base_transform_request.with_entities(entities[i : i + batch_size])
|
|
83
|
+
res = client.transform(req)
|
|
84
|
+
processed_entities.extend(res.entities)
|
|
85
|
+
|
|
86
|
+
attribute_name_to_wordlist = split_into_wordlist(processed_entities)
|
|
87
|
+
|
|
88
|
+
def _compute_stats_for_wordlist(wordlist: list[str]) -> AttributeStats:
|
|
89
|
+
token_list = tokenize_wordlist(wordlist, token_size=token_size, padding=padding)
|
|
90
|
+
average_tokens = compute_average_tokens_for_token_list(token_list)
|
|
91
|
+
token_counts = count_tokens_in_token_list(token_list)
|
|
92
|
+
ngram_entropy = compute_ngram_entropy(token_counts)
|
|
93
|
+
|
|
94
|
+
return {"average_tokens": average_tokens, "ngram_entropy": ngram_entropy}
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
attr_name: _compute_stats_for_wordlist(wordlist) for attr_name, wordlist in attribute_name_to_wordlist.items()
|
|
98
|
+
}
|
fable_client/_model.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FakerGeneratorSpec(BaseModel):
|
|
7
|
+
function_name: str
|
|
8
|
+
attribute_name: str
|
|
9
|
+
args: dict[str, Any] = Field(default_factory=dict)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _default_faker_locale():
|
|
13
|
+
return ["en_US"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FakerGeneratorConfig(BaseModel):
|
|
17
|
+
seed: int
|
|
18
|
+
count: int = Field(ge=0)
|
|
19
|
+
locale: list[str] = Field(default_factory=_default_faker_locale)
|
|
20
|
+
generators: list[FakerGeneratorSpec]
|
fable_client/main.py
ADDED
fable_client/types.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fable-client
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: HTTP-based client for interacting with the FABLE service for privacy-preserving record linkage with Bloom filters.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: record linkage,privacy,bloom filter,bitarray,cryptography,service,client,cli
|
|
8
|
+
Author: Maximilian Jugl
|
|
9
|
+
Requires-Python: >=3.10,<4
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Security :: Cryptography
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
26
|
+
Provides-Extra: faker
|
|
27
|
+
Requires-Dist: click (>=8.0.0,<9.0.0)
|
|
28
|
+
Requires-Dist: fable-core (>=0.1.5,<0.2.0)
|
|
29
|
+
Requires-Dist: fable-model (>=0.1.7,<0.2.0)
|
|
30
|
+
Requires-Dist: faker (>=26.0.0) ; extra == "faker"
|
|
31
|
+
Requires-Dist: httpx (>=0.28.0,<0.29.0)
|
|
32
|
+
Project-URL: Repository, https://github.com/ul-mds/fable-client
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/fable-client/)
|
|
36
|
+
[](https://pypi.org/project/fable-client/)
|
|
37
|
+

|
|
38
|
+
[](https://pypi.org/project/fable-client/)
|
|
39
|
+
|
|
40
|
+
# FABLE Client
|
|
41
|
+
|
|
42
|
+
This package contains a HTTP-based client for working with the server provided by
|
|
43
|
+
the [PPRL service](https://github.com/ul-mds/fable-pprl-service) which is part of the FABLE
|
|
44
|
+
(**F**ederated **A**nonymized **B**loom filter **L**inkage **E**ngine) ecosystem.
|
|
45
|
+
It also contains a command-line application which uses the library to process CSV files.
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install fable-client
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Weight estimation requires additional packages which are not shipped by default.
|
|
54
|
+
To add them, install this package using the following command.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install fable-client[faker]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Library methods
|
|
61
|
+
|
|
62
|
+
The library exposes functions for entity pre-processing, masking and bit vector matching.
|
|
63
|
+
They follow the data model that is also used by the FABLE PPRL service, which is exposed through
|
|
64
|
+
the [FABLE model package](https://github.com/ul-mds/fable-model).
|
|
65
|
+
|
|
66
|
+
### Entity transformation
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import fable_client
|
|
70
|
+
from fable_model import (
|
|
71
|
+
EntityTransformRequest,
|
|
72
|
+
TransformConfig,
|
|
73
|
+
EmptyValueHandling,
|
|
74
|
+
AttributeValueEntity,
|
|
75
|
+
GlobalTransformerConfig,
|
|
76
|
+
NormalizationTransformer,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
80
|
+
|
|
81
|
+
response = client.transform(
|
|
82
|
+
EntityTransformRequest(
|
|
83
|
+
config=TransformConfig(empty_value=EmptyValueHandling.error),
|
|
84
|
+
entities=[AttributeValueEntity(id="001", attributes={"first_name": "Müller", "last_name": "Ludenscheidt"})],
|
|
85
|
+
global_transformers=GlobalTransformerConfig(before=[NormalizationTransformer()]),
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
print(response.entities)
|
|
90
|
+
# => [AttributeValueEntity(id='001', attributes={'first_name': 'muller', 'last_name': 'ludenscheidt'})]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Entity masking
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import fable_client
|
|
97
|
+
from fable_model import (
|
|
98
|
+
EntityMaskRequest,
|
|
99
|
+
MaskConfig,
|
|
100
|
+
HashConfig,
|
|
101
|
+
HashFunction,
|
|
102
|
+
HashAlgorithm,
|
|
103
|
+
RandomHash,
|
|
104
|
+
CLKFilter,
|
|
105
|
+
AttributeValueEntity,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
109
|
+
|
|
110
|
+
response = client.mask(
|
|
111
|
+
EntityMaskRequest(
|
|
112
|
+
config=MaskConfig(
|
|
113
|
+
token_size=2,
|
|
114
|
+
hash=HashConfig(
|
|
115
|
+
function=HashFunction(algorithms=[HashAlgorithm.sha1], key="s3cr3t_k3y"), strategy=RandomHash()
|
|
116
|
+
),
|
|
117
|
+
filter=CLKFilter(hash_values=5, filter_size=256),
|
|
118
|
+
),
|
|
119
|
+
entities=[AttributeValueEntity(id="001", attributes={"first_name": "muller", "last_name": "ludenscheidt"})],
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
print(response.entities)
|
|
124
|
+
# => [BitVectorEntity(id='001', value='SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A=')]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Bit vector matching
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import fable_client
|
|
131
|
+
from fable_model import VectorMatchRequest, MatchConfig, SimilarityMeasure, BitVectorEntity
|
|
132
|
+
|
|
133
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
134
|
+
|
|
135
|
+
response = client.match(
|
|
136
|
+
VectorMatchRequest(
|
|
137
|
+
config=MatchConfig(measure=SimilarityMeasure.jaccard, threshold=0.8),
|
|
138
|
+
domain=[BitVectorEntity(id="001", value="SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A=")],
|
|
139
|
+
range=[
|
|
140
|
+
BitVectorEntity(id="100", value="UKkgqBHBDJJCANICELSpWMAUBYCMEMLrZgEQGBKRC7A="),
|
|
141
|
+
BitVectorEntity(id="101", value="H5DN45iUeEjrjbHZrzHb3AyQk9O4IgxcpENKKzEKRLE="),
|
|
142
|
+
],
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
print(response.matches)
|
|
147
|
+
# => [Match(domain=BitVectorEntity(id='001', value='SKkgqBHBCJJCANICEKSpWMAUBYCQEMLuZgEQGBKRC8A='), range=BitVectorEntity(id='100', value='UKkgqBHBDJJCANICELSpWMAUBYCMEMLrZgEQGBKRC7A='), similarity=0.8536585365853658)]
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Attribute weight estimation
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
import fable_client
|
|
154
|
+
from fable_model import (
|
|
155
|
+
AttributeValueEntity,
|
|
156
|
+
BaseTransformRequest,
|
|
157
|
+
TransformConfig,
|
|
158
|
+
EmptyValueHandling,
|
|
159
|
+
GlobalTransformerConfig,
|
|
160
|
+
NormalizationTransformer,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
client = fable_client.FableClient(base_url="http://localhost:8080")
|
|
164
|
+
|
|
165
|
+
stats = fable_client.estimate.compute_attribute_stats(
|
|
166
|
+
client,
|
|
167
|
+
[
|
|
168
|
+
AttributeValueEntity(id="001", attributes={"given_name": "Max", "last_name": "Mustermann", "gender": "m"}),
|
|
169
|
+
AttributeValueEntity(id="002", attributes={"given_name": "Maria", "last_name": "Musterfrau", "gender": "f"}),
|
|
170
|
+
],
|
|
171
|
+
BaseTransformRequest(
|
|
172
|
+
config=TransformConfig(empty_value=EmptyValueHandling.skip),
|
|
173
|
+
global_transformers=GlobalTransformerConfig(before=[NormalizationTransformer()]),
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
print(stats)
|
|
178
|
+
# => {'given_name': {'average_tokens': 5.0, 'ngram_entropy': 2.9219280948873623}, 'last_name': {'average_tokens': 11.0, 'ngram_entropy': 3.913977073182751}, 'gender': {'average_tokens': 2.0, 'ngram_entropy': 2.0}}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Command line interface
|
|
182
|
+
|
|
183
|
+
The `fable` command exposes all the library's functions and adapts them to work with CSV files.
|
|
184
|
+
Running `fable --help` provides an overview of the command options.
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
$ fable --help
|
|
188
|
+
Usage: fable [OPTIONS] COMMAND [ARGS]...
|
|
189
|
+
|
|
190
|
+
HTTP client for performing PPRL based on Bloom filters.
|
|
191
|
+
|
|
192
|
+
Options:
|
|
193
|
+
--base-url TEXT base URL to HTTP-based PPRL service
|
|
194
|
+
-b, --batch-size INTEGER RANGE amount of bit vectors to match at a time [x>=1]
|
|
195
|
+
--timeout-secs INTEGER RANGE seconds until a request times out [x>=1]
|
|
196
|
+
--delimiter TEXT column delimiter for CSV files
|
|
197
|
+
--encoding TEXT character encoding for files
|
|
198
|
+
--help Show this message and exit.
|
|
199
|
+
|
|
200
|
+
Commands:
|
|
201
|
+
estimate Estimate attribute weights based on randomly generated data.
|
|
202
|
+
mask Mask a CSV file with entities.
|
|
203
|
+
match Match bit vectors from CSV files against each other.
|
|
204
|
+
transform Perform pre-processing on a CSV file with entities
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The `fable` command works on two basic types of CSV files that follow a simple structure.
|
|
208
|
+
Entity files are CSV files that contain a column with a unique identifier and arbitrary additional columns which
|
|
209
|
+
contain values for certain attributes that identify an entity.
|
|
210
|
+
Each row is representative of a single entity.
|
|
211
|
+
|
|
212
|
+
```csv
|
|
213
|
+
id,first_name,last_name,date_of_birth,gender
|
|
214
|
+
001,Natalie,Sampson,1956-12-16,female
|
|
215
|
+
002,Eric,Lynch,1910-01-11,female
|
|
216
|
+
003,Pam,Vaughn,1983-10-05,male
|
|
217
|
+
004,David,Jackson,2006-01-27,male
|
|
218
|
+
005,Rachel,Dyer,1904-02-02,female
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Bit vector files contain an ID column and a value column which contains a representative bit vector.
|
|
222
|
+
These bit vectors are generally generated by masking a record from an entity file.
|
|
223
|
+
|
|
224
|
+
```csv
|
|
225
|
+
id,value
|
|
226
|
+
001,0Dr8t+kE5ltI+xdM85fwx0QLrTIgvFN35/0YvODNdOE0AaUHPphikXYy4LlArE4UqfjPs+wKtT233R7lBzSp5mwkCjTzA1tl0N7s+sFeKyIrOiGk0gNIYvA=
|
|
227
|
+
002,QMEIkE9TN1Quv0K0QAIk1RZD3qF7nQh0IyOYqVDf8IQkyaLGcFjiLHsEgBpU8CRSCuATbWpjEwGi3dilizySQy4miGiJolilYmwKysjseq+IFsAU3T1IRjA=
|
|
228
|
+
003,BqFoNZhrAVBq9SV1wBK0dUZLHDM9hCBoO4XdKCzvasSUELQeAB8+DV5tAhDl5KCSJfDCB6JG4WSoCFbozXqBYSUMqEQJE0JwhpRK6oLOcRRoGwGESDBMZwA=
|
|
229
|
+
004,8C9KItMTwtz4oXQvo8G0t1bTnwspnghmJwyqqcL2RIHASb4XJHAqybMCXQBm5mq6h/kdxGbblxBjhy79jRUcI60haqZhNsst0n7OUAxM/UoZVumIilRIbCA=
|
|
230
|
+
005,CFk4I0sKwnRoiTEOQASy1QZfHCGB1GBgYQDcZwDDtIkGGLOmLRhrQyOSlQDUDoYTbvaBRVqbkRnqmYQbDTEGlG+2y60FMmBEKtxsr0I4I00oMpuoXAsDWmA=
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Pre-processing is done with the `fable transform` command.
|
|
234
|
+
It requires a base transform request file, an entity file and an output file to write the pre-processed entities to.
|
|
235
|
+
Attribute and global transformer configurations can be provided, but at least one must be specified.
|
|
236
|
+
|
|
237
|
+
In this example, a global normalization transformer which is executed before all other attribute-specific transformers
|
|
238
|
+
is defined.
|
|
239
|
+
Date time reformatting is applied to the "date of birth" column in the input file.
|
|
240
|
+
|
|
241
|
+
_request.json_
|
|
242
|
+
|
|
243
|
+
```json
|
|
244
|
+
{
|
|
245
|
+
"config": {
|
|
246
|
+
"empty_value": "skip"
|
|
247
|
+
},
|
|
248
|
+
"attribute_transformers": [
|
|
249
|
+
{
|
|
250
|
+
"attribute_name": "date_of_birth",
|
|
251
|
+
"transformers": [
|
|
252
|
+
{
|
|
253
|
+
"name": "date_time",
|
|
254
|
+
"input_format": "%Y-%m-%d",
|
|
255
|
+
"output_format": "%Y%m%d"
|
|
256
|
+
}
|
|
257
|
+
]
|
|
258
|
+
}
|
|
259
|
+
],
|
|
260
|
+
"global_transformers": {
|
|
261
|
+
"before": [
|
|
262
|
+
{
|
|
263
|
+
"name": "normalization"
|
|
264
|
+
}
|
|
265
|
+
]
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
$ fable transform ./request.json ./input.csv ./output.csv
|
|
272
|
+
Transforming entities [####################################] 100%
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
_output.csv_
|
|
276
|
+
|
|
277
|
+
```csv
|
|
278
|
+
id,first_name,last_name,date_of_birth,gender
|
|
279
|
+
001,natalie,sampson,19561216,female
|
|
280
|
+
002,eric,lynch,19100111,female
|
|
281
|
+
003,pam,vaughn,19831005,male
|
|
282
|
+
004,david,jackson,20060127,male
|
|
283
|
+
005,rachel,dyer,19040202,female
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Masking is done with `fable mask` and its subcommands.
|
|
287
|
+
It requires a base mask request file, an entity file and an output file to write the masked entities to.
|
|
288
|
+
|
|
289
|
+
_request.json_
|
|
290
|
+
|
|
291
|
+
```json
|
|
292
|
+
{
|
|
293
|
+
"config": {
|
|
294
|
+
"token_size": 2,
|
|
295
|
+
"hash": {
|
|
296
|
+
"function": {
|
|
297
|
+
"algorithms": ["sha256"],
|
|
298
|
+
"key": "s3cr3t_k3y",
|
|
299
|
+
"strategy": {
|
|
300
|
+
"name": "random_hash"
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"prepend_attribute_name": true,
|
|
305
|
+
"filter": {
|
|
306
|
+
"type": "clk",
|
|
307
|
+
"filter_size": 512,
|
|
308
|
+
"hash_values": 5,
|
|
309
|
+
"padding": "_",
|
|
310
|
+
"hardeners": [
|
|
311
|
+
{
|
|
312
|
+
"name": "permute",
|
|
313
|
+
"seed": 727
|
|
314
|
+
},
|
|
315
|
+
{
|
|
316
|
+
"name": "rehash",
|
|
317
|
+
"window_size": 16,
|
|
318
|
+
"window_step": 8,
|
|
319
|
+
"samples": 2
|
|
320
|
+
}
|
|
321
|
+
]
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
_input.csv_
|
|
328
|
+
|
|
329
|
+
```csv
|
|
330
|
+
id,first_name,last_name,date_of_birth,gender
|
|
331
|
+
001,natalie,sampson,19561216,female
|
|
332
|
+
002,eric,lynch,19100111,female
|
|
333
|
+
003,pam,vaughn,19831005,male
|
|
334
|
+
004,david,jackson,20060127,male
|
|
335
|
+
005,rachel,dyer,19040202,female
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
```
|
|
339
|
+
$ fable mask ./request.json ./input.csv ./output.csv
|
|
340
|
+
Masking entities [####################################] 100%
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
_output.csv_
|
|
344
|
+
|
|
345
|
+
```csv
|
|
346
|
+
id,value
|
|
347
|
+
001,wAWgITvQ1/VACpRYC2EKrfCkWziyEhmyKwi5sMsFrAQVoIBygTQScPRoIIAto0AwS0ihlcAIFAcQRwccY5IOmQ==
|
|
348
|
+
002,cFCwQIABQ+TgSSdlGM/z54BEUgmYhA1GKtCxQAKAXFIWiPAFIQYaFArgM61pUAAeATwBlBEOEw4Oowe0rbcMGw==
|
|
349
|
+
003,IgK16AAISCRoCuVAb1UBZYBBhGgxSEkKeMkTUCKAx4IAsNGJBS4ShgBAGIapBIQWJLiBFEEKAIWAGYS8ZZGMKw==
|
|
350
|
+
004,ZlBkyoYIEWmeaxbPDNng5JjHACkCAJwjlBCJQBJ4ZBSyOAukACUahOAFQ20oNwTQEDRA005+VUUfsUQcKCGNxg==
|
|
351
|
+
005,cUekQFQkI7TpTcRwmcNDoodRRBshlSEiAUjBQiMlxBLTmODMJICmDmxgUqYKonQEMFD58QsogRQFIgYUwJDOHA==
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Matching is done with the `fable match` command.
|
|
355
|
+
It allows the matching of multiple bit vector input files at once.
|
|
356
|
+
If more than two files are provided, the command will pick out pairs of files and matches their contents against one
|
|
357
|
+
another.
|
|
358
|
+
|
|
359
|
+
In this example, the bit vectors of two files are matched against each other.
|
|
360
|
+
The Jaccard index is used as a similarity measure and a match threshold of 70% is applied.
|
|
361
|
+
|
|
362
|
+
_request.json_
|
|
363
|
+
|
|
364
|
+
```json
|
|
365
|
+
{
|
|
366
|
+
"config": {
|
|
367
|
+
"measure": "jaccard",
|
|
368
|
+
"threshold": 0.7
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
_domain.csv_
|
|
374
|
+
|
|
375
|
+
```csv
|
|
376
|
+
id,value
|
|
377
|
+
001,wAWgITvQ1/VACpRYC2EKrfCkWziyEhmyKwi5sMsFrAQVoIBygTQScPRoIIAto0AwS0ihlcAIFAcQRwccY5IOmQ==
|
|
378
|
+
002,cFCwQIABQ+TgSSdlGM/z54BEUgmYhA1GKtCxQAKAXFIWiPAFIQYaFArgM61pUAAeATwBlBEOEw4Oowe0rbcMGw==
|
|
379
|
+
003,IgK16AAISCRoCuVAb1UBZYBBhGgxSEkKeMkTUCKAx4IAsNGJBS4ShgBAGIapBIQWJLiBFEEKAIWAGYS8ZZGMKw==
|
|
380
|
+
004,ZlBkyoYIEWmeaxbPDNng5JjHACkCAJwjlBCJQBJ4ZBSyOAukACUahOAFQ20oNwTQEDRA005+VUUfsUQcKCGNxg==
|
|
381
|
+
005,cUekQFQkI7TpTcRwmcNDoodRRBshlSEiAUjBQiMlxBLTmODMJICmDmxgUqYKonQEMFD58QsogRQFIgYUwJDOHA==
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
_range.csv_
|
|
385
|
+
|
|
386
|
+
```csv
|
|
387
|
+
id,value
|
|
388
|
+
101,kUSyxIgtIDSAB7ZYDkFQRZpFoMkCjCCCbDTWAUJTRAAEBpspBX4PNUZKi1AIVCABAjg6EAoKuwVleeUYgRBYoQ==
|
|
389
|
+
102,IAA0YE4MGexIiYdEjwNzoOKmIA4CEHEiKQASYFPhxQTQlPAAgYW3AWBYmQJ8YMoaAj0ZkoOrFyUmFo52TDcIKw==
|
|
390
|
+
103,BFAwREkkQbTdzddgDHFWgMRJMyxAMW+jq2ASICMBtIEr+YDCBRUgxEDIsQpciO4mAK3h2cIbXFQCMlaVpJPZIQ==
|
|
391
|
+
104,wBWgITvQ2/VACpRYC2EKrfCkWxiyEhmyKwi5sMsFrBQVoIBygTQScPRoIIAto0AwS0ihldAIFAcQRwccY5IOmQ==
|
|
392
|
+
105,QCCwIKQAED5AjaZYmodDcZAEBKkIxgAiDfEUoDKEdgEAEJAMAwcfQEbQkaQ4ANAABqiUscAKPQZEMJxRhTGIGQ==
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
```
|
|
396
|
+
$ fable match request.json domain.csv range.csv output.csv
|
|
397
|
+
Matching bit vectors from domain.csv and range.csv [####################################] 100%
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
_output.csv_
|
|
401
|
+
|
|
402
|
+
```csv
|
|
403
|
+
domain_id,domain_file,range_id,range_file,similarity
|
|
404
|
+
001,domain.csv,104,range.csv,0.9690721649484536
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
Weight estimation is done with the `fable estimate` command.
|
|
408
|
+
It generates random data based off of user specification and computes estimates for attribute weights.
|
|
409
|
+
Data can be generated using [Faker](https://faker.readthedocs.io/).
|
|
410
|
+
|
|
411
|
+
*faker.json*
|
|
412
|
+
|
|
413
|
+
```json
|
|
414
|
+
{
|
|
415
|
+
"seed": 727,
|
|
416
|
+
"count": 5000,
|
|
417
|
+
"locale": ["de_DE"],
|
|
418
|
+
"generators": [
|
|
419
|
+
{"function_name": "first_name_nonbinary", "attribute_name": "given_name"},
|
|
420
|
+
{"function_name": "last_name", "attribute_name": "last_name"},
|
|
421
|
+
{"function_name": "random_element", "attribute_name": "gender", "args": {"elements": ["m", "f"]}},
|
|
422
|
+
{"function_name": "street_name", "attribute_name": "street_name"},
|
|
423
|
+
{"function_name": "city", "attribute_name": "municipality"},
|
|
424
|
+
{"function_name": "postcode", "attribute_name": "postcode"}
|
|
425
|
+
]
|
|
426
|
+
}
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
```
|
|
430
|
+
$ fable estimate faker faker.json faker-output.json
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
*faker-output.json*
|
|
434
|
+
|
|
435
|
+
```json
|
|
436
|
+
[
|
|
437
|
+
{
|
|
438
|
+
"attribute_name": "given_name",
|
|
439
|
+
"weight": 7.657958943890718,
|
|
440
|
+
"average_token_count": 7.5686
|
|
441
|
+
},
|
|
442
|
+
{
|
|
443
|
+
"attribute_name": "last_name",
|
|
444
|
+
"weight": 7.444573503220938,
|
|
445
|
+
"average_token_count": 7.5204
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"attribute_name": "gender",
|
|
449
|
+
"weight": 1.9999971146079947,
|
|
450
|
+
"average_token_count": 2.0
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
"attribute_name": "street_name",
|
|
454
|
+
"weight": 7.605565770282046,
|
|
455
|
+
"average_token_count": 16.2188
|
|
456
|
+
},
|
|
457
|
+
{
|
|
458
|
+
"attribute_name": "municipality",
|
|
459
|
+
"weight": 7.659422921807241,
|
|
460
|
+
"average_token_count": 9.952
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
"attribute_name": "postcode",
|
|
464
|
+
"weight": 6.7812429085107,
|
|
465
|
+
"average_token_count": 5.9464
|
|
466
|
+
}
|
|
467
|
+
]
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
## Configuring pytest
|
|
471
|
+
|
|
472
|
+
In order to run integration tests, the FABLE PPRL service is needed.
|
|
473
|
+
The first option is to spin up the service independently and direct pytest to it.
|
|
474
|
+
Alternatively, pytest can start a Docker test container for the duration of the test run.
|
|
475
|
+
The following table shows all available configuration options.
|
|
476
|
+
These variables can be defined in `.env` or `.env.test`.
|
|
477
|
+
|
|
478
|
+
| **Environment variable** | **Description** | **Default** |
|
|
479
|
+
|-----------------------------------|-----------------------------------------------------------------------------|-------------|
|
|
480
|
+
| PYTEST_PPRL_BASE_URL<sup>1)</sup> | Base URL for the FABLE PPRL service | |
|
|
481
|
+
| PYTEST_PPRL_SERVICE_VERSION | Tag of the FABLE PPRL service image that will run inside the test container | latest |
|
|
482
|
+
| PYTEST_PRRL_SERVICE_PORT | Port that will be exposed by the test container | 8080 |
|
|
483
|
+
|
|
484
|
+
<sup>1)</sup> If defined, pytest will not spin up a test container.
|
|
485
|
+
|
|
486
|
+
## License
|
|
487
|
+
|
|
488
|
+
MIT.
|
|
489
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
fable_client/__init__.py,sha256=Y3swMGPFlHZLsTJyKiBDsRRigHmwildneq-imdBBAf8,163
|
|
2
|
+
fable_client/_cli.py,sha256=_WXKcIhkw0EtR7ma7cX24O0-jCTTjJ134OHRr1mViLM,17095
|
|
3
|
+
fable_client/_client.py,sha256=HfsYRAq8ZBiELu8euJPG-x_xfiEZp1-zscWRmCm1FTc,2711
|
|
4
|
+
fable_client/_estimate.py,sha256=GrMIOxzm9lxsILhhqW78zEOXITr6FQbMtYL0IhvkdqM,3085
|
|
5
|
+
fable_client/_model.py,sha256=SiuTYVBbryJvVI1ophhF4YifgkuAN_EwhwRd6B4LPEU,449
|
|
6
|
+
fable_client/main.py,sha256=TcxUSffggdRNGZhlt_iSvQ7hQzJ3WVfZuuQ4fHDXmGE,113
|
|
7
|
+
fable_client/types.py,sha256=eiYF458YKlLU_QUocVP7t_bmSkXG6TYDnYVH0vk1Peo,175
|
|
8
|
+
fable_client-0.4.1.dist-info/METADATA,sha256=4_9Vgxye_tZOZyWbkZ5_RCrv19xOvIHU8gRybaxAA5c,16211
|
|
9
|
+
fable_client-0.4.1.dist-info/WHEEL,sha256=EGEvSphFYqXKs23-kQBeyNoJP1nrT8ZJKQoi5p5DYL8,88
|
|
10
|
+
fable_client-0.4.1.dist-info/entry_points.txt,sha256=7W7ZrPoF3jcq4IEAcIzs1IdMKP0NcJOHje7yO2Zlxao,51
|
|
11
|
+
fable_client-0.4.1.dist-info/licenses/LICENSE,sha256=Q-Ktj_VJi4SeAGoslkUojX0DD9cZFxaw36u4FR5f61o,1117
|
|
12
|
+
fable_client-0.4.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 University Medical Center Leipzig, Dept. Medical Data Science
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|