lethe-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lethe/__init__.py +3 -0
- lethe/cli.py +138 -0
- lethe/config.py +27 -0
- lethe/mapping/__init__.py +0 -0
- lethe/mapping/session_index.py +65 -0
- lethe/multiplier.py +102 -0
- lethe/multiply_pipeline.py +69 -0
- lethe/parsers/__init__.py +96 -0
- lethe/parsers/base.py +17 -0
- lethe/parsers/csv_parser.py +39 -0
- lethe/parsers/txt_parser.py +108 -0
- lethe/pipeline.py +89 -0
- lethe/replacer/__init__.py +0 -0
- lethe/replacer/engine.py +33 -0
- lethe/replacer/freeform.py +57 -0
- lethe/sanitizer.py +100 -0
- lethe/scanner/__init__.py +0 -0
- lethe/scanner/column_heuristics.py +52 -0
- lethe/scanner/confidence.py +46 -0
- lethe/scanner/engine.py +103 -0
- lethe/scanner/pattern_recognizers.py +50 -0
- lethe_cli-0.1.1.dist-info/METADATA +78 -0
- lethe_cli-0.1.1.dist-info/RECORD +26 -0
- lethe_cli-0.1.1.dist-info/WHEEL +4 -0
- lethe_cli-0.1.1.dist-info/entry_points.txt +2 -0
- lethe_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
lethe/__init__.py
ADDED
lethe/cli.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Lethe CLI: data anonymization for structured files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from lethe.config import DEFAULT_CHUNK_SIZE, DEFAULT_LOCALE, DEFAULT_THRESHOLD, LetheConfig
|
|
11
|
+
from lethe.multiply_pipeline import run_multiply
|
|
12
|
+
from lethe.pipeline import run_pipeline
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(
|
|
15
|
+
name="lethe",
|
|
16
|
+
help="Anonymize PII in structured data files.",
|
|
17
|
+
no_args_is_help=True,
|
|
18
|
+
invoke_without_command=True,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@app.callback()
|
|
23
|
+
def main() -> None:
|
|
24
|
+
"""Lethe: data anonymization for structured files."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.command()
|
|
28
|
+
def anonymize(
|
|
29
|
+
input_file: Annotated[
|
|
30
|
+
Path,
|
|
31
|
+
typer.Argument(help="Input data file to anonymize (CSV, TSV, or TXT).", exists=True, readable=True),
|
|
32
|
+
],
|
|
33
|
+
output: Annotated[
|
|
34
|
+
Optional[Path],
|
|
35
|
+
typer.Option("-o", "--output", help="Output file path. Defaults to <input>_anonymized.<ext>."),
|
|
36
|
+
] = None,
|
|
37
|
+
model: Annotated[
|
|
38
|
+
str,
|
|
39
|
+
typer.Option("--model", "-m", help="spaCy model: 'trf' (transformer, default) or 'sm' (small, fast)."),
|
|
40
|
+
] = "trf",
|
|
41
|
+
threshold: Annotated[
|
|
42
|
+
float,
|
|
43
|
+
typer.Option("--threshold", "-t", help="Minimum confidence score to treat a cell as PII."),
|
|
44
|
+
] = DEFAULT_THRESHOLD,
|
|
45
|
+
chunk_size: Annotated[
|
|
46
|
+
int,
|
|
47
|
+
typer.Option("--chunk-size", help="Rows per chunk for streaming."),
|
|
48
|
+
] = DEFAULT_CHUNK_SIZE,
|
|
49
|
+
locale: Annotated[
|
|
50
|
+
str,
|
|
51
|
+
typer.Option("--locale", help="Faker locale for generated replacement values."),
|
|
52
|
+
] = DEFAULT_LOCALE,
|
|
53
|
+
seed: Annotated[
|
|
54
|
+
Optional[int],
|
|
55
|
+
typer.Option("--seed", help="Random seed for reproducible fake data."),
|
|
56
|
+
] = None,
|
|
57
|
+
clean: Annotated[
|
|
58
|
+
bool,
|
|
59
|
+
typer.Option("--clean", help="Delete the original input file after successful anonymization."),
|
|
60
|
+
] = False,
|
|
61
|
+
confirm_clean: Annotated[
|
|
62
|
+
bool,
|
|
63
|
+
typer.Option("--confirm-clean", help="Required confirmation for --clean. Destructive actions must be explicitly confirmed."),
|
|
64
|
+
] = False,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Anonymize PII in a data file (CSV, TSV, or TXT)."""
|
|
67
|
+
if model not in ("trf", "sm"):
|
|
68
|
+
typer.echo(f"Error: model must be 'trf' or 'sm', got '{model}'", err=True)
|
|
69
|
+
raise typer.Exit(1)
|
|
70
|
+
|
|
71
|
+
if clean and not confirm_clean:
|
|
72
|
+
typer.echo("Error: --clean requires --confirm-clean to confirm deletion of the input file.", err=True)
|
|
73
|
+
raise typer.Exit(1)
|
|
74
|
+
|
|
75
|
+
config = LetheConfig(
|
|
76
|
+
model=model,
|
|
77
|
+
threshold=threshold,
|
|
78
|
+
chunk_size=chunk_size,
|
|
79
|
+
locale=locale,
|
|
80
|
+
seed=seed,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
run_pipeline(input_file, output, config)
|
|
84
|
+
|
|
85
|
+
if clean:
|
|
86
|
+
input_file.unlink()
|
|
87
|
+
typer.echo(f"Deleted original file: {input_file}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@app.command()
|
|
91
|
+
def multiply(
|
|
92
|
+
input_file: Annotated[
|
|
93
|
+
Path,
|
|
94
|
+
typer.Argument(help="Input data file to multiply (CSV or TSV).", exists=True, readable=True),
|
|
95
|
+
],
|
|
96
|
+
factor: Annotated[
|
|
97
|
+
int,
|
|
98
|
+
typer.Option("--factor", "-f", help="Multiplication factor (output rows = input rows * factor)."),
|
|
99
|
+
] = 3,
|
|
100
|
+
output: Annotated[
|
|
101
|
+
Optional[Path],
|
|
102
|
+
typer.Option("-o", "--output", help="Output file path. Defaults to <input>_multiplied.<ext>."),
|
|
103
|
+
] = None,
|
|
104
|
+
locale: Annotated[
|
|
105
|
+
str,
|
|
106
|
+
typer.Option("--locale", help="Faker locale for generated replacement values."),
|
|
107
|
+
] = DEFAULT_LOCALE,
|
|
108
|
+
seed: Annotated[
|
|
109
|
+
Optional[int],
|
|
110
|
+
typer.Option("--seed", help="Random seed for reproducible fake data."),
|
|
111
|
+
] = None,
|
|
112
|
+
sanitize: Annotated[
|
|
113
|
+
bool,
|
|
114
|
+
typer.Option("--sanitize", help="Validate and fix emails and URLs to be RFC-compliant ASCII."),
|
|
115
|
+
] = False,
|
|
116
|
+
clean: Annotated[
|
|
117
|
+
bool,
|
|
118
|
+
typer.Option("--clean", help="Delete the original input file after successful multiplication."),
|
|
119
|
+
] = False,
|
|
120
|
+
confirm_clean: Annotated[
|
|
121
|
+
bool,
|
|
122
|
+
typer.Option("--confirm-clean", help="Required confirmation for --clean. Destructive actions must be explicitly confirmed."),
|
|
123
|
+
] = False,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Multiply a dataset with synthetic rows (CSV or TSV)."""
|
|
126
|
+
if factor < 1:
|
|
127
|
+
typer.echo("Error: factor must be >= 1", err=True)
|
|
128
|
+
raise typer.Exit(1)
|
|
129
|
+
|
|
130
|
+
if clean and not confirm_clean:
|
|
131
|
+
typer.echo("Error: --clean requires --confirm-clean to confirm deletion of the input file.", err=True)
|
|
132
|
+
raise typer.Exit(1)
|
|
133
|
+
|
|
134
|
+
run_multiply(input_file, output, factor, locale=locale, seed=seed, sanitize=sanitize)
|
|
135
|
+
|
|
136
|
+
if clean:
|
|
137
|
+
input_file.unlink()
|
|
138
|
+
typer.echo(f"Deleted original file: {input_file}")
|
lethe/config.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Configuration for Lethe anonymization runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
SPACY_MODELS = {
|
|
8
|
+
"trf": "en_core_web_trf",
|
|
9
|
+
"sm": "en_core_web_sm",
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
DEFAULT_THRESHOLD = 0.35
|
|
13
|
+
DEFAULT_CHUNK_SIZE = 5000
|
|
14
|
+
DEFAULT_LOCALE = "en_US"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class LetheConfig:
|
|
19
|
+
model: str = "trf"
|
|
20
|
+
threshold: float = DEFAULT_THRESHOLD
|
|
21
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE
|
|
22
|
+
locale: str = DEFAULT_LOCALE
|
|
23
|
+
seed: int | None = None
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def spacy_model(self) -> str:
|
|
27
|
+
return SPACY_MODELS[self.model]
|
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Session-scoped mapping index: original value -> consistent fake replacement."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from faker import Faker
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
FAKER_GENERATORS: dict[str, str] = {
|
|
9
|
+
"PERSON": "name",
|
|
10
|
+
"EMAIL_ADDRESS": "email",
|
|
11
|
+
"PHONE_NUMBER": "phone_number",
|
|
12
|
+
"LOCATION": "address",
|
|
13
|
+
"US_SSN": "ssn",
|
|
14
|
+
"CREDIT_CARD": "credit_card_number",
|
|
15
|
+
"DATE_TIME": "date",
|
|
16
|
+
"IP_ADDRESS": "ipv4",
|
|
17
|
+
"IBAN_CODE": "iban",
|
|
18
|
+
"US_DRIVER_LICENSE": "bothify",
|
|
19
|
+
"US_PASSPORT": "bothify",
|
|
20
|
+
"UK_NINO": "bothify",
|
|
21
|
+
"NL_BSN": "numerify",
|
|
22
|
+
"URL": "url",
|
|
23
|
+
"CRYPTO": "sha1",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
FAKER_GENERATOR_KWARGS: dict[str, dict] = {
|
|
27
|
+
"US_DRIVER_LICENSE": {"text": "??#######"},
|
|
28
|
+
"US_PASSPORT": {"text": "#########"},
|
|
29
|
+
"UK_NINO": {"text": "??######?"},
|
|
30
|
+
"NL_BSN": {"text": "#########"},
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SessionIndex:
|
|
35
|
+
"""Maintains a dict mapping (entity_type, original) -> fake value.
|
|
36
|
+
|
|
37
|
+
Ensures cross-table consistency: the same original value always maps
|
|
38
|
+
to the same fake replacement within a session.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, locale: str = "en_US", seed: int | None = None) -> None:
|
|
42
|
+
self._faker = Faker(locale)
|
|
43
|
+
if seed is not None:
|
|
44
|
+
Faker.seed(seed)
|
|
45
|
+
self._index: dict[tuple[str, str], str] = {}
|
|
46
|
+
|
|
47
|
+
def get_or_create(self, entity_type: str, original: str) -> str:
|
|
48
|
+
key = (entity_type, original)
|
|
49
|
+
if key in self._index:
|
|
50
|
+
return self._index[key]
|
|
51
|
+
|
|
52
|
+
fake_value = self._generate(entity_type)
|
|
53
|
+
self._index[key] = fake_value
|
|
54
|
+
return fake_value
|
|
55
|
+
|
|
56
|
+
def _generate(self, entity_type: str) -> str:
|
|
57
|
+
method_name = FAKER_GENERATORS.get(entity_type, "text")
|
|
58
|
+
method = getattr(self._faker, method_name)
|
|
59
|
+
kwargs = FAKER_GENERATOR_KWARGS.get(entity_type, {})
|
|
60
|
+
result = method(**kwargs) if kwargs else method()
|
|
61
|
+
return str(result)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def mapping_count(self) -> int:
|
|
65
|
+
return len(self._index)
|
lethe/multiplier.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Multiply datasets: produce N * factor rows with synthetic data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from lethe.mapping.session_index import SessionIndex
|
|
10
|
+
from lethe.scanner.column_heuristics import infer_pii_type
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Multiplier:
|
|
14
|
+
"""Takes an input DataFrame and generates a multiplied version.
|
|
15
|
+
|
|
16
|
+
Columns are classified using heuristics:
|
|
17
|
+
- PII columns: fresh Faker values for every row
|
|
18
|
+
- ID columns: sequential integers continuing from max existing value
|
|
19
|
+
- Sample columns (non-PII / unknown): sampled from existing values
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
df: pd.DataFrame,
|
|
25
|
+
locale: str = "en_US",
|
|
26
|
+
seed: int | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
self._df = df
|
|
29
|
+
self._session = SessionIndex(locale=locale, seed=seed)
|
|
30
|
+
if seed is not None:
|
|
31
|
+
random.seed(seed)
|
|
32
|
+
self._pii_columns: dict[str, str] = {}
|
|
33
|
+
self._id_columns: list[str] = []
|
|
34
|
+
self._sample_columns: list[str] = []
|
|
35
|
+
self.classify_columns()
|
|
36
|
+
|
|
37
|
+
def classify_columns(self) -> None:
|
|
38
|
+
for col in self._df.columns:
|
|
39
|
+
hint = infer_pii_type(col)
|
|
40
|
+
if hint is None:
|
|
41
|
+
self._sample_columns.append(col)
|
|
42
|
+
elif hint == "SKIP":
|
|
43
|
+
if self._is_id_column(col):
|
|
44
|
+
self._id_columns.append(col)
|
|
45
|
+
else:
|
|
46
|
+
self._sample_columns.append(col)
|
|
47
|
+
else:
|
|
48
|
+
self._pii_columns[col] = hint
|
|
49
|
+
|
|
50
|
+
def _is_id_column(self, col: str) -> bool:
|
|
51
|
+
"""Check if a SKIP column is specifically an ID/primary-key column."""
|
|
52
|
+
series = self._df[col]
|
|
53
|
+
if not pd.api.types.is_numeric_dtype(series):
|
|
54
|
+
return False
|
|
55
|
+
clean = series.dropna()
|
|
56
|
+
if clean.empty:
|
|
57
|
+
return False
|
|
58
|
+
as_int = clean.astype(int)
|
|
59
|
+
return as_int.is_unique and as_int.is_monotonic_increasing
|
|
60
|
+
|
|
61
|
+
def generate(self, factor: int) -> pd.DataFrame:
|
|
62
|
+
"""Return a DataFrame with original_rows * factor total rows.
|
|
63
|
+
|
|
64
|
+
Rows 0..N-1 are the originals with PII replaced (anonymized).
|
|
65
|
+
Rows N..N*factor-1 are fully synthetic.
|
|
66
|
+
"""
|
|
67
|
+
n_original = len(self._df)
|
|
68
|
+
n_synthetic = n_original * (factor - 1)
|
|
69
|
+
rows: dict[str, list] = {col: [] for col in self._df.columns}
|
|
70
|
+
|
|
71
|
+
# Phase 1: anonymize original rows
|
|
72
|
+
for _, row in self._df.iterrows():
|
|
73
|
+
for col in self._df.columns:
|
|
74
|
+
val = row[col]
|
|
75
|
+
if col in self._pii_columns:
|
|
76
|
+
rows[col].append(self._session._generate(self._pii_columns[col]))
|
|
77
|
+
elif col in self._id_columns:
|
|
78
|
+
rows[col].append(val)
|
|
79
|
+
else:
|
|
80
|
+
rows[col].append(val)
|
|
81
|
+
|
|
82
|
+
# Phase 2: generate synthetic rows
|
|
83
|
+
id_counters: dict[str, int] = {}
|
|
84
|
+
for col in self._id_columns:
|
|
85
|
+
id_counters[col] = int(self._df[col].max()) + 1
|
|
86
|
+
|
|
87
|
+
col_values: dict[str, list] = {}
|
|
88
|
+
for col in self._sample_columns:
|
|
89
|
+
col_values[col] = self._df[col].dropna().tolist()
|
|
90
|
+
|
|
91
|
+
for _ in range(n_synthetic):
|
|
92
|
+
for col in self._df.columns:
|
|
93
|
+
if col in self._pii_columns:
|
|
94
|
+
rows[col].append(self._session._generate(self._pii_columns[col]))
|
|
95
|
+
elif col in self._id_columns:
|
|
96
|
+
rows[col].append(id_counters[col])
|
|
97
|
+
id_counters[col] += 1
|
|
98
|
+
else:
|
|
99
|
+
pool = col_values.get(col, [])
|
|
100
|
+
rows[col].append(random.choice(pool) if pool else None)
|
|
101
|
+
|
|
102
|
+
return pd.DataFrame(rows)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Pipeline for the multiply command: read -> multiply -> write."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from lethe.multiplier import Multiplier
|
|
11
|
+
from lethe.parsers import TextFormat, detect_format, make_writer
|
|
12
|
+
from lethe.sanitizer import sanitize_dataframe
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def default_multiply_output(input_path: Path) -> Path:
|
|
18
|
+
return input_path.with_stem(input_path.stem + "_multiplied")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_multiply(
|
|
22
|
+
input_path: Path,
|
|
23
|
+
output_path: Path | None,
|
|
24
|
+
factor: int,
|
|
25
|
+
locale: str = "en_US",
|
|
26
|
+
seed: int | None = None,
|
|
27
|
+
sanitize: bool = False,
|
|
28
|
+
) -> Path:
|
|
29
|
+
if output_path is None:
|
|
30
|
+
output_path = default_multiply_output(input_path)
|
|
31
|
+
|
|
32
|
+
fmt = detect_format(input_path)
|
|
33
|
+
|
|
34
|
+
if fmt in (TextFormat.FREEFORM, TextFormat.ONE_VALUE_PER_LINE):
|
|
35
|
+
console.print(
|
|
36
|
+
f"[red]Error:[/red] Cannot multiply {fmt.value} files. "
|
|
37
|
+
"Multiply only supports CSV and TSV formats with columnar data.",
|
|
38
|
+
highlight=False,
|
|
39
|
+
)
|
|
40
|
+
raise SystemExit(1)
|
|
41
|
+
|
|
42
|
+
console.print(f"[bold]Lethe[/bold] multiplying [cyan]{input_path.name}[/cyan]")
|
|
43
|
+
console.print(f" Format: [yellow]{fmt.value}[/yellow]")
|
|
44
|
+
console.print(f" Factor: [yellow]{factor}x[/yellow]")
|
|
45
|
+
console.print(f" Output: [cyan]{output_path}[/cyan]")
|
|
46
|
+
|
|
47
|
+
sep = "\t" if fmt == TextFormat.TSV else ","
|
|
48
|
+
df = pd.read_csv(input_path, sep=sep)
|
|
49
|
+
n_input = len(df)
|
|
50
|
+
|
|
51
|
+
console.print(f" Input rows: [yellow]{n_input}[/yellow]")
|
|
52
|
+
|
|
53
|
+
multiplier = Multiplier(df, locale=locale, seed=seed)
|
|
54
|
+
result = multiplier.generate(factor)
|
|
55
|
+
|
|
56
|
+
if sanitize:
|
|
57
|
+
console.print(" Sanitizing emails and URLs...", style="dim")
|
|
58
|
+
result = sanitize_dataframe(result, multiplier._pii_columns)
|
|
59
|
+
|
|
60
|
+
writer = make_writer(output_path, fmt)
|
|
61
|
+
writer.write_chunk(result)
|
|
62
|
+
writer.close()
|
|
63
|
+
|
|
64
|
+
n_output = len(result)
|
|
65
|
+
console.print(
|
|
66
|
+
f"\n [green]Done.[/green] {n_input} input rows -> {n_output} output rows "
|
|
67
|
+
f"({factor}x multiplication)."
|
|
68
|
+
)
|
|
69
|
+
return output_path
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Format detection and parser factory for supported file types."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from lethe.parsers.csv_parser import CsvReader, CsvWriter
|
|
9
|
+
from lethe.parsers.txt_parser import (
|
|
10
|
+
FreeformReader,
|
|
11
|
+
FreeformWriter,
|
|
12
|
+
LineReader,
|
|
13
|
+
LineWriter,
|
|
14
|
+
TsvReader,
|
|
15
|
+
TsvWriter,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TextFormat(Enum):
|
|
20
|
+
CSV = "csv"
|
|
21
|
+
TSV = "tsv"
|
|
22
|
+
ONE_VALUE_PER_LINE = "one_value_per_line"
|
|
23
|
+
FREEFORM = "freeform"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def detect_format(path: Path) -> TextFormat:
|
|
27
|
+
"""Detect file format from extension and content heuristics."""
|
|
28
|
+
suffix = path.suffix.lower()
|
|
29
|
+
|
|
30
|
+
if suffix == ".csv":
|
|
31
|
+
return TextFormat.CSV
|
|
32
|
+
if suffix == ".tsv":
|
|
33
|
+
return TextFormat.TSV
|
|
34
|
+
|
|
35
|
+
if suffix == ".txt":
|
|
36
|
+
return _detect_txt_format(path)
|
|
37
|
+
|
|
38
|
+
return TextFormat.CSV
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _detect_txt_format(path: Path) -> TextFormat:
|
|
42
|
+
"""Inspect .txt file content to determine the sub-format."""
|
|
43
|
+
lines: list[str] = []
|
|
44
|
+
with open(path, encoding="utf-8") as f:
|
|
45
|
+
for i, line in enumerate(f):
|
|
46
|
+
if i >= 20:
|
|
47
|
+
break
|
|
48
|
+
lines.append(line.rstrip("\n\r"))
|
|
49
|
+
|
|
50
|
+
if not lines:
|
|
51
|
+
return TextFormat.FREEFORM
|
|
52
|
+
|
|
53
|
+
non_empty = [l for l in lines if l.strip()]
|
|
54
|
+
if not non_empty:
|
|
55
|
+
return TextFormat.FREEFORM
|
|
56
|
+
|
|
57
|
+
tab_counts = [l.count("\t") for l in non_empty]
|
|
58
|
+
if tab_counts and min(tab_counts) >= 1:
|
|
59
|
+
consistent = all(c == tab_counts[0] for c in tab_counts)
|
|
60
|
+
if consistent:
|
|
61
|
+
return TextFormat.TSV
|
|
62
|
+
|
|
63
|
+
total_chars = sum(len(l) for l in non_empty)
|
|
64
|
+
avg_chars = total_chars / len(non_empty) if non_empty else 0
|
|
65
|
+
max_words = max((len(l.split()) for l in non_empty), default=0)
|
|
66
|
+
|
|
67
|
+
if avg_chars < 60 and max_words <= 4:
|
|
68
|
+
return TextFormat.ONE_VALUE_PER_LINE
|
|
69
|
+
|
|
70
|
+
return TextFormat.FREEFORM
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def make_reader(path: Path, fmt: TextFormat, chunk_size: int = 5000):
|
|
74
|
+
"""Create the appropriate reader for a given format."""
|
|
75
|
+
if fmt == TextFormat.CSV:
|
|
76
|
+
return CsvReader(path, chunk_size=chunk_size)
|
|
77
|
+
if fmt == TextFormat.TSV:
|
|
78
|
+
return TsvReader(path, chunk_size=chunk_size)
|
|
79
|
+
if fmt == TextFormat.ONE_VALUE_PER_LINE:
|
|
80
|
+
return LineReader(path, chunk_size=chunk_size)
|
|
81
|
+
if fmt == TextFormat.FREEFORM:
|
|
82
|
+
return FreeformReader(path, chunk_size=chunk_size)
|
|
83
|
+
return CsvReader(path, chunk_size=chunk_size)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def make_writer(path: Path, fmt: TextFormat):
|
|
87
|
+
"""Create the appropriate writer for a given format."""
|
|
88
|
+
if fmt == TextFormat.CSV:
|
|
89
|
+
return CsvWriter(path)
|
|
90
|
+
if fmt == TextFormat.TSV:
|
|
91
|
+
return TsvWriter(path)
|
|
92
|
+
if fmt == TextFormat.ONE_VALUE_PER_LINE:
|
|
93
|
+
return LineWriter(path)
|
|
94
|
+
if fmt == TextFormat.FREEFORM:
|
|
95
|
+
return FreeformWriter(path)
|
|
96
|
+
return CsvWriter(path)
|
lethe/parsers/base.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Protocols for chunked file I/O."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Iterator, Protocol
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChunkedReader(Protocol):
|
|
11
|
+
def read_chunks(self) -> Iterator[pd.DataFrame]: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ChunkedWriter(Protocol):
|
|
15
|
+
def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None: ...
|
|
16
|
+
|
|
17
|
+
def close(self) -> None: ...
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Chunked CSV reader and writer using pandas."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterator
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CsvReader:
|
|
13
|
+
def __init__(self, path: Path, chunk_size: int = 5000) -> None:
|
|
14
|
+
self.path = path
|
|
15
|
+
self.chunk_size = chunk_size
|
|
16
|
+
|
|
17
|
+
def read_chunks(self) -> Iterator[pd.DataFrame]:
|
|
18
|
+
with pd.read_csv(self.path, chunksize=self.chunk_size) as reader:
|
|
19
|
+
yield from reader
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CsvWriter:
|
|
23
|
+
def __init__(self, path: Path) -> None:
|
|
24
|
+
self.path = path
|
|
25
|
+
self._first = True
|
|
26
|
+
|
|
27
|
+
def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
|
|
28
|
+
write_header = header or self._first
|
|
29
|
+
chunk.to_csv(
|
|
30
|
+
self.path,
|
|
31
|
+
mode="a" if not self._first else "w",
|
|
32
|
+
header=write_header,
|
|
33
|
+
index=False,
|
|
34
|
+
quoting=csv.QUOTE_ALL,
|
|
35
|
+
)
|
|
36
|
+
self._first = False
|
|
37
|
+
|
|
38
|
+
def close(self) -> None:
|
|
39
|
+
pass
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Parsers for text-based file formats: TSV, one-value-per-line, free-form."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterator
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TsvReader:
|
|
12
|
+
def __init__(self, path: Path, chunk_size: int = 5000) -> None:
|
|
13
|
+
self.path = path
|
|
14
|
+
self.chunk_size = chunk_size
|
|
15
|
+
|
|
16
|
+
def read_chunks(self) -> Iterator[pd.DataFrame]:
|
|
17
|
+
with pd.read_csv(self.path, sep="\t", chunksize=self.chunk_size) as reader:
|
|
18
|
+
yield from reader
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TsvWriter:
|
|
22
|
+
def __init__(self, path: Path) -> None:
|
|
23
|
+
self.path = path
|
|
24
|
+
self._first = True
|
|
25
|
+
|
|
26
|
+
def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
|
|
27
|
+
write_header = header or self._first
|
|
28
|
+
chunk.to_csv(
|
|
29
|
+
self.path,
|
|
30
|
+
mode="a" if not self._first else "w",
|
|
31
|
+
header=write_header,
|
|
32
|
+
index=False,
|
|
33
|
+
sep="\t",
|
|
34
|
+
)
|
|
35
|
+
self._first = False
|
|
36
|
+
|
|
37
|
+
def close(self) -> None:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LineReader:
|
|
42
|
+
def __init__(self, path: Path, chunk_size: int = 5000) -> None:
|
|
43
|
+
self.path = path
|
|
44
|
+
self.chunk_size = chunk_size
|
|
45
|
+
|
|
46
|
+
def read_chunks(self) -> Iterator[pd.DataFrame]:
|
|
47
|
+
buf: list[str] = []
|
|
48
|
+
with open(self.path, encoding="utf-8") as f:
|
|
49
|
+
for line in f:
|
|
50
|
+
stripped = line.rstrip("\n\r")
|
|
51
|
+
if not stripped:
|
|
52
|
+
continue
|
|
53
|
+
buf.append(stripped)
|
|
54
|
+
if len(buf) >= self.chunk_size:
|
|
55
|
+
yield pd.DataFrame({"value": buf})
|
|
56
|
+
buf = []
|
|
57
|
+
if buf:
|
|
58
|
+
yield pd.DataFrame({"value": buf})
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class LineWriter:
|
|
62
|
+
def __init__(self, path: Path) -> None:
|
|
63
|
+
self.path = path
|
|
64
|
+
self._first = True
|
|
65
|
+
|
|
66
|
+
def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
|
|
67
|
+
mode = "w" if self._first else "a"
|
|
68
|
+
with open(self.path, mode, encoding="utf-8") as f:
|
|
69
|
+
for value in chunk["value"]:
|
|
70
|
+
f.write(str(value) + "\n")
|
|
71
|
+
self._first = False
|
|
72
|
+
|
|
73
|
+
def close(self) -> None:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class FreeformReader:
|
|
78
|
+
def __init__(self, path: Path, chunk_size: int = 5000) -> None:
|
|
79
|
+
self.path = path
|
|
80
|
+
self.chunk_size = chunk_size
|
|
81
|
+
|
|
82
|
+
def read_chunks(self) -> Iterator[pd.DataFrame]:
|
|
83
|
+
buf: list[str] = []
|
|
84
|
+
with open(self.path, encoding="utf-8") as f:
|
|
85
|
+
for line in f:
|
|
86
|
+
text = line.rstrip("\n\r")
|
|
87
|
+
buf.append(text)
|
|
88
|
+
if len(buf) >= self.chunk_size:
|
|
89
|
+
yield pd.DataFrame({"text": buf})
|
|
90
|
+
buf = []
|
|
91
|
+
if buf:
|
|
92
|
+
yield pd.DataFrame({"text": buf})
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class FreeformWriter:
|
|
96
|
+
def __init__(self, path: Path) -> None:
|
|
97
|
+
self.path = path
|
|
98
|
+
self._first = True
|
|
99
|
+
|
|
100
|
+
def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
|
|
101
|
+
mode = "w" if self._first else "a"
|
|
102
|
+
with open(self.path, mode, encoding="utf-8") as f:
|
|
103
|
+
for text in chunk["text"]:
|
|
104
|
+
f.write(str(text) + "\n")
|
|
105
|
+
self._first = False
|
|
106
|
+
|
|
107
|
+
def close(self) -> None:
|
|
108
|
+
pass
|