lethe-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lethe/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Lethe: data anonymization CLI tool."""
2
+
3
+ __version__ = "0.1.1"
lethe/cli.py ADDED
@@ -0,0 +1,138 @@
1
+ """Lethe CLI: data anonymization for structured files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Optional
7
+
8
+ import typer
9
+
10
+ from lethe.config import DEFAULT_CHUNK_SIZE, DEFAULT_LOCALE, DEFAULT_THRESHOLD, LetheConfig
11
+ from lethe.multiply_pipeline import run_multiply
12
+ from lethe.pipeline import run_pipeline
13
+
14
+ app = typer.Typer(
15
+ name="lethe",
16
+ help="Anonymize PII in structured data files.",
17
+ no_args_is_help=True,
18
+ invoke_without_command=True,
19
+ )
20
+
21
+
22
+ @app.callback()
23
+ def main() -> None:
24
+ """Lethe: data anonymization for structured files."""
25
+
26
+
27
+ @app.command()
28
+ def anonymize(
29
+ input_file: Annotated[
30
+ Path,
31
+ typer.Argument(help="Input data file to anonymize (CSV, TSV, or TXT).", exists=True, readable=True),
32
+ ],
33
+ output: Annotated[
34
+ Optional[Path],
35
+ typer.Option("-o", "--output", help="Output file path. Defaults to <input>_anonymized.<ext>."),
36
+ ] = None,
37
+ model: Annotated[
38
+ str,
39
+ typer.Option("--model", "-m", help="spaCy model: 'trf' (transformer, default) or 'sm' (small, fast)."),
40
+ ] = "trf",
41
+ threshold: Annotated[
42
+ float,
43
+ typer.Option("--threshold", "-t", help="Minimum confidence score to treat a cell as PII."),
44
+ ] = DEFAULT_THRESHOLD,
45
+ chunk_size: Annotated[
46
+ int,
47
+ typer.Option("--chunk-size", help="Rows per chunk for streaming."),
48
+ ] = DEFAULT_CHUNK_SIZE,
49
+ locale: Annotated[
50
+ str,
51
+ typer.Option("--locale", help="Faker locale for generated replacement values."),
52
+ ] = DEFAULT_LOCALE,
53
+ seed: Annotated[
54
+ Optional[int],
55
+ typer.Option("--seed", help="Random seed for reproducible fake data."),
56
+ ] = None,
57
+ clean: Annotated[
58
+ bool,
59
+ typer.Option("--clean", help="Delete the original input file after successful anonymization."),
60
+ ] = False,
61
+ confirm_clean: Annotated[
62
+ bool,
63
+ typer.Option("--confirm-clean", help="Required confirmation for --clean. Destructive actions must be explicitly confirmed."),
64
+ ] = False,
65
+ ) -> None:
66
+ """Anonymize PII in a data file (CSV, TSV, or TXT)."""
67
+ if model not in ("trf", "sm"):
68
+ typer.echo(f"Error: model must be 'trf' or 'sm', got '{model}'", err=True)
69
+ raise typer.Exit(1)
70
+
71
+ if clean and not confirm_clean:
72
+ typer.echo("Error: --clean requires --confirm-clean to confirm deletion of the input file.", err=True)
73
+ raise typer.Exit(1)
74
+
75
+ config = LetheConfig(
76
+ model=model,
77
+ threshold=threshold,
78
+ chunk_size=chunk_size,
79
+ locale=locale,
80
+ seed=seed,
81
+ )
82
+
83
+ run_pipeline(input_file, output, config)
84
+
85
+ if clean:
86
+ input_file.unlink()
87
+ typer.echo(f"Deleted original file: {input_file}")
88
+
89
+
90
+ @app.command()
91
+ def multiply(
92
+ input_file: Annotated[
93
+ Path,
94
+ typer.Argument(help="Input data file to multiply (CSV or TSV).", exists=True, readable=True),
95
+ ],
96
+ factor: Annotated[
97
+ int,
98
+ typer.Option("--factor", "-f", help="Multiplication factor (output rows = input rows * factor)."),
99
+ ] = 3,
100
+ output: Annotated[
101
+ Optional[Path],
102
+ typer.Option("-o", "--output", help="Output file path. Defaults to <input>_multiplied.<ext>."),
103
+ ] = None,
104
+ locale: Annotated[
105
+ str,
106
+ typer.Option("--locale", help="Faker locale for generated replacement values."),
107
+ ] = DEFAULT_LOCALE,
108
+ seed: Annotated[
109
+ Optional[int],
110
+ typer.Option("--seed", help="Random seed for reproducible fake data."),
111
+ ] = None,
112
+ sanitize: Annotated[
113
+ bool,
114
+ typer.Option("--sanitize", help="Validate and fix emails and URLs to be RFC-compliant ASCII."),
115
+ ] = False,
116
+ clean: Annotated[
117
+ bool,
118
+ typer.Option("--clean", help="Delete the original input file after successful multiplication."),
119
+ ] = False,
120
+ confirm_clean: Annotated[
121
+ bool,
122
+ typer.Option("--confirm-clean", help="Required confirmation for --clean. Destructive actions must be explicitly confirmed."),
123
+ ] = False,
124
+ ) -> None:
125
+ """Multiply a dataset with synthetic rows (CSV or TSV)."""
126
+ if factor < 1:
127
+ typer.echo("Error: factor must be >= 1", err=True)
128
+ raise typer.Exit(1)
129
+
130
+ if clean and not confirm_clean:
131
+ typer.echo("Error: --clean requires --confirm-clean to confirm deletion of the input file.", err=True)
132
+ raise typer.Exit(1)
133
+
134
+ run_multiply(input_file, output, factor, locale=locale, seed=seed, sanitize=sanitize)
135
+
136
+ if clean:
137
+ input_file.unlink()
138
+ typer.echo(f"Deleted original file: {input_file}")
lethe/config.py ADDED
@@ -0,0 +1,27 @@
1
+ """Configuration for Lethe anonymization runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ SPACY_MODELS = {
8
+ "trf": "en_core_web_trf",
9
+ "sm": "en_core_web_sm",
10
+ }
11
+
12
+ DEFAULT_THRESHOLD = 0.35
13
+ DEFAULT_CHUNK_SIZE = 5000
14
+ DEFAULT_LOCALE = "en_US"
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class LetheConfig:
19
+ model: str = "trf"
20
+ threshold: float = DEFAULT_THRESHOLD
21
+ chunk_size: int = DEFAULT_CHUNK_SIZE
22
+ locale: str = DEFAULT_LOCALE
23
+ seed: int | None = None
24
+
25
+ @property
26
+ def spacy_model(self) -> str:
27
+ return SPACY_MODELS[self.model]
File without changes
@@ -0,0 +1,65 @@
1
+ """Session-scoped mapping index: original value -> consistent fake replacement."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from faker import Faker
6
+
7
+
8
+ FAKER_GENERATORS: dict[str, str] = {
9
+ "PERSON": "name",
10
+ "EMAIL_ADDRESS": "email",
11
+ "PHONE_NUMBER": "phone_number",
12
+ "LOCATION": "address",
13
+ "US_SSN": "ssn",
14
+ "CREDIT_CARD": "credit_card_number",
15
+ "DATE_TIME": "date",
16
+ "IP_ADDRESS": "ipv4",
17
+ "IBAN_CODE": "iban",
18
+ "US_DRIVER_LICENSE": "bothify",
19
+ "US_PASSPORT": "bothify",
20
+ "UK_NINO": "bothify",
21
+ "NL_BSN": "numerify",
22
+ "URL": "url",
23
+ "CRYPTO": "sha1",
24
+ }
25
+
26
+ FAKER_GENERATOR_KWARGS: dict[str, dict] = {
27
+ "US_DRIVER_LICENSE": {"text": "??#######"},
28
+ "US_PASSPORT": {"text": "#########"},
29
+ "UK_NINO": {"text": "??######?"},
30
+ "NL_BSN": {"text": "#########"},
31
+ }
32
+
33
+
34
+ class SessionIndex:
35
+ """Maintains a dict mapping (entity_type, original) -> fake value.
36
+
37
+ Ensures cross-table consistency: the same original value always maps
38
+ to the same fake replacement within a session.
39
+ """
40
+
41
+ def __init__(self, locale: str = "en_US", seed: int | None = None) -> None:
42
+ self._faker = Faker(locale)
43
+ if seed is not None:
44
+ Faker.seed(seed)
45
+ self._index: dict[tuple[str, str], str] = {}
46
+
47
+ def get_or_create(self, entity_type: str, original: str) -> str:
48
+ key = (entity_type, original)
49
+ if key in self._index:
50
+ return self._index[key]
51
+
52
+ fake_value = self._generate(entity_type)
53
+ self._index[key] = fake_value
54
+ return fake_value
55
+
56
+ def _generate(self, entity_type: str) -> str:
57
+ method_name = FAKER_GENERATORS.get(entity_type, "text")
58
+ method = getattr(self._faker, method_name)
59
+ kwargs = FAKER_GENERATOR_KWARGS.get(entity_type, {})
60
+ result = method(**kwargs) if kwargs else method()
61
+ return str(result)
62
+
63
+ @property
64
+ def mapping_count(self) -> int:
65
+ return len(self._index)
lethe/multiplier.py ADDED
@@ -0,0 +1,102 @@
1
+ """Multiply datasets: produce N * factor rows with synthetic data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+
7
+ import pandas as pd
8
+
9
+ from lethe.mapping.session_index import SessionIndex
10
+ from lethe.scanner.column_heuristics import infer_pii_type
11
+
12
+
13
+ class Multiplier:
14
+ """Takes an input DataFrame and generates a multiplied version.
15
+
16
+ Columns are classified using heuristics:
17
+ - PII columns: fresh Faker values for every row
18
+ - ID columns: sequential integers continuing from max existing value
19
+ - Sample columns (non-PII / unknown): sampled from existing values
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ df: pd.DataFrame,
25
+ locale: str = "en_US",
26
+ seed: int | None = None,
27
+ ) -> None:
28
+ self._df = df
29
+ self._session = SessionIndex(locale=locale, seed=seed)
30
+ if seed is not None:
31
+ random.seed(seed)
32
+ self._pii_columns: dict[str, str] = {}
33
+ self._id_columns: list[str] = []
34
+ self._sample_columns: list[str] = []
35
+ self.classify_columns()
36
+
37
+ def classify_columns(self) -> None:
38
+ for col in self._df.columns:
39
+ hint = infer_pii_type(col)
40
+ if hint is None:
41
+ self._sample_columns.append(col)
42
+ elif hint == "SKIP":
43
+ if self._is_id_column(col):
44
+ self._id_columns.append(col)
45
+ else:
46
+ self._sample_columns.append(col)
47
+ else:
48
+ self._pii_columns[col] = hint
49
+
50
+ def _is_id_column(self, col: str) -> bool:
51
+ """Check if a SKIP column is specifically an ID/primary-key column."""
52
+ series = self._df[col]
53
+ if not pd.api.types.is_numeric_dtype(series):
54
+ return False
55
+ clean = series.dropna()
56
+ if clean.empty:
57
+ return False
58
+ as_int = clean.astype(int)
59
+ return as_int.is_unique and as_int.is_monotonic_increasing
60
+
61
+ def generate(self, factor: int) -> pd.DataFrame:
62
+ """Return a DataFrame with original_rows * factor total rows.
63
+
64
+ Rows 0..N-1 are the originals with PII replaced (anonymized).
65
+ Rows N..N*factor-1 are fully synthetic.
66
+ """
67
+ n_original = len(self._df)
68
+ n_synthetic = n_original * (factor - 1)
69
+ rows: dict[str, list] = {col: [] for col in self._df.columns}
70
+
71
+ # Phase 1: anonymize original rows
72
+ for _, row in self._df.iterrows():
73
+ for col in self._df.columns:
74
+ val = row[col]
75
+ if col in self._pii_columns:
76
+ rows[col].append(self._session._generate(self._pii_columns[col]))
77
+ elif col in self._id_columns:
78
+ rows[col].append(val)
79
+ else:
80
+ rows[col].append(val)
81
+
82
+ # Phase 2: generate synthetic rows
83
+ id_counters: dict[str, int] = {}
84
+ for col in self._id_columns:
85
+ id_counters[col] = int(self._df[col].max()) + 1
86
+
87
+ col_values: dict[str, list] = {}
88
+ for col in self._sample_columns:
89
+ col_values[col] = self._df[col].dropna().tolist()
90
+
91
+ for _ in range(n_synthetic):
92
+ for col in self._df.columns:
93
+ if col in self._pii_columns:
94
+ rows[col].append(self._session._generate(self._pii_columns[col]))
95
+ elif col in self._id_columns:
96
+ rows[col].append(id_counters[col])
97
+ id_counters[col] += 1
98
+ else:
99
+ pool = col_values.get(col, [])
100
+ rows[col].append(random.choice(pool) if pool else None)
101
+
102
+ return pd.DataFrame(rows)
@@ -0,0 +1,69 @@
1
+ """Pipeline for the multiply command: read -> multiply -> write."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ from rich.console import Console
9
+
10
+ from lethe.multiplier import Multiplier
11
+ from lethe.parsers import TextFormat, detect_format, make_writer
12
+ from lethe.sanitizer import sanitize_dataframe
13
+
14
+ console = Console()
15
+
16
+
17
+ def default_multiply_output(input_path: Path) -> Path:
18
+ return input_path.with_stem(input_path.stem + "_multiplied")
19
+
20
+
21
+ def run_multiply(
22
+ input_path: Path,
23
+ output_path: Path | None,
24
+ factor: int,
25
+ locale: str = "en_US",
26
+ seed: int | None = None,
27
+ sanitize: bool = False,
28
+ ) -> Path:
29
+ if output_path is None:
30
+ output_path = default_multiply_output(input_path)
31
+
32
+ fmt = detect_format(input_path)
33
+
34
+ if fmt in (TextFormat.FREEFORM, TextFormat.ONE_VALUE_PER_LINE):
35
+ console.print(
36
+ f"[red]Error:[/red] Cannot multiply {fmt.value} files. "
37
+ "Multiply only supports CSV and TSV formats with columnar data.",
38
+ highlight=False,
39
+ )
40
+ raise SystemExit(1)
41
+
42
+ console.print(f"[bold]Lethe[/bold] multiplying [cyan]{input_path.name}[/cyan]")
43
+ console.print(f" Format: [yellow]{fmt.value}[/yellow]")
44
+ console.print(f" Factor: [yellow]{factor}x[/yellow]")
45
+ console.print(f" Output: [cyan]{output_path}[/cyan]")
46
+
47
+ sep = "\t" if fmt == TextFormat.TSV else ","
48
+ df = pd.read_csv(input_path, sep=sep)
49
+ n_input = len(df)
50
+
51
+ console.print(f" Input rows: [yellow]{n_input}[/yellow]")
52
+
53
+ multiplier = Multiplier(df, locale=locale, seed=seed)
54
+ result = multiplier.generate(factor)
55
+
56
+ if sanitize:
57
+ console.print(" Sanitizing emails and URLs...", style="dim")
58
+ result = sanitize_dataframe(result, multiplier._pii_columns)
59
+
60
+ writer = make_writer(output_path, fmt)
61
+ writer.write_chunk(result)
62
+ writer.close()
63
+
64
+ n_output = len(result)
65
+ console.print(
66
+ f"\n [green]Done.[/green] {n_input} input rows -> {n_output} output rows "
67
+ f"({factor}x multiplication)."
68
+ )
69
+ return output_path
@@ -0,0 +1,96 @@
1
+ """Format detection and parser factory for supported file types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from pathlib import Path
7
+
8
+ from lethe.parsers.csv_parser import CsvReader, CsvWriter
9
+ from lethe.parsers.txt_parser import (
10
+ FreeformReader,
11
+ FreeformWriter,
12
+ LineReader,
13
+ LineWriter,
14
+ TsvReader,
15
+ TsvWriter,
16
+ )
17
+
18
+
19
+ class TextFormat(Enum):
20
+ CSV = "csv"
21
+ TSV = "tsv"
22
+ ONE_VALUE_PER_LINE = "one_value_per_line"
23
+ FREEFORM = "freeform"
24
+
25
+
26
+ def detect_format(path: Path) -> TextFormat:
27
+ """Detect file format from extension and content heuristics."""
28
+ suffix = path.suffix.lower()
29
+
30
+ if suffix == ".csv":
31
+ return TextFormat.CSV
32
+ if suffix == ".tsv":
33
+ return TextFormat.TSV
34
+
35
+ if suffix == ".txt":
36
+ return _detect_txt_format(path)
37
+
38
+ return TextFormat.CSV
39
+
40
+
41
+ def _detect_txt_format(path: Path) -> TextFormat:
42
+ """Inspect .txt file content to determine the sub-format."""
43
+ lines: list[str] = []
44
+ with open(path, encoding="utf-8") as f:
45
+ for i, line in enumerate(f):
46
+ if i >= 20:
47
+ break
48
+ lines.append(line.rstrip("\n\r"))
49
+
50
+ if not lines:
51
+ return TextFormat.FREEFORM
52
+
53
+ non_empty = [l for l in lines if l.strip()]
54
+ if not non_empty:
55
+ return TextFormat.FREEFORM
56
+
57
+ tab_counts = [l.count("\t") for l in non_empty]
58
+ if tab_counts and min(tab_counts) >= 1:
59
+ consistent = all(c == tab_counts[0] for c in tab_counts)
60
+ if consistent:
61
+ return TextFormat.TSV
62
+
63
+ total_chars = sum(len(l) for l in non_empty)
64
+ avg_chars = total_chars / len(non_empty) if non_empty else 0
65
+ max_words = max((len(l.split()) for l in non_empty), default=0)
66
+
67
+ if avg_chars < 60 and max_words <= 4:
68
+ return TextFormat.ONE_VALUE_PER_LINE
69
+
70
+ return TextFormat.FREEFORM
71
+
72
+
73
+ def make_reader(path: Path, fmt: TextFormat, chunk_size: int = 5000):
74
+ """Create the appropriate reader for a given format."""
75
+ if fmt == TextFormat.CSV:
76
+ return CsvReader(path, chunk_size=chunk_size)
77
+ if fmt == TextFormat.TSV:
78
+ return TsvReader(path, chunk_size=chunk_size)
79
+ if fmt == TextFormat.ONE_VALUE_PER_LINE:
80
+ return LineReader(path, chunk_size=chunk_size)
81
+ if fmt == TextFormat.FREEFORM:
82
+ return FreeformReader(path, chunk_size=chunk_size)
83
+ return CsvReader(path, chunk_size=chunk_size)
84
+
85
+
86
+ def make_writer(path: Path, fmt: TextFormat):
87
+ """Create the appropriate writer for a given format."""
88
+ if fmt == TextFormat.CSV:
89
+ return CsvWriter(path)
90
+ if fmt == TextFormat.TSV:
91
+ return TsvWriter(path)
92
+ if fmt == TextFormat.ONE_VALUE_PER_LINE:
93
+ return LineWriter(path)
94
+ if fmt == TextFormat.FREEFORM:
95
+ return FreeformWriter(path)
96
+ return CsvWriter(path)
lethe/parsers/base.py ADDED
@@ -0,0 +1,17 @@
1
+ """Protocols for chunked file I/O."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Iterator, Protocol
6
+
7
+ import pandas as pd
8
+
9
+
10
+ class ChunkedReader(Protocol):
11
+ def read_chunks(self) -> Iterator[pd.DataFrame]: ...
12
+
13
+
14
+ class ChunkedWriter(Protocol):
15
+ def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None: ...
16
+
17
+ def close(self) -> None: ...
@@ -0,0 +1,39 @@
1
+ """Chunked CSV reader and writer using pandas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from pathlib import Path
7
+ from typing import Iterator
8
+
9
+ import pandas as pd
10
+
11
+
12
+ class CsvReader:
13
+ def __init__(self, path: Path, chunk_size: int = 5000) -> None:
14
+ self.path = path
15
+ self.chunk_size = chunk_size
16
+
17
+ def read_chunks(self) -> Iterator[pd.DataFrame]:
18
+ with pd.read_csv(self.path, chunksize=self.chunk_size) as reader:
19
+ yield from reader
20
+
21
+
22
+ class CsvWriter:
23
+ def __init__(self, path: Path) -> None:
24
+ self.path = path
25
+ self._first = True
26
+
27
+ def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
28
+ write_header = header or self._first
29
+ chunk.to_csv(
30
+ self.path,
31
+ mode="a" if not self._first else "w",
32
+ header=write_header,
33
+ index=False,
34
+ quoting=csv.QUOTE_ALL,
35
+ )
36
+ self._first = False
37
+
38
+ def close(self) -> None:
39
+ pass
@@ -0,0 +1,108 @@
1
+ """Parsers for text-based file formats: TSV, one-value-per-line, free-form."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterator
7
+
8
+ import pandas as pd
9
+
10
+
11
+ class TsvReader:
12
+ def __init__(self, path: Path, chunk_size: int = 5000) -> None:
13
+ self.path = path
14
+ self.chunk_size = chunk_size
15
+
16
+ def read_chunks(self) -> Iterator[pd.DataFrame]:
17
+ with pd.read_csv(self.path, sep="\t", chunksize=self.chunk_size) as reader:
18
+ yield from reader
19
+
20
+
21
+ class TsvWriter:
22
+ def __init__(self, path: Path) -> None:
23
+ self.path = path
24
+ self._first = True
25
+
26
+ def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
27
+ write_header = header or self._first
28
+ chunk.to_csv(
29
+ self.path,
30
+ mode="a" if not self._first else "w",
31
+ header=write_header,
32
+ index=False,
33
+ sep="\t",
34
+ )
35
+ self._first = False
36
+
37
+ def close(self) -> None:
38
+ pass
39
+
40
+
41
+ class LineReader:
42
+ def __init__(self, path: Path, chunk_size: int = 5000) -> None:
43
+ self.path = path
44
+ self.chunk_size = chunk_size
45
+
46
+ def read_chunks(self) -> Iterator[pd.DataFrame]:
47
+ buf: list[str] = []
48
+ with open(self.path, encoding="utf-8") as f:
49
+ for line in f:
50
+ stripped = line.rstrip("\n\r")
51
+ if not stripped:
52
+ continue
53
+ buf.append(stripped)
54
+ if len(buf) >= self.chunk_size:
55
+ yield pd.DataFrame({"value": buf})
56
+ buf = []
57
+ if buf:
58
+ yield pd.DataFrame({"value": buf})
59
+
60
+
61
+ class LineWriter:
62
+ def __init__(self, path: Path) -> None:
63
+ self.path = path
64
+ self._first = True
65
+
66
+ def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
67
+ mode = "w" if self._first else "a"
68
+ with open(self.path, mode, encoding="utf-8") as f:
69
+ for value in chunk["value"]:
70
+ f.write(str(value) + "\n")
71
+ self._first = False
72
+
73
+ def close(self) -> None:
74
+ pass
75
+
76
+
77
+ class FreeformReader:
78
+ def __init__(self, path: Path, chunk_size: int = 5000) -> None:
79
+ self.path = path
80
+ self.chunk_size = chunk_size
81
+
82
+ def read_chunks(self) -> Iterator[pd.DataFrame]:
83
+ buf: list[str] = []
84
+ with open(self.path, encoding="utf-8") as f:
85
+ for line in f:
86
+ text = line.rstrip("\n\r")
87
+ buf.append(text)
88
+ if len(buf) >= self.chunk_size:
89
+ yield pd.DataFrame({"text": buf})
90
+ buf = []
91
+ if buf:
92
+ yield pd.DataFrame({"text": buf})
93
+
94
+
95
+ class FreeformWriter:
96
+ def __init__(self, path: Path) -> None:
97
+ self.path = path
98
+ self._first = True
99
+
100
+ def write_chunk(self, chunk: pd.DataFrame, *, header: bool = False) -> None:
101
+ mode = "w" if self._first else "a"
102
+ with open(self.path, mode, encoding="utf-8") as f:
103
+ for text in chunk["text"]:
104
+ f.write(str(text) + "\n")
105
+ self._first = False
106
+
107
+ def close(self) -> None:
108
+ pass