pybutt 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ from pathlib import Path
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+ from pybutt.exceptions import InvalidManifestError
7
+ from pybutt.files.manifest import (
8
+ load_file_manifest,
9
+ validate_manifest_entries,
10
+ write_manifest,
11
+ )
12
+
13
+
14
+ def _write_table_chunks(writer, table, rowgroup_size: int):
15
+ if table.num_rows < rowgroup_size:
16
+ return table
17
+
18
+ offset = 0
19
+ while offset + rowgroup_size <= table.num_rows:
20
+ chunk = table.slice(offset, rowgroup_size)
21
+ writer.write_table(chunk, row_group_size=rowgroup_size)
22
+ offset += rowgroup_size
23
+
24
+ if offset < table.num_rows:
25
+ return table.slice(offset)
26
+
27
+ return None
28
+
29
+
30
+ def combine_parquet_files(
31
+ manifest_path: Path,
32
+ output_file: Path,
33
+ rowgroup_size: int = 1_048_576,
34
+ delete_originals: bool = False,
35
+ new_manifest_name: str | None = None,
36
+ ):
37
+ """combine all parquet files listed in a manifest into a single Parquet file.
38
+
39
+ The resulting file will use the schema of the first file. All subsequent
40
+ files must be schema-compatible (column names/types) or behavior is undefined.
41
+ """
42
+ manifest_path = Path(manifest_path)
43
+ base_dir = manifest_path.parent
44
+
45
+ manifest = load_file_manifest(manifest_path, operation="combine")
46
+
47
+ entries = validate_manifest_entries(manifest, base_dir)
48
+ if not entries:
49
+ raise InvalidManifestError("Manifest contains no entries to combine")
50
+
51
+ output_file = Path(output_file)
52
+ output_file.parent.mkdir(parents=True, exist_ok=True)
53
+
54
+ # Use schema from first file
55
+ first_path = base_dir / entries[0]
56
+ first_pf = pq.ParquetFile(first_path)
57
+ schema = first_pf.schema_arrow
58
+
59
+ with pq.ParquetWriter(output_file, schema, compression="snappy") as writer:
60
+ buffered_table = None
61
+
62
+ for entry in entries:
63
+ src = base_dir / entry
64
+ pf = pq.ParquetFile(src)
65
+ # If schema differs, let pyarrow handle or raise downstream
66
+ for batch in pf.iter_batches():
67
+ table = pa.Table.from_batches([batch])
68
+ if buffered_table is None:
69
+ buffered_table = table
70
+ else:
71
+ buffered_table = pa.concat_tables([buffered_table, table])
72
+
73
+ buffered_table = _write_table_chunks(
74
+ writer, buffered_table, rowgroup_size
75
+ )
76
+
77
+ if buffered_table is not None and buffered_table.num_rows > 0:
78
+ writer.write_table(buffered_table, row_group_size=rowgroup_size)
79
+
80
+ if delete_originals:
81
+ for entry in entries:
82
+ src = base_dir / entry
83
+ if src.exists() and src != output_file:
84
+ src.unlink()
85
+ if manifest_path.exists():
86
+ manifest_path.unlink()
87
+
88
+ new_manifest_name = (
89
+ new_manifest_name or f"{manifest_path.stem}_combined{manifest_path.suffix}"
90
+ )
91
+ write_manifest(base_dir / new_manifest_name, [output_file.name])
92
+
93
+ return output_file
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+
3
+ import pyarrow.parquet as pq
4
+
5
+ from pybutt.files.manifest import load_file_manifest
6
+
7
+
8
+ def inspect_parquet_file(filepath: Path, verbose: bool = False) -> dict:
9
+ pf = pq.ParquetFile(filepath)
10
+
11
+ info = {
12
+ "file": filepath.name,
13
+ "rows": pf.metadata.num_rows,
14
+ "row_groups": pf.metadata.num_row_groups,
15
+ "row_group_sizes": {
16
+ pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups)
17
+ },
18
+ }
19
+
20
+ if verbose:
21
+ schema = pf.schema_arrow
22
+ info["columns"] = {field.name: str(field.type) for field in schema}
23
+
24
+ return info
25
+
26
+
27
+ def inspect_manifest(manifest_path: str | Path, verbose: bool = False):
28
+ manifest_path = Path(manifest_path)
29
+ base_dir = manifest_path.parent
30
+
31
+ manifest = load_file_manifest(manifest_path, operation="Inspect")
32
+
33
+ for filename in manifest["entries"]:
34
+ filepath = base_dir / filename
35
+ if not filepath.exists():
36
+ print(f"Missing file: {filepath}")
37
+ continue
38
+
39
+ info = inspect_parquet_file(filepath, verbose=verbose)
40
+
41
+ print(info["file"])
42
+ print(f" rows: {info['rows']}")
43
+ print(f" row groups: {info['row_groups']}")
44
+ print(f" group sizes: {info['row_group_sizes']}")
45
+
46
+ if verbose:
47
+ print(" columns:")
48
+ for col, typ in info["columns"].items():
49
+ print(f" {col}: {typ}")
50
+
51
+ print()
@@ -0,0 +1,160 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from pybutt.core.config import validate_identifier
5
+ from pybutt.exceptions import (
6
+ DuplicateManifestEntryError,
7
+ InvalidManifestEntryError,
8
+ InvalidManifestError,
9
+ ManifestNotFoundError,
10
+ MissingManifestEntryError,
11
+ PathTraversalError,
12
+ UnsupportedManifestTypeError,
13
+ UnsupportedManifestVersionError,
14
+ )
15
+
16
+ MANIFEST_VERSION_1 = 1
17
+ MANIFEST_VERSION_2 = 2
18
+ SUPPORTED_MANIFEST_TYPES = frozenset({"files", "tables"})
19
+
20
+
21
+ def _parse_manifest_dict(data):
22
+ if not isinstance(data, dict):
23
+ raise InvalidManifestError(
24
+ "Manifest must be a list or an object with version, type, and entries"
25
+ )
26
+
27
+ version = data.get("version")
28
+ if version not in {MANIFEST_VERSION_1, MANIFEST_VERSION_2}:
29
+ raise UnsupportedManifestVersionError(
30
+ f"Unsupported manifest version: {version}"
31
+ )
32
+
33
+ manifest_type = data.get("type")
34
+ if version == MANIFEST_VERSION_1:
35
+ manifest_type = manifest_type or "files"
36
+ if manifest_type != "files":
37
+ raise UnsupportedManifestTypeError(
38
+ "Version 1 manifests support only type 'files'"
39
+ )
40
+ else:
41
+ if manifest_type not in SUPPORTED_MANIFEST_TYPES:
42
+ raise InvalidManifestError(
43
+ "Manifest type must be 'files' or 'tables' for version 2"
44
+ )
45
+
46
+ entries = data.get("entries")
47
+ if not isinstance(entries, list):
48
+ raise InvalidManifestError("Manifest entries must be a list")
49
+
50
+ if manifest_type == "tables":
51
+ return {
52
+ "version": version,
53
+ "type": manifest_type,
54
+ "entries": [_validate_table_name(e) for e in entries],
55
+ }
56
+
57
+ return {"version": version, "type": manifest_type, "entries": entries}
58
+
59
+
60
+ def _validate_table_name(value):
61
+ if not isinstance(value, str):
62
+ raise InvalidManifestEntryError(
63
+ f"Invalid manifest table entry (not string): {value}"
64
+ )
65
+
66
+ parts = value.split(".")
67
+ if len(parts) != 2:
68
+ raise InvalidManifestEntryError(
69
+ f"Invalid table name format, expected schema.table: {value}"
70
+ )
71
+
72
+ schema, table = parts
73
+ validate_identifier(schema)
74
+ validate_identifier(table)
75
+ return value
76
+
77
+
78
+ def default_manifest_filename(
79
+ schema: str, table: str, op_type: str = "", suffix: str = "manifest"
80
+ ) -> str:
81
+ return f"{schema}_{table}_{op_type}{suffix}.json"
82
+
83
+
84
+ def default_import_manifest_filename(
85
+ schema: str,
86
+ table: str,
87
+ ) -> str:
88
+ return default_manifest_filename(schema=schema, table=table, op_type="import_")
89
+
90
+
91
+ def write_manifest(
92
+ path: str | Path,
93
+ entries: list[str],
94
+ manifest_type: str = "files",
95
+ version: int = MANIFEST_VERSION_2,
96
+ ) -> Path:
97
+ """Write a versioned manifest JSON file and return its :class:`Path`."""
98
+ path = Path(path)
99
+ with open(path, "w") as f:
100
+ json.dump(
101
+ {"version": version, "type": manifest_type, "entries": entries},
102
+ f,
103
+ indent=4,
104
+ )
105
+ return path
106
+
107
+
108
+ def load_manifest(manifest_path: str | Path) -> dict:
109
+ manifest_path = Path(manifest_path)
110
+
111
+ if not manifest_path.exists():
112
+ raise ManifestNotFoundError(f"Manifest not found: {manifest_path}")
113
+
114
+ with open(manifest_path) as f:
115
+ data = json.load(f)
116
+
117
+ if isinstance(data, list):
118
+ return {"version": MANIFEST_VERSION_1, "type": "files", "entries": data}
119
+
120
+ return _parse_manifest_dict(data)
121
+
122
+
123
+ def load_file_manifest(
124
+ manifest_path: str | Path, *, operation: str = "Operation"
125
+ ) -> dict:
126
+ """Load a manifest and raise if it is not a file manifest."""
127
+ manifest = load_manifest(manifest_path)
128
+ if manifest["type"] != "files":
129
+ raise UnsupportedManifestTypeError(
130
+ f"{operation} only supports file manifests, got: {manifest['type']}"
131
+ )
132
+ return manifest
133
+
134
+
135
+ def validate_manifest_entries(manifest: dict, base_dir: Path) -> list[str]:
136
+ seen = set()
137
+ validated = []
138
+
139
+ for item in manifest["entries"]:
140
+ if not isinstance(item, str):
141
+ raise InvalidManifestEntryError(
142
+ f"Invalid manifest entry (not string): {item}"
143
+ )
144
+
145
+ if item in seen:
146
+ raise DuplicateManifestEntryError(f"Duplicate file in manifest: {item}")
147
+
148
+ if manifest["type"] == "files":
149
+ filepath = (base_dir / item).resolve()
150
+ if not filepath.is_relative_to(base_dir.resolve()):
151
+ raise PathTraversalError(
152
+ f"Manifest entry escapes base directory: {item}"
153
+ )
154
+ if not filepath.exists():
155
+ raise MissingManifestEntryError(f"Missing file: {filepath}")
156
+
157
+ seen.add(item)
158
+ validated.append(item)
159
+
160
+ return validated
pybutt/io/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .combiner import TableCombine
2
+ from .exporter import Exporter
3
+ from .importer import Importer
4
+ from .purger import TablePurger
5
+
6
+ __all__ = ["Exporter", "Importer", "TableCombine", "TablePurger"]
pybutt/io/combiner.py ADDED
@@ -0,0 +1,119 @@
1
+ from collections.abc import Iterable
2
+
3
+ from pybutt.core.base import SqlServerIOBase
4
+ from pybutt.core.config import (
5
+ SCHEMA_DEFAULT,
6
+ TransactionMode,
7
+ coerce_transaction_mode,
8
+ quote_identifier,
9
+ validate_engine,
10
+ validate_identifier,
11
+ )
12
+ from pybutt.core.logobs import context, get_logger
13
+ from pybutt.exceptions import SchemaMismatchError
14
+
15
+ logger = get_logger("combiner")
16
+
17
+
18
+ class TableCombine(SqlServerIOBase):
19
+ """Combine multiple SQL tables into a single target table.
20
+
21
+ Sources should be provided as fully-qualified schema.table strings.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ config,
27
+ table: str,
28
+ sources: Iterable[str],
29
+ schema: str = SCHEMA_DEFAULT,
30
+ transaction_mode: TransactionMode = TransactionMode.BATCH,
31
+ engine: str = "pyodbc",
32
+ ):
33
+ super().__init__(config)
34
+ self.schema = validate_identifier(schema)
35
+ self.table = validate_identifier(table)
36
+ self.sources: list[str] = list(sources)
37
+ self.transaction_mode = coerce_transaction_mode(transaction_mode)
38
+ validate_engine(engine, allowed=frozenset({"pyodbc", "duckdb"}))
39
+ self.engine = engine
40
+
41
+ def _parse_schema_table(self, fq: str) -> tuple[str, str]:
42
+ parts = fq.split(".")
43
+ if len(parts) != 2:
44
+ raise ValueError(f"Invalid source table name: {fq}")
45
+ schema, table = parts
46
+ validate_identifier(schema)
47
+ validate_identifier(table)
48
+ return schema, table
49
+
50
+ def _ensure_target_exists_and_schema(self, cur, first_source: str):
51
+ # If target exists, validate schema equality;
52
+ # otherwise create from first source (no rows)
53
+ cur.execute("SELECT OBJECT_ID(?)", (f"{self.schema}.{self.table}",))
54
+ exists = cur.fetchone()[0] is not None
55
+
56
+ # Get column list for source
57
+ src_schema, src_table = self._parse_schema_table(first_source)
58
+ q_src = f"{quote_identifier(src_schema)}.{quote_identifier(src_table)}"
59
+ q_tgt = f"{quote_identifier(self.schema)}.{quote_identifier(self.table)}"
60
+ cur.execute(f"SELECT TOP 0 * FROM {q_src}")
61
+ src_cols = [c[0] for c in cur.description]
62
+
63
+ if not exists:
64
+ # Create target table with same schema as source
65
+ cur.execute(f"SELECT TOP 0 * INTO {q_tgt} FROM {q_src}")
66
+ cur.connection.commit()
67
+ return src_cols
68
+
69
+ # Target exists: get target columns
70
+ cur.execute(f"SELECT TOP 0 * FROM {q_tgt}")
71
+ tgt_cols = [c[0] for c in cur.description]
72
+
73
+ if set(src_cols) != set(tgt_cols):
74
+ raise SchemaMismatchError(
75
+ "Source and target schemas differ for "
76
+ f"{first_source} vs {self.schema}.{self.table}"
77
+ )
78
+
79
+ return src_cols
80
+
81
+ def combine(self):
82
+ """Combine all source tables into the target table.
83
+
84
+ Implementation: create target if missing using first source schema, then
85
+ run `INSERT INTO target SELECT * FROM source` for each source.
86
+ """
87
+ with self.connection_p(autocommit=False) as conn:
88
+ with conn.cursor() as cur:
89
+ # Ensure first source schema compatible / create target
90
+ first_source = self.sources[0]
91
+ self._ensure_target_exists_and_schema(
92
+ cur, first_source, self.schema, self.table
93
+ )
94
+
95
+ # Insert from each source
96
+ q_tgt = (
97
+ f"{quote_identifier(self.schema)}"
98
+ f".{quote_identifier(self.table)}"
99
+ )
100
+ for src in self.sources:
101
+ src_schema, src_table = self._parse_schema_table(src)
102
+ q_src = (
103
+ f"{quote_identifier(src_schema)}.{quote_identifier(src_table)}"
104
+ )
105
+ logger.info(
106
+ "Combining "
107
+ + context(
108
+ source=f"{src_schema}.{src_table}",
109
+ target=f"{self.schema}.{self.table}",
110
+ )
111
+ )
112
+ try:
113
+ cur.execute(f"INSERT INTO {q_tgt} SELECT * FROM {q_src}")
114
+ conn.commit()
115
+ except Exception:
116
+ conn.rollback()
117
+ raise
118
+
119
+ logger.info("Table combine completed")