pybutt 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- old_tests/app.py +713 -0
- pybutt/__init__.py +17 -0
- pybutt/cli/__init__.py +11 -0
- pybutt/cli/app.py +94 -0
- pybutt/cli/combine_command.py +236 -0
- pybutt/cli/export_command.py +317 -0
- pybutt/cli/import_command.py +286 -0
- pybutt/cli/inspect_command.py +30 -0
- pybutt/cli/purge_command.py +235 -0
- pybutt/core/__init__.py +30 -0
- pybutt/core/base.py +124 -0
- pybutt/core/config.py +144 -0
- pybutt/core/logobs.py +445 -0
- pybutt/exceptions.py +82 -0
- pybutt/files/__init__.py +28 -0
- pybutt/files/combine.py +93 -0
- pybutt/files/inspect.py +51 -0
- pybutt/files/manifest.py +160 -0
- pybutt/io/__init__.py +6 -0
- pybutt/io/combiner.py +119 -0
- pybutt/io/exporter.py +612 -0
- pybutt/io/importer.py +928 -0
- pybutt/io/purger.py +44 -0
- pybutt-2.0.0.dist-info/METADATA +756 -0
- pybutt-2.0.0.dist-info/RECORD +39 -0
- pybutt-2.0.0.dist-info/WHEEL +5 -0
- pybutt-2.0.0.dist-info/entry_points.txt +2 -0
- pybutt-2.0.0.dist-info/licenses/LICENSE +21 -0
- pybutt-2.0.0.dist-info/top_level.txt +3 -0
- tests/conftest.py +22 -0
- tests/test_cli.py +979 -0
- tests/test_cli_help.py +130 -0
- tests/test_combiner.py +259 -0
- tests/test_core.py +1009 -0
- tests/test_exporter.py +637 -0
- tests/test_files.py +178 -0
- tests/test_import_retry_logic.py +837 -0
- tests/test_logobs.py +491 -0
- tests/test_purge.py +219 -0
pybutt/files/combine.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
from pybutt.exceptions import InvalidManifestError
|
|
7
|
+
from pybutt.files.manifest import (
|
|
8
|
+
load_file_manifest,
|
|
9
|
+
validate_manifest_entries,
|
|
10
|
+
write_manifest,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _write_table_chunks(writer, table, rowgroup_size: int):
|
|
15
|
+
if table.num_rows < rowgroup_size:
|
|
16
|
+
return table
|
|
17
|
+
|
|
18
|
+
offset = 0
|
|
19
|
+
while offset + rowgroup_size <= table.num_rows:
|
|
20
|
+
chunk = table.slice(offset, rowgroup_size)
|
|
21
|
+
writer.write_table(chunk, row_group_size=rowgroup_size)
|
|
22
|
+
offset += rowgroup_size
|
|
23
|
+
|
|
24
|
+
if offset < table.num_rows:
|
|
25
|
+
return table.slice(offset)
|
|
26
|
+
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def combine_parquet_files(
|
|
31
|
+
manifest_path: Path,
|
|
32
|
+
output_file: Path,
|
|
33
|
+
rowgroup_size: int = 1_048_576,
|
|
34
|
+
delete_originals: bool = False,
|
|
35
|
+
new_manifest_name: str | None = None,
|
|
36
|
+
):
|
|
37
|
+
"""combine all parquet files listed in a manifest into a single Parquet file.
|
|
38
|
+
|
|
39
|
+
The resulting file will use the schema of the first file. All subsequent
|
|
40
|
+
files must be schema-compatible (column names/types) or behavior is undefined.
|
|
41
|
+
"""
|
|
42
|
+
manifest_path = Path(manifest_path)
|
|
43
|
+
base_dir = manifest_path.parent
|
|
44
|
+
|
|
45
|
+
manifest = load_file_manifest(manifest_path, operation="combine")
|
|
46
|
+
|
|
47
|
+
entries = validate_manifest_entries(manifest, base_dir)
|
|
48
|
+
if not entries:
|
|
49
|
+
raise InvalidManifestError("Manifest contains no entries to combine")
|
|
50
|
+
|
|
51
|
+
output_file = Path(output_file)
|
|
52
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
# Use schema from first file
|
|
55
|
+
first_path = base_dir / entries[0]
|
|
56
|
+
first_pf = pq.ParquetFile(first_path)
|
|
57
|
+
schema = first_pf.schema_arrow
|
|
58
|
+
|
|
59
|
+
with pq.ParquetWriter(output_file, schema, compression="snappy") as writer:
|
|
60
|
+
buffered_table = None
|
|
61
|
+
|
|
62
|
+
for entry in entries:
|
|
63
|
+
src = base_dir / entry
|
|
64
|
+
pf = pq.ParquetFile(src)
|
|
65
|
+
# If schema differs, let pyarrow handle or raise downstream
|
|
66
|
+
for batch in pf.iter_batches():
|
|
67
|
+
table = pa.Table.from_batches([batch])
|
|
68
|
+
if buffered_table is None:
|
|
69
|
+
buffered_table = table
|
|
70
|
+
else:
|
|
71
|
+
buffered_table = pa.concat_tables([buffered_table, table])
|
|
72
|
+
|
|
73
|
+
buffered_table = _write_table_chunks(
|
|
74
|
+
writer, buffered_table, rowgroup_size
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if buffered_table is not None and buffered_table.num_rows > 0:
|
|
78
|
+
writer.write_table(buffered_table, row_group_size=rowgroup_size)
|
|
79
|
+
|
|
80
|
+
if delete_originals:
|
|
81
|
+
for entry in entries:
|
|
82
|
+
src = base_dir / entry
|
|
83
|
+
if src.exists() and src != output_file:
|
|
84
|
+
src.unlink()
|
|
85
|
+
if manifest_path.exists():
|
|
86
|
+
manifest_path.unlink()
|
|
87
|
+
|
|
88
|
+
new_manifest_name = (
|
|
89
|
+
new_manifest_name or f"{manifest_path.stem}_combined{manifest_path.suffix}"
|
|
90
|
+
)
|
|
91
|
+
write_manifest(base_dir / new_manifest_name, [output_file.name])
|
|
92
|
+
|
|
93
|
+
return output_file
|
pybutt/files/inspect.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pyarrow.parquet as pq
|
|
4
|
+
|
|
5
|
+
from pybutt.files.manifest import load_file_manifest
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def inspect_parquet_file(filepath: Path, verbose: bool = False) -> dict:
|
|
9
|
+
pf = pq.ParquetFile(filepath)
|
|
10
|
+
|
|
11
|
+
info = {
|
|
12
|
+
"file": filepath.name,
|
|
13
|
+
"rows": pf.metadata.num_rows,
|
|
14
|
+
"row_groups": pf.metadata.num_row_groups,
|
|
15
|
+
"row_group_sizes": {
|
|
16
|
+
pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups)
|
|
17
|
+
},
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if verbose:
|
|
21
|
+
schema = pf.schema_arrow
|
|
22
|
+
info["columns"] = {field.name: str(field.type) for field in schema}
|
|
23
|
+
|
|
24
|
+
return info
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def inspect_manifest(manifest_path: str | Path, verbose: bool = False):
|
|
28
|
+
manifest_path = Path(manifest_path)
|
|
29
|
+
base_dir = manifest_path.parent
|
|
30
|
+
|
|
31
|
+
manifest = load_file_manifest(manifest_path, operation="Inspect")
|
|
32
|
+
|
|
33
|
+
for filename in manifest["entries"]:
|
|
34
|
+
filepath = base_dir / filename
|
|
35
|
+
if not filepath.exists():
|
|
36
|
+
print(f"Missing file: {filepath}")
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
info = inspect_parquet_file(filepath, verbose=verbose)
|
|
40
|
+
|
|
41
|
+
print(info["file"])
|
|
42
|
+
print(f" rows: {info['rows']}")
|
|
43
|
+
print(f" row groups: {info['row_groups']}")
|
|
44
|
+
print(f" group sizes: {info['row_group_sizes']}")
|
|
45
|
+
|
|
46
|
+
if verbose:
|
|
47
|
+
print(" columns:")
|
|
48
|
+
for col, typ in info["columns"].items():
|
|
49
|
+
print(f" {col}: {typ}")
|
|
50
|
+
|
|
51
|
+
print()
|
pybutt/files/manifest.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pybutt.core.config import validate_identifier
|
|
5
|
+
from pybutt.exceptions import (
|
|
6
|
+
DuplicateManifestEntryError,
|
|
7
|
+
InvalidManifestEntryError,
|
|
8
|
+
InvalidManifestError,
|
|
9
|
+
ManifestNotFoundError,
|
|
10
|
+
MissingManifestEntryError,
|
|
11
|
+
PathTraversalError,
|
|
12
|
+
UnsupportedManifestTypeError,
|
|
13
|
+
UnsupportedManifestVersionError,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
MANIFEST_VERSION_1 = 1
|
|
17
|
+
MANIFEST_VERSION_2 = 2
|
|
18
|
+
SUPPORTED_MANIFEST_TYPES = frozenset({"files", "tables"})
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _parse_manifest_dict(data):
|
|
22
|
+
if not isinstance(data, dict):
|
|
23
|
+
raise InvalidManifestError(
|
|
24
|
+
"Manifest must be a list or an object with version, type, and entries"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
version = data.get("version")
|
|
28
|
+
if version not in {MANIFEST_VERSION_1, MANIFEST_VERSION_2}:
|
|
29
|
+
raise UnsupportedManifestVersionError(
|
|
30
|
+
f"Unsupported manifest version: {version}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
manifest_type = data.get("type")
|
|
34
|
+
if version == MANIFEST_VERSION_1:
|
|
35
|
+
manifest_type = manifest_type or "files"
|
|
36
|
+
if manifest_type != "files":
|
|
37
|
+
raise UnsupportedManifestTypeError(
|
|
38
|
+
"Version 1 manifests support only type 'files'"
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
if manifest_type not in SUPPORTED_MANIFEST_TYPES:
|
|
42
|
+
raise InvalidManifestError(
|
|
43
|
+
"Manifest type must be 'files' or 'tables' for version 2"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
entries = data.get("entries")
|
|
47
|
+
if not isinstance(entries, list):
|
|
48
|
+
raise InvalidManifestError("Manifest entries must be a list")
|
|
49
|
+
|
|
50
|
+
if manifest_type == "tables":
|
|
51
|
+
return {
|
|
52
|
+
"version": version,
|
|
53
|
+
"type": manifest_type,
|
|
54
|
+
"entries": [_validate_table_name(e) for e in entries],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return {"version": version, "type": manifest_type, "entries": entries}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _validate_table_name(value):
|
|
61
|
+
if not isinstance(value, str):
|
|
62
|
+
raise InvalidManifestEntryError(
|
|
63
|
+
f"Invalid manifest table entry (not string): {value}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
parts = value.split(".")
|
|
67
|
+
if len(parts) != 2:
|
|
68
|
+
raise InvalidManifestEntryError(
|
|
69
|
+
f"Invalid table name format, expected schema.table: {value}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
schema, table = parts
|
|
73
|
+
validate_identifier(schema)
|
|
74
|
+
validate_identifier(table)
|
|
75
|
+
return value
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def default_manifest_filename(
|
|
79
|
+
schema: str, table: str, op_type: str = "", suffix: str = "manifest"
|
|
80
|
+
) -> str:
|
|
81
|
+
return f"{schema}_{table}_{op_type}{suffix}.json"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def default_import_manifest_filename(
|
|
85
|
+
schema: str,
|
|
86
|
+
table: str,
|
|
87
|
+
) -> str:
|
|
88
|
+
return default_manifest_filename(schema=schema, table=table, op_type="import_")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def write_manifest(
|
|
92
|
+
path: str | Path,
|
|
93
|
+
entries: list[str],
|
|
94
|
+
manifest_type: str = "files",
|
|
95
|
+
version: int = MANIFEST_VERSION_2,
|
|
96
|
+
) -> Path:
|
|
97
|
+
"""Write a versioned manifest JSON file and return its :class:`Path`."""
|
|
98
|
+
path = Path(path)
|
|
99
|
+
with open(path, "w") as f:
|
|
100
|
+
json.dump(
|
|
101
|
+
{"version": version, "type": manifest_type, "entries": entries},
|
|
102
|
+
f,
|
|
103
|
+
indent=4,
|
|
104
|
+
)
|
|
105
|
+
return path
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def load_manifest(manifest_path: str | Path) -> dict:
|
|
109
|
+
manifest_path = Path(manifest_path)
|
|
110
|
+
|
|
111
|
+
if not manifest_path.exists():
|
|
112
|
+
raise ManifestNotFoundError(f"Manifest not found: {manifest_path}")
|
|
113
|
+
|
|
114
|
+
with open(manifest_path) as f:
|
|
115
|
+
data = json.load(f)
|
|
116
|
+
|
|
117
|
+
if isinstance(data, list):
|
|
118
|
+
return {"version": MANIFEST_VERSION_1, "type": "files", "entries": data}
|
|
119
|
+
|
|
120
|
+
return _parse_manifest_dict(data)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def load_file_manifest(
|
|
124
|
+
manifest_path: str | Path, *, operation: str = "Operation"
|
|
125
|
+
) -> dict:
|
|
126
|
+
"""Load a manifest and raise if it is not a file manifest."""
|
|
127
|
+
manifest = load_manifest(manifest_path)
|
|
128
|
+
if manifest["type"] != "files":
|
|
129
|
+
raise UnsupportedManifestTypeError(
|
|
130
|
+
f"{operation} only supports file manifests, got: {manifest['type']}"
|
|
131
|
+
)
|
|
132
|
+
return manifest
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def validate_manifest_entries(manifest: dict, base_dir: Path) -> list[str]:
|
|
136
|
+
seen = set()
|
|
137
|
+
validated = []
|
|
138
|
+
|
|
139
|
+
for item in manifest["entries"]:
|
|
140
|
+
if not isinstance(item, str):
|
|
141
|
+
raise InvalidManifestEntryError(
|
|
142
|
+
f"Invalid manifest entry (not string): {item}"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if item in seen:
|
|
146
|
+
raise DuplicateManifestEntryError(f"Duplicate file in manifest: {item}")
|
|
147
|
+
|
|
148
|
+
if manifest["type"] == "files":
|
|
149
|
+
filepath = (base_dir / item).resolve()
|
|
150
|
+
if not filepath.is_relative_to(base_dir.resolve()):
|
|
151
|
+
raise PathTraversalError(
|
|
152
|
+
f"Manifest entry escapes base directory: {item}"
|
|
153
|
+
)
|
|
154
|
+
if not filepath.exists():
|
|
155
|
+
raise MissingManifestEntryError(f"Missing file: {filepath}")
|
|
156
|
+
|
|
157
|
+
seen.add(item)
|
|
158
|
+
validated.append(item)
|
|
159
|
+
|
|
160
|
+
return validated
|
pybutt/io/__init__.py
ADDED
pybutt/io/combiner.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from pybutt.core.base import SqlServerIOBase
|
|
4
|
+
from pybutt.core.config import (
|
|
5
|
+
SCHEMA_DEFAULT,
|
|
6
|
+
TransactionMode,
|
|
7
|
+
coerce_transaction_mode,
|
|
8
|
+
quote_identifier,
|
|
9
|
+
validate_engine,
|
|
10
|
+
validate_identifier,
|
|
11
|
+
)
|
|
12
|
+
from pybutt.core.logobs import context, get_logger
|
|
13
|
+
from pybutt.exceptions import SchemaMismatchError
|
|
14
|
+
|
|
15
|
+
logger = get_logger("combiner")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TableCombine(SqlServerIOBase):
|
|
19
|
+
"""Combine multiple SQL tables into a single target table.
|
|
20
|
+
|
|
21
|
+
Sources should be provided as fully-qualified schema.table strings.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
config,
|
|
27
|
+
table: str,
|
|
28
|
+
sources: Iterable[str],
|
|
29
|
+
schema: str = SCHEMA_DEFAULT,
|
|
30
|
+
transaction_mode: TransactionMode = TransactionMode.BATCH,
|
|
31
|
+
engine: str = "pyodbc",
|
|
32
|
+
):
|
|
33
|
+
super().__init__(config)
|
|
34
|
+
self.schema = validate_identifier(schema)
|
|
35
|
+
self.table = validate_identifier(table)
|
|
36
|
+
self.sources: list[str] = list(sources)
|
|
37
|
+
self.transaction_mode = coerce_transaction_mode(transaction_mode)
|
|
38
|
+
validate_engine(engine, allowed=frozenset({"pyodbc", "duckdb"}))
|
|
39
|
+
self.engine = engine
|
|
40
|
+
|
|
41
|
+
def _parse_schema_table(self, fq: str) -> tuple[str, str]:
|
|
42
|
+
parts = fq.split(".")
|
|
43
|
+
if len(parts) != 2:
|
|
44
|
+
raise ValueError(f"Invalid source table name: {fq}")
|
|
45
|
+
schema, table = parts
|
|
46
|
+
validate_identifier(schema)
|
|
47
|
+
validate_identifier(table)
|
|
48
|
+
return schema, table
|
|
49
|
+
|
|
50
|
+
def _ensure_target_exists_and_schema(self, cur, first_source: str):
|
|
51
|
+
# If target exists, validate schema equality;
|
|
52
|
+
# otherwise create from first source (no rows)
|
|
53
|
+
cur.execute("SELECT OBJECT_ID(?)", (f"{self.schema}.{self.table}",))
|
|
54
|
+
exists = cur.fetchone()[0] is not None
|
|
55
|
+
|
|
56
|
+
# Get column list for source
|
|
57
|
+
src_schema, src_table = self._parse_schema_table(first_source)
|
|
58
|
+
q_src = f"{quote_identifier(src_schema)}.{quote_identifier(src_table)}"
|
|
59
|
+
q_tgt = f"{quote_identifier(self.schema)}.{quote_identifier(self.table)}"
|
|
60
|
+
cur.execute(f"SELECT TOP 0 * FROM {q_src}")
|
|
61
|
+
src_cols = [c[0] for c in cur.description]
|
|
62
|
+
|
|
63
|
+
if not exists:
|
|
64
|
+
# Create target table with same schema as source
|
|
65
|
+
cur.execute(f"SELECT TOP 0 * INTO {q_tgt} FROM {q_src}")
|
|
66
|
+
cur.connection.commit()
|
|
67
|
+
return src_cols
|
|
68
|
+
|
|
69
|
+
# Target exists: get target columns
|
|
70
|
+
cur.execute(f"SELECT TOP 0 * FROM {q_tgt}")
|
|
71
|
+
tgt_cols = [c[0] for c in cur.description]
|
|
72
|
+
|
|
73
|
+
if set(src_cols) != set(tgt_cols):
|
|
74
|
+
raise SchemaMismatchError(
|
|
75
|
+
"Source and target schemas differ for "
|
|
76
|
+
f"{first_source} vs {self.schema}.{self.table}"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return src_cols
|
|
80
|
+
|
|
81
|
+
def combine(self):
|
|
82
|
+
"""Combine all source tables into the target table.
|
|
83
|
+
|
|
84
|
+
Implementation: create target if missing using first source schema, then
|
|
85
|
+
run `INSERT INTO target SELECT * FROM source` for each source.
|
|
86
|
+
"""
|
|
87
|
+
with self.connection_p(autocommit=False) as conn:
|
|
88
|
+
with conn.cursor() as cur:
|
|
89
|
+
# Ensure first source schema compatible / create target
|
|
90
|
+
first_source = self.sources[0]
|
|
91
|
+
self._ensure_target_exists_and_schema(
|
|
92
|
+
cur, first_source, self.schema, self.table
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Insert from each source
|
|
96
|
+
q_tgt = (
|
|
97
|
+
f"{quote_identifier(self.schema)}"
|
|
98
|
+
f".{quote_identifier(self.table)}"
|
|
99
|
+
)
|
|
100
|
+
for src in self.sources:
|
|
101
|
+
src_schema, src_table = self._parse_schema_table(src)
|
|
102
|
+
q_src = (
|
|
103
|
+
f"{quote_identifier(src_schema)}.{quote_identifier(src_table)}"
|
|
104
|
+
)
|
|
105
|
+
logger.info(
|
|
106
|
+
"Combining "
|
|
107
|
+
+ context(
|
|
108
|
+
source=f"{src_schema}.{src_table}",
|
|
109
|
+
target=f"{self.schema}.{self.table}",
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
try:
|
|
113
|
+
cur.execute(f"INSERT INTO {q_tgt} SELECT * FROM {q_src}")
|
|
114
|
+
conn.commit()
|
|
115
|
+
except Exception:
|
|
116
|
+
conn.rollback()
|
|
117
|
+
raise
|
|
118
|
+
|
|
119
|
+
logger.info("Table combine completed")
|