permafrost-framework 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- permafrost/__init__.py +114 -0
- permafrost/__main__.py +64 -0
- permafrost/catalog.py +441 -0
- permafrost/chunk_mode.py +225 -0
- permafrost/cli.py +419 -0
- permafrost/cluster.py +621 -0
- permafrost/codec.py +535 -0
- permafrost/schema_detector.py +250 -0
- permafrost/spark.py +464 -0
- permafrost/storage.py +620 -0
- permafrost_framework-0.6.0.dist-info/METADATA +97 -0
- permafrost_framework-0.6.0.dist-info/RECORD +16 -0
- permafrost_framework-0.6.0.dist-info/WHEEL +5 -0
- permafrost_framework-0.6.0.dist-info/entry_points.txt +2 -0
- permafrost_framework-0.6.0.dist-info/licenses/LICENSE +121 -0
- permafrost_framework-0.6.0.dist-info/top_level.txt +1 -0
permafrost/__init__.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Permafrost Data Framework
|
|
3
|
+
=========================
|
|
4
|
+
Plataforma distribuída de compressão inteligente para arquivamento digital de longo prazo.
|
|
5
|
+
|
|
6
|
+
Uso rápido:
|
|
7
|
+
from permafrost import freeze, thaw, audit
|
|
8
|
+
from permafrost import PermafrostCatalog, SchemaDetector
|
|
9
|
+
from permafrost import freeze_to, thaw_from # cloud
|
|
10
|
+
from permafrost import PermafrostMaster, PermafrostWorker, PermafrostClient # cluster
|
|
11
|
+
|
|
12
|
+
Formatos suportados:
|
|
13
|
+
freeze(df, "arquivo.permafrost") # DataFrame tabular
|
|
14
|
+
freeze(detector.detect("dados.jsonl")[0], ...) # JSONL / NoSQL
|
|
15
|
+
freeze_file("dados.csv", "saida.permafrost") # streaming, sem carregar tudo na RAM
|
|
16
|
+
freeze_to(df, "s3://bucket/dados.permafrost") # direto para cloud
|
|
17
|
+
|
|
18
|
+
Exemplos:
|
|
19
|
+
>>> import permafrost as pf
|
|
20
|
+
>>> metrics = pf.freeze(df, "vendas.permafrost", codec=pf.CODEC_LZMA2)
|
|
21
|
+
>>> print(f"Ratio: {metrics['ratio']:.2f}x")
|
|
22
|
+
>>> df_back = pf.thaw("vendas.permafrost")
|
|
23
|
+
>>> info = pf.audit("vendas.permafrost") # sem descomprimir
|
|
24
|
+
|
|
25
|
+
Links:
|
|
26
|
+
GitHub: https://github.com/caua-ferreira/permafrost-framework
|
|
27
|
+
Docs: https://github.com/caua-ferreira/permafrost-framework/tree/main/docs
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
__version__ = "0.6.0"
|
|
31
|
+
__author__ = "Permafrost Contributors"
|
|
32
|
+
__license__ = "Apache-2.0"
|
|
33
|
+
|
|
34
|
+
# ── Core codec ────────────────────────────────────────────────────────────────
|
|
35
|
+
from permafrost.codec import (
|
|
36
|
+
freeze,
|
|
37
|
+
thaw,
|
|
38
|
+
audit,
|
|
39
|
+
# Codec IDs
|
|
40
|
+
CODEC_ZSTD,
|
|
41
|
+
CODEC_LZMA2,
|
|
42
|
+
CODEC_ZPAQ,
|
|
43
|
+
# Quantization levels
|
|
44
|
+
QUANT_NONE,
|
|
45
|
+
QUANT_HIGH,
|
|
46
|
+
QUANT_MEDIUM,
|
|
47
|
+
QUANT_LOW,
|
|
48
|
+
# Format constants
|
|
49
|
+
MAGIC,
|
|
50
|
+
EOF_MAGIC,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# ── Schema detection (SQL + NoSQL + JSONL) ────────────────────────────────────
|
|
54
|
+
from permafrost.schema_detector import (
|
|
55
|
+
SchemaDetector,
|
|
56
|
+
DataType,
|
|
57
|
+
FieldKind,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# ── Chunk mode (streaming — datasets > RAM) ───────────────────────────────────
|
|
61
|
+
from permafrost.chunk_mode import (
|
|
62
|
+
freeze_stream,
|
|
63
|
+
freeze_file,
|
|
64
|
+
thaw_iter,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# ── Catalog (DuckDB index) ────────────────────────────────────────────────────
|
|
68
|
+
from permafrost.catalog import PermafrostCatalog
|
|
69
|
+
|
|
70
|
+
# ── Cloud storage adapters ────────────────────────────────────────────────────
|
|
71
|
+
from permafrost.storage import (
|
|
72
|
+
LocalAdapter,
|
|
73
|
+
S3Adapter,
|
|
74
|
+
GCSAdapter,
|
|
75
|
+
AzureAdapter,
|
|
76
|
+
storage_from_uri,
|
|
77
|
+
parse_uri,
|
|
78
|
+
freeze_to,
|
|
79
|
+
thaw_from,
|
|
80
|
+
audit_remote,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# ── Cluster (distributed processing) ─────────────────────────────────────────
|
|
84
|
+
from permafrost.cluster import (
|
|
85
|
+
PermafrostMaster,
|
|
86
|
+
PermafrostWorker,
|
|
87
|
+
PermafrostClient,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
__all__ = [
|
|
91
|
+
# Core
|
|
92
|
+
"freeze", "thaw", "audit",
|
|
93
|
+
# Codecs
|
|
94
|
+
"CODEC_ZSTD", "CODEC_LZMA2", "CODEC_ZPAQ",
|
|
95
|
+
# Quant levels
|
|
96
|
+
"QUANT_NONE", "QUANT_HIGH", "QUANT_MEDIUM", "QUANT_LOW",
|
|
97
|
+
# Schema
|
|
98
|
+
"SchemaDetector", "DataType", "FieldKind",
|
|
99
|
+
# Chunk mode
|
|
100
|
+
"freeze_stream", "freeze_file", "thaw_iter",
|
|
101
|
+
# Catalog
|
|
102
|
+
"PermafrostCatalog",
|
|
103
|
+
# Storage
|
|
104
|
+
"LocalAdapter", "S3Adapter", "GCSAdapter", "AzureAdapter",
|
|
105
|
+
"storage_from_uri", "parse_uri", "freeze_to", "thaw_from", "audit_remote",
|
|
106
|
+
# Cluster
|
|
107
|
+
"PermafrostMaster", "PermafrostWorker", "PermafrostClient",
|
|
108
|
+
]
|
|
109
|
+
# ── Spark DataSource API v2 ───────────────────────────────────────────────────
|
|
110
|
+
try:
|
|
111
|
+
from permafrost.spark import PermafrostDataSource, register as spark_register
|
|
112
|
+
__all__ += ["PermafrostDataSource", "spark_register"]
|
|
113
|
+
except ImportError:
|
|
114
|
+
pass # PySpark não instalado — ok
|
permafrost/__main__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entrypoint para execução via `python -m permafrost`.
|
|
3
|
+
|
|
4
|
+
Uso:
|
|
5
|
+
python -m permafrost master [--host HOST] [--port PORT]
|
|
6
|
+
python -m permafrost worker --master URL [--host HOST] [--port PORT] [--id ID]
|
|
7
|
+
python -m permafrost freeze arquivo.csv
|
|
8
|
+
python -m permafrost thaw arquivo.permafrost
|
|
9
|
+
"""
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
if len(sys.argv) < 2:
|
|
14
|
+
print("Uso: python -m permafrost <comando>")
|
|
15
|
+
print("Comandos: master | worker | freeze | thaw | audit | catalog")
|
|
16
|
+
sys.exit(1)
|
|
17
|
+
|
|
18
|
+
cmd = sys.argv[1]
|
|
19
|
+
|
|
20
|
+
if cmd == "master":
|
|
21
|
+
import argparse, uvicorn
|
|
22
|
+
from permafrost.cluster import PermafrostMaster
|
|
23
|
+
p = argparse.ArgumentParser(description="Permafrost Master node")
|
|
24
|
+
p.add_argument("--host", default="0.0.0.0")
|
|
25
|
+
p.add_argument("--port", type=int, default=8700)
|
|
26
|
+
p.add_argument("--max-retries", type=int, default=3)
|
|
27
|
+
args = p.parse_args(sys.argv[2:])
|
|
28
|
+
master = PermafrostMaster(host=args.host, port=args.port)
|
|
29
|
+
master.MAX_RETRIES = args.max_retries
|
|
30
|
+
print(f"❄ Permafrost Master iniciando em {args.host}:{args.port}")
|
|
31
|
+
uvicorn.run(master.app, host=args.host, port=args.port, log_level="info")
|
|
32
|
+
|
|
33
|
+
elif cmd == "worker":
|
|
34
|
+
import argparse, uvicorn
|
|
35
|
+
from permafrost.cluster import PermafrostWorker
|
|
36
|
+
p = argparse.ArgumentParser(description="Permafrost Worker node")
|
|
37
|
+
p.add_argument("--master", required=True, help="URL do master (ex: http://master:8700)")
|
|
38
|
+
p.add_argument("--host", default="0.0.0.0")
|
|
39
|
+
p.add_argument("--port", type=int, default=8801)
|
|
40
|
+
p.add_argument("--id", default=None, help="ID único do worker")
|
|
41
|
+
args = p.parse_args(sys.argv[2:])
|
|
42
|
+
worker = PermafrostWorker(
|
|
43
|
+
master_url=args.master,
|
|
44
|
+
host=args.host,
|
|
45
|
+
port=args.port,
|
|
46
|
+
worker_id=args.id,
|
|
47
|
+
)
|
|
48
|
+
print(f"❄ Permafrost Worker {worker.worker_id} → {args.master}")
|
|
49
|
+
worker.run(auto_register=True)
|
|
50
|
+
|
|
51
|
+
elif cmd in ("freeze", "thaw", "audit", "verify", "catalog"):
|
|
52
|
+
# Delegar para a CLI typer
|
|
53
|
+
from permafrost.cli import app
|
|
54
|
+
sys.argv = ["permafrost"] + sys.argv[1:]
|
|
55
|
+
app()
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
print(f"Comando desconhecido: {cmd}")
|
|
59
|
+
print("Comandos disponíveis: master | worker | freeze | thaw | audit | catalog")
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
permafrost/catalog.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PermafrostCatalog v1.0
|
|
3
|
+
Índice centralizado de arquivos .permafrost usando DuckDB.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- register(path) → indexa um arquivo lendo apenas header+footer
|
|
7
|
+
- register_dir(dir) → registra todos os .permafrost de um diretório
|
|
8
|
+
- search(...) → query SQL-like com filtros
|
|
9
|
+
- thaw(...) → thaw seletivo usando o catalog como roteador
|
|
10
|
+
- cost_report() → custo estimado por tier de storage
|
|
11
|
+
- integrity_check() → verifica SHA-256 de todos os arquivos registrados
|
|
12
|
+
- stats() → métricas gerais do catalog
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os, json, hashlib, time, re
|
|
16
|
+
import duckdb
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
# Importar o codec
|
|
21
|
+
|
|
22
|
+
from permafrost.codec import audit as pf_audit, thaw as pf_thaw
|
|
23
|
+
|
|
24
|
+
# ── STORAGE PRICING ($/GB/mês) ────────────────────────────────────────────────
|
|
25
|
+
STORAGE_PRICES = {
|
|
26
|
+
's3_standard': 0.023,
|
|
27
|
+
's3_ia': 0.0125,
|
|
28
|
+
'glacier': 0.004,
|
|
29
|
+
'glacier_deep':0.00099,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
CATALOG_SCHEMA = """
|
|
33
|
+
CREATE TABLE IF NOT EXISTS datasets (
|
|
34
|
+
id INTEGER PRIMARY KEY,
|
|
35
|
+
name VARCHAR NOT NULL,
|
|
36
|
+
path VARCHAR NOT NULL UNIQUE,
|
|
37
|
+
registered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
38
|
+
freeze_date TIMESTAMP,
|
|
39
|
+
codec VARCHAR,
|
|
40
|
+
quant_level INTEGER,
|
|
41
|
+
orig_rows BIGINT,
|
|
42
|
+
n_chunks INTEGER,
|
|
43
|
+
chunk_rows INTEGER,
|
|
44
|
+
file_size_bytes BIGINT,
|
|
45
|
+
file_size_mb DOUBLE,
|
|
46
|
+
partition_col VARCHAR,
|
|
47
|
+
partition_keys VARCHAR, -- JSON array
|
|
48
|
+
columns VARCHAR, -- JSON array
|
|
49
|
+
comment VARCHAR,
|
|
50
|
+
tags VARCHAR, -- JSON array
|
|
51
|
+
schema_hash VARCHAR, -- SHA-256 dos nomes das colunas
|
|
52
|
+
last_verified TIMESTAMP,
|
|
53
|
+
verified_ok BOOLEAN
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
57
|
+
id INTEGER PRIMARY KEY,
|
|
58
|
+
dataset_id INTEGER REFERENCES datasets(id),
|
|
59
|
+
chunk_id INTEGER,
|
|
60
|
+
row_start BIGINT,
|
|
61
|
+
row_end BIGINT,
|
|
62
|
+
part_key VARCHAR,
|
|
63
|
+
part_col VARCHAR,
|
|
64
|
+
byte_offset BIGINT,
|
|
65
|
+
byte_len BIGINT,
|
|
66
|
+
sha256 VARCHAR
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
CREATE SEQUENCE IF NOT EXISTS dataset_seq START 1;
|
|
70
|
+
CREATE SEQUENCE IF NOT EXISTS chunk_seq START 1;
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
class PermafrostCatalog:
|
|
74
|
+
"""Índice centralizado de arquivos `.permafrost` usando DuckDB embedded.
|
|
75
|
+
|
|
76
|
+
Registra metadados de arquivos ``.permafrost`` lendo apenas o header e o
|
|
77
|
+
sparse index (zero decompressão). Permite busca por schema, período, codec,
|
|
78
|
+
estimativa de custo e verificação de integridade.
|
|
79
|
+
|
|
80
|
+
O banco DuckDB tem duas tabelas:
|
|
81
|
+
|
|
82
|
+
- ``datasets`` — um registro por arquivo, espelha o header do ``.permafrost``
|
|
83
|
+
- ``chunks`` — um registro por chunk, espelha o sparse index (habilita seeks diretos)
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> import permafrost as pf
|
|
87
|
+
>>> cat = pf.PermafrostCatalog(".permafrost_catalog.db")
|
|
88
|
+
>>> cat.register_dir("/dados/cold/", tags=["producao"])
|
|
89
|
+
>>> cat.search(partition_key="2023", lossless_only=True)
|
|
90
|
+
>>> cat.cost_report("glacier_deep")
|
|
91
|
+
>>> cat.integrity_check()
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, catalog_path: str = ".permafrost_catalog.db"):
|
|
95
|
+
"""Abre (ou cria) o catálogo DuckDB no caminho especificado.
|
|
96
|
+
|
|
97
|
+
O catálogo indexa arquivos ``.permafrost`` lendo apenas o header e o
|
|
98
|
+
sparse index — zero decompressão. Todas as consultas são SQL DuckDB.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
catalog_path: Caminho do arquivo DuckDB. Use ``":memory:"`` para
|
|
102
|
+
testes (dados não persistidos). Padrão: ``".permafrost_catalog.db"``.
|
|
103
|
+
|
|
104
|
+
Example:
|
|
105
|
+
>>> cat = PermafrostCatalog(".permafrost_catalog.db")
|
|
106
|
+
>>> cat = PermafrostCatalog(":memory:") # testes
|
|
107
|
+
"""
|
|
108
|
+
self.catalog_path = catalog_path
|
|
109
|
+
self.con = duckdb.connect(catalog_path)
|
|
110
|
+
self.con.execute(CATALOG_SCHEMA)
|
|
111
|
+
self._print_header()
|
|
112
|
+
|
|
113
|
+
def _print_header(self):
|
|
114
|
+
n = self.con.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
|
|
115
|
+
print(f"PermafrostCatalog → {self.catalog_path}")
|
|
116
|
+
print(f" {n} dataset(s) registrado(s)\n")
|
|
117
|
+
|
|
118
|
+
# ── REGISTER ──────────────────────────────────────────────────────────────
|
|
119
|
+
def register(self, path: str, tags: list = None, name: str = None) -> dict:
|
|
120
|
+
"""
|
|
121
|
+
Registra um arquivo .permafrost lendo apenas header + sparse index.
|
|
122
|
+
Não descomprime nenhum chunk.
|
|
123
|
+
"""
|
|
124
|
+
if not os.path.exists(path):
|
|
125
|
+
raise FileNotFoundError(f"Arquivo não encontrado: {path}")
|
|
126
|
+
|
|
127
|
+
# Verificar se já está registrado
|
|
128
|
+
existing = self.con.execute(
|
|
129
|
+
"SELECT id FROM datasets WHERE path = ?", [path]
|
|
130
|
+
).fetchone()
|
|
131
|
+
if existing:
|
|
132
|
+
return {'status': 'already_registered', 'path': path, 'id': existing[0]}
|
|
133
|
+
|
|
134
|
+
# Ler metadados via audit() — zero decompressão
|
|
135
|
+
info = pf_audit(path)
|
|
136
|
+
|
|
137
|
+
# Derivar campos
|
|
138
|
+
ds_name = name or os.path.splitext(os.path.basename(path))[0]
|
|
139
|
+
schema_hash = hashlib.sha256(
|
|
140
|
+
json.dumps(sorted(info['columns'])).encode()
|
|
141
|
+
).hexdigest()[:16]
|
|
142
|
+
part_keys = json.dumps(info.get('partition_keys', []))
|
|
143
|
+
columns_j = json.dumps(info['columns'])
|
|
144
|
+
tags_j = json.dumps(tags or [])
|
|
145
|
+
freeze_ts = info['freeze_date']
|
|
146
|
+
|
|
147
|
+
# Inserir dataset
|
|
148
|
+
ds_id = self.con.execute("SELECT nextval('dataset_seq')").fetchone()[0]
|
|
149
|
+
self.con.execute("""
|
|
150
|
+
INSERT INTO datasets
|
|
151
|
+
(id, name, path, freeze_date, codec, quant_level, orig_rows,
|
|
152
|
+
n_chunks, chunk_rows, file_size_bytes, file_size_mb,
|
|
153
|
+
partition_col, partition_keys, columns, comment, tags, schema_hash)
|
|
154
|
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
155
|
+
""", [
|
|
156
|
+
ds_id, ds_name, path, freeze_ts,
|
|
157
|
+
info['codec'], info['quant'],
|
|
158
|
+
info['orig_rows'], info['n_chunks'], info['chunk_rows'],
|
|
159
|
+
int(info['file_size_mb'] * 1e6), info['file_size_mb'],
|
|
160
|
+
info.get('partition_col'), part_keys,
|
|
161
|
+
columns_j, info.get('comment',''), tags_j, schema_hash,
|
|
162
|
+
])
|
|
163
|
+
|
|
164
|
+
# Inserir chunks do sparse index
|
|
165
|
+
for entry in info.get('index_entries', []):
|
|
166
|
+
chunk_id = self.con.execute("SELECT nextval('chunk_seq')").fetchone()[0]
|
|
167
|
+
self.con.execute("""
|
|
168
|
+
INSERT INTO chunks
|
|
169
|
+
(id, dataset_id, chunk_id, row_start, row_end,
|
|
170
|
+
part_key, part_col, byte_offset, byte_len, sha256)
|
|
171
|
+
VALUES (?,?,?,?,?,?,?,?,?,?)
|
|
172
|
+
""", [
|
|
173
|
+
chunk_id, ds_id,
|
|
174
|
+
entry['chunk_id'], entry['row_start'], entry['row_end'],
|
|
175
|
+
entry['part_key'], entry['part_col'],
|
|
176
|
+
entry['byte_offset'], entry['byte_len'], entry['sha256'],
|
|
177
|
+
])
|
|
178
|
+
|
|
179
|
+
return {
|
|
180
|
+
'status': 'registered', 'id': ds_id, 'name': ds_name,
|
|
181
|
+
'path': path, 'rows': info['orig_rows'],
|
|
182
|
+
'file_mb': info['file_size_mb'], 'n_chunks': info['n_chunks'],
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
def register_dir(self, directory: str, tags: list = None, recursive: bool = False) -> list:
|
|
186
|
+
"""Registra todos os .permafrost de um diretório."""
|
|
187
|
+
results = []
|
|
188
|
+
walk = os.walk(directory) if recursive else [(directory, [], os.listdir(directory))]
|
|
189
|
+
for root, _, files in walk:
|
|
190
|
+
for fname in sorted(files):
|
|
191
|
+
if fname.endswith('.permafrost'):
|
|
192
|
+
path = os.path.join(root, fname)
|
|
193
|
+
try:
|
|
194
|
+
r = self.register(path, tags=tags)
|
|
195
|
+
results.append(r)
|
|
196
|
+
status = r['status']
|
|
197
|
+
if status == 'registered':
|
|
198
|
+
print(f" ✓ {fname:30s} {r.get('rows',0):>8,} linhas | {r.get('file_mb',0):.3f} MB")
|
|
199
|
+
else:
|
|
200
|
+
print(f" ~ {fname:30s} já registrado")
|
|
201
|
+
except Exception as e:
|
|
202
|
+
print(f" ✗ {fname}: {e}")
|
|
203
|
+
results.append({'status': 'error', 'path': path, 'error': str(e)})
|
|
204
|
+
return results
|
|
205
|
+
|
|
206
|
+
# ── SEARCH ────────────────────────────────────────────────────────────────
|
|
207
|
+
def search(self,
|
|
208
|
+
name: str = None,
|
|
209
|
+
codec: str = None,
|
|
210
|
+
partition_col: str = None,
|
|
211
|
+
partition_key: str = None,
|
|
212
|
+
columns_contain: str = None,
|
|
213
|
+
min_rows: int = None,
|
|
214
|
+
max_mb: float = None,
|
|
215
|
+
tags_contain: str = None,
|
|
216
|
+
lossless_only: bool = False) -> pd.DataFrame:
|
|
217
|
+
"""
|
|
218
|
+
Busca datasets no catalog com filtros opcionais.
|
|
219
|
+
Retorna DataFrame com os resultados.
|
|
220
|
+
"""
|
|
221
|
+
conditions = ["1=1"]
|
|
222
|
+
params = []
|
|
223
|
+
|
|
224
|
+
if name:
|
|
225
|
+
conditions.append("name LIKE ?")
|
|
226
|
+
params.append(f"%{name}%")
|
|
227
|
+
if codec:
|
|
228
|
+
conditions.append("codec = ?")
|
|
229
|
+
params.append(codec)
|
|
230
|
+
if partition_col:
|
|
231
|
+
conditions.append("partition_col = ?")
|
|
232
|
+
params.append(partition_col)
|
|
233
|
+
if partition_key:
|
|
234
|
+
conditions.append("partition_keys LIKE ?")
|
|
235
|
+
params.append(f"%{partition_key}%")
|
|
236
|
+
if columns_contain:
|
|
237
|
+
conditions.append("columns LIKE ?")
|
|
238
|
+
params.append(f"%{columns_contain}%")
|
|
239
|
+
if min_rows:
|
|
240
|
+
conditions.append("orig_rows >= ?")
|
|
241
|
+
params.append(min_rows)
|
|
242
|
+
if max_mb:
|
|
243
|
+
conditions.append("file_size_mb <= ?")
|
|
244
|
+
params.append(max_mb)
|
|
245
|
+
if tags_contain:
|
|
246
|
+
conditions.append("tags LIKE ?")
|
|
247
|
+
params.append(f"%{tags_contain}%")
|
|
248
|
+
if lossless_only:
|
|
249
|
+
conditions.append("quant_level = 0")
|
|
250
|
+
|
|
251
|
+
where = " AND ".join(conditions)
|
|
252
|
+
sql = f"""
|
|
253
|
+
SELECT id, name, codec, quant_level as quant,
|
|
254
|
+
orig_rows as rows, n_chunks, file_size_mb as mb,
|
|
255
|
+
partition_col, freeze_date, comment
|
|
256
|
+
FROM datasets
|
|
257
|
+
WHERE {where}
|
|
258
|
+
ORDER BY freeze_date DESC
|
|
259
|
+
"""
|
|
260
|
+
return self.con.execute(sql, params).df()
|
|
261
|
+
|
|
262
|
+
def search_chunks(self, dataset_name: str, part_key: str = None) -> pd.DataFrame:
|
|
263
|
+
"""Busca chunks de um dataset com filtro por partition key."""
|
|
264
|
+
sql = """
|
|
265
|
+
SELECT c.chunk_id, c.row_start, c.row_end,
|
|
266
|
+
c.part_key, c.byte_offset, c.byte_len,
|
|
267
|
+
c.sha256, round(c.byte_len/1024.0, 1) as kb
|
|
268
|
+
FROM chunks c
|
|
269
|
+
JOIN datasets d ON c.dataset_id = d.id
|
|
270
|
+
WHERE d.name LIKE ?
|
|
271
|
+
"""
|
|
272
|
+
params = [f"%{dataset_name}%"]
|
|
273
|
+
if part_key:
|
|
274
|
+
sql += " AND c.part_key LIKE ?"
|
|
275
|
+
params.append(f"%{part_key}%")
|
|
276
|
+
sql += " ORDER BY c.chunk_id"
|
|
277
|
+
return self.con.execute(sql, params).df()
|
|
278
|
+
|
|
279
|
+
# ── THAW via CATALOG ──────────────────────────────────────────────────────
|
|
280
|
+
def thaw(self, name: str, filter: dict = None, row_range: tuple = None,
|
|
281
|
+
verify: bool = True) -> pd.DataFrame:
|
|
282
|
+
"""
|
|
283
|
+
Encontra o dataset pelo nome e executa thaw com seleção via sparse index.
|
|
284
|
+
"""
|
|
285
|
+
result = self.con.execute(
|
|
286
|
+
"SELECT path, partition_col FROM datasets WHERE name LIKE ? LIMIT 1",
|
|
287
|
+
[f"%{name}%"]
|
|
288
|
+
).fetchone()
|
|
289
|
+
if not result:
|
|
290
|
+
raise KeyError(f"Dataset '{name}' não encontrado no catalog. Use search() para listar.")
|
|
291
|
+
path, part_col = result
|
|
292
|
+
|
|
293
|
+
# Adaptar filtro para a coluna de partição correta
|
|
294
|
+
if filter and part_col and part_col != '__rows__':
|
|
295
|
+
# Garantir que o filtro usa a coluna correta
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
print(f" thaw: {os.path.basename(path)}", end="")
|
|
299
|
+
t0 = time.time()
|
|
300
|
+
df = pf_thaw(path, verify=verify, filter=filter, row_range=row_range)
|
|
301
|
+
tt = time.time() - t0
|
|
302
|
+
print(f" → {len(df):,} linhas em {tt:.3f}s")
|
|
303
|
+
return df
|
|
304
|
+
|
|
305
|
+
# ── COST REPORT ───────────────────────────────────────────────────────────
|
|
306
|
+
def cost_report(self, tier: str = 'glacier_deep') -> pd.DataFrame:
|
|
307
|
+
"""
|
|
308
|
+
Relatório de custo estimado de storage por dataset.
|
|
309
|
+
tier: s3_standard | s3_ia | glacier | glacier_deep
|
|
310
|
+
"""
|
|
311
|
+
price = STORAGE_PRICES.get(tier, 0.00099)
|
|
312
|
+
sql = """
|
|
313
|
+
SELECT
|
|
314
|
+
name,
|
|
315
|
+
codec,
|
|
316
|
+
CASE quant_level
|
|
317
|
+
WHEN 0 THEN 'lossless'
|
|
318
|
+
WHEN 1 THEN 'high'
|
|
319
|
+
WHEN 2 THEN 'medium'
|
|
320
|
+
ELSE 'low'
|
|
321
|
+
END as quant,
|
|
322
|
+
orig_rows as rows,
|
|
323
|
+
round(file_size_mb, 3) as size_mb,
|
|
324
|
+
n_chunks,
|
|
325
|
+
freeze_date
|
|
326
|
+
FROM datasets
|
|
327
|
+
ORDER BY file_size_mb DESC
|
|
328
|
+
"""
|
|
329
|
+
df = self.con.execute(sql).df()
|
|
330
|
+
df['cost_monthly_usd'] = (df['size_mb'] / 1024) * price
|
|
331
|
+
df['cost_annual_usd'] = df['cost_monthly_usd'] * 12
|
|
332
|
+
df['cost_3yr_usd'] = df['cost_monthly_usd'] * 36
|
|
333
|
+
df['tier'] = tier
|
|
334
|
+
return df
|
|
335
|
+
|
|
336
|
+
# ── INTEGRITY CHECK ───────────────────────────────────────────────────────
|
|
337
|
+
def integrity_check(self, name_filter: str = None) -> pd.DataFrame:
|
|
338
|
+
"""
|
|
339
|
+
Verifica integridade (SHA-256) de todos os chunks de todos os datasets.
|
|
340
|
+
Não descomprime — apenas confere os hashes dos blobs comprimidos.
|
|
341
|
+
"""
|
|
342
|
+
sql = "SELECT id, name, path FROM datasets"
|
|
343
|
+
params = []
|
|
344
|
+
if name_filter:
|
|
345
|
+
sql += " WHERE name LIKE ?"
|
|
346
|
+
params.append(f"%{name_filter}%")
|
|
347
|
+
|
|
348
|
+
datasets_rows = self.con.execute(sql, params).fetchall()
|
|
349
|
+
results = []
|
|
350
|
+
|
|
351
|
+
for ds_id, ds_name, path in datasets_rows:
|
|
352
|
+
if not os.path.exists(path):
|
|
353
|
+
results.append({'name': ds_name, 'path': path, 'status': 'FILE_MISSING',
|
|
354
|
+
'chunks_ok': 0, 'chunks_fail': 0})
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
with open(path, 'rb') as f:
|
|
358
|
+
raw = f.read()
|
|
359
|
+
|
|
360
|
+
chunks = self.con.execute(
|
|
361
|
+
"SELECT chunk_id, byte_offset, byte_len, sha256 FROM chunks WHERE dataset_id = ?",
|
|
362
|
+
[ds_id]
|
|
363
|
+
).fetchall()
|
|
364
|
+
|
|
365
|
+
ok_count = fail_count = 0
|
|
366
|
+
for chunk_id, offset, length, sha_stored in chunks:
|
|
367
|
+
blob = raw[offset: offset + length]
|
|
368
|
+
sha_computed = hashlib.sha256(blob).hexdigest()
|
|
369
|
+
if sha_computed == sha_stored:
|
|
370
|
+
ok_count += 1
|
|
371
|
+
else:
|
|
372
|
+
fail_count += 1
|
|
373
|
+
|
|
374
|
+
status = 'OK' if fail_count == 0 else 'CORRUPTED'
|
|
375
|
+
self.con.execute("""
|
|
376
|
+
UPDATE datasets SET last_verified = CURRENT_TIMESTAMP, verified_ok = ?
|
|
377
|
+
WHERE id = ?
|
|
378
|
+
""", [fail_count == 0, ds_id])
|
|
379
|
+
results.append({
|
|
380
|
+
'name': ds_name, 'status': status,
|
|
381
|
+
'chunks_ok': ok_count, 'chunks_fail': fail_count,
|
|
382
|
+
'path': path,
|
|
383
|
+
})
|
|
384
|
+
|
|
385
|
+
return pd.DataFrame(results)
|
|
386
|
+
|
|
387
|
+
# ── STATS ─────────────────────────────────────────────────────────────────
|
|
388
|
+
def stats(self) -> dict:
|
|
389
|
+
"""Retorna métricas agregadas de todos os datasets registrados.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Dicionário com::
|
|
393
|
+
|
|
394
|
+
{
|
|
395
|
+
"total_datasets": 4,
|
|
396
|
+
"total_rows": 540000,
|
|
397
|
+
"total_mb": 2.964,
|
|
398
|
+
"total_chunks": 54,
|
|
399
|
+
"avg_mb_per_1k_rows": 0.0055,
|
|
400
|
+
"distinct_codecs": 2,
|
|
401
|
+
"lossless_count": 3,
|
|
402
|
+
"vault_count": 1,
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
Example:
|
|
406
|
+
>>> s = cat.stats()
|
|
407
|
+
>>> print(f"{s['total_datasets']} datasets, {s['total_rows']:,} linhas")
|
|
408
|
+
"""
|
|
409
|
+
r = self.con.execute("""
|
|
410
|
+
SELECT
|
|
411
|
+
COUNT(*) as total_datasets,
|
|
412
|
+
SUM(orig_rows) as total_rows,
|
|
413
|
+
SUM(file_size_mb) as total_mb,
|
|
414
|
+
SUM(n_chunks) as total_chunks,
|
|
415
|
+
AVG(file_size_mb/NULLIF(orig_rows/1000.0,0)) as avg_mb_per_1k_rows,
|
|
416
|
+
COUNT(DISTINCT codec) as distinct_codecs,
|
|
417
|
+
COUNT(DISTINCT partition_col) as distinct_partitions,
|
|
418
|
+
SUM(CASE WHEN quant_level=0 THEN 1 ELSE 0 END) as lossless_count,
|
|
419
|
+
SUM(CASE WHEN quant_level>0 THEN 1 ELSE 0 END) as vault_count
|
|
420
|
+
FROM datasets
|
|
421
|
+
""").fetchone()
|
|
422
|
+
|
|
423
|
+
labels = ['total_datasets','total_rows','total_mb','total_chunks',
|
|
424
|
+
'avg_mb_per_1k_rows','distinct_codecs','distinct_partitions',
|
|
425
|
+
'lossless_count','vault_count']
|
|
426
|
+
return dict(zip(labels, r))
|
|
427
|
+
|
|
428
|
+
# ── SQL DIRETO ────────────────────────────────────────────────────────────
|
|
429
|
+
def sql(self, query: str) -> pd.DataFrame:
|
|
430
|
+
"""Executa SQL direto no catalog DuckDB."""
|
|
431
|
+
return self.con.execute(query).df()
|
|
432
|
+
|
|
433
|
+
def __repr__(self):
|
|
434
|
+
n = self.con.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
|
|
435
|
+
return f"<PermafrostCatalog path='{self.catalog_path}' datasets={n}>"
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
print("permafrost_catalog.py carregado")
|
|
439
|
+
print(" Classes: PermafrostCatalog")
|
|
440
|
+
print(" Métodos: register, register_dir, search, search_chunks, thaw,")
|
|
441
|
+
print(" cost_report, integrity_check, stats, sql")
|