permafrost-framework 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
permafrost/__init__.py ADDED
@@ -0,0 +1,114 @@
1
+ """
2
+ Permafrost Data Framework
3
+ =========================
4
+ Plataforma distribuída de compressão inteligente para arquivamento digital de longo prazo.
5
+
6
+ Uso rápido:
7
+ from permafrost import freeze, thaw, audit
8
+ from permafrost import PermafrostCatalog, SchemaDetector
9
+ from permafrost import freeze_to, thaw_from # cloud
10
+ from permafrost import PermafrostMaster, PermafrostWorker, PermafrostClient # cluster
11
+
12
+ Formatos suportados:
13
+ freeze(df, "arquivo.permafrost") # DataFrame tabular
14
+ freeze(detector.detect("dados.jsonl")[0], ...) # JSONL / NoSQL
15
+ freeze_file("dados.csv", "saida.permafrost") # streaming, sem carregar tudo na RAM
16
+ freeze_to(df, "s3://bucket/dados.permafrost") # direto para cloud
17
+
18
+ Exemplos:
19
+ >>> import permafrost as pf
20
+ >>> metrics = pf.freeze(df, "vendas.permafrost", codec=pf.CODEC_LZMA2)
21
+ >>> print(f"Ratio: {metrics['ratio']:.2f}x")
22
+ >>> df_back = pf.thaw("vendas.permafrost")
23
+ >>> info = pf.audit("vendas.permafrost") # sem descomprimir
24
+
25
+ Links:
26
+ GitHub: https://github.com/caua-ferreira/permafrost-framework
27
+ Docs: https://github.com/caua-ferreira/permafrost-framework/tree/main/docs
28
+ """
29
+
30
+ __version__ = "0.6.0"
31
+ __author__ = "Permafrost Contributors"
32
+ __license__ = "Apache-2.0"
33
+
34
+ # ── Core codec ────────────────────────────────────────────────────────────────
35
+ from permafrost.codec import (
36
+ freeze,
37
+ thaw,
38
+ audit,
39
+ # Codec IDs
40
+ CODEC_ZSTD,
41
+ CODEC_LZMA2,
42
+ CODEC_ZPAQ,
43
+ # Quantization levels
44
+ QUANT_NONE,
45
+ QUANT_HIGH,
46
+ QUANT_MEDIUM,
47
+ QUANT_LOW,
48
+ # Format constants
49
+ MAGIC,
50
+ EOF_MAGIC,
51
+ )
52
+
53
+ # ── Schema detection (SQL + NoSQL + JSONL) ────────────────────────────────────
54
+ from permafrost.schema_detector import (
55
+ SchemaDetector,
56
+ DataType,
57
+ FieldKind,
58
+ )
59
+
60
+ # ── Chunk mode (streaming — datasets > RAM) ───────────────────────────────────
61
+ from permafrost.chunk_mode import (
62
+ freeze_stream,
63
+ freeze_file,
64
+ thaw_iter,
65
+ )
66
+
67
+ # ── Catalog (DuckDB index) ────────────────────────────────────────────────────
68
+ from permafrost.catalog import PermafrostCatalog
69
+
70
+ # ── Cloud storage adapters ────────────────────────────────────────────────────
71
+ from permafrost.storage import (
72
+ LocalAdapter,
73
+ S3Adapter,
74
+ GCSAdapter,
75
+ AzureAdapter,
76
+ storage_from_uri,
77
+ parse_uri,
78
+ freeze_to,
79
+ thaw_from,
80
+ audit_remote,
81
+ )
82
+
83
+ # ── Cluster (distributed processing) ─────────────────────────────────────────
84
+ from permafrost.cluster import (
85
+ PermafrostMaster,
86
+ PermafrostWorker,
87
+ PermafrostClient,
88
+ )
89
+
90
+ __all__ = [
91
+ # Core
92
+ "freeze", "thaw", "audit",
93
+ # Codecs
94
+ "CODEC_ZSTD", "CODEC_LZMA2", "CODEC_ZPAQ",
95
+ # Quant levels
96
+ "QUANT_NONE", "QUANT_HIGH", "QUANT_MEDIUM", "QUANT_LOW",
97
+ # Schema
98
+ "SchemaDetector", "DataType", "FieldKind",
99
+ # Chunk mode
100
+ "freeze_stream", "freeze_file", "thaw_iter",
101
+ # Catalog
102
+ "PermafrostCatalog",
103
+ # Storage
104
+ "LocalAdapter", "S3Adapter", "GCSAdapter", "AzureAdapter",
105
+ "storage_from_uri", "parse_uri", "freeze_to", "thaw_from", "audit_remote",
106
+ # Cluster
107
+ "PermafrostMaster", "PermafrostWorker", "PermafrostClient",
108
+ ]
109
+ # ── Spark DataSource API v2 ───────────────────────────────────────────────────
110
+ try:
111
+ from permafrost.spark import PermafrostDataSource, register as spark_register
112
+ __all__ += ["PermafrostDataSource", "spark_register"]
113
+ except ImportError:
114
+ pass # PySpark não instalado — ok
permafrost/__main__.py ADDED
@@ -0,0 +1,64 @@
1
+ """
2
+ Entrypoint para execução via `python -m permafrost`.
3
+
4
+ Uso:
5
+ python -m permafrost master [--host HOST] [--port PORT]
6
+ python -m permafrost worker --master URL [--host HOST] [--port PORT] [--id ID]
7
+ python -m permafrost freeze arquivo.csv
8
+ python -m permafrost thaw arquivo.permafrost
9
+ """
10
+ import sys
11
+
12
+ def main():
13
+ if len(sys.argv) < 2:
14
+ print("Uso: python -m permafrost <comando>")
15
+ print("Comandos: master | worker | freeze | thaw | audit | catalog")
16
+ sys.exit(1)
17
+
18
+ cmd = sys.argv[1]
19
+
20
+ if cmd == "master":
21
+ import argparse, uvicorn
22
+ from permafrost.cluster import PermafrostMaster
23
+ p = argparse.ArgumentParser(description="Permafrost Master node")
24
+ p.add_argument("--host", default="0.0.0.0")
25
+ p.add_argument("--port", type=int, default=8700)
26
+ p.add_argument("--max-retries", type=int, default=3)
27
+ args = p.parse_args(sys.argv[2:])
28
+ master = PermafrostMaster(host=args.host, port=args.port)
29
+ master.MAX_RETRIES = args.max_retries
30
+ print(f"❄ Permafrost Master iniciando em {args.host}:{args.port}")
31
+ uvicorn.run(master.app, host=args.host, port=args.port, log_level="info")
32
+
33
+ elif cmd == "worker":
34
+ import argparse, uvicorn
35
+ from permafrost.cluster import PermafrostWorker
36
+ p = argparse.ArgumentParser(description="Permafrost Worker node")
37
+ p.add_argument("--master", required=True, help="URL do master (ex: http://master:8700)")
38
+ p.add_argument("--host", default="0.0.0.0")
39
+ p.add_argument("--port", type=int, default=8801)
40
+ p.add_argument("--id", default=None, help="ID único do worker")
41
+ args = p.parse_args(sys.argv[2:])
42
+ worker = PermafrostWorker(
43
+ master_url=args.master,
44
+ host=args.host,
45
+ port=args.port,
46
+ worker_id=args.id,
47
+ )
48
+ print(f"❄ Permafrost Worker {worker.worker_id} → {args.master}")
49
+ worker.run(auto_register=True)
50
+
51
+ elif cmd in ("freeze", "thaw", "audit", "verify", "catalog"):
52
+ # Delegar para a CLI typer
53
+ from permafrost.cli import app
54
+ sys.argv = ["permafrost"] + sys.argv[1:]
55
+ app()
56
+
57
+ else:
58
+ print(f"Comando desconhecido: {cmd}")
59
+ print("Comandos disponíveis: master | worker | freeze | thaw | audit | catalog")
60
+ sys.exit(1)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
permafrost/catalog.py ADDED
@@ -0,0 +1,441 @@
1
+ """
2
+ PermafrostCatalog v1.0
3
+ Índice centralizado de arquivos .permafrost usando DuckDB.
4
+
5
+ Features:
6
+ - register(path) → indexa um arquivo lendo apenas header+footer
7
+ - register_dir(dir) → registra todos os .permafrost de um diretório
8
+ - search(...) → query SQL-like com filtros
9
+ - thaw(...) → thaw seletivo usando o catalog como roteador
10
+ - cost_report() → custo estimado por tier de storage
11
+ - integrity_check() → verifica SHA-256 de todos os arquivos registrados
12
+ - stats() → métricas gerais do catalog
13
+ """
14
+
15
+ import os, json, hashlib, time, re
16
+ import duckdb
17
+ import pandas as pd
18
+ import numpy as np
19
+
20
+ # Importar o codec
21
+
22
+ from permafrost.codec import audit as pf_audit, thaw as pf_thaw
23
+
24
+ # ── STORAGE PRICING ($/GB/mês) ────────────────────────────────────────────────
25
+ STORAGE_PRICES = {
26
+ 's3_standard': 0.023,
27
+ 's3_ia': 0.0125,
28
+ 'glacier': 0.004,
29
+ 'glacier_deep':0.00099,
30
+ }
31
+
32
+ CATALOG_SCHEMA = """
33
+ CREATE TABLE IF NOT EXISTS datasets (
34
+ id INTEGER PRIMARY KEY,
35
+ name VARCHAR NOT NULL,
36
+ path VARCHAR NOT NULL UNIQUE,
37
+ registered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
38
+ freeze_date TIMESTAMP,
39
+ codec VARCHAR,
40
+ quant_level INTEGER,
41
+ orig_rows BIGINT,
42
+ n_chunks INTEGER,
43
+ chunk_rows INTEGER,
44
+ file_size_bytes BIGINT,
45
+ file_size_mb DOUBLE,
46
+ partition_col VARCHAR,
47
+ partition_keys VARCHAR, -- JSON array
48
+ columns VARCHAR, -- JSON array
49
+ comment VARCHAR,
50
+ tags VARCHAR, -- JSON array
51
+ schema_hash VARCHAR, -- SHA-256 dos nomes das colunas
52
+ last_verified TIMESTAMP,
53
+ verified_ok BOOLEAN
54
+ );
55
+
56
+ CREATE TABLE IF NOT EXISTS chunks (
57
+ id INTEGER PRIMARY KEY,
58
+ dataset_id INTEGER REFERENCES datasets(id),
59
+ chunk_id INTEGER,
60
+ row_start BIGINT,
61
+ row_end BIGINT,
62
+ part_key VARCHAR,
63
+ part_col VARCHAR,
64
+ byte_offset BIGINT,
65
+ byte_len BIGINT,
66
+ sha256 VARCHAR
67
+ );
68
+
69
+ CREATE SEQUENCE IF NOT EXISTS dataset_seq START 1;
70
+ CREATE SEQUENCE IF NOT EXISTS chunk_seq START 1;
71
+ """
72
+
73
+ class PermafrostCatalog:
74
+ """Índice centralizado de arquivos `.permafrost` usando DuckDB embedded.
75
+
76
+ Registra metadados de arquivos ``.permafrost`` lendo apenas o header e o
77
+ sparse index (zero decompressão). Permite busca por schema, período, codec,
78
+ estimativa de custo e verificação de integridade.
79
+
80
+ O banco DuckDB tem duas tabelas:
81
+
82
+ - ``datasets`` — um registro por arquivo, espelha o header do ``.permafrost``
83
+ - ``chunks`` — um registro por chunk, espelha o sparse index (habilita seeks diretos)
84
+
85
+ Example:
86
+ >>> import permafrost as pf
87
+ >>> cat = pf.PermafrostCatalog(".permafrost_catalog.db")
88
+ >>> cat.register_dir("/dados/cold/", tags=["producao"])
89
+ >>> cat.search(partition_key="2023", lossless_only=True)
90
+ >>> cat.cost_report("glacier_deep")
91
+ >>> cat.integrity_check()
92
+ """
93
+
94
+ def __init__(self, catalog_path: str = ".permafrost_catalog.db"):
95
+ """Abre (ou cria) o catálogo DuckDB no caminho especificado.
96
+
97
+ O catálogo indexa arquivos ``.permafrost`` lendo apenas o header e o
98
+ sparse index — zero decompressão. Todas as consultas são SQL DuckDB.
99
+
100
+ Args:
101
+ catalog_path: Caminho do arquivo DuckDB. Use ``":memory:"`` para
102
+ testes (dados não persistidos). Padrão: ``".permafrost_catalog.db"``.
103
+
104
+ Example:
105
+ >>> cat = PermafrostCatalog(".permafrost_catalog.db")
106
+ >>> cat = PermafrostCatalog(":memory:") # testes
107
+ """
108
+ self.catalog_path = catalog_path
109
+ self.con = duckdb.connect(catalog_path)
110
+ self.con.execute(CATALOG_SCHEMA)
111
+ self._print_header()
112
+
113
+ def _print_header(self):
114
+ n = self.con.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
115
+ print(f"PermafrostCatalog → {self.catalog_path}")
116
+ print(f" {n} dataset(s) registrado(s)\n")
117
+
118
+ # ── REGISTER ──────────────────────────────────────────────────────────────
119
+ def register(self, path: str, tags: list = None, name: str = None) -> dict:
120
+ """
121
+ Registra um arquivo .permafrost lendo apenas header + sparse index.
122
+ Não descomprime nenhum chunk.
123
+ """
124
+ if not os.path.exists(path):
125
+ raise FileNotFoundError(f"Arquivo não encontrado: {path}")
126
+
127
+ # Verificar se já está registrado
128
+ existing = self.con.execute(
129
+ "SELECT id FROM datasets WHERE path = ?", [path]
130
+ ).fetchone()
131
+ if existing:
132
+ return {'status': 'already_registered', 'path': path, 'id': existing[0]}
133
+
134
+ # Ler metadados via audit() — zero decompressão
135
+ info = pf_audit(path)
136
+
137
+ # Derivar campos
138
+ ds_name = name or os.path.splitext(os.path.basename(path))[0]
139
+ schema_hash = hashlib.sha256(
140
+ json.dumps(sorted(info['columns'])).encode()
141
+ ).hexdigest()[:16]
142
+ part_keys = json.dumps(info.get('partition_keys', []))
143
+ columns_j = json.dumps(info['columns'])
144
+ tags_j = json.dumps(tags or [])
145
+ freeze_ts = info['freeze_date']
146
+
147
+ # Inserir dataset
148
+ ds_id = self.con.execute("SELECT nextval('dataset_seq')").fetchone()[0]
149
+ self.con.execute("""
150
+ INSERT INTO datasets
151
+ (id, name, path, freeze_date, codec, quant_level, orig_rows,
152
+ n_chunks, chunk_rows, file_size_bytes, file_size_mb,
153
+ partition_col, partition_keys, columns, comment, tags, schema_hash)
154
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
155
+ """, [
156
+ ds_id, ds_name, path, freeze_ts,
157
+ info['codec'], info['quant'],
158
+ info['orig_rows'], info['n_chunks'], info['chunk_rows'],
159
+ int(info['file_size_mb'] * 1e6), info['file_size_mb'],
160
+ info.get('partition_col'), part_keys,
161
+ columns_j, info.get('comment',''), tags_j, schema_hash,
162
+ ])
163
+
164
+ # Inserir chunks do sparse index
165
+ for entry in info.get('index_entries', []):
166
+ chunk_id = self.con.execute("SELECT nextval('chunk_seq')").fetchone()[0]
167
+ self.con.execute("""
168
+ INSERT INTO chunks
169
+ (id, dataset_id, chunk_id, row_start, row_end,
170
+ part_key, part_col, byte_offset, byte_len, sha256)
171
+ VALUES (?,?,?,?,?,?,?,?,?,?)
172
+ """, [
173
+ chunk_id, ds_id,
174
+ entry['chunk_id'], entry['row_start'], entry['row_end'],
175
+ entry['part_key'], entry['part_col'],
176
+ entry['byte_offset'], entry['byte_len'], entry['sha256'],
177
+ ])
178
+
179
+ return {
180
+ 'status': 'registered', 'id': ds_id, 'name': ds_name,
181
+ 'path': path, 'rows': info['orig_rows'],
182
+ 'file_mb': info['file_size_mb'], 'n_chunks': info['n_chunks'],
183
+ }
184
+
185
+ def register_dir(self, directory: str, tags: list = None, recursive: bool = False) -> list:
186
+ """Registra todos os .permafrost de um diretório."""
187
+ results = []
188
+ walk = os.walk(directory) if recursive else [(directory, [], os.listdir(directory))]
189
+ for root, _, files in walk:
190
+ for fname in sorted(files):
191
+ if fname.endswith('.permafrost'):
192
+ path = os.path.join(root, fname)
193
+ try:
194
+ r = self.register(path, tags=tags)
195
+ results.append(r)
196
+ status = r['status']
197
+ if status == 'registered':
198
+ print(f" ✓ {fname:30s} {r.get('rows',0):>8,} linhas | {r.get('file_mb',0):.3f} MB")
199
+ else:
200
+ print(f" ~ {fname:30s} já registrado")
201
+ except Exception as e:
202
+ print(f" ✗ {fname}: {e}")
203
+ results.append({'status': 'error', 'path': path, 'error': str(e)})
204
+ return results
205
+
206
+ # ── SEARCH ────────────────────────────────────────────────────────────────
207
+ def search(self,
208
+ name: str = None,
209
+ codec: str = None,
210
+ partition_col: str = None,
211
+ partition_key: str = None,
212
+ columns_contain: str = None,
213
+ min_rows: int = None,
214
+ max_mb: float = None,
215
+ tags_contain: str = None,
216
+ lossless_only: bool = False) -> pd.DataFrame:
217
+ """
218
+ Busca datasets no catalog com filtros opcionais.
219
+ Retorna DataFrame com os resultados.
220
+ """
221
+ conditions = ["1=1"]
222
+ params = []
223
+
224
+ if name:
225
+ conditions.append("name LIKE ?")
226
+ params.append(f"%{name}%")
227
+ if codec:
228
+ conditions.append("codec = ?")
229
+ params.append(codec)
230
+ if partition_col:
231
+ conditions.append("partition_col = ?")
232
+ params.append(partition_col)
233
+ if partition_key:
234
+ conditions.append("partition_keys LIKE ?")
235
+ params.append(f"%{partition_key}%")
236
+ if columns_contain:
237
+ conditions.append("columns LIKE ?")
238
+ params.append(f"%{columns_contain}%")
239
+ if min_rows:
240
+ conditions.append("orig_rows >= ?")
241
+ params.append(min_rows)
242
+ if max_mb:
243
+ conditions.append("file_size_mb <= ?")
244
+ params.append(max_mb)
245
+ if tags_contain:
246
+ conditions.append("tags LIKE ?")
247
+ params.append(f"%{tags_contain}%")
248
+ if lossless_only:
249
+ conditions.append("quant_level = 0")
250
+
251
+ where = " AND ".join(conditions)
252
+ sql = f"""
253
+ SELECT id, name, codec, quant_level as quant,
254
+ orig_rows as rows, n_chunks, file_size_mb as mb,
255
+ partition_col, freeze_date, comment
256
+ FROM datasets
257
+ WHERE {where}
258
+ ORDER BY freeze_date DESC
259
+ """
260
+ return self.con.execute(sql, params).df()
261
+
262
+ def search_chunks(self, dataset_name: str, part_key: str = None) -> pd.DataFrame:
263
+ """Busca chunks de um dataset com filtro por partition key."""
264
+ sql = """
265
+ SELECT c.chunk_id, c.row_start, c.row_end,
266
+ c.part_key, c.byte_offset, c.byte_len,
267
+ c.sha256, round(c.byte_len/1024.0, 1) as kb
268
+ FROM chunks c
269
+ JOIN datasets d ON c.dataset_id = d.id
270
+ WHERE d.name LIKE ?
271
+ """
272
+ params = [f"%{dataset_name}%"]
273
+ if part_key:
274
+ sql += " AND c.part_key LIKE ?"
275
+ params.append(f"%{part_key}%")
276
+ sql += " ORDER BY c.chunk_id"
277
+ return self.con.execute(sql, params).df()
278
+
279
+ # ── THAW via CATALOG ──────────────────────────────────────────────────────
280
+ def thaw(self, name: str, filter: dict = None, row_range: tuple = None,
281
+ verify: bool = True) -> pd.DataFrame:
282
+ """
283
+ Encontra o dataset pelo nome e executa thaw com seleção via sparse index.
284
+ """
285
+ result = self.con.execute(
286
+ "SELECT path, partition_col FROM datasets WHERE name LIKE ? LIMIT 1",
287
+ [f"%{name}%"]
288
+ ).fetchone()
289
+ if not result:
290
+ raise KeyError(f"Dataset '{name}' não encontrado no catalog. Use search() para listar.")
291
+ path, part_col = result
292
+
293
+ # Adaptar filtro para a coluna de partição correta
294
+ if filter and part_col and part_col != '__rows__':
295
+ # Garantir que o filtro usa a coluna correta
296
+ pass
297
+
298
+ print(f" thaw: {os.path.basename(path)}", end="")
299
+ t0 = time.time()
300
+ df = pf_thaw(path, verify=verify, filter=filter, row_range=row_range)
301
+ tt = time.time() - t0
302
+ print(f" → {len(df):,} linhas em {tt:.3f}s")
303
+ return df
304
+
305
+ # ── COST REPORT ───────────────────────────────────────────────────────────
306
+ def cost_report(self, tier: str = 'glacier_deep') -> pd.DataFrame:
307
+ """
308
+ Relatório de custo estimado de storage por dataset.
309
+ tier: s3_standard | s3_ia | glacier | glacier_deep
310
+ """
311
+ price = STORAGE_PRICES.get(tier, 0.00099)
312
+ sql = """
313
+ SELECT
314
+ name,
315
+ codec,
316
+ CASE quant_level
317
+ WHEN 0 THEN 'lossless'
318
+ WHEN 1 THEN 'high'
319
+ WHEN 2 THEN 'medium'
320
+ ELSE 'low'
321
+ END as quant,
322
+ orig_rows as rows,
323
+ round(file_size_mb, 3) as size_mb,
324
+ n_chunks,
325
+ freeze_date
326
+ FROM datasets
327
+ ORDER BY file_size_mb DESC
328
+ """
329
+ df = self.con.execute(sql).df()
330
+ df['cost_monthly_usd'] = (df['size_mb'] / 1024) * price
331
+ df['cost_annual_usd'] = df['cost_monthly_usd'] * 12
332
+ df['cost_3yr_usd'] = df['cost_monthly_usd'] * 36
333
+ df['tier'] = tier
334
+ return df
335
+
336
+ # ── INTEGRITY CHECK ───────────────────────────────────────────────────────
337
+ def integrity_check(self, name_filter: str = None) -> pd.DataFrame:
338
+ """
339
+ Verifica integridade (SHA-256) de todos os chunks de todos os datasets.
340
+ Não descomprime — apenas confere os hashes dos blobs comprimidos.
341
+ """
342
+ sql = "SELECT id, name, path FROM datasets"
343
+ params = []
344
+ if name_filter:
345
+ sql += " WHERE name LIKE ?"
346
+ params.append(f"%{name_filter}%")
347
+
348
+ datasets_rows = self.con.execute(sql, params).fetchall()
349
+ results = []
350
+
351
+ for ds_id, ds_name, path in datasets_rows:
352
+ if not os.path.exists(path):
353
+ results.append({'name': ds_name, 'path': path, 'status': 'FILE_MISSING',
354
+ 'chunks_ok': 0, 'chunks_fail': 0})
355
+ continue
356
+
357
+ with open(path, 'rb') as f:
358
+ raw = f.read()
359
+
360
+ chunks = self.con.execute(
361
+ "SELECT chunk_id, byte_offset, byte_len, sha256 FROM chunks WHERE dataset_id = ?",
362
+ [ds_id]
363
+ ).fetchall()
364
+
365
+ ok_count = fail_count = 0
366
+ for chunk_id, offset, length, sha_stored in chunks:
367
+ blob = raw[offset: offset + length]
368
+ sha_computed = hashlib.sha256(blob).hexdigest()
369
+ if sha_computed == sha_stored:
370
+ ok_count += 1
371
+ else:
372
+ fail_count += 1
373
+
374
+ status = 'OK' if fail_count == 0 else 'CORRUPTED'
375
+ self.con.execute("""
376
+ UPDATE datasets SET last_verified = CURRENT_TIMESTAMP, verified_ok = ?
377
+ WHERE id = ?
378
+ """, [fail_count == 0, ds_id])
379
+ results.append({
380
+ 'name': ds_name, 'status': status,
381
+ 'chunks_ok': ok_count, 'chunks_fail': fail_count,
382
+ 'path': path,
383
+ })
384
+
385
+ return pd.DataFrame(results)
386
+
387
+ # ── STATS ─────────────────────────────────────────────────────────────────
388
+ def stats(self) -> dict:
389
+ """Retorna métricas agregadas de todos os datasets registrados.
390
+
391
+ Returns:
392
+ Dicionário com::
393
+
394
+ {
395
+ "total_datasets": 4,
396
+ "total_rows": 540000,
397
+ "total_mb": 2.964,
398
+ "total_chunks": 54,
399
+ "avg_mb_per_1k_rows": 0.0055,
400
+ "distinct_codecs": 2,
401
+ "lossless_count": 3,
402
+ "vault_count": 1,
403
+ }
404
+
405
+ Example:
406
+ >>> s = cat.stats()
407
+ >>> print(f"{s['total_datasets']} datasets, {s['total_rows']:,} linhas")
408
+ """
409
+ r = self.con.execute("""
410
+ SELECT
411
+ COUNT(*) as total_datasets,
412
+ SUM(orig_rows) as total_rows,
413
+ SUM(file_size_mb) as total_mb,
414
+ SUM(n_chunks) as total_chunks,
415
+ AVG(file_size_mb/NULLIF(orig_rows/1000.0,0)) as avg_mb_per_1k_rows,
416
+ COUNT(DISTINCT codec) as distinct_codecs,
417
+ COUNT(DISTINCT partition_col) as distinct_partitions,
418
+ SUM(CASE WHEN quant_level=0 THEN 1 ELSE 0 END) as lossless_count,
419
+ SUM(CASE WHEN quant_level>0 THEN 1 ELSE 0 END) as vault_count
420
+ FROM datasets
421
+ """).fetchone()
422
+
423
+ labels = ['total_datasets','total_rows','total_mb','total_chunks',
424
+ 'avg_mb_per_1k_rows','distinct_codecs','distinct_partitions',
425
+ 'lossless_count','vault_count']
426
+ return dict(zip(labels, r))
427
+
428
+ # ── SQL DIRETO ────────────────────────────────────────────────────────────
429
+ def sql(self, query: str) -> pd.DataFrame:
430
+ """Executa SQL direto no catalog DuckDB."""
431
+ return self.con.execute(query).df()
432
+
433
+ def __repr__(self):
434
+ n = self.con.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
435
+ return f"<PermafrostCatalog path='{self.catalog_path}' datasets={n}>"
436
+
437
+
438
+ print("permafrost_catalog.py carregado")
439
+ print(" Classes: PermafrostCatalog")
440
+ print(" Métodos: register, register_dir, search, search_chunks, thaw,")
441
+ print(" cost_report, integrity_check, stats, sql")