@dataif/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/bin/dataif.js +623 -0
- package/package.json +26 -0
- package/scripts/build-template.mjs +72 -0
- package/templates/dataif/README.md +157 -0
- package/templates/dataif/infra/.env.example +119 -0
- package/templates/dataif/infra/.env.stg.example +119 -0
- package/templates/dataif/infra/airflow/Dockerfile +11 -0
- package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
- package/templates/dataif/infra/airflow/requirements.txt +3 -0
- package/templates/dataif/infra/docker-compose.yml +306 -0
- package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
- package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
- package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
- package/templates/dataif/infra/keycloak/Dockerfile +4 -0
- package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
- package/templates/dataif/infra/ollama/Dockerfile +9 -0
- package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
- package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
- package/templates/dataif/infra/postgres/Dockerfile +4 -0
- package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
- package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
- package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
- package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
- package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
- package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
- package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
- package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
- package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
- package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
- package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
- package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
- package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
- package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
- package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
- package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
- package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
- package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
- package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
- package/templates/dataif/scripts/configure-env.sh +149 -0
- package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
- package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
- package/templates/dataif/scripts/deploy.sh +79 -0
- package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
- package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
- package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
- package/templates/dataif/scripts/publish-images.sh +31 -0
- package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
- package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
- package/templates/dataif/services/api/.dockerignore +18 -0
- package/templates/dataif/services/api/Dockerfile +12 -0
- package/templates/dataif/services/api/app/__init__.py +1 -0
- package/templates/dataif/services/api/app/auth.py +48 -0
- package/templates/dataif/services/api/app/config.py +59 -0
- package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
- package/templates/dataif/services/api/app/main.py +2432 -0
- package/templates/dataif/services/api/app/metabase_admin.py +191 -0
- package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
- package/templates/dataif/services/api/app/metabase_embed.py +15 -0
- package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
- package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
- package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
- package/templates/dataif/services/api/app/vanna_client.py +32 -0
- package/templates/dataif/services/api/requirements.txt +9 -0
- package/templates/dataif/services/vanna/.dockerignore +18 -0
- package/templates/dataif/services/vanna/Dockerfile +12 -0
- package/templates/dataif/services/vanna/app/config.py +57 -0
- package/templates/dataif/services/vanna/app/main.py +108 -0
- package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
- package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
- package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
- package/templates/dataif/services/vanna/requirements.txt +8 -0
- package/templates/dataif/services/web/.dockerignore +13 -0
- package/templates/dataif/services/web/Dockerfile +16 -0
- package/templates/dataif/services/web/index.html +12 -0
- package/templates/dataif/services/web/nginx.conf +74 -0
- package/templates/dataif/services/web/package-lock.json +4397 -0
- package/templates/dataif/services/web/package.json +32 -0
- package/templates/dataif/services/web/postcss.config.mjs +5 -0
- package/templates/dataif/services/web/src/App.jsx +2817 -0
- package/templates/dataif/services/web/src/adminAuth.js +245 -0
- package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
- package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
- package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
- package/templates/dataif/services/web/src/assets/if.svg +0 -0
- package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
- package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
- package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
- package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
- package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
- package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
- package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
- package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
- package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
- package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
- package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
- package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
- package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
- package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
- package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
- package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
- package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
- package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
- package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
- package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
- package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
- package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
- package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
- package/templates/dataif/services/web/src/main.jsx +14 -0
- package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
- package/templates/dataif/services/web/src/styles/globals.css +60 -0
- package/templates/dataif/services/web/src/styles/theme.css +1326 -0
- package/templates/dataif/services/web/src/styles/typography.css +430 -0
- package/templates/dataif/services/web/src/styles.css +1287 -0
- package/templates/dataif/services/web/src/utils/cx.ts +24 -0
- package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
- package/templates/dataif/services/web/vite.config.js +14 -0
- package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
- package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
- package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
- package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
- package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
- package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
- package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
- package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
- package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
- package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
- package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
- package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
- package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
- package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
- package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
- package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
- package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
- package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
- package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
- package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
- package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
- package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
- package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
- package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
- package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
- package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
- package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import urllib.error
|
|
6
|
+
import urllib.request
|
|
7
|
+
from hashlib import md5
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
API_BASE = os.getenv("METABASE_API_URL", "http://localhost:3001/api").rstrip("/")
|
|
11
|
+
API_KEY = os.getenv("METABASE_API_KEY")
|
|
12
|
+
CARD_NAME_PREFIX = os.getenv("METABASE_CARD_PREFIX", "PNP 2024 - ")
|
|
13
|
+
|
|
14
|
+
if not API_KEY:
|
|
15
|
+
raise SystemExit("METABASE_API_KEY is required")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def api(method: str, path: str, payload: dict | list | None = None) -> dict | list:
|
|
19
|
+
data = None
|
|
20
|
+
headers = {"x-api-key": API_KEY, "Accept": "application/json"}
|
|
21
|
+
if payload is not None:
|
|
22
|
+
data = json.dumps(payload).encode("utf-8")
|
|
23
|
+
headers["Content-Type"] = "application/json"
|
|
24
|
+
|
|
25
|
+
request = urllib.request.Request(f"{API_BASE}{path}", data=data, headers=headers, method=method)
|
|
26
|
+
try:
|
|
27
|
+
with urllib.request.urlopen(request) as response:
|
|
28
|
+
body = response.read().decode("utf-8")
|
|
29
|
+
return json.loads(body) if body else {}
|
|
30
|
+
except urllib.error.HTTPError as exc:
|
|
31
|
+
body = exc.read().decode("utf-8", errors="replace")
|
|
32
|
+
raise RuntimeError(f"{method} {path} failed: {exc.code} {body}") from exc
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def ensure_template_tag_ids() -> dict[str, int]:
|
|
36
|
+
updated = 0
|
|
37
|
+
skipped = 0
|
|
38
|
+
|
|
39
|
+
for card_stub in api("GET", "/card"):
|
|
40
|
+
card_id = int(card_stub["id"])
|
|
41
|
+
name = card_stub.get("name") or ""
|
|
42
|
+
if not name.startswith(CARD_NAME_PREFIX):
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
card = api("GET", f"/card/{card_id}")
|
|
46
|
+
dataset_query = card.get("dataset_query") or {}
|
|
47
|
+
native = dataset_query.get("native") or {}
|
|
48
|
+
template_tags = native.get("template-tags") or {}
|
|
49
|
+
if not template_tags:
|
|
50
|
+
skipped += 1
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
changed = False
|
|
54
|
+
normalized_tags = {}
|
|
55
|
+
for slug, tag in template_tags.items():
|
|
56
|
+
if not isinstance(tag, dict):
|
|
57
|
+
normalized_tags[slug] = tag
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
tag = dict(tag)
|
|
61
|
+
if not tag.get("id"):
|
|
62
|
+
tag["id"] = md5(f"{card_id}:{slug}".encode("utf-8")).hexdigest()[:12]
|
|
63
|
+
changed = True
|
|
64
|
+
normalized_tags[slug] = tag
|
|
65
|
+
|
|
66
|
+
if not changed:
|
|
67
|
+
skipped += 1
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
payload = {
|
|
71
|
+
"name": card["name"],
|
|
72
|
+
"description": card.get("description"),
|
|
73
|
+
"display": card["display"],
|
|
74
|
+
"dataset_query": {
|
|
75
|
+
**dataset_query,
|
|
76
|
+
"native": {
|
|
77
|
+
**native,
|
|
78
|
+
"template-tags": normalized_tags,
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
"visualization_settings": card.get("visualization_settings") or {},
|
|
82
|
+
"parameters": [],
|
|
83
|
+
}
|
|
84
|
+
api("PUT", f"/card/{card_id}", payload)
|
|
85
|
+
updated += 1
|
|
86
|
+
|
|
87
|
+
return {"updated": updated, "skipped": skipped}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
print(json.dumps(ensure_template_tag_ids(), ensure_ascii=False))
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
8
|
+
sys.path.insert(0, str(ROOT / "pipelines"))
|
|
9
|
+
|
|
10
|
+
from dataif_pipelines.connectors.nilo_pecanha.powerbi_microdados import main
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import psycopg2
|
|
11
|
+
from psycopg2.extras import RealDictCursor
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
15
|
+
ENV_PATH = ROOT / "infra" / ".env"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_env_file(path: Path) -> dict[str, str]:
|
|
19
|
+
values: dict[str, str] = {}
|
|
20
|
+
if not path.exists():
|
|
21
|
+
return values
|
|
22
|
+
|
|
23
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
24
|
+
stripped = line.strip()
|
|
25
|
+
if not stripped or stripped.startswith("#") or "=" not in stripped:
|
|
26
|
+
continue
|
|
27
|
+
key, value = stripped.split("=", 1)
|
|
28
|
+
values[key.strip()] = value.strip()
|
|
29
|
+
return values
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_dsn() -> str:
|
|
33
|
+
explicit = os.getenv("WAREHOUSE_DSN")
|
|
34
|
+
if explicit:
|
|
35
|
+
return explicit
|
|
36
|
+
|
|
37
|
+
env_file = load_env_file(ENV_PATH)
|
|
38
|
+
db_name = os.getenv("DATAIF_DB_NAME") or env_file.get("DATAIF_DB_NAME", "dataif")
|
|
39
|
+
user = os.getenv("DATAIF_ETL_USER") or env_file.get("DATAIF_ETL_USER", "etl_user")
|
|
40
|
+
password = os.getenv("DATAIF_ETL_PASSWORD") or env_file.get("DATAIF_ETL_PASSWORD", "etl_password")
|
|
41
|
+
host = os.getenv("POSTGRES_HOST") or env_file.get("POSTGRES_HOST", "localhost")
|
|
42
|
+
port = os.getenv("POSTGRES_EXPOSE_PORT") or env_file.get("POSTGRES_EXPOSE_PORT", "5433")
|
|
43
|
+
|
|
44
|
+
if host == "postgres":
|
|
45
|
+
host = "localhost"
|
|
46
|
+
|
|
47
|
+
return f"postgresql://{user}:{password}@{host}:{port}/{db_name}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fetch_one(cur: RealDictCursor, query: str, params: tuple[Any, ...]) -> dict[str, Any] | None:
|
|
51
|
+
cur.execute(query, params)
|
|
52
|
+
return cur.fetchone()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fetch_all(cur: RealDictCursor, query: str, params: tuple[Any, ...]) -> list[dict[str, Any]]:
|
|
56
|
+
cur.execute(query, params)
|
|
57
|
+
return list(cur.fetchall())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def latest_raw_run(cur: RealDictCursor, run_id: str | None) -> dict[str, Any] | None:
|
|
61
|
+
if run_id:
|
|
62
|
+
return fetch_one(
|
|
63
|
+
cur,
|
|
64
|
+
"""
|
|
65
|
+
SELECT run_id, status, extracted_count, loaded_count, details, started_at, finished_at
|
|
66
|
+
FROM audit.etl_run_log
|
|
67
|
+
WHERE connector_id = 'nilo_pecanha'
|
|
68
|
+
AND run_id = %s
|
|
69
|
+
ORDER BY finished_at DESC NULLS LAST, started_at DESC
|
|
70
|
+
LIMIT 1
|
|
71
|
+
""",
|
|
72
|
+
(run_id,),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return fetch_one(
|
|
76
|
+
cur,
|
|
77
|
+
"""
|
|
78
|
+
SELECT run_id, status, extracted_count, loaded_count, details, started_at, finished_at
|
|
79
|
+
FROM audit.etl_run_log
|
|
80
|
+
WHERE connector_id = 'nilo_pecanha'
|
|
81
|
+
AND status = 'raw_loaded'
|
|
82
|
+
ORDER BY finished_at DESC NULLS LAST, started_at DESC
|
|
83
|
+
LIMIT 1
|
|
84
|
+
""",
|
|
85
|
+
(),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def load_manifest(cur: RealDictCursor, run_id: str) -> dict[str, Any] | None:
|
|
90
|
+
row = fetch_one(
|
|
91
|
+
cur,
|
|
92
|
+
"""
|
|
93
|
+
SELECT content_text::jsonb AS manifest
|
|
94
|
+
FROM raw.nilo_pecanha_assets
|
|
95
|
+
WHERE run_id = %s
|
|
96
|
+
AND asset_type = 'powerbi_microdados_manifest'
|
|
97
|
+
ORDER BY ingested_at DESC
|
|
98
|
+
LIMIT 1
|
|
99
|
+
""",
|
|
100
|
+
(run_id,),
|
|
101
|
+
)
|
|
102
|
+
return dict(row["manifest"]) if row and row.get("manifest") else None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def load_raw_counts_by_file(cur: RealDictCursor, run_id: str) -> list[dict[str, Any]]:
|
|
106
|
+
return fetch_all(
|
|
107
|
+
cur,
|
|
108
|
+
"""
|
|
109
|
+
SELECT
|
|
110
|
+
payload->>'tipo_microdados' AS tipo_microdados,
|
|
111
|
+
payload->>'source_file_name' AS source_file_name,
|
|
112
|
+
COUNT(*)::bigint AS raw_rows
|
|
113
|
+
FROM raw.nilo_pecanha_records
|
|
114
|
+
WHERE run_id = %s
|
|
115
|
+
GROUP BY 1, 2
|
|
116
|
+
ORDER BY 1, 2
|
|
117
|
+
""",
|
|
118
|
+
(run_id,),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def load_core_checks(cur: RealDictCursor, run_id: str) -> dict[str, Any]:
|
|
123
|
+
row = fetch_one(
|
|
124
|
+
cur,
|
|
125
|
+
"""
|
|
126
|
+
SELECT
|
|
127
|
+
COUNT(*)::bigint AS total_rows,
|
|
128
|
+
SUM(CASE WHEN COALESCE(payload->>'id', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_id,
|
|
129
|
+
SUM(CASE WHEN COALESCE(payload->>'ano', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_ano,
|
|
130
|
+
SUM(CASE WHEN COALESCE(payload->>'tipo_microdados', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_tipo_microdados,
|
|
131
|
+
SUM(CASE WHEN COALESCE(payload->>'source_file_name', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_source_file_name,
|
|
132
|
+
SUM(CASE WHEN COALESCE(payload->>'microdados_url', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_microdados_url,
|
|
133
|
+
SUM(
|
|
134
|
+
CASE
|
|
135
|
+
WHEN COALESCE(payload->>'source_method', '') <> 'powerbi_microdados' THEN 1
|
|
136
|
+
ELSE 0
|
|
137
|
+
END
|
|
138
|
+
)::bigint AS unexpected_source_method,
|
|
139
|
+
SUM(
|
|
140
|
+
CASE
|
|
141
|
+
WHEN COALESCE(payload->>'ano', '') <> ''
|
|
142
|
+
AND COALESCE(payload->>'ano', '') !~ '^[0-9]{4}$'
|
|
143
|
+
THEN 1
|
|
144
|
+
ELSE 0
|
|
145
|
+
END
|
|
146
|
+
)::bigint AS ano_not_four_digits,
|
|
147
|
+
SUM(
|
|
148
|
+
CASE
|
|
149
|
+
WHEN COALESCE(payload->>'ano', '') ~ '^[0-9]{4}$'
|
|
150
|
+
AND COALESCE(substring(payload->>'source_file_name' FROM '([0-9]{4})'), '') <> ''
|
|
151
|
+
AND payload->>'ano' <> substring(payload->>'source_file_name' FROM '([0-9]{4})')
|
|
152
|
+
THEN 1
|
|
153
|
+
ELSE 0
|
|
154
|
+
END
|
|
155
|
+
)::bigint AS ano_file_mismatch
|
|
156
|
+
FROM raw.nilo_pecanha_records
|
|
157
|
+
WHERE run_id = %s
|
|
158
|
+
""",
|
|
159
|
+
(run_id,),
|
|
160
|
+
)
|
|
161
|
+
return row or {}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def load_sample_profile(cur: RealDictCursor, run_id: str, sample_size: int, max_examples: int) -> list[dict[str, Any]]:
|
|
165
|
+
return fetch_all(
|
|
166
|
+
cur,
|
|
167
|
+
"""
|
|
168
|
+
WITH sample AS (
|
|
169
|
+
SELECT payload
|
|
170
|
+
FROM raw.nilo_pecanha_records
|
|
171
|
+
WHERE run_id = %s
|
|
172
|
+
ORDER BY id
|
|
173
|
+
LIMIT %s
|
|
174
|
+
),
|
|
175
|
+
pairs AS (
|
|
176
|
+
SELECT
|
|
177
|
+
e.key,
|
|
178
|
+
NULLIF(BTRIM(e.value), '') AS value
|
|
179
|
+
FROM sample s
|
|
180
|
+
CROSS JOIN LATERAL jsonb_each_text(s.payload) AS e(key, value)
|
|
181
|
+
),
|
|
182
|
+
distinct_values AS (
|
|
183
|
+
SELECT DISTINCT key, value
|
|
184
|
+
FROM pairs
|
|
185
|
+
WHERE value IS NOT NULL
|
|
186
|
+
),
|
|
187
|
+
ranked_values AS (
|
|
188
|
+
SELECT
|
|
189
|
+
key,
|
|
190
|
+
value,
|
|
191
|
+
ROW_NUMBER() OVER (PARTITION BY key ORDER BY value) AS rn
|
|
192
|
+
FROM distinct_values
|
|
193
|
+
)
|
|
194
|
+
SELECT
|
|
195
|
+
p.key,
|
|
196
|
+
COUNT(*) FILTER (WHERE p.value IS NOT NULL)::bigint AS populated_rows,
|
|
197
|
+
COUNT(DISTINCT p.value)::bigint AS distinct_values,
|
|
198
|
+
COALESCE(
|
|
199
|
+
ARRAY_AGG(rv.value ORDER BY rv.value) FILTER (WHERE rv.rn <= %s),
|
|
200
|
+
ARRAY[]::text[]
|
|
201
|
+
) AS example_values
|
|
202
|
+
FROM pairs p
|
|
203
|
+
LEFT JOIN ranked_values rv
|
|
204
|
+
ON rv.key = p.key
|
|
205
|
+
AND rv.value = p.value
|
|
206
|
+
GROUP BY p.key
|
|
207
|
+
ORDER BY populated_rows DESC, p.key
|
|
208
|
+
""",
|
|
209
|
+
(run_id, sample_size, max_examples),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def print_operational_summary(run_row: dict[str, Any], manifest: dict[str, Any], raw_counts: list[dict[str, Any]]) -> list[str]:
|
|
214
|
+
details = dict(run_row.get("details") or {})
|
|
215
|
+
runtime = dict(details.get("runtime") or {})
|
|
216
|
+
checks = dict(details.get("checks") or {})
|
|
217
|
+
manifest_downloads = list(manifest.get("downloads") or [])
|
|
218
|
+
raw_total = sum(int(row["raw_rows"]) for row in raw_counts)
|
|
219
|
+
manifest_total = sum(int(item.get("row_count") or 0) for item in manifest_downloads)
|
|
220
|
+
|
|
221
|
+
print("Validação operacional")
|
|
222
|
+
print(f"- run_id: {run_row['run_id']}")
|
|
223
|
+
print(f"- status audit: {run_row['status']}")
|
|
224
|
+
print(f"- periodo: {run_row['started_at']} -> {run_row['finished_at']}")
|
|
225
|
+
print(f"- downloads reportados pela pipeline: {runtime.get('download_count', 0)}")
|
|
226
|
+
print(f"- raw_count reportado no audit: {checks.get('raw_count', 0)}")
|
|
227
|
+
print(f"- asset_count reportado no audit: {checks.get('asset_count', 0)}")
|
|
228
|
+
print(f"- manifesto: {len(manifest_downloads)} downloads / {manifest_total} linhas esperadas")
|
|
229
|
+
print(f"- raw agregado por arquivo: {len(raw_counts)} arquivos / {raw_total} linhas persistidas")
|
|
230
|
+
|
|
231
|
+
failures: list[str] = []
|
|
232
|
+
if int(runtime.get("download_count") or 0) != len(manifest_downloads):
|
|
233
|
+
failures.append("download_count do audit difere do manifesto")
|
|
234
|
+
if int(checks.get("raw_count") or 0) != manifest_total:
|
|
235
|
+
failures.append("raw_count do audit difere da soma do manifesto")
|
|
236
|
+
if raw_total != manifest_total:
|
|
237
|
+
failures.append("raw agregado por arquivo difere da soma do manifesto")
|
|
238
|
+
|
|
239
|
+
print("")
|
|
240
|
+
print("Conferencia por arquivo")
|
|
241
|
+
counts_by_key = {
|
|
242
|
+
(str(row["tipo_microdados"]), str(row["source_file_name"])): int(row["raw_rows"])
|
|
243
|
+
for row in raw_counts
|
|
244
|
+
}
|
|
245
|
+
for item in manifest_downloads:
|
|
246
|
+
key = (str(item.get("tipo_microdados") or ""), str(item.get("source_file_name") or ""))
|
|
247
|
+
manifest_rows = int(item.get("row_count") or 0)
|
|
248
|
+
raw_rows = counts_by_key.get(key, 0)
|
|
249
|
+
marker = "OK" if manifest_rows == raw_rows else "DIVERGENTE"
|
|
250
|
+
print(f"- {key[0]} | {key[1]} | manifesto={manifest_rows} | raw={raw_rows} | {marker}")
|
|
251
|
+
if manifest_rows != raw_rows:
|
|
252
|
+
failures.append(f"arquivo divergente: {key[1]}")
|
|
253
|
+
|
|
254
|
+
return failures
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def print_analytical_summary(core_checks: dict[str, Any], profile: list[dict[str, Any]], sample_size: int) -> list[str]:
|
|
258
|
+
failures: list[str] = []
|
|
259
|
+
|
|
260
|
+
print("")
|
|
261
|
+
print("Primeira validação analitica")
|
|
262
|
+
print(f"- total_rows: {core_checks.get('total_rows', 0)}")
|
|
263
|
+
for key in (
|
|
264
|
+
"missing_id",
|
|
265
|
+
"missing_ano",
|
|
266
|
+
"missing_tipo_microdados",
|
|
267
|
+
"missing_source_file_name",
|
|
268
|
+
"missing_microdados_url",
|
|
269
|
+
"unexpected_source_method",
|
|
270
|
+
"ano_not_four_digits",
|
|
271
|
+
"ano_file_mismatch",
|
|
272
|
+
):
|
|
273
|
+
value = int(core_checks.get(key) or 0)
|
|
274
|
+
print(f"- {key}: {value}")
|
|
275
|
+
if value > 0:
|
|
276
|
+
failures.append(f"{key}={value}")
|
|
277
|
+
|
|
278
|
+
print("")
|
|
279
|
+
print(f"Perfil amostral de colunas ({sample_size} registros)")
|
|
280
|
+
for row in profile[:20]:
|
|
281
|
+
examples = ", ".join(row["example_values"][:4]) if row["example_values"] else "-"
|
|
282
|
+
print(
|
|
283
|
+
f"- {row['key']}: preenchidos={row['populated_rows']} "
|
|
284
|
+
f"| distintos={row['distinct_values']} | exemplos={examples}"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
return failures
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def main() -> int:
|
|
291
|
+
parser = argparse.ArgumentParser(description="Valida a ultima run raw do conector PNP no Postgres.")
|
|
292
|
+
parser.add_argument("--run-id", help="Run especifica para validar. Se omitido, usa a ultima raw_loaded.")
|
|
293
|
+
parser.add_argument("--sample-size", type=int, default=1000, help="Quantidade de registros usados no perfil amostral.")
|
|
294
|
+
parser.add_argument("--max-examples", type=int, default=4, help="Quantidade maxima de exemplos por coluna no perfil.")
|
|
295
|
+
parser.add_argument("--strict", action="store_true", help="Retorna codigo 1 se qualquer inconsistência for encontrada.")
|
|
296
|
+
args = parser.parse_args()
|
|
297
|
+
|
|
298
|
+
dsn = build_dsn()
|
|
299
|
+
with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
|
|
300
|
+
run_row = latest_raw_run(cur, args.run_id)
|
|
301
|
+
if not run_row:
|
|
302
|
+
print("Nenhuma run encontrada para o conector nilo_pecanha.")
|
|
303
|
+
return 1
|
|
304
|
+
|
|
305
|
+
manifest = load_manifest(cur, str(run_row["run_id"]))
|
|
306
|
+
if not manifest:
|
|
307
|
+
print(f"Run {run_row['run_id']} nao possui manifesto em raw.nilo_pecanha_assets.")
|
|
308
|
+
return 1
|
|
309
|
+
|
|
310
|
+
raw_counts = load_raw_counts_by_file(cur, str(run_row["run_id"]))
|
|
311
|
+
core_checks = load_core_checks(cur, str(run_row["run_id"]))
|
|
312
|
+
profile = load_sample_profile(cur, str(run_row["run_id"]), args.sample_size, args.max_examples)
|
|
313
|
+
|
|
314
|
+
operational_failures = print_operational_summary(run_row, manifest, raw_counts)
|
|
315
|
+
analytical_failures = print_analytical_summary(core_checks, profile, args.sample_size)
|
|
316
|
+
|
|
317
|
+
all_failures = [*operational_failures, *analytical_failures]
|
|
318
|
+
print("")
|
|
319
|
+
if all_failures:
|
|
320
|
+
print("Resultado final: inconsistencias encontradas")
|
|
321
|
+
for failure in all_failures:
|
|
322
|
+
print(f"- {failure}")
|
|
323
|
+
return 1 if args.strict else 0
|
|
324
|
+
|
|
325
|
+
print("Resultado final: sem inconsistencias nas validacoes executadas")
|
|
326
|
+
return 0
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
if __name__ == "__main__":
|
|
330
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
REGISTRY="${DATAIF_IMAGE_REGISTRY:-docker.io/dataif}"
|
|
5
|
+
TAG="${DATAIF_IMAGE_TAG:-latest}"
|
|
6
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
7
|
+
BUILD_FLAGS=(--pull)
|
|
8
|
+
|
|
9
|
+
if [ -n "${DATAIF_BUILD_FLAGS:-}" ]; then
|
|
10
|
+
read -r -a EXTRA_BUILD_FLAGS <<<"${DATAIF_BUILD_FLAGS}"
|
|
11
|
+
BUILD_FLAGS+=("${EXTRA_BUILD_FLAGS[@]}")
|
|
12
|
+
fi
|
|
13
|
+
|
|
14
|
+
build_and_push() {
|
|
15
|
+
local image_name="$1"
|
|
16
|
+
local context_dir="$2"
|
|
17
|
+
local dockerfile_path="$3"
|
|
18
|
+
|
|
19
|
+
printf 'Building %s/%s:%s\n' "${REGISTRY}" "${image_name}" "${TAG}"
|
|
20
|
+
docker build "${BUILD_FLAGS[@]}" -t "${REGISTRY}/${image_name}:${TAG}" -f "${dockerfile_path}" "${context_dir}"
|
|
21
|
+
printf 'Pushing %s/%s:%s\n' "${REGISTRY}" "${image_name}" "${TAG}"
|
|
22
|
+
docker push "${REGISTRY}/${image_name}:${TAG}"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
build_and_push "dataif-postgres" "${ROOT_DIR}" "${ROOT_DIR}/infra/postgres/Dockerfile"
|
|
26
|
+
build_and_push "dataif-keycloak" "${ROOT_DIR}" "${ROOT_DIR}/infra/keycloak/Dockerfile"
|
|
27
|
+
build_and_push "dataif-airflow" "${ROOT_DIR}" "${ROOT_DIR}/infra/airflow/Dockerfile.release"
|
|
28
|
+
build_and_push "dataif-api" "${ROOT_DIR}/services/api" "${ROOT_DIR}/services/api/Dockerfile"
|
|
29
|
+
build_and_push "dataif-web" "${ROOT_DIR}/services/web" "${ROOT_DIR}/services/web/Dockerfile"
|
|
30
|
+
build_and_push "dataif-vanna" "${ROOT_DIR}/services/vanna" "${ROOT_DIR}/services/vanna/Dockerfile"
|
|
31
|
+
build_and_push "dataif-ollama-bootstrap" "${ROOT_DIR}/infra/ollama" "${ROOT_DIR}/infra/ollama/Dockerfile"
|