@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,91 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import urllib.error
6
+ import urllib.request
7
+ from hashlib import md5
8
+
9
+
10
+ API_BASE = os.getenv("METABASE_API_URL", "http://localhost:3001/api").rstrip("/")
11
+ API_KEY = os.getenv("METABASE_API_KEY")
12
+ CARD_NAME_PREFIX = os.getenv("METABASE_CARD_PREFIX", "PNP 2024 - ")
13
+
14
+ if not API_KEY:
15
+ raise SystemExit("METABASE_API_KEY is required")
16
+
17
+
18
+ def api(method: str, path: str, payload: dict | list | None = None) -> dict | list:
19
+ data = None
20
+ headers = {"x-api-key": API_KEY, "Accept": "application/json"}
21
+ if payload is not None:
22
+ data = json.dumps(payload).encode("utf-8")
23
+ headers["Content-Type"] = "application/json"
24
+
25
+ request = urllib.request.Request(f"{API_BASE}{path}", data=data, headers=headers, method=method)
26
+ try:
27
+ with urllib.request.urlopen(request) as response:
28
+ body = response.read().decode("utf-8")
29
+ return json.loads(body) if body else {}
30
+ except urllib.error.HTTPError as exc:
31
+ body = exc.read().decode("utf-8", errors="replace")
32
+ raise RuntimeError(f"{method} {path} failed: {exc.code} {body}") from exc
33
+
34
+
35
+ def ensure_template_tag_ids() -> dict[str, int]:
36
+ updated = 0
37
+ skipped = 0
38
+
39
+ for card_stub in api("GET", "/card"):
40
+ card_id = int(card_stub["id"])
41
+ name = card_stub.get("name") or ""
42
+ if not name.startswith(CARD_NAME_PREFIX):
43
+ continue
44
+
45
+ card = api("GET", f"/card/{card_id}")
46
+ dataset_query = card.get("dataset_query") or {}
47
+ native = dataset_query.get("native") or {}
48
+ template_tags = native.get("template-tags") or {}
49
+ if not template_tags:
50
+ skipped += 1
51
+ continue
52
+
53
+ changed = False
54
+ normalized_tags = {}
55
+ for slug, tag in template_tags.items():
56
+ if not isinstance(tag, dict):
57
+ normalized_tags[slug] = tag
58
+ continue
59
+
60
+ tag = dict(tag)
61
+ if not tag.get("id"):
62
+ tag["id"] = md5(f"{card_id}:{slug}".encode("utf-8")).hexdigest()[:12]
63
+ changed = True
64
+ normalized_tags[slug] = tag
65
+
66
+ if not changed:
67
+ skipped += 1
68
+ continue
69
+
70
+ payload = {
71
+ "name": card["name"],
72
+ "description": card.get("description"),
73
+ "display": card["display"],
74
+ "dataset_query": {
75
+ **dataset_query,
76
+ "native": {
77
+ **native,
78
+ "template-tags": normalized_tags,
79
+ },
80
+ },
81
+ "visualization_settings": card.get("visualization_settings") or {},
82
+ "parameters": [],
83
+ }
84
+ api("PUT", f"/card/{card_id}", payload)
85
+ updated += 1
86
+
87
+ return {"updated": updated, "skipped": skipped}
88
+
89
+
90
+ if __name__ == "__main__":
91
+ print(json.dumps(ensure_template_tag_ids(), ensure_ascii=False))
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+ sys.path.insert(0, str(ROOT / "pipelines"))
9
+
10
+ from dataif_pipelines.connectors.nilo_pecanha.powerbi_microdados import main
11
+
12
+
13
+ if __name__ == "__main__":
14
+ raise SystemExit(main())
@@ -0,0 +1,330 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import psycopg2
11
+ from psycopg2.extras import RealDictCursor
12
+
13
+
14
+ ROOT = Path(__file__).resolve().parents[1]
15
+ ENV_PATH = ROOT / "infra" / ".env"
16
+
17
+
18
+ def load_env_file(path: Path) -> dict[str, str]:
19
+ values: dict[str, str] = {}
20
+ if not path.exists():
21
+ return values
22
+
23
+ for line in path.read_text(encoding="utf-8").splitlines():
24
+ stripped = line.strip()
25
+ if not stripped or stripped.startswith("#") or "=" not in stripped:
26
+ continue
27
+ key, value = stripped.split("=", 1)
28
+ values[key.strip()] = value.strip()
29
+ return values
30
+
31
+
32
+ def build_dsn() -> str:
33
+ explicit = os.getenv("WAREHOUSE_DSN")
34
+ if explicit:
35
+ return explicit
36
+
37
+ env_file = load_env_file(ENV_PATH)
38
+ db_name = os.getenv("DATAIF_DB_NAME") or env_file.get("DATAIF_DB_NAME", "dataif")
39
+ user = os.getenv("DATAIF_ETL_USER") or env_file.get("DATAIF_ETL_USER", "etl_user")
40
+ password = os.getenv("DATAIF_ETL_PASSWORD") or env_file.get("DATAIF_ETL_PASSWORD", "etl_password")
41
+ host = os.getenv("POSTGRES_HOST") or env_file.get("POSTGRES_HOST", "localhost")
42
+ port = os.getenv("POSTGRES_EXPOSE_PORT") or env_file.get("POSTGRES_EXPOSE_PORT", "5433")
43
+
44
+ if host == "postgres":
45
+ host = "localhost"
46
+
47
+ return f"postgresql://{user}:{password}@{host}:{port}/{db_name}"
48
+
49
+
50
+ def fetch_one(cur: RealDictCursor, query: str, params: tuple[Any, ...]) -> dict[str, Any] | None:
51
+ cur.execute(query, params)
52
+ return cur.fetchone()
53
+
54
+
55
+ def fetch_all(cur: RealDictCursor, query: str, params: tuple[Any, ...]) -> list[dict[str, Any]]:
56
+ cur.execute(query, params)
57
+ return list(cur.fetchall())
58
+
59
+
60
+ def latest_raw_run(cur: RealDictCursor, run_id: str | None) -> dict[str, Any] | None:
61
+ if run_id:
62
+ return fetch_one(
63
+ cur,
64
+ """
65
+ SELECT run_id, status, extracted_count, loaded_count, details, started_at, finished_at
66
+ FROM audit.etl_run_log
67
+ WHERE connector_id = 'nilo_pecanha'
68
+ AND run_id = %s
69
+ ORDER BY finished_at DESC NULLS LAST, started_at DESC
70
+ LIMIT 1
71
+ """,
72
+ (run_id,),
73
+ )
74
+
75
+ return fetch_one(
76
+ cur,
77
+ """
78
+ SELECT run_id, status, extracted_count, loaded_count, details, started_at, finished_at
79
+ FROM audit.etl_run_log
80
+ WHERE connector_id = 'nilo_pecanha'
81
+ AND status = 'raw_loaded'
82
+ ORDER BY finished_at DESC NULLS LAST, started_at DESC
83
+ LIMIT 1
84
+ """,
85
+ (),
86
+ )
87
+
88
+
89
+ def load_manifest(cur: RealDictCursor, run_id: str) -> dict[str, Any] | None:
90
+ row = fetch_one(
91
+ cur,
92
+ """
93
+ SELECT content_text::jsonb AS manifest
94
+ FROM raw.nilo_pecanha_assets
95
+ WHERE run_id = %s
96
+ AND asset_type = 'powerbi_microdados_manifest'
97
+ ORDER BY ingested_at DESC
98
+ LIMIT 1
99
+ """,
100
+ (run_id,),
101
+ )
102
+ return dict(row["manifest"]) if row and row.get("manifest") else None
103
+
104
+
105
+ def load_raw_counts_by_file(cur: RealDictCursor, run_id: str) -> list[dict[str, Any]]:
106
+ return fetch_all(
107
+ cur,
108
+ """
109
+ SELECT
110
+ payload->>'tipo_microdados' AS tipo_microdados,
111
+ payload->>'source_file_name' AS source_file_name,
112
+ COUNT(*)::bigint AS raw_rows
113
+ FROM raw.nilo_pecanha_records
114
+ WHERE run_id = %s
115
+ GROUP BY 1, 2
116
+ ORDER BY 1, 2
117
+ """,
118
+ (run_id,),
119
+ )
120
+
121
+
122
+ def load_core_checks(cur: RealDictCursor, run_id: str) -> dict[str, Any]:
123
+ row = fetch_one(
124
+ cur,
125
+ """
126
+ SELECT
127
+ COUNT(*)::bigint AS total_rows,
128
+ SUM(CASE WHEN COALESCE(payload->>'id', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_id,
129
+ SUM(CASE WHEN COALESCE(payload->>'ano', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_ano,
130
+ SUM(CASE WHEN COALESCE(payload->>'tipo_microdados', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_tipo_microdados,
131
+ SUM(CASE WHEN COALESCE(payload->>'source_file_name', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_source_file_name,
132
+ SUM(CASE WHEN COALESCE(payload->>'microdados_url', '') = '' THEN 1 ELSE 0 END)::bigint AS missing_microdados_url,
133
+ SUM(
134
+ CASE
135
+ WHEN COALESCE(payload->>'source_method', '') <> 'powerbi_microdados' THEN 1
136
+ ELSE 0
137
+ END
138
+ )::bigint AS unexpected_source_method,
139
+ SUM(
140
+ CASE
141
+ WHEN COALESCE(payload->>'ano', '') <> ''
142
+ AND COALESCE(payload->>'ano', '') !~ '^[0-9]{4}$'
143
+ THEN 1
144
+ ELSE 0
145
+ END
146
+ )::bigint AS ano_not_four_digits,
147
+ SUM(
148
+ CASE
149
+ WHEN COALESCE(payload->>'ano', '') ~ '^[0-9]{4}$'
150
+ AND COALESCE(substring(payload->>'source_file_name' FROM '([0-9]{4})'), '') <> ''
151
+ AND payload->>'ano' <> substring(payload->>'source_file_name' FROM '([0-9]{4})')
152
+ THEN 1
153
+ ELSE 0
154
+ END
155
+ )::bigint AS ano_file_mismatch
156
+ FROM raw.nilo_pecanha_records
157
+ WHERE run_id = %s
158
+ """,
159
+ (run_id,),
160
+ )
161
+ return row or {}
162
+
163
+
164
+ def load_sample_profile(cur: RealDictCursor, run_id: str, sample_size: int, max_examples: int) -> list[dict[str, Any]]:
165
+ return fetch_all(
166
+ cur,
167
+ """
168
+ WITH sample AS (
169
+ SELECT payload
170
+ FROM raw.nilo_pecanha_records
171
+ WHERE run_id = %s
172
+ ORDER BY id
173
+ LIMIT %s
174
+ ),
175
+ pairs AS (
176
+ SELECT
177
+ e.key,
178
+ NULLIF(BTRIM(e.value), '') AS value
179
+ FROM sample s
180
+ CROSS JOIN LATERAL jsonb_each_text(s.payload) AS e(key, value)
181
+ ),
182
+ distinct_values AS (
183
+ SELECT DISTINCT key, value
184
+ FROM pairs
185
+ WHERE value IS NOT NULL
186
+ ),
187
+ ranked_values AS (
188
+ SELECT
189
+ key,
190
+ value,
191
+ ROW_NUMBER() OVER (PARTITION BY key ORDER BY value) AS rn
192
+ FROM distinct_values
193
+ )
194
+ SELECT
195
+ p.key,
196
+ COUNT(*) FILTER (WHERE p.value IS NOT NULL)::bigint AS populated_rows,
197
+ COUNT(DISTINCT p.value)::bigint AS distinct_values,
198
+ COALESCE(
199
+ ARRAY_AGG(rv.value ORDER BY rv.value) FILTER (WHERE rv.rn <= %s),
200
+ ARRAY[]::text[]
201
+ ) AS example_values
202
+ FROM pairs p
203
+ LEFT JOIN ranked_values rv
204
+ ON rv.key = p.key
205
+ AND rv.value = p.value
206
+ GROUP BY p.key
207
+ ORDER BY populated_rows DESC, p.key
208
+ """,
209
+ (run_id, sample_size, max_examples),
210
+ )
211
+
212
+
213
+ def print_operational_summary(run_row: dict[str, Any], manifest: dict[str, Any], raw_counts: list[dict[str, Any]]) -> list[str]:
214
+ details = dict(run_row.get("details") or {})
215
+ runtime = dict(details.get("runtime") or {})
216
+ checks = dict(details.get("checks") or {})
217
+ manifest_downloads = list(manifest.get("downloads") or [])
218
+ raw_total = sum(int(row["raw_rows"]) for row in raw_counts)
219
+ manifest_total = sum(int(item.get("row_count") or 0) for item in manifest_downloads)
220
+
221
+ print("Validação operacional")
222
+ print(f"- run_id: {run_row['run_id']}")
223
+ print(f"- status audit: {run_row['status']}")
224
+ print(f"- periodo: {run_row['started_at']} -> {run_row['finished_at']}")
225
+ print(f"- downloads reportados pela pipeline: {runtime.get('download_count', 0)}")
226
+ print(f"- raw_count reportado no audit: {checks.get('raw_count', 0)}")
227
+ print(f"- asset_count reportado no audit: {checks.get('asset_count', 0)}")
228
+ print(f"- manifesto: {len(manifest_downloads)} downloads / {manifest_total} linhas esperadas")
229
+ print(f"- raw agregado por arquivo: {len(raw_counts)} arquivos / {raw_total} linhas persistidas")
230
+
231
+ failures: list[str] = []
232
+ if int(runtime.get("download_count") or 0) != len(manifest_downloads):
233
+ failures.append("download_count do audit difere do manifesto")
234
+ if int(checks.get("raw_count") or 0) != manifest_total:
235
+ failures.append("raw_count do audit difere da soma do manifesto")
236
+ if raw_total != manifest_total:
237
+ failures.append("raw agregado por arquivo difere da soma do manifesto")
238
+
239
+ print("")
240
+ print("Conferencia por arquivo")
241
+ counts_by_key = {
242
+ (str(row["tipo_microdados"]), str(row["source_file_name"])): int(row["raw_rows"])
243
+ for row in raw_counts
244
+ }
245
+ for item in manifest_downloads:
246
+ key = (str(item.get("tipo_microdados") or ""), str(item.get("source_file_name") or ""))
247
+ manifest_rows = int(item.get("row_count") or 0)
248
+ raw_rows = counts_by_key.get(key, 0)
249
+ marker = "OK" if manifest_rows == raw_rows else "DIVERGENTE"
250
+ print(f"- {key[0]} | {key[1]} | manifesto={manifest_rows} | raw={raw_rows} | {marker}")
251
+ if manifest_rows != raw_rows:
252
+ failures.append(f"arquivo divergente: {key[1]}")
253
+
254
+ return failures
255
+
256
+
257
+ def print_analytical_summary(core_checks: dict[str, Any], profile: list[dict[str, Any]], sample_size: int) -> list[str]:
258
+ failures: list[str] = []
259
+
260
+ print("")
261
+ print("Primeira validação analitica")
262
+ print(f"- total_rows: {core_checks.get('total_rows', 0)}")
263
+ for key in (
264
+ "missing_id",
265
+ "missing_ano",
266
+ "missing_tipo_microdados",
267
+ "missing_source_file_name",
268
+ "missing_microdados_url",
269
+ "unexpected_source_method",
270
+ "ano_not_four_digits",
271
+ "ano_file_mismatch",
272
+ ):
273
+ value = int(core_checks.get(key) or 0)
274
+ print(f"- {key}: {value}")
275
+ if value > 0:
276
+ failures.append(f"{key}={value}")
277
+
278
+ print("")
279
+ print(f"Perfil amostral de colunas ({sample_size} registros)")
280
+ for row in profile[:20]:
281
+ examples = ", ".join(row["example_values"][:4]) if row["example_values"] else "-"
282
+ print(
283
+ f"- {row['key']}: preenchidos={row['populated_rows']} "
284
+ f"| distintos={row['distinct_values']} | exemplos={examples}"
285
+ )
286
+
287
+ return failures
288
+
289
+
290
+ def main() -> int:
291
+ parser = argparse.ArgumentParser(description="Valida a ultima run raw do conector PNP no Postgres.")
292
+ parser.add_argument("--run-id", help="Run especifica para validar. Se omitido, usa a ultima raw_loaded.")
293
+ parser.add_argument("--sample-size", type=int, default=1000, help="Quantidade de registros usados no perfil amostral.")
294
+ parser.add_argument("--max-examples", type=int, default=4, help="Quantidade maxima de exemplos por coluna no perfil.")
295
+ parser.add_argument("--strict", action="store_true", help="Retorna codigo 1 se qualquer inconsistência for encontrada.")
296
+ args = parser.parse_args()
297
+
298
+ dsn = build_dsn()
299
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
300
+ run_row = latest_raw_run(cur, args.run_id)
301
+ if not run_row:
302
+ print("Nenhuma run encontrada para o conector nilo_pecanha.")
303
+ return 1
304
+
305
+ manifest = load_manifest(cur, str(run_row["run_id"]))
306
+ if not manifest:
307
+ print(f"Run {run_row['run_id']} nao possui manifesto em raw.nilo_pecanha_assets.")
308
+ return 1
309
+
310
+ raw_counts = load_raw_counts_by_file(cur, str(run_row["run_id"]))
311
+ core_checks = load_core_checks(cur, str(run_row["run_id"]))
312
+ profile = load_sample_profile(cur, str(run_row["run_id"]), args.sample_size, args.max_examples)
313
+
314
+ operational_failures = print_operational_summary(run_row, manifest, raw_counts)
315
+ analytical_failures = print_analytical_summary(core_checks, profile, args.sample_size)
316
+
317
+ all_failures = [*operational_failures, *analytical_failures]
318
+ print("")
319
+ if all_failures:
320
+ print("Resultado final: inconsistencias encontradas")
321
+ for failure in all_failures:
322
+ print(f"- {failure}")
323
+ return 1 if args.strict else 0
324
+
325
+ print("Resultado final: sem inconsistencias nas validacoes executadas")
326
+ return 0
327
+
328
+
329
+ if __name__ == "__main__":
330
+ raise SystemExit(main())
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ REGISTRY="${DATAIF_IMAGE_REGISTRY:-docker.io/dataif}"
5
+ TAG="${DATAIF_IMAGE_TAG:-latest}"
6
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
7
+ BUILD_FLAGS=(--pull)
8
+
9
+ if [ -n "${DATAIF_BUILD_FLAGS:-}" ]; then
10
+ read -r -a EXTRA_BUILD_FLAGS <<<"${DATAIF_BUILD_FLAGS}"
11
+ BUILD_FLAGS+=("${EXTRA_BUILD_FLAGS[@]}")
12
+ fi
13
+
14
+ build_and_push() {
15
+ local image_name="$1"
16
+ local context_dir="$2"
17
+ local dockerfile_path="$3"
18
+
19
+ printf 'Building %s/%s:%s\n' "${REGISTRY}" "${image_name}" "${TAG}"
20
+ docker build "${BUILD_FLAGS[@]}" -t "${REGISTRY}/${image_name}:${TAG}" -f "${dockerfile_path}" "${context_dir}"
21
+ printf 'Pushing %s/%s:%s\n' "${REGISTRY}" "${image_name}" "${TAG}"
22
+ docker push "${REGISTRY}/${image_name}:${TAG}"
23
+ }
24
+
25
+ build_and_push "dataif-postgres" "${ROOT_DIR}" "${ROOT_DIR}/infra/postgres/Dockerfile"
26
+ build_and_push "dataif-keycloak" "${ROOT_DIR}" "${ROOT_DIR}/infra/keycloak/Dockerfile"
27
+ build_and_push "dataif-airflow" "${ROOT_DIR}" "${ROOT_DIR}/infra/airflow/Dockerfile.release"
28
+ build_and_push "dataif-api" "${ROOT_DIR}/services/api" "${ROOT_DIR}/services/api/Dockerfile"
29
+ build_and_push "dataif-web" "${ROOT_DIR}/services/web" "${ROOT_DIR}/services/web/Dockerfile"
30
+ build_and_push "dataif-vanna" "${ROOT_DIR}/services/vanna" "${ROOT_DIR}/services/vanna/Dockerfile"
31
+ build_and_push "dataif-ollama-bootstrap" "${ROOT_DIR}/infra/ollama" "${ROOT_DIR}/infra/ollama/Dockerfile"