@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from typing import Any
6
+
7
+ from dataif_pipelines.connectors.base.types import NormalizedRecord, RawRecord, RunContext
8
+ from dataif_pipelines.connectors.nilo_pecanha.transform import normalize_domain_record, normalize_record
9
+ from dataif_pipelines.repositories import pnp_raw_repository
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class NormalizationResult:
14
+ normalized_records: list[NormalizedRecord]
15
+ quarantine_rows: list[dict[str, Any]]
16
+
17
+
18
+ def normalize_raw_records(raw_records: list[RawRecord], run_context: RunContext) -> NormalizationResult:
19
+ normalized: list[NormalizedRecord] = []
20
+ quarantine_rows: list[dict[str, Any]] = []
21
+
22
+ for raw_record in raw_records:
23
+ payload = raw_record.get("payload")
24
+ if not isinstance(payload, dict):
25
+ continue
26
+
27
+ source_url = str(raw_record.get("source_url") or run_context.source_url)
28
+ legacy_record = normalize_record(
29
+ payload=payload,
30
+ source_url=source_url,
31
+ run_id=run_context.run_id,
32
+ endpoint_id=int(raw_record.get("endpoint_id")),
33
+ endpoint_key=str(raw_record.get("endpoint_key") or "default"),
34
+ source_kind=str(raw_record.get("source_kind") or "powerbi_microdados"),
35
+ )
36
+
37
+ try:
38
+ domain_record = normalize_domain_record(
39
+ payload=payload,
40
+ run_id=run_context.run_id,
41
+ instance_key=str(payload.get("instance_key") or "").strip() or None,
42
+ source_url=source_url,
43
+ )
44
+ except Exception as exc:
45
+ quarantine_rows.append(
46
+ {
47
+ "run_id": run_context.run_id,
48
+ "instance_key": str(payload.get("instance_key") or "").strip() or None,
49
+ "source_url": source_url,
50
+ "source_row_number": payload.get("source_row_number"),
51
+ "error_type": "unsupported_domain",
52
+ "error_message": str(exc),
53
+ "raw_line_text": json.dumps(payload, ensure_ascii=True, sort_keys=True),
54
+ "details_json": {"tipo_microdados": payload.get("tipo_microdados")},
55
+ }
56
+ )
57
+ continue
58
+
59
+ normalized.append({**legacy_record, **domain_record})
60
+
61
+ return NormalizationResult(normalized_records=normalized, quarantine_rows=quarantine_rows)
62
+
63
+
64
+ def load_raw_batch(
65
+ dsn: str,
66
+ *,
67
+ normalized_records: list[dict[str, Any]],
68
+ pending_assets: list[dict[str, Any]],
69
+ pending_catalog_entries: list[dict[str, Any]],
70
+ pending_run_selection: list[dict[str, Any]],
71
+ pending_downloads: list[dict[str, Any]],
72
+ pending_quarantine: list[dict[str, Any]],
73
+ write_legacy: bool = False,
74
+ ) -> dict[str, int]:
75
+ return pnp_raw_repository.load_raw_batch(
76
+ dsn,
77
+ normalized_records=normalized_records,
78
+ pending_assets=pending_assets,
79
+ pending_catalog_entries=pending_catalog_entries,
80
+ pending_run_selection=pending_run_selection,
81
+ pending_downloads=pending_downloads,
82
+ pending_quarantine=pending_quarantine,
83
+ write_legacy=write_legacy,
84
+ )
85
+
86
+
87
+ def upsert_raw_metadata(
88
+ dsn: str,
89
+ *,
90
+ pending_assets: list[dict[str, Any]],
91
+ pending_catalog_entries: list[dict[str, Any]],
92
+ pending_run_selection: list[dict[str, Any]],
93
+ pending_downloads: list[dict[str, Any]],
94
+ write_legacy: bool = False,
95
+ include_download_columns: bool = True,
96
+ ) -> dict[str, Any]:
97
+ return pnp_raw_repository.upsert_raw_metadata(
98
+ dsn,
99
+ pending_assets=pending_assets,
100
+ pending_catalog_entries=pending_catalog_entries,
101
+ pending_run_selection=pending_run_selection,
102
+ pending_downloads=pending_downloads,
103
+ write_legacy=write_legacy,
104
+ include_download_columns=include_download_columns,
105
+ )
106
+
107
+
108
+ def load_raw_record_chunk(
109
+ dsn: str,
110
+ *,
111
+ normalized_records: list[dict[str, Any]],
112
+ pending_quarantine: list[dict[str, Any]],
113
+ download_id_by_url: dict[str, int],
114
+ pending_assets: list[dict[str, Any]] | None = None,
115
+ write_legacy: bool = False,
116
+ ) -> dict[str, int]:
117
+ return pnp_raw_repository.load_raw_record_chunk(
118
+ dsn,
119
+ normalized_records=normalized_records,
120
+ pending_quarantine=pending_quarantine,
121
+ download_id_by_url=download_id_by_url,
122
+ pending_assets=pending_assets,
123
+ write_legacy=write_legacy,
124
+ )
@@ -0,0 +1,271 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import psycopg2
8
+ from psycopg2.extras import Json, RealDictCursor
9
+
10
+ from dataif_pipelines.repositories import pnp_raw_repository
11
+
12
+ _ENDPOINT_SQL = {
13
+ "Matrículas": {
14
+ "domain": "020_pnp_matriculas.sql",
15
+ },
16
+ "Eficiência Acadêmica": {
17
+ "domain": "030_pnp_eficiencia_academica.sql",
18
+ },
19
+ "Servidores": {
20
+ "domain": "040_pnp_servidores.sql",
21
+ },
22
+ "Financeiro": {
23
+ "domain": "050_pnp_financeiro.sql",
24
+ },
25
+ }
26
+
27
+
28
+ def _resolve_sql_dir() -> Path:
29
+ candidates = (
30
+ Path(__file__).resolve().parents[2] / "sql" / "staging",
31
+ Path(__file__).resolve().parents[3] / "sql" / "staging",
32
+ )
33
+ for candidate in candidates:
34
+ if candidate.exists():
35
+ return candidate
36
+ raise FileNotFoundError("staging SQL directory not found in expected locations")
37
+
38
+
39
+ def _read_sql_file(filename: str) -> str:
40
+ return (_resolve_sql_dir() / filename).read_text(encoding="utf-8")
41
+
42
+
43
+ def _list_run_download_batches(dsn: str, *, run_id: str, instance_key: str | None) -> list[dict[str, Any]]:
44
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
45
+ cur.execute(
46
+ """
47
+ SELECT
48
+ download_id,
49
+ tipo_microdados
50
+ FROM raw.pnp_downloads
51
+ WHERE run_id = %s
52
+ AND instance_key IS NOT DISTINCT FROM %s
53
+ AND status = 'success'
54
+ ORDER BY
55
+ CASE tipo_microdados
56
+ WHEN 'Matrículas' THEN 1
57
+ WHEN 'Eficiência Acadêmica' THEN 2
58
+ WHEN 'Servidores' THEN 3
59
+ WHEN 'Financeiro' THEN 4
60
+ ELSE 99
61
+ END,
62
+ download_id
63
+ """,
64
+ (run_id, instance_key),
65
+ )
66
+ return [dict(row) for row in cur.fetchall()]
67
+
68
+
69
+ def _collect_materialized_counts(dsn: str, *, run_id: str) -> dict[str, int]:
70
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
71
+ counts: dict[str, int] = {}
72
+ for table_name, result_key in (
73
+ ("staging.pnp_matriculas", "matriculas_count"),
74
+ ("staging.pnp_eficiencia_academica", "eficiencia_academica_count"),
75
+ ("staging.pnp_servidores", "servidores_count"),
76
+ ("staging.pnp_financeiro", "financeiro_count"),
77
+ ):
78
+ cur.execute(f"SELECT COUNT(*) AS count_value FROM {table_name} WHERE run_id = %s", (run_id,))
79
+ counts[result_key] = int((cur.fetchone() or {}).get("count_value") or 0)
80
+ counts["deduplicated_count"] = (
81
+ counts["matriculas_count"]
82
+ + counts["eficiencia_academica_count"]
83
+ + counts["servidores_count"]
84
+ + counts["financeiro_count"]
85
+ )
86
+ return counts
87
+
88
+
89
+ def materialize_instance_staging(dsn: str, *, run_id: str, instance_key: str | None) -> dict[str, Any]:
90
+ raw_checks = pnp_raw_repository.collect_run_checks(dsn, run_id)
91
+ started_at = datetime.now(tz=UTC)
92
+ batch_rows = _list_run_download_batches(dsn, run_id=run_id, instance_key=instance_key)
93
+ counts = {
94
+ "matriculas_count": 0,
95
+ "eficiencia_academica_count": 0,
96
+ "servidores_count": 0,
97
+ "financeiro_count": 0,
98
+ "deduplicated_count": 0,
99
+ }
100
+ try:
101
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
102
+ cur.execute(
103
+ """
104
+ INSERT INTO staging.pnp_ingestion_runs (
105
+ run_id,
106
+ instance_key,
107
+ status,
108
+ selected_download_count,
109
+ downloaded_file_count,
110
+ raw_record_count,
111
+ deduplicated_record_count,
112
+ quality_status,
113
+ quality_summary_json,
114
+ started_at,
115
+ finished_at,
116
+ updated_at
117
+ )
118
+ VALUES (%s, %s, 'running', %s, %s, %s, 0, NULL, '{}'::jsonb, %s, NULL, NOW())
119
+ ON CONFLICT (run_id) DO UPDATE
120
+ SET
121
+ instance_key = EXCLUDED.instance_key,
122
+ status = EXCLUDED.status,
123
+ selected_download_count = EXCLUDED.selected_download_count,
124
+ downloaded_file_count = EXCLUDED.downloaded_file_count,
125
+ raw_record_count = EXCLUDED.raw_record_count,
126
+ deduplicated_record_count = 0,
127
+ quality_status = NULL,
128
+ quality_summary_json = '{}'::jsonb,
129
+ started_at = EXCLUDED.started_at,
130
+ finished_at = NULL,
131
+ updated_at = NOW()
132
+ """,
133
+ (
134
+ run_id,
135
+ instance_key,
136
+ int(raw_checks.get("run_selection_count") or 0),
137
+ int(raw_checks.get("download_count") or 0),
138
+ int(raw_checks.get("raw_count") or 0),
139
+ started_at,
140
+ ),
141
+ )
142
+ conn.commit()
143
+
144
+ for batch in batch_rows:
145
+ tipo_microdados = str(batch.get("tipo_microdados") or "")
146
+ sql_files = _ENDPOINT_SQL.get(tipo_microdados)
147
+ if not sql_files:
148
+ continue
149
+ params = {
150
+ "instance_key": instance_key,
151
+ "run_id": run_id,
152
+ "download_id": int(batch["download_id"]),
153
+ }
154
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
155
+ cur.execute("SET LOCAL synchronous_commit = OFF")
156
+ cur.execute(_read_sql_file(sql_files["domain"]), params)
157
+ conn.commit()
158
+
159
+ counts = _collect_materialized_counts(dsn, run_id=run_id)
160
+
161
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
162
+ cur.execute(
163
+ """
164
+ UPDATE staging.pnp_ingestion_runs
165
+ SET
166
+ status = 'success',
167
+ deduplicated_record_count = %s,
168
+ quality_status = 'passed',
169
+ quality_summary_json = %s,
170
+ finished_at = NOW(),
171
+ updated_at = NOW()
172
+ WHERE run_id = %s
173
+ """,
174
+ (
175
+ counts["deduplicated_count"],
176
+ Json(
177
+ {
178
+ "raw_checks": raw_checks,
179
+ "staging_counts": {
180
+ "matriculas_count": counts["matriculas_count"],
181
+ "eficiencia_academica_count": counts["eficiencia_academica_count"],
182
+ "servidores_count": counts["servidores_count"],
183
+ "financeiro_count": counts["financeiro_count"],
184
+ "deduplicated_count": counts["deduplicated_count"],
185
+ },
186
+ "batch_count": len(batch_rows),
187
+ }
188
+ ),
189
+ run_id,
190
+ ),
191
+ )
192
+ except Exception as exc:
193
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as error_conn:
194
+ with error_conn.cursor() as cur:
195
+ cur.execute(
196
+ """
197
+ INSERT INTO staging.pnp_ingestion_runs (
198
+ run_id,
199
+ instance_key,
200
+ status,
201
+ selected_download_count,
202
+ downloaded_file_count,
203
+ raw_record_count,
204
+ deduplicated_record_count,
205
+ quality_status,
206
+ quality_summary_json,
207
+ started_at,
208
+ finished_at,
209
+ updated_at
210
+ )
211
+ VALUES (%s, %s, 'running', %s, %s, %s, 0, NULL, '{}'::jsonb, %s, NULL, NOW())
212
+ ON CONFLICT (run_id) DO UPDATE
213
+ SET
214
+ instance_key = EXCLUDED.instance_key,
215
+ status = EXCLUDED.status,
216
+ selected_download_count = EXCLUDED.selected_download_count,
217
+ downloaded_file_count = EXCLUDED.downloaded_file_count,
218
+ raw_record_count = EXCLUDED.raw_record_count,
219
+ deduplicated_record_count = 0,
220
+ quality_status = NULL,
221
+ quality_summary_json = '{}'::jsonb,
222
+ started_at = EXCLUDED.started_at,
223
+ finished_at = NULL,
224
+ updated_at = NOW()
225
+ """,
226
+ (
227
+ run_id,
228
+ instance_key,
229
+ int(raw_checks.get("run_selection_count") or 0),
230
+ int(raw_checks.get("download_count") or 0),
231
+ int(raw_checks.get("raw_count") or 0),
232
+ started_at,
233
+ ),
234
+ )
235
+ raise
236
+
237
+ return {
238
+ "run_id": run_id,
239
+ "instance_key": instance_key,
240
+ "matriculas_count": counts["matriculas_count"],
241
+ "eficiencia_academica_count": counts["eficiencia_academica_count"],
242
+ "servidores_count": counts["servidores_count"],
243
+ "financeiro_count": counts["financeiro_count"],
244
+ "deduplicated_record_count": counts["deduplicated_count"],
245
+ "batch_count": len(batch_rows),
246
+ }
247
+
248
+
249
+ def collect_staging_checks(dsn: str, run_id: str) -> dict[str, Any]:
250
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
251
+ cur.execute(
252
+ """
253
+ SELECT
254
+ run_id,
255
+ instance_key,
256
+ status,
257
+ selected_download_count,
258
+ downloaded_file_count,
259
+ raw_record_count,
260
+ deduplicated_record_count,
261
+ quality_status,
262
+ quality_summary_json,
263
+ started_at,
264
+ finished_at
265
+ FROM staging.pnp_ingestion_runs
266
+ WHERE run_id = %s
267
+ """,
268
+ (run_id,),
269
+ )
270
+ row = cur.fetchone()
271
+ return dict(row) if row else {"run_id": run_id}
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from dataclasses import dataclass
5
+ from typing import Any
6
+
7
+ from dataif_pipelines.connectors.nilo_pecanha.powerbi_microdados import (
8
+ MicrodadosCatalogEntry,
9
+ PowerBIMicrodadosClient,
10
+ PowerBIMicrodadosContext,
11
+ )
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class CatalogSelectionResult:
16
+ instance_key: str | None
17
+ selected_years: tuple[str, ...]
18
+ selected_microdados_types: tuple[str, ...]
19
+ selection_source: str
20
+ context: PowerBIMicrodadosContext
21
+ catalog_entries: tuple[MicrodadosCatalogEntry, ...]
22
+ selected_entries: tuple[MicrodadosCatalogEntry, ...]
23
+
24
+
25
+ def create_powerbi_client(*, page_url: str, timeout_seconds: int) -> PowerBIMicrodadosClient:
26
+ return PowerBIMicrodadosClient(page_url=page_url, timeout_seconds=timeout_seconds)
27
+
28
+
29
+ def resolve_catalog_selection(
30
+ *,
31
+ client: PowerBIMicrodadosClient,
32
+ request_params: dict[str, Any],
33
+ ) -> CatalogSelectionResult:
34
+ instance_key = str(request_params.get("instance_key") or "").strip() or None
35
+ selected_years = tuple(
36
+ str(item).strip()
37
+ for item in (request_params.get("selected_years") or [])
38
+ if isinstance(item, str) and item.strip()
39
+ )
40
+ selected_microdados_types = tuple(
41
+ str(item).strip()
42
+ for item in (request_params.get("selected_microdados_types") or [])
43
+ if isinstance(item, str) and item.strip()
44
+ )
45
+ if not selected_years:
46
+ raise RuntimeError("powerbi_microdados mode requires selected_years")
47
+ if not selected_microdados_types:
48
+ raise RuntimeError("powerbi_microdados mode requires selected_microdados_types")
49
+
50
+ context, catalog_entries = client.fetch_catalog()
51
+ selected_downloads = [
52
+ item
53
+ for item in (request_params.get("selected_downloads") or [])
54
+ if isinstance(item, dict)
55
+ ]
56
+
57
+ if selected_downloads:
58
+ selected_keys = {
59
+ (
60
+ str(item.get("ano_base") or "").strip(),
61
+ str(item.get("tipo_microdados") or "").strip(),
62
+ str(item.get("microdados_url") or "").strip(),
63
+ )
64
+ for item in selected_downloads
65
+ if str(item.get("ano_base") or "").strip()
66
+ and str(item.get("tipo_microdados") or "").strip()
67
+ and str(item.get("microdados_url") or "").strip()
68
+ }
69
+ filtered_entries = [
70
+ entry
71
+ for entry in catalog_entries
72
+ if (entry.ano_base, entry.tipo_microdados, entry.microdados_url) in selected_keys
73
+ ]
74
+ missing_keys = selected_keys - {
75
+ (entry.ano_base, entry.tipo_microdados, entry.microdados_url)
76
+ for entry in filtered_entries
77
+ }
78
+ if missing_keys:
79
+ missing_text = ", ".join(
80
+ f"{ano_base}/{tipo_microdados}"
81
+ for ano_base, tipo_microdados, _microdados_url in sorted(missing_keys)
82
+ )
83
+ raise RuntimeError(f"powerbi_microdados selected_downloads missing from public catalog: {missing_text}")
84
+ selection_source = "selected_downloads"
85
+ else:
86
+ filtered_entries = [
87
+ entry
88
+ for entry in catalog_entries
89
+ if entry.ano_base in selected_years and entry.tipo_microdados in selected_microdados_types
90
+ ]
91
+ selection_source = "catalog_filter"
92
+
93
+ if not filtered_entries:
94
+ raise RuntimeError("powerbi_microdados selection matched no download links in the public catalog")
95
+
96
+ return CatalogSelectionResult(
97
+ instance_key=instance_key,
98
+ selected_years=selected_years,
99
+ selected_microdados_types=selected_microdados_types,
100
+ selection_source=selection_source,
101
+ context=context,
102
+ catalog_entries=tuple(catalog_entries),
103
+ selected_entries=tuple(filtered_entries),
104
+ )
105
+
106
+
107
+ def build_catalog_entry_rows(
108
+ *,
109
+ run_id: str,
110
+ selection: CatalogSelectionResult,
111
+ ) -> list[dict[str, Any]]:
112
+ selected_keys = {
113
+ (entry.ano_base, entry.tipo_microdados, entry.microdados_url)
114
+ for entry in selection.selected_entries
115
+ }
116
+ rows: list[dict[str, Any]] = []
117
+ for entry in selection.catalog_entries:
118
+ rows.append(
119
+ {
120
+ "run_id": run_id,
121
+ "instance_key": selection.instance_key,
122
+ "ano_base": entry.ano_base,
123
+ "tipo_microdados": entry.tipo_microdados,
124
+ "microdados_url": entry.microdados_url,
125
+ "resource_key": selection.context.resource_key,
126
+ "visual_id": selection.context.visual_id,
127
+ "api_base_url": selection.context.api_base_url,
128
+ "catalog_hash": hashlib.sha256(
129
+ f"{entry.ano_base}|{entry.tipo_microdados}|{entry.microdados_url}".encode("utf-8")
130
+ ).hexdigest(),
131
+ "is_selected": (entry.ano_base, entry.tipo_microdados, entry.microdados_url) in selected_keys,
132
+ }
133
+ )
134
+ return rows
135
+
136
+
137
+ def build_run_selection_rows(
138
+ *,
139
+ run_id: str,
140
+ selection: CatalogSelectionResult,
141
+ ) -> list[dict[str, Any]]:
142
+ rows: list[dict[str, Any]] = []
143
+ for selection_rank, entry in enumerate(selection.selected_entries, start=1):
144
+ rows.append(
145
+ {
146
+ "run_id": run_id,
147
+ "instance_key": selection.instance_key,
148
+ "ano_base": entry.ano_base,
149
+ "tipo_microdados": entry.tipo_microdados,
150
+ "microdados_url": entry.microdados_url,
151
+ "selection_source": selection.selection_source,
152
+ "selection_rank": selection_rank,
153
+ "details_json": {
154
+ "page_url": selection.context.page_url,
155
+ "resource_key": selection.context.resource_key,
156
+ },
157
+ }
158
+ )
159
+ return rows
@@ -0,0 +1,112 @@
1
+ WITH selected_rows AS (
2
+ SELECT
3
+ src.*,
4
+ ROW_NUMBER() OVER (
5
+ PARTITION BY src.instance_key, src.record_hash
6
+ ORDER BY src.raw_record_id DESC
7
+ ) AS dedup_rank
8
+ FROM raw.pnp_matriculas_src src
9
+ LEFT JOIN raw.pnp_downloads downloads ON downloads.download_id = src.download_id
10
+ JOIN raw.pnp_instance_selection selection
11
+ ON selection.instance_key = src.instance_key
12
+ AND selection.is_active = TRUE
13
+ AND selection.ano_base = src.ano_base
14
+ AND selection.tipo_microdados = src.tipo_microdados
15
+ AND (
16
+ selection.configured_microdados_url IS NULL
17
+ OR selection.configured_microdados_url = downloads.microdados_url
18
+ )
19
+ WHERE src.run_id = %(run_id)s
20
+ AND src.instance_key IS NOT DISTINCT FROM %(instance_key)s
21
+ AND src.download_id = %(download_id)s
22
+ ),
23
+ deduplicated_rows AS (
24
+ SELECT *
25
+ FROM selected_rows
26
+ WHERE dedup_rank = 1
27
+ )
28
+ INSERT INTO staging.pnp_matriculas (
29
+ raw_record_id,
30
+ run_id,
31
+ instance_key,
32
+ ano,
33
+ instituicao,
34
+ regiao,
35
+ uf,
36
+ municipio,
37
+ sexo,
38
+ cor_raca,
39
+ renda_familiar,
40
+ faixa_etaria,
41
+ situacao_matricula,
42
+ modalidade_ensino,
43
+ tipo_curso,
44
+ tipo_oferta,
45
+ turno,
46
+ eixo_tecnologico,
47
+ subeixo_tecnologico,
48
+ nome_curso,
49
+ total_inscritos,
50
+ vagas_ofertadas,
51
+ processed_at
52
+ )
53
+ SELECT
54
+ deduplicated_rows.raw_record_id,
55
+ deduplicated_rows.run_id,
56
+ deduplicated_rows.instance_key,
57
+ CASE
58
+ WHEN NULLIF(deduplicated_rows.ano, '') ~ '^[0-9]{1,4}$' THEN deduplicated_rows.ano::INTEGER
59
+ ELSE NULL
60
+ END AS ano,
61
+ NULLIF(deduplicated_rows.instituicao, '') AS instituicao,
62
+ NULLIF(deduplicated_rows.regiao, '') AS regiao,
63
+ NULLIF(deduplicated_rows.uf, '') AS uf,
64
+ NULLIF(deduplicated_rows.municipio, '') AS municipio,
65
+ NULLIF(deduplicated_rows.sexo, '') AS sexo,
66
+ NULLIF(deduplicated_rows.cor_raca, '') AS cor_raca,
67
+ NULLIF(deduplicated_rows.renda_familiar, '') AS renda_familiar,
68
+ NULLIF(deduplicated_rows.faixa_etaria, '') AS faixa_etaria,
69
+ NULLIF(deduplicated_rows.situacao_de_matricula, '') AS situacao_matricula,
70
+ NULLIF(deduplicated_rows.modalidade_de_ensino, '') AS modalidade_ensino,
71
+ NULLIF(deduplicated_rows.tipo_de_curso, '') AS tipo_curso,
72
+ NULLIF(deduplicated_rows.tipo_de_oferta, '') AS tipo_oferta,
73
+ NULLIF(deduplicated_rows.turno, '') AS turno,
74
+ NULLIF(deduplicated_rows.eixo_tecnologico, '') AS eixo_tecnologico,
75
+ NULLIF(deduplicated_rows.subeixo_tecnologico, '') AS subeixo_tecnologico,
76
+ NULLIF(deduplicated_rows.nome_de_curso, '') AS nome_curso,
77
+ CASE
78
+ WHEN REPLACE(REPLACE(NULLIF(BTRIM(deduplicated_rows.total_de_inscritos), ''), '.', ''), ',', '.') ~ '^-?[0-9]+(\.[0-9]+)?$'
79
+ THEN REPLACE(REPLACE(NULLIF(BTRIM(deduplicated_rows.total_de_inscritos), ''), '.', ''), ',', '.')::NUMERIC
80
+ ELSE NULL
81
+ END AS total_inscritos,
82
+ CASE
83
+ WHEN REPLACE(REPLACE(NULLIF(BTRIM(deduplicated_rows.vagas_ofertadas), ''), '.', ''), ',', '.') ~ '^-?[0-9]+(\.[0-9]+)?$'
84
+ THEN REPLACE(REPLACE(NULLIF(BTRIM(deduplicated_rows.vagas_ofertadas), ''), '.', ''), ',', '.')::NUMERIC
85
+ ELSE NULL
86
+ END AS vagas_ofertadas,
87
+ NOW()
88
+ FROM deduplicated_rows
89
+ ON CONFLICT (raw_record_id) DO UPDATE
90
+ SET
91
+ run_id = EXCLUDED.run_id,
92
+ instance_key = EXCLUDED.instance_key,
93
+ ano = EXCLUDED.ano,
94
+ instituicao = EXCLUDED.instituicao,
95
+ regiao = EXCLUDED.regiao,
96
+ uf = EXCLUDED.uf,
97
+ municipio = EXCLUDED.municipio,
98
+ sexo = EXCLUDED.sexo,
99
+ cor_raca = EXCLUDED.cor_raca,
100
+ renda_familiar = EXCLUDED.renda_familiar,
101
+ faixa_etaria = EXCLUDED.faixa_etaria,
102
+ situacao_matricula = EXCLUDED.situacao_matricula,
103
+ modalidade_ensino = EXCLUDED.modalidade_ensino,
104
+ tipo_curso = EXCLUDED.tipo_curso,
105
+ tipo_oferta = EXCLUDED.tipo_oferta,
106
+ turno = EXCLUDED.turno,
107
+ eixo_tecnologico = EXCLUDED.eixo_tecnologico,
108
+ subeixo_tecnologico = EXCLUDED.subeixo_tecnologico,
109
+ nome_curso = EXCLUDED.nome_curso,
110
+ total_inscritos = EXCLUDED.total_inscritos,
111
+ vagas_ofertadas = EXCLUDED.vagas_ofertadas,
112
+ processed_at = NOW();