@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,19 @@
1
+ """Service layer for pipeline orchestration and extraction steps."""
2
+
3
+ from dataif_pipelines.services import (
4
+ pnp_curated_service,
5
+ pnp_download_service,
6
+ pnp_quality_service,
7
+ pnp_raw_ingestion_service,
8
+ pnp_staging_service,
9
+ powerbi_catalog_service,
10
+ )
11
+
12
+ __all__ = [
13
+ "pnp_download_service",
14
+ "pnp_curated_service",
15
+ "pnp_quality_service",
16
+ "pnp_raw_ingestion_service",
17
+ "pnp_staging_service",
18
+ "powerbi_catalog_service",
19
+ ]
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import psycopg2
7
+ from psycopg2.extras import RealDictCursor
8
+
9
+ _CURATED_SQL_FILES = (
10
+ "010_vw_pnp_admin_ingestao.sql",
11
+ "020_vw_pnp_qualidade_dados.sql",
12
+ "030_vw_pnp_matriculas.sql",
13
+ "040_vw_pnp_eficiencia.sql",
14
+ "050_vw_pnp_servidores.sql",
15
+ "060_vw_pnp_financeiro.sql",
16
+ "070_vw_pnp_vanna.sql",
17
+ "004_mv_pnp_dashboard_fast.sql",
18
+ )
19
+
20
+
21
+ def _resolve_sql_dir() -> Path:
22
+ candidates = (
23
+ Path(__file__).resolve().parents[2] / "sql" / "views_curated",
24
+ Path(__file__).resolve().parents[3] / "sql" / "views_curated",
25
+ )
26
+ for candidate in candidates:
27
+ if candidate.exists():
28
+ return candidate
29
+ raise FileNotFoundError("curated SQL directory not found in expected locations")
30
+
31
+
32
+ def _read_sql_file(filename: str) -> str:
33
+ return (_resolve_sql_dir() / filename).read_text(encoding="utf-8")
34
+
35
+
36
+ def materialize_instance_curated(dsn: str, *, run_id: str, instance_key: str | None) -> dict[str, Any]:
37
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
38
+ for filename in _CURATED_SQL_FILES:
39
+ cur.execute(_read_sql_file(filename))
40
+
41
+ query_map = {
42
+ "admin_ingestao_count": "SELECT COUNT(*) FROM curated.vw_pnp_admin_ingestao WHERE run_id = %s",
43
+ "qualidade_count": "SELECT COUNT(*) FROM curated.vw_pnp_qualidade_dados WHERE run_id = %s",
44
+ "matriculas_perfil_count": "SELECT COUNT(*) FROM curated.vw_pnp_matriculas_perfil WHERE run_id = %s",
45
+ "matriculas_oferta_count": "SELECT COUNT(*) FROM curated.vw_pnp_matriculas_oferta WHERE run_id = %s",
46
+ "eficiencia_situacao_count": "SELECT COUNT(*) FROM curated.vw_pnp_eficiencia_situacao WHERE run_id = %s",
47
+ "servidores_quadro_count": "SELECT COUNT(*) FROM curated.vw_pnp_servidores_quadro WHERE run_id = %s",
48
+ "financeiro_execucao_count": "SELECT COUNT(*) FROM curated.vw_pnp_financeiro_execucao WHERE run_id = %s",
49
+ "vanna_resumo_count": "SELECT COUNT(*) FROM curated.vw_pnp_vanna_resumo WHERE run_id = %s",
50
+ "vanna_catalogo_count": "SELECT COUNT(*) FROM curated.vw_pnp_vanna_catalogo",
51
+ "mv_matriculas_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_matriculas",
52
+ "mv_eficiencia_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_eficiencia",
53
+ "mv_servidores_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_servidores",
54
+ "mv_financeiro_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_financeiro",
55
+ "mv_qualidade_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_qualidade",
56
+ "mv_ingestao_count": "SELECT COUNT(*) FROM curated.mv_pnp_dashboard_ingestao",
57
+ }
58
+ result = {"run_id": run_id, "instance_key": instance_key}
59
+ for key, query in query_map.items():
60
+ if "%s" in query:
61
+ cur.execute(query, (run_id,))
62
+ else:
63
+ cur.execute(query)
64
+ row = cur.fetchone()
65
+ result[key] = int(next(iter(row.values()))) if row else 0
66
+ return result
@@ -0,0 +1,534 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import gzip
5
+ import hashlib
6
+ import io
7
+ import json
8
+ import re
9
+ import unicodedata
10
+ from collections.abc import Iterator
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+ from urllib.parse import urlparse
14
+
15
+ from dataif_pipelines.connectors.nilo_pecanha.powerbi_microdados import MicrodadosCatalogEntry, PowerBIMicrodadosClient
16
+ from dataif_pipelines.services.powerbi_catalog_service import CatalogSelectionResult
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ParsedCsvContent:
21
+ headers: tuple[str, ...]
22
+ rows: list[dict[str, Any]]
23
+ delimiter: str
24
+ invalid_rows: list[dict[str, Any]]
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class DownloadBatch:
29
+ records: list[dict[str, Any]]
30
+ downloads: list[dict[str, Any]]
31
+ assets: list[dict[str, Any]]
32
+ quarantine_rows: list[dict[str, Any]]
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class CsvRowResult:
37
+ source_row_number: int
38
+ row: dict[str, Any] | None
39
+ invalid_row: dict[str, Any] | None
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class StreamedCsvContent:
44
+ headers: tuple[str, ...]
45
+ delimiter: str
46
+ row_results: Iterator[CsvRowResult]
47
+
48
+
49
+ def normalize_column_name(value: str) -> str:
50
+ ascii_text = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
51
+ lowered = ascii_text.lower().strip()
52
+ collapsed = re.sub(r"[^a-z0-9]+", "_", lowered).strip("_")
53
+ return collapsed or "coluna"
54
+
55
+
56
+ def parse_csv_content(content: str) -> ParsedCsvContent:
57
+ if not content:
58
+ return ParsedCsvContent(headers=(), rows=[], delimiter=",", invalid_rows=[])
59
+
60
+ lines = content.splitlines()
61
+ first_line = lines[0] if lines else ""
62
+ delimiter = ";" if first_line.count(";") > first_line.count(",") else ","
63
+ stream = io.StringIO(content)
64
+ reader = csv.DictReader(stream, delimiter=delimiter)
65
+ headers = tuple(str(item).strip() for item in (reader.fieldnames or []) if item is not None)
66
+ rows: list[dict[str, Any]] = []
67
+ invalid_rows: list[dict[str, Any]] = []
68
+
69
+ for row_number, row in enumerate(reader, start=1):
70
+ extras = row.get(None)
71
+ if extras:
72
+ invalid_rows.append(
73
+ {
74
+ "source_row_number": row_number,
75
+ "error_type": "csv_extra_columns",
76
+ "error_message": "row has extra columns beyond the detected header",
77
+ "raw_line_text": json.dumps(extras, ensure_ascii=True),
78
+ }
79
+ )
80
+
81
+ cleaned = {
82
+ str(key).strip(): (value.strip() if isinstance(value, str) else value)
83
+ for key, value in row.items()
84
+ if key is not None
85
+ }
86
+ if cleaned:
87
+ rows.append(cleaned)
88
+
89
+ return ParsedCsvContent(headers=headers, rows=rows, delimiter=delimiter, invalid_rows=invalid_rows)
90
+
91
+
92
+ def decode_content_bytes(content_bytes: bytes, source_url: str) -> str:
93
+ content = content_bytes
94
+ if source_url.endswith(".gz"):
95
+ content = gzip.decompress(content_bytes)
96
+ for encoding in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
97
+ try:
98
+ return content.decode(encoding)
99
+ except UnicodeDecodeError:
100
+ continue
101
+ return content.decode("utf-8", errors="replace")
102
+
103
+
104
+ def stream_csv_content(content_bytes: bytes, source_url: str) -> StreamedCsvContent:
105
+ if not content_bytes:
106
+ return StreamedCsvContent(headers=(), delimiter=",", row_results=iter(()))
107
+
108
+ binary_factory = _build_binary_factory(content_bytes, source_url)
109
+ encoding = _detect_content_encoding(binary_factory)
110
+ text_stream = io.TextIOWrapper(binary_factory(), encoding=encoding, errors="replace", newline="")
111
+ first_line = text_stream.readline()
112
+ delimiter = ";" if first_line.count(";") > first_line.count(",") else ","
113
+ reader = csv.DictReader(_iter_text_lines(first_line, text_stream), delimiter=delimiter)
114
+ headers = tuple(str(item).strip() for item in (reader.fieldnames or []) if item is not None)
115
+
116
+ def _row_iter() -> Iterator[CsvRowResult]:
117
+ try:
118
+ for row_number, row in enumerate(reader, start=1):
119
+ extras = row.get(None)
120
+ invalid_row = None
121
+ if extras:
122
+ invalid_row = {
123
+ "source_row_number": row_number,
124
+ "error_type": "csv_extra_columns",
125
+ "error_message": "row has extra columns beyond the detected header",
126
+ "raw_line_text": json.dumps(extras, ensure_ascii=True),
127
+ }
128
+
129
+ cleaned = {
130
+ str(key).strip(): (value.strip() if isinstance(value, str) else value)
131
+ for key, value in row.items()
132
+ if key is not None
133
+ }
134
+ yield CsvRowResult(
135
+ source_row_number=row_number,
136
+ row=cleaned or None,
137
+ invalid_row=invalid_row,
138
+ )
139
+ finally:
140
+ text_stream.close()
141
+
142
+ return StreamedCsvContent(headers=headers, delimiter=delimiter, row_results=_row_iter())
143
+
144
+
145
+ def stream_csv_binary_stream(binary_stream, source_url: str) -> StreamedCsvContent:
146
+ buffered_raw = io.BufferedReader(binary_stream)
147
+ decoded_binary_stream = gzip.GzipFile(fileobj=buffered_raw) if source_url.endswith(".gz") else buffered_raw
148
+ buffered_decoded_stream = io.BufferedReader(decoded_binary_stream)
149
+ sample = buffered_decoded_stream.peek(65536)[:65536]
150
+ encoding = _detect_bytes_encoding(sample)
151
+ text_stream = io.TextIOWrapper(buffered_decoded_stream, encoding=encoding, errors="replace", newline="")
152
+ first_line = text_stream.readline()
153
+ delimiter = ";" if first_line.count(";") > first_line.count(",") else ","
154
+ reader = csv.DictReader(_iter_text_lines(first_line, text_stream), delimiter=delimiter)
155
+ headers = tuple(str(item).strip() for item in (reader.fieldnames or []) if item is not None)
156
+
157
+ def _row_iter() -> Iterator[CsvRowResult]:
158
+ try:
159
+ for row_number, row in enumerate(reader, start=1):
160
+ extras = row.get(None)
161
+ invalid_row = None
162
+ if extras:
163
+ invalid_row = {
164
+ "source_row_number": row_number,
165
+ "error_type": "csv_extra_columns",
166
+ "error_message": "row has extra columns beyond the detected header",
167
+ "raw_line_text": json.dumps(extras, ensure_ascii=True),
168
+ }
169
+
170
+ cleaned = {
171
+ str(key).strip(): (value.strip() if isinstance(value, str) else value)
172
+ for key, value in row.items()
173
+ if key is not None
174
+ }
175
+ yield CsvRowResult(
176
+ source_row_number=row_number,
177
+ row=cleaned or None,
178
+ invalid_row=invalid_row,
179
+ )
180
+ finally:
181
+ text_stream.close()
182
+
183
+ return StreamedCsvContent(headers=headers, delimiter=delimiter, row_results=_row_iter())
184
+
185
+
186
+ def build_download_batch(
187
+ *,
188
+ client: PowerBIMicrodadosClient,
189
+ run_id: str,
190
+ endpoint_id: int,
191
+ endpoint_key: str,
192
+ selection: CatalogSelectionResult,
193
+ ) -> DownloadBatch:
194
+ records: list[dict[str, Any]] = []
195
+ downloads: list[dict[str, Any]] = []
196
+ assets: list[dict[str, Any]] = []
197
+ quarantine_rows: list[dict[str, Any]] = []
198
+ total_record_count = 0
199
+
200
+ for selection_rank, entry in enumerate(selection.selected_entries, start=1):
201
+ content = client.fetch_entry_content(entry)
202
+ streamed = stream_csv_content(content.content_bytes, entry.microdados_url)
203
+ source_file_name = resolve_source_file_name(entry)
204
+ row_count = 0
205
+
206
+ for row_result in streamed.row_results:
207
+ if row_result.invalid_row:
208
+ quarantine_rows.append(
209
+ build_quarantine_row(
210
+ run_id=run_id,
211
+ instance_key=selection.instance_key,
212
+ entry=entry,
213
+ invalid_row=row_result.invalid_row,
214
+ )
215
+ )
216
+
217
+ if not row_result.row:
218
+ continue
219
+
220
+ row_count += 1
221
+ records.append(
222
+ build_record_payload(
223
+ row=row_result.row,
224
+ row_number=row_result.source_row_number,
225
+ entry=entry,
226
+ selection=selection,
227
+ source_file_name=source_file_name,
228
+ source_file_sha256=content.sha256,
229
+ )
230
+ )
231
+
232
+ total_record_count += row_count
233
+ download_row = build_download_row(
234
+ run_id=run_id,
235
+ instance_key=selection.instance_key,
236
+ entry=entry,
237
+ source_file_name=source_file_name,
238
+ source_file_sha256=content.sha256,
239
+ content_type=content.content_type,
240
+ size_bytes=content.size_bytes,
241
+ row_count_raw=row_count,
242
+ delimiter=streamed.delimiter,
243
+ selection_source=selection.selection_source,
244
+ selection_rank=selection_rank,
245
+ headers=streamed.headers,
246
+ status="success",
247
+ error_message=None,
248
+ )
249
+ downloads.append(download_row)
250
+ assets.append(
251
+ build_download_asset_row(
252
+ run_id=run_id,
253
+ endpoint_id=endpoint_id,
254
+ endpoint_key=endpoint_key,
255
+ entry=entry,
256
+ source_file_name=source_file_name,
257
+ content_type=content.content_type,
258
+ size_bytes=content.size_bytes,
259
+ sha256=content.sha256,
260
+ row_count=row_count,
261
+ headers=streamed.headers,
262
+ )
263
+ )
264
+
265
+ assets.append(
266
+ build_manifest_asset_row(
267
+ run_id=run_id,
268
+ endpoint_id=endpoint_id,
269
+ endpoint_key=endpoint_key,
270
+ selection=selection,
271
+ downloads=downloads,
272
+ raw_record_count=total_record_count,
273
+ )
274
+ )
275
+ return DownloadBatch(records=records, downloads=downloads, assets=assets, quarantine_rows=quarantine_rows)
276
+
277
+
278
+ def resolve_source_file_name(entry: MicrodadosCatalogEntry) -> str:
279
+ return urlparse(entry.microdados_url).path.rsplit("/", 1)[-1] or f"{entry.ano_base}.csv"
280
+
281
+
282
+ def build_download_row(
283
+ *,
284
+ run_id: str,
285
+ instance_key: str | None,
286
+ entry: MicrodadosCatalogEntry,
287
+ source_file_name: str,
288
+ source_file_sha256: str,
289
+ content_type: str | None,
290
+ size_bytes: int,
291
+ row_count_raw: int,
292
+ delimiter: str,
293
+ selection_source: str,
294
+ selection_rank: int,
295
+ headers: tuple[str, ...],
296
+ status: str,
297
+ error_message: str | None,
298
+ ) -> dict[str, Any]:
299
+ return {
300
+ "run_id": run_id,
301
+ "instance_key": instance_key,
302
+ "ano_base": entry.ano_base,
303
+ "tipo_microdados": entry.tipo_microdados,
304
+ "microdados_url": entry.microdados_url,
305
+ "source_file_name": source_file_name,
306
+ "source_file_sha256": source_file_sha256,
307
+ "content_type": content_type,
308
+ "size_bytes": size_bytes,
309
+ "row_count_raw": row_count_raw,
310
+ "status": status,
311
+ "error_message": error_message,
312
+ "details_json": {
313
+ "delimiter": delimiter,
314
+ "selection_source": selection_source,
315
+ "selection_rank": selection_rank,
316
+ "header_count": len(headers),
317
+ },
318
+ "headers": headers,
319
+ "normalized_headers": {header: normalize_column_name(header) for header in headers},
320
+ }
321
+
322
+
323
+ def build_download_asset_row(
324
+ *,
325
+ run_id: str,
326
+ endpoint_id: int,
327
+ endpoint_key: str,
328
+ entry: MicrodadosCatalogEntry,
329
+ source_file_name: str,
330
+ content_type: str | None,
331
+ size_bytes: int,
332
+ sha256: str,
333
+ row_count: int,
334
+ headers: tuple[str, ...],
335
+ ) -> dict[str, Any]:
336
+ return _build_asset_row(
337
+ run_id=run_id,
338
+ endpoint_id=endpoint_id,
339
+ endpoint_key=endpoint_key,
340
+ asset_type="powerbi_microdados_download",
341
+ source_url=entry.microdados_url,
342
+ content_text=json.dumps(
343
+ {
344
+ "ano_base": entry.ano_base,
345
+ "tipo_microdados": entry.tipo_microdados,
346
+ "microdados_url": entry.microdados_url,
347
+ "source_file_name": source_file_name,
348
+ "content_type": content_type,
349
+ "size_bytes": size_bytes,
350
+ "sha256": sha256,
351
+ "row_count": row_count,
352
+ "headers": list(headers),
353
+ },
354
+ ensure_ascii=True,
355
+ sort_keys=True,
356
+ ),
357
+ )
358
+
359
+
360
+ def build_manifest_asset_row(
361
+ *,
362
+ run_id: str,
363
+ endpoint_id: int,
364
+ endpoint_key: str,
365
+ selection: CatalogSelectionResult,
366
+ downloads: list[dict[str, Any]],
367
+ raw_record_count: int,
368
+ ) -> dict[str, Any]:
369
+ manifest = {
370
+ "status": "ok",
371
+ "selected_years": list(selection.selected_years),
372
+ "selected_microdados_types": list(selection.selected_microdados_types),
373
+ "selection_source": selection.selection_source,
374
+ "catalog_entry_count": len(selection.catalog_entries),
375
+ "selected_entry_count": len(selection.selected_entries),
376
+ "raw_record_count": raw_record_count,
377
+ "context": {
378
+ "page_url": selection.context.page_url,
379
+ "resource_key": selection.context.resource_key,
380
+ "tenant_id": selection.context.tenant_id,
381
+ "resolved_cluster_uri": selection.context.resolved_cluster_uri,
382
+ "api_base_url": selection.context.api_base_url,
383
+ "model_id": selection.context.model_id,
384
+ "dataset_id": selection.context.dataset_id,
385
+ "report_id": selection.context.report_id,
386
+ "report_numeric_id": selection.context.report_numeric_id,
387
+ "section_name": selection.context.section_name,
388
+ "section_display_name": selection.context.section_display_name,
389
+ "visual_id": selection.context.visual_id,
390
+ "visual_type": selection.context.visual_type,
391
+ },
392
+ "entries": [
393
+ {
394
+ "ano_base": entry.ano_base,
395
+ "tipo_microdados": entry.tipo_microdados,
396
+ "microdados_url": entry.microdados_url,
397
+ "is_selected": entry in selection.selected_entries,
398
+ }
399
+ for entry in selection.catalog_entries
400
+ ],
401
+ "downloads": downloads,
402
+ }
403
+ return _build_asset_row(
404
+ run_id=run_id,
405
+ endpoint_id=endpoint_id,
406
+ endpoint_key=endpoint_key,
407
+ asset_type="powerbi_microdados_manifest",
408
+ source_url=selection.context.page_url,
409
+ content_text=json.dumps(manifest, ensure_ascii=True, sort_keys=True),
410
+ )
411
+
412
+
413
+ def build_quarantine_row(
414
+ *,
415
+ run_id: str,
416
+ instance_key: str | None,
417
+ entry: MicrodadosCatalogEntry,
418
+ invalid_row: dict[str, Any],
419
+ ) -> dict[str, Any]:
420
+ return {
421
+ "run_id": run_id,
422
+ "instance_key": instance_key,
423
+ "source_url": entry.microdados_url,
424
+ "source_row_number": invalid_row.get("source_row_number"),
425
+ "error_type": str(invalid_row.get("error_type") or "csv_parse_error"),
426
+ "error_message": str(invalid_row.get("error_message") or "CSV parsing produced an invalid row"),
427
+ "raw_line_text": str(invalid_row.get("raw_line_text") or ""),
428
+ "details_json": {"tipo_microdados": entry.tipo_microdados},
429
+ }
430
+
431
+
432
+ def build_record_payload(
433
+ *,
434
+ row: dict[str, Any],
435
+ row_number: int,
436
+ entry: MicrodadosCatalogEntry,
437
+ selection: CatalogSelectionResult,
438
+ source_file_name: str,
439
+ source_file_sha256: str,
440
+ ) -> dict[str, Any]:
441
+ enriched = {str(key): value for key, value in row.items() if isinstance(key, str)}
442
+ dataset_name = f"pnp_microdados_{_slugify(entry.tipo_microdados)}"
443
+ enriched.update(
444
+ {
445
+ "id": str(
446
+ enriched.get("Código da Matricula")
447
+ or enriched.get("Código da Matrícula")
448
+ or enriched.get("Código do Ciclo Matricula")
449
+ or enriched.get("Matrícula")
450
+ or f"{entry.ano_base}|{entry.tipo_microdados}|{source_file_name}|{row_number}"
451
+ ),
452
+ "dataset": dataset_name,
453
+ "entidade": entry.tipo_microdados,
454
+ "ano": entry.ano_base,
455
+ "tipo": entry.tipo_microdados,
456
+ "indicador": entry.tipo_microdados,
457
+ "tipo_microdados": entry.tipo_microdados,
458
+ "microdados_url": entry.microdados_url,
459
+ "source_file_name": source_file_name,
460
+ "source_file_sha256": source_file_sha256,
461
+ "source_method": "powerbi_microdados",
462
+ "source_row_number": row_number,
463
+ "instance_key": selection.instance_key,
464
+ }
465
+ )
466
+ return enriched
467
+
468
+
469
+ def _build_asset_row(
470
+ *,
471
+ run_id: str,
472
+ endpoint_id: int,
473
+ endpoint_key: str,
474
+ asset_type: str,
475
+ source_url: str,
476
+ content_text: str,
477
+ ) -> dict[str, Any]:
478
+ return {
479
+ "run_id": run_id,
480
+ "endpoint_id": endpoint_id,
481
+ "endpoint_key": endpoint_key,
482
+ "asset_type": asset_type,
483
+ "source_url": source_url,
484
+ "content_text": content_text,
485
+ "content_hash": hashlib.sha256(content_text.encode("utf-8")).hexdigest(),
486
+ }
487
+
488
+
489
+ def _build_binary_factory(content_bytes: bytes, source_url: str):
490
+ def _factory():
491
+ raw = io.BytesIO(content_bytes)
492
+ if source_url.endswith(".gz"):
493
+ return gzip.GzipFile(fileobj=raw)
494
+ return raw
495
+
496
+ return _factory
497
+
498
+
499
+ def _detect_content_encoding(binary_factory) -> str:
500
+ stream = binary_factory()
501
+ try:
502
+ sample = stream.read(65536)
503
+ finally:
504
+ stream.close()
505
+
506
+ for encoding in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
507
+ try:
508
+ sample.decode(encoding)
509
+ return encoding
510
+ except UnicodeDecodeError:
511
+ continue
512
+ return "utf-8"
513
+
514
+
515
+ def _detect_bytes_encoding(sample: bytes) -> str:
516
+ for encoding in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
517
+ try:
518
+ sample.decode(encoding)
519
+ return encoding
520
+ except UnicodeDecodeError:
521
+ continue
522
+ return "utf-8"
523
+
524
+
525
+ def _iter_text_lines(first_line: str, text_stream: io.TextIOWrapper) -> Iterator[str]:
526
+ if first_line:
527
+ yield first_line
528
+ yield from text_stream
529
+
530
+
531
+ def _slugify(value: str) -> str:
532
+ ascii_text = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii").lower()
533
+ collapsed = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
534
+ return collapsed or "arquivo"
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from dataif_pipelines.repositories import pnp_raw_repository
6
+
7
+
8
+ def collect_run_checks(dsn: str, run_id: str) -> dict[str, Any]:
9
+ return pnp_raw_repository.collect_run_checks(dsn, run_id)