@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,728 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import base64
5
+ import contextlib
6
+ import gzip
7
+ import hashlib
8
+ import io
9
+ import json
10
+ import uuid
11
+ from dataclasses import asdict, dataclass
12
+ from pathlib import Path
13
+ from typing import Any, Iterator
14
+ from urllib.parse import parse_qs, unquote, urlparse
15
+
16
+ import requests
17
+
18
+ DEFAULT_POWERBI_MICRODADOS_URL = (
19
+ "https://app.powerbi.com/view?"
20
+ "r=eyJrIjoiZDhkNGNiYzgtMjQ0My00OGVlLWJjNzYtZWQwYjI2OThhYWM1IiwidCI6IjllNjgyMzU5LWQxMjgtNGVkYi1iYjU4LTgyYjJhMTUzNDBmZiJ9"
21
+ )
22
+ MICRODADOS_SECTION_DISPLAY_NAME = "Microdados da PNP"
23
+ MICRODADOS_ROWS_QUERY_REF = "Microdados.Ano Base"
24
+ MICRODADOS_COLUMNS_QUERY_REF = "Microdados.Tipo de microdados"
25
+ MICRODADOS_VALUES_QUERY_REF = "Microdados.MicrodadosURL"
26
+ MICRODADOS_ENTITY_NAME = "Microdados"
27
+ MICRODADOS_ANO_PROPERTY = "Ano Base"
28
+ MICRODADOS_TIPO_PROPERTY = "Tipo de microdados"
29
+ MICRODADOS_URL_PROPERTY = "MicrodadosURL"
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class PowerBIMicrodadosContext:
34
+ page_url: str
35
+ resource_key: str
36
+ tenant_id: str
37
+ resolved_cluster_uri: str
38
+ api_base_url: str
39
+ model_id: int
40
+ dataset_id: str
41
+ report_id: str
42
+ report_numeric_id: int
43
+ section_name: str
44
+ section_display_name: str
45
+ visual_id: str
46
+ visual_type: str
47
+ prototype_query: dict[str, Any]
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class MicrodadosCatalogEntry:
52
+ ano_base: str
53
+ tipo_microdados: str
54
+ microdados_url: str
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class MicrodadosDownloadResult:
59
+ ano_base: str
60
+ tipo_microdados: str
61
+ source_url: str
62
+ output_path: str
63
+ size_bytes: int
64
+ sha256: str
65
+ content_type: str | None
66
+ preview_lines: tuple[str, ...]
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class MicrodadosContentResult:
71
+ source_url: str
72
+ content_bytes: bytes
73
+ size_bytes: int
74
+ sha256: str
75
+ content_type: str | None
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class MicrodadosContentStream:
80
+ source_url: str
81
+ raw_stream: Any
82
+ content_type: str | None
83
+ size_bytes: int
84
+ sha256: str | None
85
+
86
+
87
+ class _IterContentStream(io.RawIOBase):
88
+ def __init__(self, chunks: Iterator[bytes]) -> None:
89
+ self._chunks = iter(chunks)
90
+ self._buffer = bytearray()
91
+ self._closed = False
92
+
93
+ def readable(self) -> bool:
94
+ return True
95
+
96
+ def readinto(self, buffer) -> int:
97
+ if self._closed:
98
+ return 0
99
+
100
+ requested = len(buffer)
101
+ while len(self._buffer) < requested:
102
+ try:
103
+ chunk = next(self._chunks)
104
+ except StopIteration:
105
+ break
106
+ if chunk:
107
+ self._buffer.extend(chunk)
108
+
109
+ if not self._buffer:
110
+ return 0
111
+
112
+ count = min(requested, len(self._buffer))
113
+ buffer[:count] = self._buffer[:count]
114
+ del self._buffer[:count]
115
+ return count
116
+
117
+ def close(self) -> None:
118
+ self._closed = True
119
+ super().close()
120
+
121
+
122
+ class PowerBIMicrodadosClient:
123
+ def __init__(
124
+ self,
125
+ page_url: str = DEFAULT_POWERBI_MICRODADOS_URL,
126
+ timeout_seconds: int = 60,
127
+ session: requests.Session | None = None,
128
+ ) -> None:
129
+ self.page_url = page_url
130
+ self.timeout_seconds = timeout_seconds
131
+ self.session = session or requests.Session()
132
+
133
+ def discover_context(self) -> PowerBIMicrodadosContext:
134
+ response = self.session.get(self.page_url, timeout=self.timeout_seconds)
135
+ response.raise_for_status()
136
+ html = response.text
137
+
138
+ resource_descriptor = self._extract_resource_descriptor(html) or self._decode_resource_descriptor_from_url(self.page_url)
139
+ resource_key = str(resource_descriptor.get("k") or "").strip()
140
+ tenant_id = str(resource_descriptor.get("t") or "").strip()
141
+ if not resource_key:
142
+ raise RuntimeError("Power BI page did not expose a resource key")
143
+ if not tenant_id:
144
+ raise RuntimeError("Power BI page did not expose a tenant id")
145
+
146
+ resolved_cluster_uri = self._extract_resolved_cluster_uri(html)
147
+ if not resolved_cluster_uri:
148
+ resolved_cluster_uri = self._resolve_cluster_uri(resource_key=resource_key, tenant_id=tenant_id)
149
+ api_base_url = self._build_apim_url(resolved_cluster_uri)
150
+
151
+ metadata_response = self.session.get(
152
+ f"{api_base_url}/public/reports/{resource_key}/modelsAndExploration?preferReadOnlySession=true",
153
+ headers=self._powerbi_headers(resource_key),
154
+ timeout=self.timeout_seconds,
155
+ )
156
+ metadata_response.raise_for_status()
157
+ metadata = metadata_response.json()
158
+
159
+ report = dict((metadata.get("exploration") or {}).get("report") or {})
160
+ model = dict(report.get("model") or {})
161
+ model_fallback = dict((metadata.get("models") or [{}])[0] or {})
162
+ section, visual = self._find_microdados_visual(metadata)
163
+
164
+ model_id = int(report.get("modelId") or model_fallback.get("id") or 0)
165
+ dataset_id = str(model.get("dbName") or model_fallback.get("dbName") or "").strip()
166
+ report_id = str(report.get("objectId") or "").strip()
167
+ report_numeric_id = int(report.get("id") or 0)
168
+ if not model_id or not dataset_id or not report_id:
169
+ raise RuntimeError("Power BI metadata did not expose model/report identifiers for microdados")
170
+
171
+ return PowerBIMicrodadosContext(
172
+ page_url=self.page_url,
173
+ resource_key=resource_key,
174
+ tenant_id=tenant_id,
175
+ resolved_cluster_uri=resolved_cluster_uri,
176
+ api_base_url=api_base_url,
177
+ model_id=model_id,
178
+ dataset_id=dataset_id,
179
+ report_id=report_id,
180
+ report_numeric_id=report_numeric_id,
181
+ section_name=str(section.get("name") or ""),
182
+ section_display_name=str(section.get("displayName") or ""),
183
+ visual_id=str(visual.get("name") or ""),
184
+ visual_type=str((visual.get("singleVisual") or {}).get("visualType") or ""),
185
+ prototype_query=dict((visual.get("singleVisual") or {}).get("prototypeQuery") or {}),
186
+ )
187
+
188
+ def fetch_catalog(self) -> tuple[PowerBIMicrodadosContext, list[MicrodadosCatalogEntry]]:
189
+ context = self.discover_context()
190
+ response = self.session.post(
191
+ f"{context.api_base_url}/public/reports/querydata?synchronous=true",
192
+ headers=self._powerbi_headers(context.resource_key, json_request=True),
193
+ json=self._build_querydata_body(context),
194
+ timeout=self.timeout_seconds,
195
+ )
196
+ response.raise_for_status()
197
+ entries = self._decode_microdados_catalog(response.json())
198
+ return context, entries
199
+
200
+ def download_entry(
201
+ self,
202
+ entry: MicrodadosCatalogEntry,
203
+ output_dir: str | Path,
204
+ preview_line_count: int = 0,
205
+ ) -> MicrodadosDownloadResult:
206
+ target_dir = Path(output_dir)
207
+ target_dir.mkdir(parents=True, exist_ok=True)
208
+
209
+ filename = Path(urlparse(entry.microdados_url).path).name or (
210
+ f"{entry.ano_base}_{self._slugify(entry.tipo_microdados)}.bin"
211
+ )
212
+ output_path = target_dir / filename
213
+ content = self.fetch_entry_content(entry)
214
+ with output_path.open("wb") as handle:
215
+ handle.write(content.content_bytes)
216
+
217
+ preview_lines = self._read_preview_lines(output_path, preview_line_count)
218
+ return MicrodadosDownloadResult(
219
+ ano_base=entry.ano_base,
220
+ tipo_microdados=entry.tipo_microdados,
221
+ source_url=entry.microdados_url,
222
+ output_path=str(output_path),
223
+ size_bytes=content.size_bytes,
224
+ sha256=content.sha256,
225
+ content_type=content.content_type,
226
+ preview_lines=preview_lines,
227
+ )
228
+
229
+ def fetch_entry_content(self, entry: MicrodadosCatalogEntry) -> MicrodadosContentResult:
230
+ hasher = hashlib.sha256()
231
+ chunks: list[bytes] = []
232
+ size_bytes = 0
233
+
234
+ with self.session.get(entry.microdados_url, stream=True, timeout=self.timeout_seconds) as response:
235
+ response.raise_for_status()
236
+ content_type = response.headers.get("content-type")
237
+ for chunk in response.iter_content(65536):
238
+ if not chunk:
239
+ continue
240
+ chunks.append(chunk)
241
+ hasher.update(chunk)
242
+ size_bytes += len(chunk)
243
+
244
+ return MicrodadosContentResult(
245
+ source_url=entry.microdados_url,
246
+ content_bytes=b"".join(chunks),
247
+ size_bytes=size_bytes,
248
+ sha256=hasher.hexdigest(),
249
+ content_type=content_type,
250
+ )
251
+
252
+ @contextlib.contextmanager
253
+ def open_entry_content_stream(self, entry: MicrodadosCatalogEntry) -> Iterator[MicrodadosContentStream]:
254
+ with self.session.get(entry.microdados_url, stream=True, timeout=self.timeout_seconds) as response:
255
+ response.raise_for_status()
256
+ size_header = response.headers.get("content-length")
257
+ try:
258
+ size_bytes = int(size_header or 0)
259
+ except (TypeError, ValueError):
260
+ size_bytes = 0
261
+ yield MicrodadosContentStream(
262
+ source_url=entry.microdados_url,
263
+ raw_stream=_IterContentStream(response.iter_content(65536)),
264
+ content_type=response.headers.get("content-type"),
265
+ size_bytes=size_bytes,
266
+ sha256=None,
267
+ )
268
+
269
+ def _resolve_cluster_uri(self, resource_key: str, tenant_id: str) -> str:
270
+ response = self.session.get(
271
+ f"https://api.powerbi.com/public/routing/cluster/{tenant_id}",
272
+ headers=self._powerbi_headers(resource_key),
273
+ timeout=self.timeout_seconds,
274
+ )
275
+ response.raise_for_status()
276
+ fixed_cluster_uri = str((response.json() or {}).get("FixedClusterUri") or "").strip()
277
+ if not fixed_cluster_uri:
278
+ raise RuntimeError("Power BI cluster routing did not return FixedClusterUri")
279
+ return fixed_cluster_uri
280
+
281
+ @staticmethod
282
+ def _powerbi_headers(resource_key: str, json_request: bool = False) -> dict[str, str]:
283
+ headers = {
284
+ "Accept": "application/json",
285
+ "ActivityId": str(uuid.uuid4()),
286
+ "RequestId": str(uuid.uuid4()),
287
+ "X-PowerBI-ResourceKey": resource_key,
288
+ }
289
+ if json_request:
290
+ headers["Content-Type"] = "application/json"
291
+ return headers
292
+
293
+ @staticmethod
294
+ def _extract_resource_descriptor(html: str) -> dict[str, str]:
295
+ marker = "resourceDescriptor = JSON.parse('"
296
+ start = html.find(marker)
297
+ if start < 0:
298
+ return {}
299
+ start += len(marker)
300
+ end = html.find("');", start)
301
+ if end < 0:
302
+ return {}
303
+ payload = html[start:end]
304
+ decoded = bytes(payload, "utf-8").decode("unicode_escape")
305
+ try:
306
+ descriptor = json.loads(decoded)
307
+ except json.JSONDecodeError:
308
+ return {}
309
+ return {
310
+ "k": str(descriptor.get("k") or "").strip(),
311
+ "t": str(descriptor.get("t") or "").strip(),
312
+ }
313
+
314
+ @staticmethod
315
+ def _decode_resource_descriptor_from_url(page_url: str) -> dict[str, str]:
316
+ parsed = urlparse(page_url)
317
+ encoded = parse_qs(parsed.query).get("r", [])
318
+ if not encoded:
319
+ return {}
320
+ token = unquote(encoded[0])
321
+ padding = "=" * (-len(token) % 4)
322
+ decoded = base64.urlsafe_b64decode(token + padding).decode("utf-8")
323
+ payload = json.loads(decoded)
324
+ return {
325
+ "k": str(payload.get("k") or "").strip(),
326
+ "t": str(payload.get("t") or "").strip(),
327
+ }
328
+
329
+ @staticmethod
330
+ def _extract_resolved_cluster_uri(html: str) -> str:
331
+ marker = "var resolvedClusterUri = '"
332
+ start = html.find(marker)
333
+ if start < 0:
334
+ return ""
335
+ start += len(marker)
336
+ end = html.find("';", start)
337
+ if end < 0:
338
+ return ""
339
+ return html[start:end].strip()
340
+
341
+ @staticmethod
342
+ def _build_apim_url(cluster_uri: str) -> str:
343
+ parsed = urlparse(cluster_uri)
344
+ hostname = parsed.hostname or ""
345
+ if not hostname:
346
+ raise RuntimeError("Invalid Power BI cluster uri")
347
+ host_tokens = hostname.split(".")
348
+ host_tokens[0] = host_tokens[0].replace("-redirect", "")
349
+ host_tokens[0] = host_tokens[0].replace("global-", "")
350
+ host_tokens[0] = f"{host_tokens[0]}-api"
351
+ scheme = parsed.scheme or "https"
352
+ return f"{scheme}://{'.'.join(host_tokens)}"
353
+
354
+ @classmethod
355
+ def _find_microdados_visual(cls, metadata: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
356
+ sections = list(((metadata.get("exploration") or {}).get("sections") or []))
357
+
358
+ preferred_section = cls._find_microdados_section(sections)
359
+ if preferred_section is not None:
360
+ visual = cls._select_microdados_visual(preferred_section)
361
+ if visual:
362
+ return preferred_section, visual
363
+
364
+ fallback_visual = cls._build_fallback_visual(preferred_section)
365
+ if fallback_visual is not None:
366
+ return preferred_section, fallback_visual
367
+
368
+ for section in sections:
369
+ visual = cls._select_microdados_visual(section)
370
+ if visual:
371
+ return section, visual
372
+
373
+ if preferred_section is None:
374
+ section_names = ", ".join(
375
+ sorted(
376
+ {
377
+ str(section.get("displayName") or "").strip()
378
+ for section in sections
379
+ if str(section.get("displayName") or "").strip()
380
+ }
381
+ )
382
+ )
383
+ raise RuntimeError(
384
+ "Power BI metadata did not expose the Microdados da PNP section"
385
+ + (f"; available sections: {section_names}" if section_names else "")
386
+ )
387
+
388
+ raise RuntimeError("Power BI metadata exposed the Microdados da PNP section, but no compatible visual or fallback query context was found")
389
+
390
+ @staticmethod
391
+ def _find_microdados_section(sections: list[dict[str, Any]]) -> dict[str, Any] | None:
392
+ for section in sections:
393
+ if str(section.get("displayName") or "").strip() == MICRODADOS_SECTION_DISPLAY_NAME:
394
+ return section
395
+ return None
396
+
397
+ @classmethod
398
+ def _select_microdados_visual(cls, section: dict[str, Any]) -> dict[str, Any] | None:
399
+ for container in section.get("visualContainers") or []:
400
+ raw_config = container.get("config")
401
+ if not isinstance(raw_config, str) or not raw_config.strip():
402
+ continue
403
+ try:
404
+ visual = json.loads(raw_config)
405
+ except json.JSONDecodeError:
406
+ continue
407
+
408
+ single_visual = dict(visual.get("singleVisual") or {})
409
+ if not single_visual:
410
+ continue
411
+
412
+ projections = dict(single_visual.get("projections") or {})
413
+ if cls._visual_matches_microdados_catalog(single_visual, projections):
414
+ return visual
415
+ return None
416
+
417
+ @classmethod
418
+ def _visual_matches_microdados_catalog(cls, single_visual: dict[str, Any], projections: dict[str, Any]) -> bool:
419
+ row_refs = [item.get("queryRef") for item in projections.get("Rows") or [] if item.get("active", True)]
420
+ column_refs = [item.get("queryRef") for item in projections.get("Columns") or [] if item.get("active", True)]
421
+ value_refs = [item.get("queryRef") for item in projections.get("Values") or [] if item.get("active", True) or "active" not in item]
422
+
423
+ exact_projection_match = (
424
+ row_refs == [MICRODADOS_ROWS_QUERY_REF]
425
+ and column_refs == [MICRODADOS_COLUMNS_QUERY_REF]
426
+ and value_refs == [MICRODADOS_VALUES_QUERY_REF]
427
+ )
428
+ if exact_projection_match:
429
+ return True
430
+
431
+ prototype_query = dict(single_visual.get("prototypeQuery") or {})
432
+ select_names = {
433
+ str(item.get("Name") or "").strip()
434
+ for item in prototype_query.get("Select") or []
435
+ if isinstance(item, dict)
436
+ }
437
+ if {
438
+ MICRODADOS_ROWS_QUERY_REF,
439
+ MICRODADOS_COLUMNS_QUERY_REF,
440
+ MICRODADOS_VALUES_QUERY_REF,
441
+ }.issubset(select_names):
442
+ return True
443
+
444
+ return False
445
+
446
+ @classmethod
447
+ def _build_fallback_visual(cls, section: dict[str, Any]) -> dict[str, Any] | None:
448
+ visual_containers = list(section.get("visualContainers") or [])
449
+ if not visual_containers:
450
+ return None
451
+
452
+ fallback_name = str(visual_containers[0].get("objectName") or visual_containers[0].get("id") or "microdados_catalog")
453
+ return {
454
+ "name": fallback_name,
455
+ "singleVisual": {
456
+ "visualType": "microdados_catalog_fallback",
457
+ "prototypeQuery": cls._build_fallback_prototype_query(),
458
+ },
459
+ }
460
+
461
+ @staticmethod
462
+ def _build_fallback_prototype_query() -> dict[str, Any]:
463
+ return {
464
+ "Version": 2,
465
+ "From": [{"Name": "m", "Entity": MICRODADOS_ENTITY_NAME, "Type": 0}],
466
+ "Select": [
467
+ {
468
+ "Measure": {
469
+ "Expression": {"SourceRef": {"Source": "m"}},
470
+ "Property": MICRODADOS_URL_PROPERTY,
471
+ },
472
+ "Name": MICRODADOS_VALUES_QUERY_REF,
473
+ },
474
+ {
475
+ "Column": {
476
+ "Expression": {"SourceRef": {"Source": "m"}},
477
+ "Property": MICRODADOS_ANO_PROPERTY,
478
+ },
479
+ "Name": MICRODADOS_ROWS_QUERY_REF,
480
+ },
481
+ {
482
+ "Column": {
483
+ "Expression": {"SourceRef": {"Source": "m"}},
484
+ "Property": MICRODADOS_TIPO_PROPERTY,
485
+ },
486
+ "Name": MICRODADOS_COLUMNS_QUERY_REF,
487
+ },
488
+ ],
489
+ }
490
+
491
+ @staticmethod
492
+ def _build_querydata_body(context: PowerBIMicrodadosContext) -> dict[str, Any]:
493
+ return {
494
+ "version": "1.0.0",
495
+ "queries": [
496
+ {
497
+ "Query": {
498
+ "Commands": [
499
+ {
500
+ "SemanticQueryDataShapeCommand": {
501
+ "Query": context.prototype_query,
502
+ "Binding": {
503
+ "Primary": {"Groupings": [{"Projections": [0, 1, 2]}]},
504
+ "DataReduction": {"DataVolume": 3, "Primary": {"Top": {"Count": 500}}},
505
+ "Version": 1,
506
+ },
507
+ "ExecutionMetricsKind": 1,
508
+ }
509
+ }
510
+ ]
511
+ },
512
+ "ApplicationContext": {
513
+ "DatasetId": context.dataset_id,
514
+ "Sources": [{"ReportId": context.report_id, "VisualId": context.visual_id}],
515
+ },
516
+ }
517
+ ],
518
+ "modelId": context.model_id,
519
+ }
520
+
521
+ @classmethod
522
+ def _decode_microdados_catalog(cls, payload: dict[str, Any]) -> list[MicrodadosCatalogEntry]:
523
+ seen: set[tuple[str, str, str]] = set()
524
+ entries: list[MicrodadosCatalogEntry] = []
525
+
526
+ results = payload.get("results") or []
527
+ for result in results:
528
+ data = dict((result.get("result") or {}).get("data") or {})
529
+ dsr = dict(data.get("dsr") or {})
530
+ for dataset in dsr.get("DS") or []:
531
+ for row_values in cls._decode_dsr_rows(dict(dataset or {})):
532
+ if len(row_values) < 3:
533
+ continue
534
+ ano_base = str(row_values[0] or "").strip()
535
+ tipo_microdados = str(row_values[1] or "").strip()
536
+ microdados_url = str(row_values[2] or "").strip()
537
+ if not ano_base or not tipo_microdados or not microdados_url:
538
+ continue
539
+ key = (ano_base, tipo_microdados, microdados_url)
540
+ if key in seen:
541
+ continue
542
+ seen.add(key)
543
+ entries.append(
544
+ MicrodadosCatalogEntry(
545
+ ano_base=ano_base,
546
+ tipo_microdados=tipo_microdados,
547
+ microdados_url=microdados_url,
548
+ )
549
+ )
550
+
551
+ if not entries:
552
+ raise RuntimeError("Power BI querydata did not expose any microdados download entries")
553
+ return entries
554
+
555
+ @classmethod
556
+ def _decode_dsr_rows(cls, dataset: dict[str, Any]) -> list[list[Any]]:
557
+ value_dicts = dict(dataset.get("ValueDicts") or {})
558
+ decoded_rows: list[list[Any]] = []
559
+
560
+ for placeholder in dataset.get("PH") or []:
561
+ if not isinstance(placeholder, dict):
562
+ continue
563
+ for member_name, rows in placeholder.items():
564
+ if not isinstance(member_name, str) or not member_name.startswith("DM"):
565
+ continue
566
+ if not isinstance(rows, list):
567
+ continue
568
+
569
+ schema: list[dict[str, Any]] = []
570
+ previous_values: list[Any] = []
571
+ for row in rows:
572
+ if not isinstance(row, dict):
573
+ continue
574
+ if isinstance(row.get("S"), list):
575
+ schema = [dict(item or {}) for item in row["S"]]
576
+ previous_values = [None] * len(schema)
577
+ if not schema:
578
+ continue
579
+
580
+ inflated = cls._inflate_dsr_row(row=row, schema=schema, previous_values=previous_values)
581
+ if inflated is None:
582
+ continue
583
+
584
+ previous_values = list(inflated)
585
+ decoded_rows.append(
586
+ [
587
+ cls._resolve_dsr_value(schema_item, raw_value, value_dicts)
588
+ for schema_item, raw_value in zip(schema, inflated)
589
+ ]
590
+ )
591
+
592
+ return decoded_rows
593
+
594
+ @staticmethod
595
+ def _inflate_dsr_row(
596
+ row: dict[str, Any],
597
+ schema: list[dict[str, Any]],
598
+ previous_values: list[Any],
599
+ ) -> list[Any] | None:
600
+ compressed = row.get("C")
601
+ if isinstance(compressed, list):
602
+ values = list(previous_values) if previous_values else [None] * len(schema)
603
+ start_index = len(schema) - len(compressed)
604
+ repeated_prefix = row.get("R")
605
+ if isinstance(repeated_prefix, int):
606
+ start_index = repeated_prefix
607
+ start_index = max(0, min(start_index, len(schema)))
608
+ for offset, value in enumerate(compressed):
609
+ index = start_index + offset
610
+ if index >= len(schema):
611
+ break
612
+ values[index] = value
613
+ return values
614
+
615
+ named_values = [row.get(str(column.get("N") or "")) for column in schema]
616
+ if any(value is not None for value in named_values):
617
+ return named_values
618
+ return None
619
+
620
+ @staticmethod
621
+ def _resolve_dsr_value(schema_item: dict[str, Any], raw_value: Any, value_dicts: dict[str, Any]) -> Any:
622
+ dictionary_name = schema_item.get("DN")
623
+ if isinstance(raw_value, int) and isinstance(dictionary_name, str):
624
+ dictionary = value_dicts.get(dictionary_name)
625
+ if isinstance(dictionary, list) and 0 <= raw_value < len(dictionary):
626
+ raw_value = dictionary[raw_value]
627
+
628
+ if isinstance(raw_value, str):
629
+ value = raw_value.strip()
630
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
631
+ value = value[1:-1]
632
+ return value
633
+ return raw_value
634
+
635
+ @staticmethod
636
+ def _slugify(value: str) -> str:
637
+ safe = "".join(char.lower() if char.isalnum() else "_" for char in value.strip())
638
+ collapsed = "_".join(part for part in safe.split("_") if part)
639
+ return collapsed or "arquivo"
640
+
641
+ @staticmethod
642
+ def _read_preview_lines(path: Path, line_count: int) -> tuple[str, ...]:
643
+ if line_count <= 0:
644
+ return ()
645
+
646
+ opener = gzip.open if path.suffix == ".gz" else open
647
+ lines: list[str] = []
648
+ with opener(path, "rt", encoding="utf-8", errors="replace") as handle:
649
+ for _ in range(line_count):
650
+ line = handle.readline()
651
+ if not line:
652
+ break
653
+ lines.append(line.rstrip("\n"))
654
+ return tuple(lines)
655
+
656
+ @staticmethod
657
+ def decode_content_bytes(content_bytes: bytes, source_url: str) -> str:
658
+ raw_bytes = gzip.decompress(content_bytes) if source_url.lower().endswith(".gz") else content_bytes
659
+ for encoding in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
660
+ try:
661
+ return raw_bytes.decode(encoding)
662
+ except UnicodeDecodeError:
663
+ continue
664
+ return raw_bytes.decode("utf-8", errors="replace")
665
+
666
+
667
+ def build_cli_parser() -> argparse.ArgumentParser:
668
+ parser = argparse.ArgumentParser(description="Consulta o catálogo de microdados públicos da PNP via Power BI.")
669
+ parser.add_argument("--page-url", default=DEFAULT_POWERBI_MICRODADOS_URL, help="URL pública do relatório Power BI.")
670
+ parser.add_argument("--timeout-seconds", type=int, default=60, help="Timeout HTTP em segundos.")
671
+ parser.add_argument("--output-json", help="Caminho para salvar o catálogo completo em JSON.")
672
+ parser.add_argument("--download-dir", help="Diretório para baixar arquivos de microdados.")
673
+ parser.add_argument(
674
+ "--download-limit",
675
+ type=int,
676
+ default=0,
677
+ help="Quantidade de arquivos a baixar. Use 0 para apenas listar o catálogo.",
678
+ )
679
+ parser.add_argument(
680
+ "--preview-lines",
681
+ type=int,
682
+ default=0,
683
+ help="Quantidade de linhas de prévia para imprimir por arquivo baixado.",
684
+ )
685
+ return parser
686
+
687
+
688
+ def main(argv: list[str] | None = None) -> int:
689
+ parser = build_cli_parser()
690
+ args = parser.parse_args(argv)
691
+
692
+ client = PowerBIMicrodadosClient(page_url=args.page_url, timeout_seconds=args.timeout_seconds)
693
+ context, entries = client.fetch_catalog()
694
+
695
+ manifest = {
696
+ "context": asdict(context),
697
+ "entries": [asdict(entry) for entry in entries],
698
+ }
699
+
700
+ if args.output_json:
701
+ output_path = Path(args.output_json)
702
+ output_path.parent.mkdir(parents=True, exist_ok=True)
703
+ output_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
704
+
705
+ print(
706
+ json.dumps(
707
+ {
708
+ "entry_count": len(entries),
709
+ "first_entry": asdict(entries[0]) if entries else None,
710
+ "resource_key": context.resource_key,
711
+ "visual_id": context.visual_id,
712
+ },
713
+ ensure_ascii=False,
714
+ )
715
+ )
716
+
717
+ if args.download_dir and args.download_limit > 0:
718
+ download_results = [
719
+ asdict(client.download_entry(entry, args.download_dir, preview_line_count=args.preview_lines))
720
+ for entry in entries[: args.download_limit]
721
+ ]
722
+ print(json.dumps({"downloads": download_results}, ensure_ascii=False, indent=2))
723
+
724
+ return 0
725
+
726
+
727
+ if __name__ == "__main__":
728
+ raise SystemExit(main())