@dataif/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/bin/dataif.js +623 -0
- package/package.json +26 -0
- package/scripts/build-template.mjs +72 -0
- package/templates/dataif/README.md +157 -0
- package/templates/dataif/infra/.env.example +119 -0
- package/templates/dataif/infra/.env.stg.example +119 -0
- package/templates/dataif/infra/airflow/Dockerfile +11 -0
- package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
- package/templates/dataif/infra/airflow/requirements.txt +3 -0
- package/templates/dataif/infra/docker-compose.yml +306 -0
- package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
- package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
- package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
- package/templates/dataif/infra/keycloak/Dockerfile +4 -0
- package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
- package/templates/dataif/infra/ollama/Dockerfile +9 -0
- package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
- package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
- package/templates/dataif/infra/postgres/Dockerfile +4 -0
- package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
- package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
- package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
- package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
- package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
- package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
- package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
- package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
- package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
- package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
- package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
- package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
- package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
- package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
- package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
- package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
- package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
- package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
- package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
- package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
- package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
- package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
- package/templates/dataif/scripts/configure-env.sh +149 -0
- package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
- package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
- package/templates/dataif/scripts/deploy.sh +79 -0
- package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
- package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
- package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
- package/templates/dataif/scripts/publish-images.sh +31 -0
- package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
- package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
- package/templates/dataif/services/api/.dockerignore +18 -0
- package/templates/dataif/services/api/Dockerfile +12 -0
- package/templates/dataif/services/api/app/__init__.py +1 -0
- package/templates/dataif/services/api/app/auth.py +48 -0
- package/templates/dataif/services/api/app/config.py +59 -0
- package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
- package/templates/dataif/services/api/app/main.py +2432 -0
- package/templates/dataif/services/api/app/metabase_admin.py +191 -0
- package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
- package/templates/dataif/services/api/app/metabase_embed.py +15 -0
- package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
- package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
- package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
- package/templates/dataif/services/api/app/vanna_client.py +32 -0
- package/templates/dataif/services/api/requirements.txt +9 -0
- package/templates/dataif/services/vanna/.dockerignore +18 -0
- package/templates/dataif/services/vanna/Dockerfile +12 -0
- package/templates/dataif/services/vanna/app/config.py +57 -0
- package/templates/dataif/services/vanna/app/main.py +108 -0
- package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
- package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
- package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
- package/templates/dataif/services/vanna/requirements.txt +8 -0
- package/templates/dataif/services/web/.dockerignore +13 -0
- package/templates/dataif/services/web/Dockerfile +16 -0
- package/templates/dataif/services/web/index.html +12 -0
- package/templates/dataif/services/web/nginx.conf +74 -0
- package/templates/dataif/services/web/package-lock.json +4397 -0
- package/templates/dataif/services/web/package.json +32 -0
- package/templates/dataif/services/web/postcss.config.mjs +5 -0
- package/templates/dataif/services/web/src/App.jsx +2817 -0
- package/templates/dataif/services/web/src/adminAuth.js +245 -0
- package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
- package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
- package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
- package/templates/dataif/services/web/src/assets/if.svg +0 -0
- package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
- package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
- package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
- package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
- package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
- package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
- package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
- package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
- package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
- package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
- package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
- package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
- package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
- package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
- package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
- package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
- package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
- package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
- package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
- package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
- package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
- package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
- package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
- package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
- package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
- package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
- package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
- package/templates/dataif/services/web/src/main.jsx +14 -0
- package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
- package/templates/dataif/services/web/src/styles/globals.css +60 -0
- package/templates/dataif/services/web/src/styles/theme.css +1326 -0
- package/templates/dataif/services/web/src/styles/typography.css +430 -0
- package/templates/dataif/services/web/src/styles.css +1287 -0
- package/templates/dataif/services/web/src/utils/cx.ts +24 -0
- package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
- package/templates/dataif/services/web/vite.config.js +14 -0
- package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
- package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
- package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
- package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
- package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
- package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
- package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
- package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
- package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
- package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
- package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
- package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
- package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
- package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
- package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
- package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
- package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
- package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
- package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
- package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
- package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
- package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
- package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
- package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
- package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
- package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
- package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py
ADDED
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import base64
|
|
5
|
+
import contextlib
|
|
6
|
+
import gzip
|
|
7
|
+
import hashlib
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import uuid
|
|
11
|
+
from dataclasses import asdict, dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Iterator
|
|
14
|
+
from urllib.parse import parse_qs, unquote, urlparse
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
|
|
18
|
+
DEFAULT_POWERBI_MICRODADOS_URL = (
|
|
19
|
+
"https://app.powerbi.com/view?"
|
|
20
|
+
"r=eyJrIjoiZDhkNGNiYzgtMjQ0My00OGVlLWJjNzYtZWQwYjI2OThhYWM1IiwidCI6IjllNjgyMzU5LWQxMjgtNGVkYi1iYjU4LTgyYjJhMTUzNDBmZiJ9"
|
|
21
|
+
)
|
|
22
|
+
MICRODADOS_SECTION_DISPLAY_NAME = "Microdados da PNP"
|
|
23
|
+
MICRODADOS_ROWS_QUERY_REF = "Microdados.Ano Base"
|
|
24
|
+
MICRODADOS_COLUMNS_QUERY_REF = "Microdados.Tipo de microdados"
|
|
25
|
+
MICRODADOS_VALUES_QUERY_REF = "Microdados.MicrodadosURL"
|
|
26
|
+
MICRODADOS_ENTITY_NAME = "Microdados"
|
|
27
|
+
MICRODADOS_ANO_PROPERTY = "Ano Base"
|
|
28
|
+
MICRODADOS_TIPO_PROPERTY = "Tipo de microdados"
|
|
29
|
+
MICRODADOS_URL_PROPERTY = "MicrodadosURL"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class PowerBIMicrodadosContext:
|
|
34
|
+
page_url: str
|
|
35
|
+
resource_key: str
|
|
36
|
+
tenant_id: str
|
|
37
|
+
resolved_cluster_uri: str
|
|
38
|
+
api_base_url: str
|
|
39
|
+
model_id: int
|
|
40
|
+
dataset_id: str
|
|
41
|
+
report_id: str
|
|
42
|
+
report_numeric_id: int
|
|
43
|
+
section_name: str
|
|
44
|
+
section_display_name: str
|
|
45
|
+
visual_id: str
|
|
46
|
+
visual_type: str
|
|
47
|
+
prototype_query: dict[str, Any]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class MicrodadosCatalogEntry:
|
|
52
|
+
ano_base: str
|
|
53
|
+
tipo_microdados: str
|
|
54
|
+
microdados_url: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class MicrodadosDownloadResult:
|
|
59
|
+
ano_base: str
|
|
60
|
+
tipo_microdados: str
|
|
61
|
+
source_url: str
|
|
62
|
+
output_path: str
|
|
63
|
+
size_bytes: int
|
|
64
|
+
sha256: str
|
|
65
|
+
content_type: str | None
|
|
66
|
+
preview_lines: tuple[str, ...]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass(frozen=True)
|
|
70
|
+
class MicrodadosContentResult:
|
|
71
|
+
source_url: str
|
|
72
|
+
content_bytes: bytes
|
|
73
|
+
size_bytes: int
|
|
74
|
+
sha256: str
|
|
75
|
+
content_type: str | None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(frozen=True)
|
|
79
|
+
class MicrodadosContentStream:
|
|
80
|
+
source_url: str
|
|
81
|
+
raw_stream: Any
|
|
82
|
+
content_type: str | None
|
|
83
|
+
size_bytes: int
|
|
84
|
+
sha256: str | None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class _IterContentStream(io.RawIOBase):
|
|
88
|
+
def __init__(self, chunks: Iterator[bytes]) -> None:
|
|
89
|
+
self._chunks = iter(chunks)
|
|
90
|
+
self._buffer = bytearray()
|
|
91
|
+
self._closed = False
|
|
92
|
+
|
|
93
|
+
def readable(self) -> bool:
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
def readinto(self, buffer) -> int:
|
|
97
|
+
if self._closed:
|
|
98
|
+
return 0
|
|
99
|
+
|
|
100
|
+
requested = len(buffer)
|
|
101
|
+
while len(self._buffer) < requested:
|
|
102
|
+
try:
|
|
103
|
+
chunk = next(self._chunks)
|
|
104
|
+
except StopIteration:
|
|
105
|
+
break
|
|
106
|
+
if chunk:
|
|
107
|
+
self._buffer.extend(chunk)
|
|
108
|
+
|
|
109
|
+
if not self._buffer:
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
count = min(requested, len(self._buffer))
|
|
113
|
+
buffer[:count] = self._buffer[:count]
|
|
114
|
+
del self._buffer[:count]
|
|
115
|
+
return count
|
|
116
|
+
|
|
117
|
+
def close(self) -> None:
|
|
118
|
+
self._closed = True
|
|
119
|
+
super().close()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class PowerBIMicrodadosClient:
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
page_url: str = DEFAULT_POWERBI_MICRODADOS_URL,
|
|
126
|
+
timeout_seconds: int = 60,
|
|
127
|
+
session: requests.Session | None = None,
|
|
128
|
+
) -> None:
|
|
129
|
+
self.page_url = page_url
|
|
130
|
+
self.timeout_seconds = timeout_seconds
|
|
131
|
+
self.session = session or requests.Session()
|
|
132
|
+
|
|
133
|
+
def discover_context(self) -> PowerBIMicrodadosContext:
|
|
134
|
+
response = self.session.get(self.page_url, timeout=self.timeout_seconds)
|
|
135
|
+
response.raise_for_status()
|
|
136
|
+
html = response.text
|
|
137
|
+
|
|
138
|
+
resource_descriptor = self._extract_resource_descriptor(html) or self._decode_resource_descriptor_from_url(self.page_url)
|
|
139
|
+
resource_key = str(resource_descriptor.get("k") or "").strip()
|
|
140
|
+
tenant_id = str(resource_descriptor.get("t") or "").strip()
|
|
141
|
+
if not resource_key:
|
|
142
|
+
raise RuntimeError("Power BI page did not expose a resource key")
|
|
143
|
+
if not tenant_id:
|
|
144
|
+
raise RuntimeError("Power BI page did not expose a tenant id")
|
|
145
|
+
|
|
146
|
+
resolved_cluster_uri = self._extract_resolved_cluster_uri(html)
|
|
147
|
+
if not resolved_cluster_uri:
|
|
148
|
+
resolved_cluster_uri = self._resolve_cluster_uri(resource_key=resource_key, tenant_id=tenant_id)
|
|
149
|
+
api_base_url = self._build_apim_url(resolved_cluster_uri)
|
|
150
|
+
|
|
151
|
+
metadata_response = self.session.get(
|
|
152
|
+
f"{api_base_url}/public/reports/{resource_key}/modelsAndExploration?preferReadOnlySession=true",
|
|
153
|
+
headers=self._powerbi_headers(resource_key),
|
|
154
|
+
timeout=self.timeout_seconds,
|
|
155
|
+
)
|
|
156
|
+
metadata_response.raise_for_status()
|
|
157
|
+
metadata = metadata_response.json()
|
|
158
|
+
|
|
159
|
+
report = dict((metadata.get("exploration") or {}).get("report") or {})
|
|
160
|
+
model = dict(report.get("model") or {})
|
|
161
|
+
model_fallback = dict((metadata.get("models") or [{}])[0] or {})
|
|
162
|
+
section, visual = self._find_microdados_visual(metadata)
|
|
163
|
+
|
|
164
|
+
model_id = int(report.get("modelId") or model_fallback.get("id") or 0)
|
|
165
|
+
dataset_id = str(model.get("dbName") or model_fallback.get("dbName") or "").strip()
|
|
166
|
+
report_id = str(report.get("objectId") or "").strip()
|
|
167
|
+
report_numeric_id = int(report.get("id") or 0)
|
|
168
|
+
if not model_id or not dataset_id or not report_id:
|
|
169
|
+
raise RuntimeError("Power BI metadata did not expose model/report identifiers for microdados")
|
|
170
|
+
|
|
171
|
+
return PowerBIMicrodadosContext(
|
|
172
|
+
page_url=self.page_url,
|
|
173
|
+
resource_key=resource_key,
|
|
174
|
+
tenant_id=tenant_id,
|
|
175
|
+
resolved_cluster_uri=resolved_cluster_uri,
|
|
176
|
+
api_base_url=api_base_url,
|
|
177
|
+
model_id=model_id,
|
|
178
|
+
dataset_id=dataset_id,
|
|
179
|
+
report_id=report_id,
|
|
180
|
+
report_numeric_id=report_numeric_id,
|
|
181
|
+
section_name=str(section.get("name") or ""),
|
|
182
|
+
section_display_name=str(section.get("displayName") or ""),
|
|
183
|
+
visual_id=str(visual.get("name") or ""),
|
|
184
|
+
visual_type=str((visual.get("singleVisual") or {}).get("visualType") or ""),
|
|
185
|
+
prototype_query=dict((visual.get("singleVisual") or {}).get("prototypeQuery") or {}),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def fetch_catalog(self) -> tuple[PowerBIMicrodadosContext, list[MicrodadosCatalogEntry]]:
|
|
189
|
+
context = self.discover_context()
|
|
190
|
+
response = self.session.post(
|
|
191
|
+
f"{context.api_base_url}/public/reports/querydata?synchronous=true",
|
|
192
|
+
headers=self._powerbi_headers(context.resource_key, json_request=True),
|
|
193
|
+
json=self._build_querydata_body(context),
|
|
194
|
+
timeout=self.timeout_seconds,
|
|
195
|
+
)
|
|
196
|
+
response.raise_for_status()
|
|
197
|
+
entries = self._decode_microdados_catalog(response.json())
|
|
198
|
+
return context, entries
|
|
199
|
+
|
|
200
|
+
def download_entry(
|
|
201
|
+
self,
|
|
202
|
+
entry: MicrodadosCatalogEntry,
|
|
203
|
+
output_dir: str | Path,
|
|
204
|
+
preview_line_count: int = 0,
|
|
205
|
+
) -> MicrodadosDownloadResult:
|
|
206
|
+
target_dir = Path(output_dir)
|
|
207
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
208
|
+
|
|
209
|
+
filename = Path(urlparse(entry.microdados_url).path).name or (
|
|
210
|
+
f"{entry.ano_base}_{self._slugify(entry.tipo_microdados)}.bin"
|
|
211
|
+
)
|
|
212
|
+
output_path = target_dir / filename
|
|
213
|
+
content = self.fetch_entry_content(entry)
|
|
214
|
+
with output_path.open("wb") as handle:
|
|
215
|
+
handle.write(content.content_bytes)
|
|
216
|
+
|
|
217
|
+
preview_lines = self._read_preview_lines(output_path, preview_line_count)
|
|
218
|
+
return MicrodadosDownloadResult(
|
|
219
|
+
ano_base=entry.ano_base,
|
|
220
|
+
tipo_microdados=entry.tipo_microdados,
|
|
221
|
+
source_url=entry.microdados_url,
|
|
222
|
+
output_path=str(output_path),
|
|
223
|
+
size_bytes=content.size_bytes,
|
|
224
|
+
sha256=content.sha256,
|
|
225
|
+
content_type=content.content_type,
|
|
226
|
+
preview_lines=preview_lines,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
def fetch_entry_content(self, entry: MicrodadosCatalogEntry) -> MicrodadosContentResult:
|
|
230
|
+
hasher = hashlib.sha256()
|
|
231
|
+
chunks: list[bytes] = []
|
|
232
|
+
size_bytes = 0
|
|
233
|
+
|
|
234
|
+
with self.session.get(entry.microdados_url, stream=True, timeout=self.timeout_seconds) as response:
|
|
235
|
+
response.raise_for_status()
|
|
236
|
+
content_type = response.headers.get("content-type")
|
|
237
|
+
for chunk in response.iter_content(65536):
|
|
238
|
+
if not chunk:
|
|
239
|
+
continue
|
|
240
|
+
chunks.append(chunk)
|
|
241
|
+
hasher.update(chunk)
|
|
242
|
+
size_bytes += len(chunk)
|
|
243
|
+
|
|
244
|
+
return MicrodadosContentResult(
|
|
245
|
+
source_url=entry.microdados_url,
|
|
246
|
+
content_bytes=b"".join(chunks),
|
|
247
|
+
size_bytes=size_bytes,
|
|
248
|
+
sha256=hasher.hexdigest(),
|
|
249
|
+
content_type=content_type,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
@contextlib.contextmanager
|
|
253
|
+
def open_entry_content_stream(self, entry: MicrodadosCatalogEntry) -> Iterator[MicrodadosContentStream]:
|
|
254
|
+
with self.session.get(entry.microdados_url, stream=True, timeout=self.timeout_seconds) as response:
|
|
255
|
+
response.raise_for_status()
|
|
256
|
+
size_header = response.headers.get("content-length")
|
|
257
|
+
try:
|
|
258
|
+
size_bytes = int(size_header or 0)
|
|
259
|
+
except (TypeError, ValueError):
|
|
260
|
+
size_bytes = 0
|
|
261
|
+
yield MicrodadosContentStream(
|
|
262
|
+
source_url=entry.microdados_url,
|
|
263
|
+
raw_stream=_IterContentStream(response.iter_content(65536)),
|
|
264
|
+
content_type=response.headers.get("content-type"),
|
|
265
|
+
size_bytes=size_bytes,
|
|
266
|
+
sha256=None,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def _resolve_cluster_uri(self, resource_key: str, tenant_id: str) -> str:
|
|
270
|
+
response = self.session.get(
|
|
271
|
+
f"https://api.powerbi.com/public/routing/cluster/{tenant_id}",
|
|
272
|
+
headers=self._powerbi_headers(resource_key),
|
|
273
|
+
timeout=self.timeout_seconds,
|
|
274
|
+
)
|
|
275
|
+
response.raise_for_status()
|
|
276
|
+
fixed_cluster_uri = str((response.json() or {}).get("FixedClusterUri") or "").strip()
|
|
277
|
+
if not fixed_cluster_uri:
|
|
278
|
+
raise RuntimeError("Power BI cluster routing did not return FixedClusterUri")
|
|
279
|
+
return fixed_cluster_uri
|
|
280
|
+
|
|
281
|
+
@staticmethod
|
|
282
|
+
def _powerbi_headers(resource_key: str, json_request: bool = False) -> dict[str, str]:
|
|
283
|
+
headers = {
|
|
284
|
+
"Accept": "application/json",
|
|
285
|
+
"ActivityId": str(uuid.uuid4()),
|
|
286
|
+
"RequestId": str(uuid.uuid4()),
|
|
287
|
+
"X-PowerBI-ResourceKey": resource_key,
|
|
288
|
+
}
|
|
289
|
+
if json_request:
|
|
290
|
+
headers["Content-Type"] = "application/json"
|
|
291
|
+
return headers
|
|
292
|
+
|
|
293
|
+
@staticmethod
|
|
294
|
+
def _extract_resource_descriptor(html: str) -> dict[str, str]:
|
|
295
|
+
marker = "resourceDescriptor = JSON.parse('"
|
|
296
|
+
start = html.find(marker)
|
|
297
|
+
if start < 0:
|
|
298
|
+
return {}
|
|
299
|
+
start += len(marker)
|
|
300
|
+
end = html.find("');", start)
|
|
301
|
+
if end < 0:
|
|
302
|
+
return {}
|
|
303
|
+
payload = html[start:end]
|
|
304
|
+
decoded = bytes(payload, "utf-8").decode("unicode_escape")
|
|
305
|
+
try:
|
|
306
|
+
descriptor = json.loads(decoded)
|
|
307
|
+
except json.JSONDecodeError:
|
|
308
|
+
return {}
|
|
309
|
+
return {
|
|
310
|
+
"k": str(descriptor.get("k") or "").strip(),
|
|
311
|
+
"t": str(descriptor.get("t") or "").strip(),
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
def _decode_resource_descriptor_from_url(page_url: str) -> dict[str, str]:
|
|
316
|
+
parsed = urlparse(page_url)
|
|
317
|
+
encoded = parse_qs(parsed.query).get("r", [])
|
|
318
|
+
if not encoded:
|
|
319
|
+
return {}
|
|
320
|
+
token = unquote(encoded[0])
|
|
321
|
+
padding = "=" * (-len(token) % 4)
|
|
322
|
+
decoded = base64.urlsafe_b64decode(token + padding).decode("utf-8")
|
|
323
|
+
payload = json.loads(decoded)
|
|
324
|
+
return {
|
|
325
|
+
"k": str(payload.get("k") or "").strip(),
|
|
326
|
+
"t": str(payload.get("t") or "").strip(),
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def _extract_resolved_cluster_uri(html: str) -> str:
|
|
331
|
+
marker = "var resolvedClusterUri = '"
|
|
332
|
+
start = html.find(marker)
|
|
333
|
+
if start < 0:
|
|
334
|
+
return ""
|
|
335
|
+
start += len(marker)
|
|
336
|
+
end = html.find("';", start)
|
|
337
|
+
if end < 0:
|
|
338
|
+
return ""
|
|
339
|
+
return html[start:end].strip()
|
|
340
|
+
|
|
341
|
+
@staticmethod
|
|
342
|
+
def _build_apim_url(cluster_uri: str) -> str:
|
|
343
|
+
parsed = urlparse(cluster_uri)
|
|
344
|
+
hostname = parsed.hostname or ""
|
|
345
|
+
if not hostname:
|
|
346
|
+
raise RuntimeError("Invalid Power BI cluster uri")
|
|
347
|
+
host_tokens = hostname.split(".")
|
|
348
|
+
host_tokens[0] = host_tokens[0].replace("-redirect", "")
|
|
349
|
+
host_tokens[0] = host_tokens[0].replace("global-", "")
|
|
350
|
+
host_tokens[0] = f"{host_tokens[0]}-api"
|
|
351
|
+
scheme = parsed.scheme or "https"
|
|
352
|
+
return f"{scheme}://{'.'.join(host_tokens)}"
|
|
353
|
+
|
|
354
|
+
@classmethod
|
|
355
|
+
def _find_microdados_visual(cls, metadata: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
|
|
356
|
+
sections = list(((metadata.get("exploration") or {}).get("sections") or []))
|
|
357
|
+
|
|
358
|
+
preferred_section = cls._find_microdados_section(sections)
|
|
359
|
+
if preferred_section is not None:
|
|
360
|
+
visual = cls._select_microdados_visual(preferred_section)
|
|
361
|
+
if visual:
|
|
362
|
+
return preferred_section, visual
|
|
363
|
+
|
|
364
|
+
fallback_visual = cls._build_fallback_visual(preferred_section)
|
|
365
|
+
if fallback_visual is not None:
|
|
366
|
+
return preferred_section, fallback_visual
|
|
367
|
+
|
|
368
|
+
for section in sections:
|
|
369
|
+
visual = cls._select_microdados_visual(section)
|
|
370
|
+
if visual:
|
|
371
|
+
return section, visual
|
|
372
|
+
|
|
373
|
+
if preferred_section is None:
|
|
374
|
+
section_names = ", ".join(
|
|
375
|
+
sorted(
|
|
376
|
+
{
|
|
377
|
+
str(section.get("displayName") or "").strip()
|
|
378
|
+
for section in sections
|
|
379
|
+
if str(section.get("displayName") or "").strip()
|
|
380
|
+
}
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
raise RuntimeError(
|
|
384
|
+
"Power BI metadata did not expose the Microdados da PNP section"
|
|
385
|
+
+ (f"; available sections: {section_names}" if section_names else "")
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
raise RuntimeError("Power BI metadata exposed the Microdados da PNP section, but no compatible visual or fallback query context was found")
|
|
389
|
+
|
|
390
|
+
@staticmethod
|
|
391
|
+
def _find_microdados_section(sections: list[dict[str, Any]]) -> dict[str, Any] | None:
|
|
392
|
+
for section in sections:
|
|
393
|
+
if str(section.get("displayName") or "").strip() == MICRODADOS_SECTION_DISPLAY_NAME:
|
|
394
|
+
return section
|
|
395
|
+
return None
|
|
396
|
+
|
|
397
|
+
@classmethod
|
|
398
|
+
def _select_microdados_visual(cls, section: dict[str, Any]) -> dict[str, Any] | None:
|
|
399
|
+
for container in section.get("visualContainers") or []:
|
|
400
|
+
raw_config = container.get("config")
|
|
401
|
+
if not isinstance(raw_config, str) or not raw_config.strip():
|
|
402
|
+
continue
|
|
403
|
+
try:
|
|
404
|
+
visual = json.loads(raw_config)
|
|
405
|
+
except json.JSONDecodeError:
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
single_visual = dict(visual.get("singleVisual") or {})
|
|
409
|
+
if not single_visual:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
projections = dict(single_visual.get("projections") or {})
|
|
413
|
+
if cls._visual_matches_microdados_catalog(single_visual, projections):
|
|
414
|
+
return visual
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
@classmethod
|
|
418
|
+
def _visual_matches_microdados_catalog(cls, single_visual: dict[str, Any], projections: dict[str, Any]) -> bool:
|
|
419
|
+
row_refs = [item.get("queryRef") for item in projections.get("Rows") or [] if item.get("active", True)]
|
|
420
|
+
column_refs = [item.get("queryRef") for item in projections.get("Columns") or [] if item.get("active", True)]
|
|
421
|
+
value_refs = [item.get("queryRef") for item in projections.get("Values") or [] if item.get("active", True) or "active" not in item]
|
|
422
|
+
|
|
423
|
+
exact_projection_match = (
|
|
424
|
+
row_refs == [MICRODADOS_ROWS_QUERY_REF]
|
|
425
|
+
and column_refs == [MICRODADOS_COLUMNS_QUERY_REF]
|
|
426
|
+
and value_refs == [MICRODADOS_VALUES_QUERY_REF]
|
|
427
|
+
)
|
|
428
|
+
if exact_projection_match:
|
|
429
|
+
return True
|
|
430
|
+
|
|
431
|
+
prototype_query = dict(single_visual.get("prototypeQuery") or {})
|
|
432
|
+
select_names = {
|
|
433
|
+
str(item.get("Name") or "").strip()
|
|
434
|
+
for item in prototype_query.get("Select") or []
|
|
435
|
+
if isinstance(item, dict)
|
|
436
|
+
}
|
|
437
|
+
if {
|
|
438
|
+
MICRODADOS_ROWS_QUERY_REF,
|
|
439
|
+
MICRODADOS_COLUMNS_QUERY_REF,
|
|
440
|
+
MICRODADOS_VALUES_QUERY_REF,
|
|
441
|
+
}.issubset(select_names):
|
|
442
|
+
return True
|
|
443
|
+
|
|
444
|
+
return False
|
|
445
|
+
|
|
446
|
+
@classmethod
|
|
447
|
+
def _build_fallback_visual(cls, section: dict[str, Any]) -> dict[str, Any] | None:
|
|
448
|
+
visual_containers = list(section.get("visualContainers") or [])
|
|
449
|
+
if not visual_containers:
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
fallback_name = str(visual_containers[0].get("objectName") or visual_containers[0].get("id") or "microdados_catalog")
|
|
453
|
+
return {
|
|
454
|
+
"name": fallback_name,
|
|
455
|
+
"singleVisual": {
|
|
456
|
+
"visualType": "microdados_catalog_fallback",
|
|
457
|
+
"prototypeQuery": cls._build_fallback_prototype_query(),
|
|
458
|
+
},
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
@staticmethod
|
|
462
|
+
def _build_fallback_prototype_query() -> dict[str, Any]:
|
|
463
|
+
return {
|
|
464
|
+
"Version": 2,
|
|
465
|
+
"From": [{"Name": "m", "Entity": MICRODADOS_ENTITY_NAME, "Type": 0}],
|
|
466
|
+
"Select": [
|
|
467
|
+
{
|
|
468
|
+
"Measure": {
|
|
469
|
+
"Expression": {"SourceRef": {"Source": "m"}},
|
|
470
|
+
"Property": MICRODADOS_URL_PROPERTY,
|
|
471
|
+
},
|
|
472
|
+
"Name": MICRODADOS_VALUES_QUERY_REF,
|
|
473
|
+
},
|
|
474
|
+
{
|
|
475
|
+
"Column": {
|
|
476
|
+
"Expression": {"SourceRef": {"Source": "m"}},
|
|
477
|
+
"Property": MICRODADOS_ANO_PROPERTY,
|
|
478
|
+
},
|
|
479
|
+
"Name": MICRODADOS_ROWS_QUERY_REF,
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"Column": {
|
|
483
|
+
"Expression": {"SourceRef": {"Source": "m"}},
|
|
484
|
+
"Property": MICRODADOS_TIPO_PROPERTY,
|
|
485
|
+
},
|
|
486
|
+
"Name": MICRODADOS_COLUMNS_QUERY_REF,
|
|
487
|
+
},
|
|
488
|
+
],
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
@staticmethod
|
|
492
|
+
def _build_querydata_body(context: PowerBIMicrodadosContext) -> dict[str, Any]:
|
|
493
|
+
return {
|
|
494
|
+
"version": "1.0.0",
|
|
495
|
+
"queries": [
|
|
496
|
+
{
|
|
497
|
+
"Query": {
|
|
498
|
+
"Commands": [
|
|
499
|
+
{
|
|
500
|
+
"SemanticQueryDataShapeCommand": {
|
|
501
|
+
"Query": context.prototype_query,
|
|
502
|
+
"Binding": {
|
|
503
|
+
"Primary": {"Groupings": [{"Projections": [0, 1, 2]}]},
|
|
504
|
+
"DataReduction": {"DataVolume": 3, "Primary": {"Top": {"Count": 500}}},
|
|
505
|
+
"Version": 1,
|
|
506
|
+
},
|
|
507
|
+
"ExecutionMetricsKind": 1,
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
]
|
|
511
|
+
},
|
|
512
|
+
"ApplicationContext": {
|
|
513
|
+
"DatasetId": context.dataset_id,
|
|
514
|
+
"Sources": [{"ReportId": context.report_id, "VisualId": context.visual_id}],
|
|
515
|
+
},
|
|
516
|
+
}
|
|
517
|
+
],
|
|
518
|
+
"modelId": context.model_id,
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
@classmethod
|
|
522
|
+
def _decode_microdados_catalog(cls, payload: dict[str, Any]) -> list[MicrodadosCatalogEntry]:
|
|
523
|
+
seen: set[tuple[str, str, str]] = set()
|
|
524
|
+
entries: list[MicrodadosCatalogEntry] = []
|
|
525
|
+
|
|
526
|
+
results = payload.get("results") or []
|
|
527
|
+
for result in results:
|
|
528
|
+
data = dict((result.get("result") or {}).get("data") or {})
|
|
529
|
+
dsr = dict(data.get("dsr") or {})
|
|
530
|
+
for dataset in dsr.get("DS") or []:
|
|
531
|
+
for row_values in cls._decode_dsr_rows(dict(dataset or {})):
|
|
532
|
+
if len(row_values) < 3:
|
|
533
|
+
continue
|
|
534
|
+
ano_base = str(row_values[0] or "").strip()
|
|
535
|
+
tipo_microdados = str(row_values[1] or "").strip()
|
|
536
|
+
microdados_url = str(row_values[2] or "").strip()
|
|
537
|
+
if not ano_base or not tipo_microdados or not microdados_url:
|
|
538
|
+
continue
|
|
539
|
+
key = (ano_base, tipo_microdados, microdados_url)
|
|
540
|
+
if key in seen:
|
|
541
|
+
continue
|
|
542
|
+
seen.add(key)
|
|
543
|
+
entries.append(
|
|
544
|
+
MicrodadosCatalogEntry(
|
|
545
|
+
ano_base=ano_base,
|
|
546
|
+
tipo_microdados=tipo_microdados,
|
|
547
|
+
microdados_url=microdados_url,
|
|
548
|
+
)
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
if not entries:
|
|
552
|
+
raise RuntimeError("Power BI querydata did not expose any microdados download entries")
|
|
553
|
+
return entries
|
|
554
|
+
|
|
555
|
+
@classmethod
|
|
556
|
+
def _decode_dsr_rows(cls, dataset: dict[str, Any]) -> list[list[Any]]:
|
|
557
|
+
value_dicts = dict(dataset.get("ValueDicts") or {})
|
|
558
|
+
decoded_rows: list[list[Any]] = []
|
|
559
|
+
|
|
560
|
+
for placeholder in dataset.get("PH") or []:
|
|
561
|
+
if not isinstance(placeholder, dict):
|
|
562
|
+
continue
|
|
563
|
+
for member_name, rows in placeholder.items():
|
|
564
|
+
if not isinstance(member_name, str) or not member_name.startswith("DM"):
|
|
565
|
+
continue
|
|
566
|
+
if not isinstance(rows, list):
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
schema: list[dict[str, Any]] = []
|
|
570
|
+
previous_values: list[Any] = []
|
|
571
|
+
for row in rows:
|
|
572
|
+
if not isinstance(row, dict):
|
|
573
|
+
continue
|
|
574
|
+
if isinstance(row.get("S"), list):
|
|
575
|
+
schema = [dict(item or {}) for item in row["S"]]
|
|
576
|
+
previous_values = [None] * len(schema)
|
|
577
|
+
if not schema:
|
|
578
|
+
continue
|
|
579
|
+
|
|
580
|
+
inflated = cls._inflate_dsr_row(row=row, schema=schema, previous_values=previous_values)
|
|
581
|
+
if inflated is None:
|
|
582
|
+
continue
|
|
583
|
+
|
|
584
|
+
previous_values = list(inflated)
|
|
585
|
+
decoded_rows.append(
|
|
586
|
+
[
|
|
587
|
+
cls._resolve_dsr_value(schema_item, raw_value, value_dicts)
|
|
588
|
+
for schema_item, raw_value in zip(schema, inflated)
|
|
589
|
+
]
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
return decoded_rows
|
|
593
|
+
|
|
594
|
+
@staticmethod
|
|
595
|
+
def _inflate_dsr_row(
|
|
596
|
+
row: dict[str, Any],
|
|
597
|
+
schema: list[dict[str, Any]],
|
|
598
|
+
previous_values: list[Any],
|
|
599
|
+
) -> list[Any] | None:
|
|
600
|
+
compressed = row.get("C")
|
|
601
|
+
if isinstance(compressed, list):
|
|
602
|
+
values = list(previous_values) if previous_values else [None] * len(schema)
|
|
603
|
+
start_index = len(schema) - len(compressed)
|
|
604
|
+
repeated_prefix = row.get("R")
|
|
605
|
+
if isinstance(repeated_prefix, int):
|
|
606
|
+
start_index = repeated_prefix
|
|
607
|
+
start_index = max(0, min(start_index, len(schema)))
|
|
608
|
+
for offset, value in enumerate(compressed):
|
|
609
|
+
index = start_index + offset
|
|
610
|
+
if index >= len(schema):
|
|
611
|
+
break
|
|
612
|
+
values[index] = value
|
|
613
|
+
return values
|
|
614
|
+
|
|
615
|
+
named_values = [row.get(str(column.get("N") or "")) for column in schema]
|
|
616
|
+
if any(value is not None for value in named_values):
|
|
617
|
+
return named_values
|
|
618
|
+
return None
|
|
619
|
+
|
|
620
|
+
@staticmethod
|
|
621
|
+
def _resolve_dsr_value(schema_item: dict[str, Any], raw_value: Any, value_dicts: dict[str, Any]) -> Any:
|
|
622
|
+
dictionary_name = schema_item.get("DN")
|
|
623
|
+
if isinstance(raw_value, int) and isinstance(dictionary_name, str):
|
|
624
|
+
dictionary = value_dicts.get(dictionary_name)
|
|
625
|
+
if isinstance(dictionary, list) and 0 <= raw_value < len(dictionary):
|
|
626
|
+
raw_value = dictionary[raw_value]
|
|
627
|
+
|
|
628
|
+
if isinstance(raw_value, str):
|
|
629
|
+
value = raw_value.strip()
|
|
630
|
+
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
|
|
631
|
+
value = value[1:-1]
|
|
632
|
+
return value
|
|
633
|
+
return raw_value
|
|
634
|
+
|
|
635
|
+
@staticmethod
|
|
636
|
+
def _slugify(value: str) -> str:
|
|
637
|
+
safe = "".join(char.lower() if char.isalnum() else "_" for char in value.strip())
|
|
638
|
+
collapsed = "_".join(part for part in safe.split("_") if part)
|
|
639
|
+
return collapsed or "arquivo"
|
|
640
|
+
|
|
641
|
+
@staticmethod
|
|
642
|
+
def _read_preview_lines(path: Path, line_count: int) -> tuple[str, ...]:
|
|
643
|
+
if line_count <= 0:
|
|
644
|
+
return ()
|
|
645
|
+
|
|
646
|
+
opener = gzip.open if path.suffix == ".gz" else open
|
|
647
|
+
lines: list[str] = []
|
|
648
|
+
with opener(path, "rt", encoding="utf-8", errors="replace") as handle:
|
|
649
|
+
for _ in range(line_count):
|
|
650
|
+
line = handle.readline()
|
|
651
|
+
if not line:
|
|
652
|
+
break
|
|
653
|
+
lines.append(line.rstrip("\n"))
|
|
654
|
+
return tuple(lines)
|
|
655
|
+
|
|
656
|
+
@staticmethod
|
|
657
|
+
def decode_content_bytes(content_bytes: bytes, source_url: str) -> str:
|
|
658
|
+
raw_bytes = gzip.decompress(content_bytes) if source_url.lower().endswith(".gz") else content_bytes
|
|
659
|
+
for encoding in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
|
|
660
|
+
try:
|
|
661
|
+
return raw_bytes.decode(encoding)
|
|
662
|
+
except UnicodeDecodeError:
|
|
663
|
+
continue
|
|
664
|
+
return raw_bytes.decode("utf-8", errors="replace")
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def build_cli_parser() -> argparse.ArgumentParser:
|
|
668
|
+
parser = argparse.ArgumentParser(description="Consulta o catálogo de microdados públicos da PNP via Power BI.")
|
|
669
|
+
parser.add_argument("--page-url", default=DEFAULT_POWERBI_MICRODADOS_URL, help="URL pública do relatório Power BI.")
|
|
670
|
+
parser.add_argument("--timeout-seconds", type=int, default=60, help="Timeout HTTP em segundos.")
|
|
671
|
+
parser.add_argument("--output-json", help="Caminho para salvar o catálogo completo em JSON.")
|
|
672
|
+
parser.add_argument("--download-dir", help="Diretório para baixar arquivos de microdados.")
|
|
673
|
+
parser.add_argument(
|
|
674
|
+
"--download-limit",
|
|
675
|
+
type=int,
|
|
676
|
+
default=0,
|
|
677
|
+
help="Quantidade de arquivos a baixar. Use 0 para apenas listar o catálogo.",
|
|
678
|
+
)
|
|
679
|
+
parser.add_argument(
|
|
680
|
+
"--preview-lines",
|
|
681
|
+
type=int,
|
|
682
|
+
default=0,
|
|
683
|
+
help="Quantidade de linhas de prévia para imprimir por arquivo baixado.",
|
|
684
|
+
)
|
|
685
|
+
return parser
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def main(argv: list[str] | None = None) -> int:
|
|
689
|
+
parser = build_cli_parser()
|
|
690
|
+
args = parser.parse_args(argv)
|
|
691
|
+
|
|
692
|
+
client = PowerBIMicrodadosClient(page_url=args.page_url, timeout_seconds=args.timeout_seconds)
|
|
693
|
+
context, entries = client.fetch_catalog()
|
|
694
|
+
|
|
695
|
+
manifest = {
|
|
696
|
+
"context": asdict(context),
|
|
697
|
+
"entries": [asdict(entry) for entry in entries],
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
if args.output_json:
|
|
701
|
+
output_path = Path(args.output_json)
|
|
702
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
703
|
+
output_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
704
|
+
|
|
705
|
+
print(
|
|
706
|
+
json.dumps(
|
|
707
|
+
{
|
|
708
|
+
"entry_count": len(entries),
|
|
709
|
+
"first_entry": asdict(entries[0]) if entries else None,
|
|
710
|
+
"resource_key": context.resource_key,
|
|
711
|
+
"visual_id": context.visual_id,
|
|
712
|
+
},
|
|
713
|
+
ensure_ascii=False,
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
if args.download_dir and args.download_limit > 0:
|
|
718
|
+
download_results = [
|
|
719
|
+
asdict(client.download_entry(entry, args.download_dir, preview_line_count=args.preview_lines))
|
|
720
|
+
for entry in entries[: args.download_limit]
|
|
721
|
+
]
|
|
722
|
+
print(json.dumps({"downloads": download_results}, ensure_ascii=False, indent=2))
|
|
723
|
+
|
|
724
|
+
return 0
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
if __name__ == "__main__":
|
|
728
|
+
raise SystemExit(main())
|