@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,438 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import json
5
+ import uuid
6
+ from typing import Any
7
+ from urllib.parse import parse_qs, unquote, urlparse
8
+
9
+ import httpx
10
+
11
+ DEFAULT_PNP_POWERBI_REPORT_URL = (
12
+ "https://app.powerbi.com/view?"
13
+ "r=eyJrIjoiZDhkNGNiYzgtMjQ0My00OGVlLWJjNzYtZWQwYjI2OThhYWM1IiwidCI6IjllNjgyMzU5LWQxMjgtNGVkYi1iYjU4LTgyYjJhMTUzNDBmZiJ9"
14
+ )
15
+ MICRODADOS_SECTION_DISPLAY_NAME = "Microdados da PNP"
16
+ MICRODADOS_ROWS_QUERY_REF = "Microdados.Ano Base"
17
+ MICRODADOS_COLUMNS_QUERY_REF = "Microdados.Tipo de microdados"
18
+ MICRODADOS_VALUES_QUERY_REF = "Microdados.MicrodadosURL"
19
+ MICRODADOS_ENTITY_NAME = "Microdados"
20
+ MICRODADOS_ANO_PROPERTY = "Ano Base"
21
+ MICRODADOS_TIPO_PROPERTY = "Tipo de microdados"
22
+ MICRODADOS_URL_PROPERTY = "MicrodadosURL"
23
+ PNP_MICRODADOS_TYPES = (
24
+ "Eficiência Acadêmica",
25
+ "Financeiro",
26
+ "Matrículas",
27
+ "Servidores",
28
+ )
29
+
30
+
31
+ def load_public_microdados_catalog(
32
+ *,
33
+ timeout_seconds: float,
34
+ page_url: str = DEFAULT_PNP_POWERBI_REPORT_URL,
35
+ ) -> dict[str, Any]:
36
+ with httpx.Client(timeout=timeout_seconds, follow_redirects=True) as client:
37
+ html_response = client.get(page_url)
38
+ html_response.raise_for_status()
39
+ html = html_response.text
40
+
41
+ resource_descriptor = _extract_resource_descriptor(html) or _decode_resource_descriptor_from_url(page_url)
42
+ resource_key = str(resource_descriptor.get("k") or "").strip()
43
+ tenant_id = str(resource_descriptor.get("t") or "").strip()
44
+ if not resource_key or not tenant_id:
45
+ raise RuntimeError("Power BI page did not expose the public resource descriptor")
46
+
47
+ resolved_cluster_uri = _extract_resolved_cluster_uri(html)
48
+ if not resolved_cluster_uri:
49
+ route_response = client.get(
50
+ f"https://api.powerbi.com/public/routing/cluster/{tenant_id}",
51
+ headers=_powerbi_headers(resource_key),
52
+ )
53
+ route_response.raise_for_status()
54
+ resolved_cluster_uri = str((route_response.json() or {}).get("FixedClusterUri") or "").strip()
55
+ if not resolved_cluster_uri:
56
+ raise RuntimeError("Power BI routing did not return FixedClusterUri")
57
+
58
+ api_base_url = _build_apim_url(resolved_cluster_uri)
59
+ metadata_response = client.get(
60
+ f"{api_base_url}/public/reports/{resource_key}/modelsAndExploration?preferReadOnlySession=true",
61
+ headers=_powerbi_headers(resource_key),
62
+ )
63
+ metadata_response.raise_for_status()
64
+ metadata = metadata_response.json()
65
+ visual = _find_microdados_visual(metadata)
66
+ report = dict((metadata.get("exploration") or {}).get("report") or {})
67
+ model = dict(report.get("model") or {})
68
+ model_fallback = dict((metadata.get("models") or [{}])[0] or {})
69
+
70
+ query_body = {
71
+ "version": "1.0.0",
72
+ "queries": [
73
+ {
74
+ "Query": {
75
+ "Commands": [
76
+ {
77
+ "SemanticQueryDataShapeCommand": {
78
+ "Query": dict((visual.get("singleVisual") or {}).get("prototypeQuery") or {}),
79
+ "Binding": {
80
+ "Primary": {"Groupings": [{"Projections": [0, 1, 2]}]},
81
+ "DataReduction": {"DataVolume": 3, "Primary": {"Top": {"Count": 500}}},
82
+ "Version": 1,
83
+ },
84
+ "ExecutionMetricsKind": 1,
85
+ }
86
+ }
87
+ ]
88
+ },
89
+ "ApplicationContext": {
90
+ "DatasetId": str(model.get("dbName") or model_fallback.get("dbName") or ""),
91
+ "Sources": [
92
+ {
93
+ "ReportId": str(report.get("objectId") or ""),
94
+ "VisualId": str(visual.get("name") or ""),
95
+ }
96
+ ],
97
+ },
98
+ }
99
+ ],
100
+ "modelId": int(report.get("modelId") or model_fallback.get("id") or 0),
101
+ }
102
+ query_response = client.post(
103
+ f"{api_base_url}/public/reports/querydata?synchronous=true",
104
+ headers=_powerbi_headers(resource_key, json_request=True),
105
+ json=query_body,
106
+ )
107
+ query_response.raise_for_status()
108
+ items = _decode_microdados_catalog(query_response.json())
109
+ type_rank = {item: index for index, item in enumerate(PNP_MICRODADOS_TYPES)}
110
+ items.sort(
111
+ key=lambda item: (
112
+ -int(item["ano_base"]) if str(item["ano_base"]).isdigit() else 0,
113
+ type_rank.get(item["tipo_microdados"], 999),
114
+ item["microdados_url"],
115
+ )
116
+ )
117
+ years = sorted({item["ano_base"] for item in items}, reverse=True)
118
+ by_year: dict[str, list[str]] = {}
119
+ for year in years:
120
+ types = sorted(
121
+ {item["tipo_microdados"] for item in items if item["ano_base"] == year},
122
+ key=lambda item: (PNP_MICRODADOS_TYPES.index(item) if item in PNP_MICRODADOS_TYPES else 999, item),
123
+ )
124
+ by_year[year] = types
125
+
126
+ return {
127
+ "page_url": page_url,
128
+ "resource_key": resource_key,
129
+ "available_years": years,
130
+ "available_microdados_types": [item for item in PNP_MICRODADOS_TYPES if any(t == item for t in {row["tipo_microdados"] for row in items})],
131
+ "types_by_year": by_year,
132
+ "items": items,
133
+ }
134
+
135
+
136
+ def _powerbi_headers(resource_key: str, json_request: bool = False) -> dict[str, str]:
137
+ headers = {
138
+ "Accept": "application/json",
139
+ "ActivityId": str(uuid.uuid4()),
140
+ "RequestId": str(uuid.uuid4()),
141
+ "X-PowerBI-ResourceKey": resource_key,
142
+ }
143
+ if json_request:
144
+ headers["Content-Type"] = "application/json"
145
+ return headers
146
+
147
+
148
+ def _extract_resource_descriptor(html: str) -> dict[str, str]:
149
+ marker = "resourceDescriptor = JSON.parse('"
150
+ start = html.find(marker)
151
+ if start < 0:
152
+ return {}
153
+ start += len(marker)
154
+ end = html.find("');", start)
155
+ if end < 0:
156
+ return {}
157
+ payload = bytes(html[start:end], "utf-8").decode("unicode_escape")
158
+ descriptor = json.loads(payload)
159
+ return {
160
+ "k": str(descriptor.get("k") or "").strip(),
161
+ "t": str(descriptor.get("t") or "").strip(),
162
+ }
163
+
164
+
165
+ def _decode_resource_descriptor_from_url(page_url: str) -> dict[str, str]:
166
+ parsed = urlparse(page_url)
167
+ encoded = parse_qs(parsed.query).get("r", [])
168
+ if not encoded:
169
+ return {}
170
+ token = unquote(encoded[0])
171
+ padding = "=" * (-len(token) % 4)
172
+ payload = json.loads(base64.urlsafe_b64decode(token + padding).decode("utf-8"))
173
+ return {
174
+ "k": str(payload.get("k") or "").strip(),
175
+ "t": str(payload.get("t") or "").strip(),
176
+ }
177
+
178
+
179
+ def _extract_resolved_cluster_uri(html: str) -> str:
180
+ marker = "var resolvedClusterUri = '"
181
+ start = html.find(marker)
182
+ if start < 0:
183
+ return ""
184
+ start += len(marker)
185
+ end = html.find("';", start)
186
+ if end < 0:
187
+ return ""
188
+ return html[start:end].strip()
189
+
190
+
191
+ def _build_apim_url(cluster_uri: str) -> str:
192
+ parsed = urlparse(cluster_uri)
193
+ hostname = parsed.hostname or ""
194
+ if not hostname:
195
+ raise RuntimeError("Invalid Power BI cluster uri")
196
+ host_tokens = hostname.split(".")
197
+ host_tokens[0] = host_tokens[0].replace("-redirect", "")
198
+ host_tokens[0] = host_tokens[0].replace("global-", "")
199
+ host_tokens[0] = f"{host_tokens[0]}-api"
200
+ scheme = parsed.scheme or "https"
201
+ return f"{scheme}://{'.'.join(host_tokens)}"
202
+
203
+
204
+ def _find_microdados_visual(metadata: dict[str, Any]) -> dict[str, Any]:
205
+ sections = list(((metadata.get("exploration") or {}).get("sections") or []))
206
+ preferred_section = _find_microdados_section(sections)
207
+ if preferred_section is not None:
208
+ visual = _select_microdados_visual(preferred_section)
209
+ if visual is not None:
210
+ return visual
211
+
212
+ fallback_visual = _build_fallback_visual(preferred_section)
213
+ if fallback_visual is not None:
214
+ return fallback_visual
215
+
216
+ for section in sections:
217
+ visual = _select_microdados_visual(section)
218
+ if visual is not None:
219
+ return visual
220
+
221
+ if preferred_section is None:
222
+ available_sections = ", ".join(
223
+ sorted(
224
+ {
225
+ str(section.get("displayName") or "").strip()
226
+ for section in sections
227
+ if str(section.get("displayName") or "").strip()
228
+ }
229
+ )
230
+ )
231
+ raise RuntimeError(
232
+ "Power BI metadata did not expose the Microdados da PNP section"
233
+ + (f"; available sections: {available_sections}" if available_sections else "")
234
+ )
235
+
236
+ raise RuntimeError(
237
+ "Power BI metadata exposed the Microdados da PNP section, "
238
+ "but no compatible visual or fallback query context was found"
239
+ )
240
+
241
+
242
+ def _find_microdados_section(sections: list[dict[str, Any]]) -> dict[str, Any] | None:
243
+ for section in sections:
244
+ if str(section.get("displayName") or "").strip() == MICRODADOS_SECTION_DISPLAY_NAME:
245
+ return section
246
+ return None
247
+
248
+
249
+ def _select_microdados_visual(section: dict[str, Any]) -> dict[str, Any] | None:
250
+ for container in section.get("visualContainers") or []:
251
+ raw_config = container.get("config")
252
+ if not isinstance(raw_config, str) or not raw_config.strip():
253
+ continue
254
+ try:
255
+ visual = json.loads(raw_config)
256
+ except json.JSONDecodeError:
257
+ continue
258
+
259
+ single_visual = dict(visual.get("singleVisual") or {})
260
+ if not single_visual:
261
+ continue
262
+
263
+ projections = dict(single_visual.get("projections") or {})
264
+ if _visual_matches_microdados_catalog(single_visual, projections):
265
+ return visual
266
+ return None
267
+
268
+
269
+ def _visual_matches_microdados_catalog(single_visual: dict[str, Any], projections: dict[str, Any]) -> bool:
270
+ row_refs = [item.get("queryRef") for item in projections.get("Rows") or [] if item.get("active", True)]
271
+ column_refs = [item.get("queryRef") for item in projections.get("Columns") or [] if item.get("active", True)]
272
+ value_refs = [
273
+ item.get("queryRef")
274
+ for item in projections.get("Values") or []
275
+ if item.get("active", True) or "active" not in item
276
+ ]
277
+
278
+ exact_projection_match = (
279
+ single_visual.get("visualType") == "pivotTable"
280
+ and row_refs == [MICRODADOS_ROWS_QUERY_REF]
281
+ and column_refs == [MICRODADOS_COLUMNS_QUERY_REF]
282
+ and value_refs == [MICRODADOS_VALUES_QUERY_REF]
283
+ )
284
+ if exact_projection_match:
285
+ return True
286
+
287
+ prototype_query = dict(single_visual.get("prototypeQuery") or {})
288
+ select_names = {
289
+ str(item.get("Name") or "").strip()
290
+ for item in prototype_query.get("Select") or []
291
+ if isinstance(item, dict)
292
+ }
293
+ return {
294
+ MICRODADOS_ROWS_QUERY_REF,
295
+ MICRODADOS_COLUMNS_QUERY_REF,
296
+ MICRODADOS_VALUES_QUERY_REF,
297
+ }.issubset(select_names)
298
+
299
+
300
+ def _build_fallback_visual(section: dict[str, Any]) -> dict[str, Any] | None:
301
+ visual_containers = list(section.get("visualContainers") or [])
302
+ if not visual_containers:
303
+ return None
304
+
305
+ fallback_name = str(visual_containers[0].get("objectName") or visual_containers[0].get("id") or "microdados_catalog")
306
+ return {
307
+ "name": fallback_name,
308
+ "singleVisual": {
309
+ "visualType": "microdados_catalog_fallback",
310
+ "prototypeQuery": _build_fallback_prototype_query(),
311
+ },
312
+ }
313
+
314
+
315
+ def _build_fallback_prototype_query() -> dict[str, Any]:
316
+ return {
317
+ "Version": 2,
318
+ "From": [{"Name": "m", "Entity": MICRODADOS_ENTITY_NAME, "Type": 0}],
319
+ "Select": [
320
+ {
321
+ "Measure": {
322
+ "Expression": {"SourceRef": {"Source": "m"}},
323
+ "Property": MICRODADOS_URL_PROPERTY,
324
+ },
325
+ "Name": MICRODADOS_VALUES_QUERY_REF,
326
+ },
327
+ {
328
+ "Column": {
329
+ "Expression": {"SourceRef": {"Source": "m"}},
330
+ "Property": MICRODADOS_ANO_PROPERTY,
331
+ },
332
+ "Name": MICRODADOS_ROWS_QUERY_REF,
333
+ },
334
+ {
335
+ "Column": {
336
+ "Expression": {"SourceRef": {"Source": "m"}},
337
+ "Property": MICRODADOS_TIPO_PROPERTY,
338
+ },
339
+ "Name": MICRODADOS_COLUMNS_QUERY_REF,
340
+ },
341
+ ],
342
+ }
343
+
344
+
345
+ def _decode_microdados_catalog(payload: dict[str, Any]) -> list[dict[str, str]]:
346
+ seen: set[tuple[str, str, str]] = set()
347
+ items: list[dict[str, str]] = []
348
+ for result in payload.get("results") or []:
349
+ data = dict((result.get("result") or {}).get("data") or {})
350
+ dsr = dict(data.get("dsr") or {})
351
+ for dataset in dsr.get("DS") or []:
352
+ for row in _decode_dsr_rows(dict(dataset or {})):
353
+ if len(row) < 3:
354
+ continue
355
+ year = str(row[0] or "").strip()
356
+ microdata_type = str(row[1] or "").strip()
357
+ download_url = str(row[2] or "").strip()
358
+ if not year or not microdata_type or not download_url:
359
+ continue
360
+ key = (year, microdata_type, download_url)
361
+ if key in seen:
362
+ continue
363
+ seen.add(key)
364
+ items.append(
365
+ {
366
+ "ano_base": year,
367
+ "tipo_microdados": microdata_type,
368
+ "microdados_url": download_url,
369
+ }
370
+ )
371
+ return items
372
+
373
+
374
+ def _decode_dsr_rows(dataset: dict[str, Any]) -> list[list[Any]]:
375
+ value_dicts = dict(dataset.get("ValueDicts") or {})
376
+ rows: list[list[Any]] = []
377
+ for placeholder in dataset.get("PH") or []:
378
+ if not isinstance(placeholder, dict):
379
+ continue
380
+ for member_name, member_rows in placeholder.items():
381
+ if not isinstance(member_name, str) or not member_name.startswith("DM") or not isinstance(member_rows, list):
382
+ continue
383
+ schema: list[dict[str, Any]] = []
384
+ previous_values: list[Any] = []
385
+ for member_row in member_rows:
386
+ if not isinstance(member_row, dict):
387
+ continue
388
+ if isinstance(member_row.get("S"), list):
389
+ schema = [dict(item or {}) for item in member_row["S"]]
390
+ previous_values = [None] * len(schema)
391
+ if not schema:
392
+ continue
393
+ inflated = _inflate_dsr_row(member_row, schema, previous_values)
394
+ if inflated is None:
395
+ continue
396
+ previous_values = list(inflated)
397
+ rows.append([_resolve_dsr_value(schema_item, value_dicts, value) for schema_item, value in zip(schema, inflated)])
398
+ return rows
399
+
400
+
401
+ def _inflate_dsr_row(
402
+ row: dict[str, Any],
403
+ schema: list[dict[str, Any]],
404
+ previous_values: list[Any],
405
+ ) -> list[Any] | None:
406
+ compressed = row.get("C")
407
+ if isinstance(compressed, list):
408
+ values = list(previous_values) if previous_values else [None] * len(schema)
409
+ start_index = len(schema) - len(compressed)
410
+ if isinstance(row.get("R"), int):
411
+ start_index = int(row["R"])
412
+ start_index = max(0, min(start_index, len(schema)))
413
+ for offset, value in enumerate(compressed):
414
+ index = start_index + offset
415
+ if index >= len(schema):
416
+ break
417
+ values[index] = value
418
+ return values
419
+
420
+ named_values = [row.get(str(column.get("N") or "")) for column in schema]
421
+ if any(value is not None for value in named_values):
422
+ return named_values
423
+ return None
424
+
425
+
426
+ def _resolve_dsr_value(schema_item: dict[str, Any], value_dicts: dict[str, Any], raw_value: Any) -> Any:
427
+ dictionary_name = schema_item.get("DN")
428
+ if isinstance(raw_value, int) and isinstance(dictionary_name, str):
429
+ dictionary = value_dicts.get(dictionary_name)
430
+ if isinstance(dictionary, list) and 0 <= raw_value < len(dictionary):
431
+ raw_value = dictionary[raw_value]
432
+
433
+ if isinstance(raw_value, str):
434
+ cleaned = raw_value.strip()
435
+ if len(cleaned) >= 2 and cleaned[0] == cleaned[-1] and cleaned[0] in {"'", '"'}:
436
+ cleaned = cleaned[1:-1]
437
+ return cleaned
438
+ return raw_value
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import httpx
4
+ from fastapi import HTTPException
5
+
6
+
7
+ async def ask_vanna(
8
+ vanna_service_url: str,
9
+ question: str,
10
+ llm_override: dict[str, object] | None = None,
11
+ ) -> dict[str, object]:
12
+ payload: dict[str, object] = {"question": question}
13
+ if llm_override:
14
+ payload["llm_override"] = llm_override
15
+
16
+ try:
17
+ async with httpx.AsyncClient(timeout=60) as client:
18
+ response = await client.post(
19
+ f"{vanna_service_url}/ask",
20
+ json=payload,
21
+ )
22
+ except httpx.RequestError as exc:
23
+ raise HTTPException(status_code=503, detail=f"Vanna service unavailable: {exc}") from exc
24
+
25
+ if response.status_code >= 400:
26
+ try:
27
+ detail = response.json().get("detail", response.text)
28
+ except ValueError:
29
+ detail = response.text
30
+ raise HTTPException(status_code=response.status_code, detail=detail)
31
+
32
+ return response.json()
@@ -0,0 +1,9 @@
1
+ fastapi==0.115.6
2
+ uvicorn[standard]==0.32.1
3
+ httpx==0.28.1
4
+ PyJWT==2.10.1
5
+ cryptography==44.0.1
6
+ pydantic==2.10.4
7
+ pydantic-settings==2.7.0
8
+ psycopg2-binary==2.9.9
9
+ croniter==6.0.0
@@ -0,0 +1,18 @@
1
+ .env
2
+ .venv
3
+ venv/
4
+
5
+ __pycache__/
6
+ **/__pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+
13
+ *.db
14
+ *.sqlite
15
+ *.sqlite3
16
+ *.log
17
+ tmp/
18
+ temp/
@@ -0,0 +1,12 @@
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+ COPY requirements.txt /app/requirements.txt
8
+ RUN pip install --no-cache-dir -r /app/requirements.txt
9
+
10
+ COPY app /app/app
11
+
12
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "9000"]
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from pydantic import Field
6
+ from pydantic_settings import BaseSettings
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ vanna_host: str = Field(default="0.0.0.0", alias="VANNA_HOST")
11
+ vanna_port: int = Field(default=9000, alias="VANNA_PORT")
12
+ vanna_dsn: str = Field(..., alias="VANNA_DSN")
13
+ vanna_llm_provider: str = Field(default="ollama", alias="VANNA_LLM_PROVIDER")
14
+ vanna_ollama_base_url: str = Field(default="http://ollama:11434", alias="VANNA_OLLAMA_BASE_URL")
15
+ vanna_ollama_model: str = Field(default="sabia-7b", alias="VANNA_OLLAMA_MODEL")
16
+ vanna_maritaca_api_url: str = Field(
17
+ default="https://chat.maritaca.ai/api/chat/completions",
18
+ alias="VANNA_MARITACA_API_URL",
19
+ )
20
+ vanna_maritaca_api_key: str = Field(default="", alias="VANNA_MARITACA_API_KEY")
21
+ vanna_maritaca_model: str = Field(default="sabia-4", alias="VANNA_MARITACA_MODEL")
22
+ vanna_maritaca_timeout_seconds: int = Field(default=60, alias="VANNA_MARITACA_TIMEOUT_SECONDS")
23
+ vanna_vectorstore_path: str = Field(default="/data/vanna/chroma", alias="VANNA_VECTORSTORE_PATH")
24
+ vanna_auto_train: bool = Field(default=True, alias="VANNA_AUTO_TRAIN")
25
+ vanna_allowed_schema: str = Field(default="curated", alias="VANNA_ALLOWED_SCHEMA")
26
+ allowed_curated_views: str = Field(
27
+ default="",
28
+ alias="ALLOWED_CURATED_VIEWS",
29
+ )
30
+ vanna_max_rows: int = Field(default=200, alias="VANNA_MAX_ROWS")
31
+
32
+ def effective_allowed_schema(self) -> str:
33
+ configured_schema = self.vanna_allowed_schema.strip().lower()
34
+ if os.getenv("VANNA_ALLOWED_SCHEMA") or configured_schema:
35
+ return configured_schema or "curated"
36
+
37
+ legacy_schemas = {
38
+ item.strip().split(".", 1)[0].lower()
39
+ for item in self.allowed_curated_views.split(",")
40
+ if "." in item.strip()
41
+ }
42
+ if len(legacy_schemas) == 1:
43
+ return next(iter(legacy_schemas))
44
+ return "curated"
45
+
46
+ def model_name(self) -> str:
47
+ provider = self.vanna_llm_provider.strip().lower()
48
+ if provider == "maritaca":
49
+ return self.vanna_maritaca_model
50
+ return self.vanna_ollama_model
51
+
52
+ class Config:
53
+ env_file = ".env"
54
+ case_sensitive = False
55
+
56
+
57
+ settings = Settings()
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from pydantic import BaseModel, Field
8
+ from sqlalchemy import create_engine, text
9
+
10
+ from .config import settings
11
+ from .sql_guard import SQLGuard
12
+ from .vanna_engine import DataifVannaEngine
13
+
14
+
15
+ class AskRequest(BaseModel):
16
+ question: str = Field(..., min_length=3, max_length=1000)
17
+ llm_override: dict[str, Any] | None = None
18
+
19
+
20
+ def _allowed_schema() -> str:
21
+ return settings.effective_allowed_schema()
22
+
23
+
24
+ def _extract_sql(candidate: str) -> str:
25
+ fenced = re.search(r"```(?:sql)?\s*(.*?)```", candidate, flags=re.IGNORECASE | re.DOTALL)
26
+ return (fenced.group(1) if fenced else candidate).strip()
27
+
28
+
29
+ def _fallback_sql(question: str) -> str:
30
+ lower = question.lower()
31
+ if "cat" in lower or "catálogo" in lower or "catalogo" in lower or "view" in lower:
32
+ return (
33
+ "SELECT relation_group, relation_name, relation_description "
34
+ "FROM curated.vw_pnp_vanna_catalogo ORDER BY relation_group, relation_name LIMIT 50"
35
+ )
36
+ if "total" in lower or "quant" in lower or "matricula" in lower or "matrícula" in lower:
37
+ return (
38
+ "SELECT ano, SUM(matriculas) AS total_matriculas "
39
+ "FROM curated.mv_pnp_dashboard_matriculas "
40
+ "GROUP BY ano ORDER BY ano DESC LIMIT 50"
41
+ )
42
+ if "indicador" in lower or "resumo" in lower or "média" in lower or "media" in lower:
43
+ return (
44
+ "SELECT dominio, indicador, ano, COUNT(*) AS registros, AVG(valor) AS media_valor "
45
+ "FROM curated.vw_pnp_vanna_resumo "
46
+ "GROUP BY dominio, indicador, ano ORDER BY ano DESC, dominio, indicador LIMIT 50"
47
+ )
48
+ return (
49
+ "SELECT run_id, instance_key, dominio, indicador, ano, instituicao, regiao, uf, municipio, valor "
50
+ "FROM curated.vw_pnp_vanna_resumo "
51
+ "ORDER BY ano DESC NULLS LAST, dominio, indicador LIMIT 50"
52
+ )
53
+
54
+
55
+ app = FastAPI(title="dataif-vanna", version="0.1.0")
56
+ engine = create_engine(settings.vanna_dsn, pool_pre_ping=True)
57
+ allowed_schema = _allowed_schema()
58
+ guard = SQLGuard({allowed_schema})
59
+ vanna_engine = DataifVannaEngine(settings, engine, allowed_schema)
60
+
61
+
62
+ @app.get("/health")
63
+ def health() -> dict[str, object]:
64
+ runtime = vanna_engine.runtime_config()
65
+ return {
66
+ "status": "ok",
67
+ "llm_provider": runtime.provider,
68
+ "model": runtime.model_name(),
69
+ "allowed_schema": allowed_schema,
70
+ "llm_available": vanna_engine.is_llm_available(),
71
+ "llm_status": vanna_engine.provider_status(),
72
+ }
73
+
74
+
75
+ @app.post("/train")
76
+ def train() -> dict[str, object]:
77
+ try:
78
+ vanna_engine.train_once(force=True)
79
+ except Exception as exc:
80
+ raise HTTPException(status_code=503, detail=f"Vanna unavailable for training: {exc}") from exc
81
+ return {"status": "ok", "allowed_schema": allowed_schema}
82
+
83
+
84
+ @app.post("/ask")
85
+ def ask(req: AskRequest) -> dict[str, Any]:
86
+ generation_mode = "vanna"
87
+ try:
88
+ sql = _extract_sql(vanna_engine.generate_sql(req.question, runtime_override=req.llm_override))
89
+ except Exception as exc:
90
+ generation_mode = f"fallback: {exc}"
91
+ sql = _fallback_sql(req.question)
92
+
93
+ try:
94
+ sql = guard.enforce_limit(sql, settings.vanna_max_rows)
95
+ except ValueError as exc:
96
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
97
+
98
+ with engine.begin() as conn:
99
+ rows = conn.execute(text(sql)).fetchmany(settings.vanna_max_rows)
100
+
101
+ items = [dict(row._mapping) for row in rows]
102
+ return {
103
+ "question": req.question,
104
+ "sql": sql,
105
+ "rows": items,
106
+ "row_count": len(items),
107
+ "generation_mode": generation_mode,
108
+ }