@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ from sqlalchemy import Engine, text
7
+ from sqlalchemy.exc import SQLAlchemyError
8
+
9
+
10
+ VANNA_LLM_SETTING_KEY = "vanna.llm_config"
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class RuntimeVannaConfig:
15
+ provider: str
16
+ ollama_base_url: str
17
+ ollama_model: str
18
+ maritaca_api_url: str
19
+ maritaca_api_key: str
20
+ maritaca_model: str
21
+ maritaca_timeout_seconds: int
22
+ allowed_schema: str
23
+ vectorstore_path: str
24
+ auto_train: bool
25
+ max_rows: int
26
+
27
+ def signature(self) -> tuple[object, ...]:
28
+ return (
29
+ self.provider,
30
+ self.ollama_base_url,
31
+ self.ollama_model,
32
+ self.maritaca_api_url,
33
+ self.maritaca_api_key,
34
+ self.maritaca_model,
35
+ self.maritaca_timeout_seconds,
36
+ self.allowed_schema,
37
+ self.vectorstore_path,
38
+ self.auto_train,
39
+ self.max_rows,
40
+ )
41
+
42
+ def model_name(self) -> str:
43
+ if self.provider == "maritaca":
44
+ return self.maritaca_model
45
+ return self.ollama_model
46
+
47
+
48
+ def load_runtime_vanna_config(base_settings: Any, engine: Engine) -> RuntimeVannaConfig:
49
+ defaults = {
50
+ "provider": str(base_settings.vanna_llm_provider).strip().lower() or "ollama",
51
+ "ollama_base_url": str(base_settings.vanna_ollama_base_url).strip() or "http://ollama:11434",
52
+ "ollama_model": str(base_settings.vanna_ollama_model).strip() or "sabia-7b",
53
+ "maritaca_api_url": str(base_settings.vanna_maritaca_api_url).strip(),
54
+ "maritaca_api_key": str(base_settings.vanna_maritaca_api_key),
55
+ "maritaca_model": str(base_settings.vanna_maritaca_model).strip() or "sabia-4",
56
+ "maritaca_timeout_seconds": int(base_settings.vanna_maritaca_timeout_seconds),
57
+ "allowed_schema": str(base_settings.effective_allowed_schema()).strip().lower() or "curated",
58
+ "vectorstore_path": str(base_settings.vanna_vectorstore_path).strip() or "/data/vanna/chroma",
59
+ "auto_train": bool(base_settings.vanna_auto_train),
60
+ "max_rows": int(base_settings.vanna_max_rows),
61
+ }
62
+
63
+ persisted = _read_persisted_llm_settings(engine)
64
+ if isinstance(persisted, dict):
65
+ provider = str(persisted.get("provider") or defaults["provider"]).strip().lower() or "ollama"
66
+ ollama = persisted.get("ollama") if isinstance(persisted.get("ollama"), dict) else {}
67
+ maritaca = persisted.get("maritaca") if isinstance(persisted.get("maritaca"), dict) else {}
68
+ defaults.update(
69
+ {
70
+ "provider": provider,
71
+ "ollama_base_url": str(ollama.get("base_url") or defaults["ollama_base_url"]).strip()
72
+ or defaults["ollama_base_url"],
73
+ "ollama_model": str(ollama.get("model") or defaults["ollama_model"]).strip() or defaults["ollama_model"],
74
+ "maritaca_api_url": str(maritaca.get("api_url") or defaults["maritaca_api_url"]).strip()
75
+ or defaults["maritaca_api_url"],
76
+ "maritaca_api_key": str(maritaca.get("api_key") or defaults["maritaca_api_key"]),
77
+ "maritaca_model": str(maritaca.get("model") or defaults["maritaca_model"]).strip()
78
+ or defaults["maritaca_model"],
79
+ "maritaca_timeout_seconds": _coerce_positive_int(
80
+ maritaca.get("timeout_seconds"), defaults["maritaca_timeout_seconds"]
81
+ ),
82
+ }
83
+ )
84
+
85
+ return RuntimeVannaConfig(**defaults)
86
+
87
+
88
+ def _read_persisted_llm_settings(engine: Engine) -> dict[str, Any] | None:
89
+ try:
90
+ with engine.begin() as conn:
91
+ row = conn.execute(
92
+ text("SELECT setting_value FROM config.app_settings WHERE setting_key = :setting_key"),
93
+ {"setting_key": VANNA_LLM_SETTING_KEY},
94
+ ).mappings().first()
95
+ except SQLAlchemyError:
96
+ return None
97
+
98
+ if not row:
99
+ return None
100
+ value = row.get("setting_value")
101
+ return value if isinstance(value, dict) else None
102
+
103
+
104
+ def _coerce_positive_int(value: object, default: int) -> int:
105
+ if isinstance(value, bool):
106
+ return int(value) or default
107
+ if isinstance(value, int):
108
+ return value if value > 0 else default
109
+ if isinstance(value, str):
110
+ normalized = value.strip()
111
+ if normalized.isdigit():
112
+ parsed = int(normalized)
113
+ return parsed if parsed > 0 else default
114
+ return default
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ class SQLGuard:
7
+ FORBIDDEN_SCHEMAS = {
8
+ "audit",
9
+ "config",
10
+ "information_schema",
11
+ "mart",
12
+ "pg_catalog",
13
+ "public",
14
+ "raw",
15
+ "staging",
16
+ }
17
+
18
+ RELATION_PATTERN = re.compile(
19
+ r"\b(?:from|join)\s+("
20
+ r"(?:\"[^\"]+\"|[a-zA-Z_][a-zA-Z0-9_]*)"
21
+ r"(?:\s*\.\s*(?:\"[^\"]+\"|[a-zA-Z_][a-zA-Z0-9_]*))?"
22
+ r")",
23
+ flags=re.IGNORECASE,
24
+ )
25
+ FROM_CLAUSE_PATTERN = re.compile(
26
+ r"\bfrom\b\s+(.*?)(?="
27
+ r"\bwhere\b|\bgroup\s+by\b|\border\s+by\b|\blimit\b|\boffset\b|"
28
+ r"\bunion\b|\bexcept\b|\bintersect\b|\bhaving\b|$"
29
+ r")",
30
+ flags=re.IGNORECASE,
31
+ )
32
+ COMMA_RELATION_PATTERN = re.compile(
33
+ r",\s*("
34
+ r"(?:\"[^\"]+\"|[a-zA-Z_][a-zA-Z0-9_]*)"
35
+ r"(?:\s*\.\s*(?:\"[^\"]+\"|[a-zA-Z_][a-zA-Z0-9_]*))?"
36
+ r")",
37
+ flags=re.IGNORECASE,
38
+ )
39
+ QUALIFIED_SCHEMA_PATTERN = re.compile(
40
+ r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\.\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\.",
41
+ flags=re.IGNORECASE,
42
+ )
43
+
44
+ def __init__(self, allowed_schemas: set[str] | list[str] | tuple[str, ...] | None = None) -> None:
45
+ schemas = allowed_schemas or {"curated"}
46
+ self.allowed_schemas = {schema.lower().strip() for schema in schemas if schema.strip()}
47
+
48
+ def validate(self, sql: str) -> None:
49
+ stripped = self._normalize(sql)
50
+ if not stripped.startswith("select"):
51
+ raise ValueError("Only SELECT statements are allowed")
52
+ if ";" in stripped.rstrip(";"):
53
+ raise ValueError("Only one SQL statement is allowed")
54
+
55
+ forbidden_patterns = [
56
+ r"\binsert\b",
57
+ r"\bupdate\b",
58
+ r"\bdelete\b",
59
+ r"\bdrop\b",
60
+ r"\balter\b",
61
+ r"\btruncate\b",
62
+ r"\bcreate\b",
63
+ r"\bgrant\b",
64
+ r"\brevoke\b",
65
+ ]
66
+ for pattern in forbidden_patterns:
67
+ if re.search(pattern, stripped):
68
+ raise ValueError("Forbidden SQL keyword")
69
+
70
+ sql_without_literals = self._strip_string_literals(stripped)
71
+ schema_refs = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\.", sql_without_literals))
72
+ blocked_schemas = schema_refs.intersection(self.FORBIDDEN_SCHEMAS)
73
+ if blocked_schemas:
74
+ raise ValueError(f"Schema not allowed: {sorted(blocked_schemas)[0]}")
75
+
76
+ for schema in self.QUALIFIED_SCHEMA_PATTERN.findall(sql_without_literals):
77
+ if schema.lower() not in self.allowed_schemas:
78
+ raise ValueError(f"Schema not allowed: {schema.lower()}")
79
+
80
+ matched_relations = self._relation_references(sql_without_literals)
81
+ if not matched_relations:
82
+ raise ValueError("SQL must reference at least one allowed relation")
83
+
84
+ for relation in matched_relations:
85
+ relation_clean = self._clean_identifier(relation)
86
+ parts = [part for part in relation_clean.split(".") if part]
87
+ if len(parts) != 2:
88
+ raise ValueError(f"Relation must be schema-qualified: {relation_clean}")
89
+ schema = parts[0]
90
+ if schema not in self.allowed_schemas:
91
+ raise ValueError(f"Schema not allowed: {schema}")
92
+
93
+ def enforce_limit(self, sql: str, max_rows: int) -> str:
94
+ self.validate(sql)
95
+ stripped = self._compact_original(sql).rstrip(";")
96
+ if re.search(r"\blimit\s+\d+\b", stripped, flags=re.IGNORECASE):
97
+ return stripped
98
+ return f"{stripped} LIMIT {max_rows}"
99
+
100
+ @staticmethod
101
+ def _normalize(sql: str) -> str:
102
+ return SQLGuard._compact_original(sql).lower()
103
+
104
+ @staticmethod
105
+ def _compact_original(sql: str) -> str:
106
+ without_block_comments = re.sub(r"/\*.*?\*/", " ", sql, flags=re.DOTALL)
107
+ without_line_comments = re.sub(r"--.*?$", " ", without_block_comments, flags=re.MULTILINE)
108
+ return re.sub(r"\s+", " ", without_line_comments).strip()
109
+
110
+ @classmethod
111
+ def _relation_references(cls, sql: str) -> list[str]:
112
+ relations = [match.group(1) for match in cls.RELATION_PATTERN.finditer(sql)]
113
+ for clause_match in cls.FROM_CLAUSE_PATTERN.finditer(sql):
114
+ relations.extend(match.group(1) for match in cls.COMMA_RELATION_PATTERN.finditer(clause_match.group(1)))
115
+ return relations
116
+
117
+ @staticmethod
118
+ def _clean_identifier(identifier: str) -> str:
119
+ return re.sub(r'\s+', "", identifier.strip().rstrip(";")).replace('"', "").lower()
120
+
121
+ @staticmethod
122
+ def _strip_string_literals(sql: str) -> str:
123
+ return re.sub(r"'(?:''|[^'])*'", "''", sql)
@@ -0,0 +1,382 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass, replace
6
+ from pathlib import Path
7
+ from threading import Lock
8
+ from typing import TYPE_CHECKING, Any
9
+ from urllib.error import HTTPError, URLError
10
+ from urllib.request import Request, urlopen
11
+
12
+ if TYPE_CHECKING:
13
+ from .config import Settings
14
+ from sqlalchemy import Engine
15
+
16
+ from .runtime_config import RuntimeVannaConfig, load_runtime_vanna_config
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class CuratedRelation:
21
+ schema_name: str
22
+ relation_name: str
23
+ relation_type: str
24
+ columns: tuple[tuple[str, str], ...]
25
+
26
+ @property
27
+ def full_name(self) -> str:
28
+ return f"{self.schema_name}.{self.relation_name}"
29
+
30
+
31
+ class MaritacaChat:
32
+ def __init__(self, config: dict[str, Any] | None = None):
33
+ self.config = config or {}
34
+
35
+ def system_message(self, message: str) -> dict[str, str]:
36
+ return {"role": "system", "content": message}
37
+
38
+ def user_message(self, message: str) -> dict[str, str]:
39
+ return {"role": "user", "content": message}
40
+
41
+ def assistant_message(self, message: str) -> dict[str, str]:
42
+ return {"role": "assistant", "content": message}
43
+
44
+ def submit_prompt(self, prompt: Any, **_: Any) -> str:
45
+ api_key = str(self.config.get("maritaca_api_key") or "").strip()
46
+ if not api_key:
47
+ raise RuntimeError("VANNA_MARITACA_API_KEY is required for VANNA_LLM_PROVIDER=maritaca")
48
+
49
+ api_url = str(
50
+ self.config.get("maritaca_api_url") or "https://chat.maritaca.ai/api/chat/completions"
51
+ ).strip()
52
+ model = str(self.config.get("model") or "sabia-4").strip()
53
+ timeout = int(self.config.get("maritaca_timeout_seconds") or 60)
54
+ payload = json.dumps({"model": model, "messages": self._messages(prompt)}).encode("utf-8")
55
+ request = Request(
56
+ api_url,
57
+ data=payload,
58
+ headers={
59
+ "Authorization": f"Bearer {api_key}",
60
+ "Content-Type": "application/json",
61
+ "Accept": "application/json",
62
+ },
63
+ method="POST",
64
+ )
65
+
66
+ try:
67
+ with urlopen(request, timeout=timeout) as response:
68
+ raw = response.read()
69
+ except HTTPError as exc:
70
+ detail = exc.read().decode("utf-8", errors="replace")
71
+ raise RuntimeError(f"Maritaca API returned HTTP {exc.code}: {detail}") from exc
72
+ except (OSError, TimeoutError, URLError) as exc:
73
+ raise RuntimeError(f"Maritaca API request failed: {exc}") from exc
74
+
75
+ try:
76
+ data = json.loads(raw.decode("utf-8"))
77
+ return data["choices"][0]["message"]["content"]
78
+ except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
79
+ raise RuntimeError("Maritaca API returned an invalid response") from exc
80
+
81
+ @staticmethod
82
+ def _messages(prompt: Any) -> list[dict[str, str]]:
83
+ if isinstance(prompt, list):
84
+ messages: list[dict[str, str]] = []
85
+ for item in prompt:
86
+ if isinstance(item, dict) and "role" in item and "content" in item:
87
+ messages.append({"role": str(item["role"]), "content": str(item["content"])})
88
+ else:
89
+ messages.append({"role": "user", "content": str(item)})
90
+ return messages or [{"role": "user", "content": ""}]
91
+ return [{"role": "user", "content": str(prompt)}]
92
+
93
+
94
+ class DataifVannaEngine:
95
+ def __init__(self, settings: Settings, engine: Engine, allowed_schema: str) -> None:
96
+ self.settings = settings
97
+ self.engine = engine
98
+ self.allowed_schema = allowed_schema.strip().lower()
99
+ self._lock = Lock()
100
+ self._trained = False
101
+ self.vn: Any | None = None
102
+ self._vanna_class: type | None = None
103
+ self._runtime_config: RuntimeVannaConfig | None = None
104
+ self._config_signature: tuple[object, ...] | None = None
105
+ Path(settings.vanna_vectorstore_path).mkdir(parents=True, exist_ok=True)
106
+ self._ensure_runtime_config()
107
+
108
+ def _client(self) -> Any:
109
+ self._ensure_runtime_config()
110
+ if self.vn is not None:
111
+ return self.vn
112
+ if self._vanna_class is None or self._runtime_config is None:
113
+ raise RuntimeError("Vanna runtime configuration is unavailable")
114
+ self.vn = self._vanna_class(config=self._provider_config(self._runtime_config))
115
+ self.vn.run_sql = self._run_sql_dataframe
116
+ self.vn.run_sql_is_set = True
117
+ return self.vn
118
+
119
+ def generate_sql(self, question: str, runtime_override: dict[str, Any] | None = None) -> str:
120
+ runtime = self._runtime_from_override(runtime_override)
121
+ if not self._is_runtime_available(runtime):
122
+ raise RuntimeError(self._unavailable_message(runtime))
123
+ if runtime.auto_train:
124
+ self.train_once()
125
+ client = self._client() if runtime.signature() == self.runtime_config().signature() else self._client_for_runtime(runtime)
126
+ return client.generate_sql(question=question, allow_llm_to_see_data=False)
127
+
128
+ def is_llm_available(self) -> bool:
129
+ return bool(self.provider_status()["available"])
130
+
131
+ def provider_status(self) -> dict[str, object]:
132
+ runtime = self.runtime_config()
133
+ return self._provider_status(runtime)
134
+
135
+ def _provider_status(self, runtime: RuntimeVannaConfig) -> dict[str, object]:
136
+ provider = runtime.provider
137
+ if provider == "maritaca":
138
+ if runtime.maritaca_api_key.strip():
139
+ return {"available": True, "detail": "Maritaca API key configured"}
140
+ return {"available": False, "detail": "Maritaca API key is not configured"}
141
+
142
+ try:
143
+ with urlopen(f"{runtime.ollama_base_url.rstrip('/')}/api/tags", timeout=2) as response:
144
+ available = 200 <= response.status < 500
145
+ return {
146
+ "available": available,
147
+ "detail": f"Ollama responded at {runtime.ollama_base_url}" if available else "Ollama returned an error",
148
+ }
149
+ except (OSError, TimeoutError, URLError) as exc:
150
+ return {"available": False, "detail": f"Ollama is not reachable at {runtime.ollama_base_url}: {exc}"}
151
+
152
+ def _is_runtime_available(self, runtime: RuntimeVannaConfig) -> bool:
153
+ return bool(self._provider_status(runtime)["available"])
154
+
155
+ def train_once(self, force: bool = False) -> None:
156
+ with self._lock:
157
+ self._ensure_runtime_config()
158
+ if self._trained and not force:
159
+ return
160
+ vn = self._client()
161
+ for relation in self._load_allowed_relations():
162
+ vn.train(ddl=self._build_ddl(relation))
163
+ for item in self._load_catalog_documentation():
164
+ vn.train(documentation=item)
165
+ for query in self._approved_examples():
166
+ vn.train(sql=query)
167
+ self._trained = True
168
+
169
+ def _run_sql_dataframe(self, sql: str) -> Any:
170
+ import pandas as pd
171
+ from sqlalchemy import text
172
+
173
+ with self.engine.begin() as conn:
174
+ return pd.read_sql_query(sql=text(sql), con=conn)
175
+
176
+ def _load_allowed_relations(self) -> list[CuratedRelation]:
177
+ from sqlalchemy import text
178
+
179
+ with self.engine.begin() as conn:
180
+ rows = conn.execute(
181
+ text(
182
+ """
183
+ SELECT
184
+ c.table_schema,
185
+ c.table_name,
186
+ c.column_name,
187
+ c.data_type,
188
+ COALESCE(
189
+ t.table_type,
190
+ CASE WHEN mv.matviewname IS NOT NULL THEN 'MATERIALIZED VIEW' ELSE 'RELATION' END
191
+ ) AS relation_type
192
+ FROM information_schema.columns c
193
+ LEFT JOIN information_schema.tables t
194
+ ON t.table_schema = c.table_schema
195
+ AND t.table_name = c.table_name
196
+ LEFT JOIN pg_catalog.pg_matviews mv
197
+ ON mv.schemaname = c.table_schema
198
+ AND mv.matviewname = c.table_name
199
+ WHERE c.table_schema = :allowed_schema
200
+ ORDER BY table_schema, table_name, ordinal_position
201
+ """
202
+ ),
203
+ {"allowed_schema": self.allowed_schema},
204
+ ).mappings()
205
+ grouped: dict[tuple[str, str, str], list[tuple[str, str]]] = {}
206
+ for row in rows:
207
+ key = (row["table_schema"], row["table_name"], row["relation_type"])
208
+ grouped.setdefault(key, []).append((row["column_name"], row["data_type"]))
209
+
210
+ return [
211
+ CuratedRelation(
212
+ schema_name=schema,
213
+ relation_name=name,
214
+ relation_type=relation_type,
215
+ columns=tuple(columns),
216
+ )
217
+ for (schema, name, relation_type), columns in grouped.items()
218
+ ]
219
+
220
+ def _load_catalog_documentation(self) -> list[str]:
221
+ from sqlalchemy import text
222
+
223
+ catalog_name = f"{self.allowed_schema}.vw_pnp_vanna_catalogo"
224
+ schema_sql = self._quote_identifier(self.allowed_schema)
225
+ with self.engine.begin() as conn:
226
+ exists = conn.execute(
227
+ text("SELECT to_regclass(:catalog_name)"),
228
+ {"catalog_name": catalog_name},
229
+ ).scalar()
230
+ if not exists:
231
+ return []
232
+
233
+ rows = conn.execute(
234
+ text(
235
+ f"""
236
+ SELECT relation_group, relation_name, relation_description
237
+ FROM {schema_sql}.vw_pnp_vanna_catalogo
238
+ ORDER BY relation_group, relation_name
239
+ """
240
+ )
241
+ ).mappings()
242
+ return [
243
+ (
244
+ f"Relacao {self.allowed_schema}.{row['relation_name']} pertence ao dominio "
245
+ f"{row['relation_group']}: {row['relation_description']}"
246
+ )
247
+ for row in rows
248
+ ]
249
+
250
+ def _build_vanna_class(self, provider: str | None = None) -> type:
251
+ from vanna.chromadb import ChromaDB_VectorStore
252
+
253
+ provider = provider or self.runtime_config().provider
254
+ if provider == "ollama":
255
+ from vanna.ollama import Ollama
256
+
257
+ class OllamaVanna(ChromaDB_VectorStore, Ollama):
258
+ def __init__(self, config=None):
259
+ ChromaDB_VectorStore.__init__(self, config=config)
260
+ Ollama.__init__(self, config=config)
261
+
262
+ return OllamaVanna
263
+
264
+ if provider == "maritaca":
265
+ class MaritacaVanna(MaritacaChat, ChromaDB_VectorStore):
266
+ def __init__(self, config=None):
267
+ ChromaDB_VectorStore.__init__(self, config=config)
268
+ MaritacaChat.__init__(self, config=config)
269
+
270
+ return MaritacaVanna
271
+
272
+ raise RuntimeError(f"Unsupported VANNA_LLM_PROVIDER: {self.settings.vanna_llm_provider}")
273
+
274
+ def _provider_config(self, runtime: RuntimeVannaConfig | None = None) -> dict[str, Any]:
275
+ runtime = runtime or self.runtime_config()
276
+ config: dict[str, Any] = {
277
+ "model": runtime.model_name(),
278
+ "path": runtime.vectorstore_path,
279
+ }
280
+ provider = runtime.provider
281
+ if provider == "ollama":
282
+ config["ollama_host"] = runtime.ollama_base_url
283
+ elif provider == "maritaca":
284
+ config["maritaca_api_url"] = runtime.maritaca_api_url
285
+ config["maritaca_api_key"] = runtime.maritaca_api_key
286
+ config["maritaca_timeout_seconds"] = runtime.maritaca_timeout_seconds
287
+ return config
288
+
289
+ def _client_for_runtime(self, runtime: RuntimeVannaConfig) -> Any:
290
+ client_class = self._build_vanna_class(runtime.provider)
291
+ client = client_class(config=self._provider_config(runtime))
292
+ client.run_sql = self._run_sql_dataframe
293
+ client.run_sql_is_set = True
294
+ return client
295
+
296
+ def _runtime_from_override(self, override: dict[str, Any] | None) -> RuntimeVannaConfig:
297
+ runtime = self.runtime_config()
298
+ if not override:
299
+ return runtime
300
+
301
+ provider = str(override.get("provider") or runtime.provider).strip().lower() or runtime.provider
302
+ ollama = override.get("ollama") if isinstance(override.get("ollama"), dict) else {}
303
+ maritaca = override.get("maritaca") if isinstance(override.get("maritaca"), dict) else {}
304
+ return replace(
305
+ runtime,
306
+ provider=provider,
307
+ ollama_base_url=str(ollama.get("base_url") or runtime.ollama_base_url).strip() or runtime.ollama_base_url,
308
+ ollama_model=str(ollama.get("model") or runtime.ollama_model).strip() or runtime.ollama_model,
309
+ maritaca_api_url=str(maritaca.get("api_url") or runtime.maritaca_api_url).strip() or runtime.maritaca_api_url,
310
+ maritaca_api_key=str(maritaca.get("api_key") or runtime.maritaca_api_key),
311
+ maritaca_model=str(maritaca.get("model") or runtime.maritaca_model).strip() or runtime.maritaca_model,
312
+ maritaca_timeout_seconds=_coerce_positive_int(
313
+ maritaca.get("timeout_seconds"),
314
+ runtime.maritaca_timeout_seconds,
315
+ ),
316
+ )
317
+
318
+ def _unavailable_message(self, runtime: RuntimeVannaConfig | None = None) -> str:
319
+ runtime = runtime or self.runtime_config()
320
+ provider = runtime.provider
321
+ if provider == "maritaca":
322
+ return "Maritaca API key is not configured"
323
+ return f"Ollama is not reachable at {runtime.ollama_base_url}"
324
+
325
+ def runtime_config(self) -> RuntimeVannaConfig:
326
+ self._ensure_runtime_config()
327
+ if self._runtime_config is None:
328
+ raise RuntimeError("Vanna runtime configuration is unavailable")
329
+ return self._runtime_config
330
+
331
+ def _ensure_runtime_config(self) -> None:
332
+ runtime = load_runtime_vanna_config(self.settings, self.engine)
333
+ signature = runtime.signature()
334
+ if self._config_signature == signature:
335
+ return
336
+
337
+ Path(runtime.vectorstore_path).mkdir(parents=True, exist_ok=True)
338
+ self._runtime_config = runtime
339
+ self._config_signature = signature
340
+ self._vanna_class = self._build_vanna_class(runtime.provider)
341
+ self.vn = None
342
+ self._trained = False
343
+
344
+ @staticmethod
345
+ def _quote_identifier(identifier: str) -> str:
346
+ if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", identifier):
347
+ raise ValueError(f"Invalid SQL identifier: {identifier}")
348
+ return f'"{identifier}"'
349
+
350
+ @staticmethod
351
+ def _build_ddl(relation: CuratedRelation) -> str:
352
+ columns = ",\n ".join(f"{name} {data_type}" for name, data_type in relation.columns)
353
+ return f"-- RELATION TYPE: {relation.relation_type}\nCREATE TABLE {relation.full_name} (\n {columns}\n);"
354
+
355
+ def _approved_examples(self) -> list[str]:
356
+ if self.allowed_schema != "curated":
357
+ return []
358
+ return [
359
+ (
360
+ "SELECT dominio, indicador, ano, SUM(valor) AS total "
361
+ "FROM curated.vw_pnp_vanna_resumo "
362
+ "GROUP BY dominio, indicador, ano "
363
+ "ORDER BY ano DESC, dominio, indicador LIMIT 50"
364
+ ),
365
+ (
366
+ "SELECT relation_group, relation_name, relation_description "
367
+ "FROM curated.vw_pnp_vanna_catalogo "
368
+ "ORDER BY relation_group, relation_name LIMIT 50"
369
+ ),
370
+ (
371
+ "SELECT instituicao, ano, SUM(valor) AS total_matriculas "
372
+ "FROM curated.vw_pnp_vanna_resumo "
373
+ "WHERE dominio = 'matriculas' "
374
+ "GROUP BY instituicao, ano "
375
+ "ORDER BY ano DESC, total_matriculas DESC LIMIT 50"
376
+ ),
377
+ (
378
+ "SELECT ano, SUM(matriculas) AS total_matriculas "
379
+ "FROM curated.mv_pnp_dashboard_matriculas "
380
+ "GROUP BY ano ORDER BY ano DESC LIMIT 50"
381
+ ),
382
+ ]
@@ -0,0 +1,8 @@
1
+ fastapi==0.115.6
2
+ uvicorn[standard]==0.32.1
3
+ SQLAlchemy==2.0.36
4
+ psycopg2-binary==2.9.9
5
+ pydantic==2.10.4
6
+ pydantic-settings==2.7.0
7
+ pandas==2.2.3
8
+ vanna[chromadb,ollama]==0.7.9
@@ -0,0 +1,13 @@
1
+ .env
2
+ node_modules/
3
+ dist/
4
+ coverage/
5
+
6
+ npm-debug.log*
7
+ yarn-debug.log*
8
+ yarn-error.log*
9
+ *.log
10
+
11
+ .DS_Store
12
+ tmp/
13
+ temp/
@@ -0,0 +1,16 @@
1
+ FROM node:20-alpine AS builder
2
+
3
+ ARG VITE_API_BASE_URL=
4
+ ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
5
+
6
+ WORKDIR /app
7
+ COPY package.json package-lock.json* ./
8
+ RUN npm install
9
+ COPY . .
10
+ RUN npm run build
11
+
12
+ FROM nginx:1.27-alpine
13
+ COPY nginx.conf /etc/nginx/nginx.conf
14
+ COPY --from=builder /app/dist /usr/share/nginx/html
15
+ EXPOSE 80
16
+ CMD ["nginx", "-g", "daemon off;"]
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <html lang="pt-BR">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>dataif</title>
7
+ </head>
8
+ <body>
9
+ <div id="root"></div>
10
+ <script type="module" src="/src/main.jsx"></script>
11
+ </body>
12
+ </html>