dataface 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- d3_format/__init__.py +14 -0
- d3_format/errors.py +19 -0
- d3_format/format.py +551 -0
- d3_format/spec.py +159 -0
- dataface/DATAFACE_SYNTAX.md +1135 -0
- dataface/__init__.py +93 -0
- dataface/_docs_site.py +20 -0
- dataface/_install_hint.py +26 -0
- dataface/agent_api/__init__.py +79 -0
- dataface/agent_api/_init_templates/__init__.py +0 -0
- dataface/agent_api/_init_templates/agents_dft_snippet.md +26 -0
- dataface/agent_api/_init_templates/dataface.yml +15 -0
- dataface/agent_api/_init_templates/faces-dataface.yml +144 -0
- dataface/agent_api/_init_templates/index.md +24 -0
- dataface/agent_api/_paths.py +118 -0
- dataface/agent_api/_project_agents_md.py +43 -0
- dataface/agent_api/_session_store.py +486 -0
- dataface/agent_api/_state.py +28 -0
- dataface/agent_api/chat.py +221 -0
- dataface/agent_api/dashboards.py +257 -0
- dataface/agent_api/describe.py +366 -0
- dataface/agent_api/describe_query.py +120 -0
- dataface/agent_api/docs/__init__.py +25 -0
- dataface/agent_api/docs/_loader.py +292 -0
- dataface/agent_api/docs/yaml-reference.md +2757 -0
- dataface/agent_api/file_refs.py +118 -0
- dataface/agent_api/init.py +126 -0
- dataface/agent_api/inspect.py +128 -0
- dataface/agent_api/mcp_install.py +170 -0
- dataface/agent_api/query.py +274 -0
- dataface/agent_api/schema.py +658 -0
- dataface/agent_api/schema_search.py +284 -0
- dataface/agent_api/search.py +270 -0
- dataface/agent_api/skill_install.py +141 -0
- dataface/agent_api/skill_render.py +90 -0
- dataface/agent_api/skills.py +293 -0
- dataface/agent_api/surface_aliases.yaml +128 -0
- dataface/agent_api/validate.py +175 -0
- dataface/agent_api/validate_query.py +84 -0
- dataface/ai/__init__.py +39 -0
- dataface/ai/agent.py +139 -0
- dataface/ai/context.py +45 -0
- dataface/ai/events.py +62 -0
- dataface/ai/external_mcp.py +610 -0
- dataface/ai/generate_sql.py +96 -0
- dataface/ai/llm.py +403 -0
- dataface/ai/mcp/__init__.py +51 -0
- dataface/ai/mcp/server.py +289 -0
- dataface/ai/memories.py +85 -0
- dataface/ai/prompts.py +177 -0
- dataface/ai/schema_context.py +138 -0
- dataface/ai/skills/before-after-comparison/SKILL.md +102 -0
- dataface/ai/skills/before-after-comparison/examples/before-after-comparison.yml +24 -0
- dataface/ai/skills/dashboard-build/SKILL.md +212 -0
- dataface/ai/skills/dashboard-build/examples/_smoke.yml +15 -0
- dataface/ai/skills/dashboard-design/SKILL.md +182 -0
- dataface/ai/skills/dashboard-review/SKILL.md +113 -0
- dataface/ai/skills/dashboard-structural-review/SKILL.md +173 -0
- dataface/ai/skills/dashboard-visual-review/SKILL.md +139 -0
- dataface/ai/skills/dataface-mcp-setup/SKILL.md +177 -0
- dataface/ai/skills/dataface-troubleshooting/SKILL.md +225 -0
- dataface/ai/skills/drill-down-link/SKILL.md +112 -0
- dataface/ai/skills/drill-down-link/examples/drill-down-link.yml +27 -0
- dataface/ai/skills/faceted-small-multiples/SKILL.md +116 -0
- dataface/ai/skills/faceted-small-multiples/examples/faceted-small-multiples.yml +33 -0
- dataface/ai/skills/filter-bar-with-variables/SKILL.md +105 -0
- dataface/ai/skills/filter-bar-with-variables/examples/filter-bar-with-variables.yml +49 -0
- dataface/ai/skills/kpi-row/SKILL.md +101 -0
- dataface/ai/skills/kpi-row/examples/kpi-row.yml +55 -0
- dataface/ai/skills/report-design/SKILL.md +184 -0
- dataface/ai/skills/single-metric-bignum/SKILL.md +90 -0
- dataface/ai/skills/single-metric-bignum/examples/single-metric-bignum.yml +27 -0
- dataface/ai/skills/table-heavy-ops-dashboard/SKILL.md +114 -0
- dataface/ai/skills/table-heavy-ops-dashboard/examples/table-heavy-ops-dashboard.yml +48 -0
- dataface/ai/skills/time-series-trend/SKILL.md +93 -0
- dataface/ai/skills/time-series-trend/examples/time-series-trend.yml +26 -0
- dataface/ai/skills/top-n-with-detail/SKILL.md +98 -0
- dataface/ai/skills/top-n-with-detail/examples/top-n-with-detail.yml +45 -0
- dataface/ai/skills/two-by-two-grid-overview/SKILL.md +78 -0
- dataface/ai/skills/two-by-two-grid-overview/examples/two-by-two-grid-overview.yml +64 -0
- dataface/ai/tool_schemas.py +132 -0
- dataface/ai/tools/__init__.py +312 -0
- dataface/ai/yaml_utils.py +57 -0
- dataface/cli/__init__.py +3 -0
- dataface/cli/_console.py +48 -0
- dataface/cli/_error_format.py +83 -0
- dataface/cli/_extras.py +190 -0
- dataface/cli/_json_output.py +8 -0
- dataface/cli/_parsing.py +17 -0
- dataface/cli/_version_info.py +56 -0
- dataface/cli/commands/__init__.py +3 -0
- dataface/cli/commands/_agent_input.py +205 -0
- dataface/cli/commands/_agent_server.py +115 -0
- dataface/cli/commands/chat.py +645 -0
- dataface/cli/commands/describe.py +107 -0
- dataface/cli/commands/docs.py +131 -0
- dataface/cli/commands/extension.py +179 -0
- dataface/cli/commands/init.py +240 -0
- dataface/cli/commands/inspect.py +94 -0
- dataface/cli/commands/mcp_init.py +167 -0
- dataface/cli/commands/query.py +386 -0
- dataface/cli/commands/render.py +291 -0
- dataface/cli/commands/schema.py +411 -0
- dataface/cli/commands/search.py +49 -0
- dataface/cli/commands/serve.py +114 -0
- dataface/cli/commands/skills.py +133 -0
- dataface/cli/commands/skills_init.py +161 -0
- dataface/cli/commands/validate.py +63 -0
- dataface/cli/main.py +1501 -0
- dataface/core/__init__.py +75 -0
- dataface/core/compile/__init__.py +244 -0
- dataface/core/compile/_jinja_helpers.py +78 -0
- dataface/core/compile/channel.py +222 -0
- dataface/core/compile/chart_focus.py +101 -0
- dataface/core/compile/chart_resolved.py +169 -0
- dataface/core/compile/chart_type_detection.py +489 -0
- dataface/core/compile/chart_update.py +261 -0
- dataface/core/compile/colors.py +64 -0
- dataface/core/compile/compiler.py +904 -0
- dataface/core/compile/config.py +823 -0
- dataface/core/compile/custom_chart_types.py +208 -0
- dataface/core/compile/data_table_attachment.py +1287 -0
- dataface/core/compile/detect.py +110 -0
- dataface/core/compile/errors.py +302 -0
- dataface/core/compile/filter_injection.py +319 -0
- dataface/core/compile/introspection.py +527 -0
- dataface/core/compile/jinja.py +511 -0
- dataface/core/compile/labels_env.py +52 -0
- dataface/core/compile/markdown.py +154 -0
- dataface/core/compile/meta.py +388 -0
- dataface/core/compile/models/__init__.py +0 -0
- dataface/core/compile/models/chart/__init__.py +0 -0
- dataface/core/compile/models/chart/authored.py +2137 -0
- dataface/core/compile/models/chart/compiled.py +398 -0
- dataface/core/compile/models/config.py +347 -0
- dataface/core/compile/models/face/__init__.py +0 -0
- dataface/core/compile/models/face/authored.py +659 -0
- dataface/core/compile/models/face/compiled.py +522 -0
- dataface/core/compile/models/factories.py +201 -0
- dataface/core/compile/models/markers.py +40 -0
- dataface/core/compile/models/palette.py +36 -0
- dataface/core/compile/models/primitives.py +415 -0
- dataface/core/compile/models/query/__init__.py +0 -0
- dataface/core/compile/models/query/authored.py +246 -0
- dataface/core/compile/models/query/compiled.py +710 -0
- dataface/core/compile/models/refs.py +137 -0
- dataface/core/compile/models/source.py +611 -0
- dataface/core/compile/models/style/__init__.py +0 -0
- dataface/core/compile/models/style/authored.py +481 -0
- dataface/core/compile/models/style/compiled.py +3399 -0
- dataface/core/compile/models/style/merged.py +1682 -0
- dataface/core/compile/models/theme.py +362 -0
- dataface/core/compile/models/variable/__init__.py +0 -0
- dataface/core/compile/models/variable/authored.py +254 -0
- dataface/core/compile/models/vega_lite/__init__.py +0 -0
- dataface/core/compile/models/vega_lite/config.py +510 -0
- dataface/core/compile/models/vega_lite/contracts.py +171 -0
- dataface/core/compile/normalize_charts.py +494 -0
- dataface/core/compile/normalize_layout.py +1000 -0
- dataface/core/compile/normalize_queries.py +297 -0
- dataface/core/compile/normalize_variables.py +489 -0
- dataface/core/compile/normalizer.py +543 -0
- dataface/core/compile/palette.py +1100 -0
- dataface/core/compile/parameterized.py +658 -0
- dataface/core/compile/parser.py +228 -0
- dataface/core/compile/schema.py +20 -0
- dataface/core/compile/schema_renderers/__init__.py +0 -0
- dataface/core/compile/schema_renderers/json_schema.py +163 -0
- dataface/core/compile/schema_renderers/prompt.py +152 -0
- dataface/core/compile/schema_renderers/vscode_schema.py +301 -0
- dataface/core/compile/sizing.py +2126 -0
- dataface/core/compile/sources.py +518 -0
- dataface/core/compile/sql_authoring_lint.py +56 -0
- dataface/core/compile/style_cascade.py +471 -0
- dataface/core/compile/typography.py +299 -0
- dataface/core/compile/validator.py +301 -0
- dataface/core/compile/variables.py +53 -0
- dataface/core/compile/vega_config.py +98 -0
- dataface/core/compile/vega_lite/__init__.py +6 -0
- dataface/core/compile/vega_lite/validation.py +95 -0
- dataface/core/compile/yaml_error_formatter.py +838 -0
- dataface/core/connections.py +38 -0
- dataface/core/dashboard.py +358 -0
- dataface/core/defaults/default_config.yml +101 -0
- dataface/core/defaults/palettes/categorical/category-10-dark.yml +32 -0
- dataface/core/defaults/palettes/categorical/category-10-light.yml +43 -0
- dataface/core/defaults/palettes/categorical/category-10.yml +31 -0
- dataface/core/defaults/palettes/categorical/category-6-tonal-blue.yml +22 -0
- dataface/core/defaults/palettes/categorical/category-6-tonal-brown.yml +29 -0
- dataface/core/defaults/palettes/categorical/category-6-tonal-green.yml +20 -0
- dataface/core/defaults/palettes/categorical/category-6-tonal-orange.yml +21 -0
- dataface/core/defaults/palettes/categorical/category-6-tonal-purple.yml +20 -0
- dataface/core/defaults/palettes/categorical/editorial-10-dark.yml +32 -0
- dataface/core/defaults/palettes/categorical/editorial-10.yml +40 -0
- dataface/core/defaults/palettes/categorical/hero-6.yml +17 -0
- dataface/core/defaults/palettes/categorical/single-blue.yml +11 -0
- dataface/core/defaults/palettes/categorical/tableau.yml +20 -0
- dataface/core/defaults/palettes/data/xkcd_colors.json +3803 -0
- dataface/core/defaults/palettes/diverging/blue-red.yml +25 -0
- dataface/core/defaults/palettes/diverging/coolwarm.yml +24 -0
- dataface/core/defaults/palettes/diverging/crimson-green.yml +23 -0
- dataface/core/defaults/palettes/diverging/orange-teal.yml +23 -0
- dataface/core/defaults/palettes/diverging/sunset.yml +24 -0
- dataface/core/defaults/palettes/scaffold/dft-creams.yml +38 -0
- dataface/core/defaults/palettes/scaffold/dft-grays.yml +53 -0
- dataface/core/defaults/palettes/sequential/amber.yml +22 -0
- dataface/core/defaults/palettes/sequential/blue.yml +22 -0
- dataface/core/defaults/palettes/sequential/brown.yml +22 -0
- dataface/core/defaults/palettes/sequential/gray.yml +22 -0
- dataface/core/defaults/palettes/sequential/green.yml +22 -0
- dataface/core/defaults/palettes/sequential/purple.yml +22 -0
- dataface/core/defaults/palettes/sequential/rust.yml +22 -0
- dataface/core/defaults/palettes/sequential/teal.yml +22 -0
- dataface/core/defaults/palettes/tone/negative.yml +32 -0
- dataface/core/defaults/palettes/tone/positive.yml +22 -0
- dataface/core/defaults/palettes/tone/warning.yml +22 -0
- dataface/core/defaults/themes/_base.yaml +786 -0
- dataface/core/defaults/themes/bi.yaml +16 -0
- dataface/core/defaults/themes/carbong100.yaml +41 -0
- dataface/core/defaults/themes/cream.yaml +122 -0
- dataface/core/defaults/themes/dark.yaml +40 -0
- dataface/core/defaults/themes/diagnostics-title-angle-extreme.yaml +9 -0
- dataface/core/defaults/themes/diagnostics-title-baseline-extreme.yaml +9 -0
- dataface/core/defaults/themes/diagnostics-title-baseline.yaml +24 -0
- dataface/core/defaults/themes/diagnostics-title-center.yaml +8 -0
- dataface/core/defaults/themes/diagnostics-title-color-extreme.yaml +24 -0
- dataface/core/defaults/themes/diagnostics-title-font-extreme.yaml +25 -0
- dataface/core/defaults/themes/diagnostics-title-left.yaml +8 -0
- dataface/core/defaults/themes/diagnostics-title-offset-extreme.yaml +9 -0
- dataface/core/defaults/themes/diagnostics-title-size-extreme.yaml +24 -0
- dataface/core/defaults/themes/diagnostics-title-weight-extreme.yaml +24 -0
- dataface/core/defaults/themes/editorial.yaml +147 -0
- dataface/core/defaults/themes/light.yaml +30 -0
- dataface/core/defaults/themes/looker.yaml +17 -0
- dataface/core/defaults/themes/stark.yaml +134 -0
- dataface/core/errors/__init__.py +67 -0
- dataface/core/errors/codes_compile.py +56 -0
- dataface/core/errors/codes_execute.py +177 -0
- dataface/core/errors/codes_render.py +106 -0
- dataface/core/errors/codes_unknown.py +15 -0
- dataface/core/errors/hints.py +74 -0
- dataface/core/errors/registry.py +42 -0
- dataface/core/errors/structured.py +92 -0
- dataface/core/execute/__init__.py +91 -0
- dataface/core/execute/adapters/__init__.py +49 -0
- dataface/core/execute/adapters/adapter_registry.py +400 -0
- dataface/core/execute/adapters/base.py +245 -0
- dataface/core/execute/adapters/csv_adapter.py +239 -0
- dataface/core/execute/adapters/dbt_adapter.py +283 -0
- dataface/core/execute/adapters/dbt_adapter_factory.py +212 -0
- dataface/core/execute/adapters/dbt_macro_loader.py +95 -0
- dataface/core/execute/adapters/dbt_utils.py +150 -0
- dataface/core/execute/adapters/http_adapter.py +224 -0
- dataface/core/execute/adapters/metricflow_adapter.py +94 -0
- dataface/core/execute/adapters/schema_resolver_adapter.py +144 -0
- dataface/core/execute/adapters/sql_adapter.py +710 -0
- dataface/core/execute/adapters/values_adapter.py +58 -0
- dataface/core/execute/batch.py +744 -0
- dataface/core/execute/cache_backend.py +135 -0
- dataface/core/execute/cache_keys.py +66 -0
- dataface/core/execute/dbt_jinja.py +21 -0
- dataface/core/execute/dialects/__init__.py +121 -0
- dataface/core/execute/dialects/athena.py +75 -0
- dataface/core/execute/dialects/base.py +302 -0
- dataface/core/execute/dialects/bigquery.py +38 -0
- dataface/core/execute/dialects/databricks.py +68 -0
- dataface/core/execute/dialects/duckdb.py +35 -0
- dataface/core/execute/dialects/mysql.py +68 -0
- dataface/core/execute/dialects/postgres.py +39 -0
- dataface/core/execute/dialects/redshift.py +12 -0
- dataface/core/execute/dialects/snowflake.py +51 -0
- dataface/core/execute/dialects/sqlserver.py +92 -0
- dataface/core/execute/duckdb_cache.py +712 -0
- dataface/core/execute/duckdb_config.py +26 -0
- dataface/core/execute/errors.py +213 -0
- dataface/core/execute/executor.py +1249 -0
- dataface/core/execute/parallel.py +162 -0
- dataface/core/execute/setup_sql.py +58 -0
- dataface/core/execute/source_registry.py +72 -0
- dataface/core/execute/source_resolver.py +255 -0
- dataface/core/execute/sql_guard.py +387 -0
- dataface/core/execute/sql_literals.py +199 -0
- dataface/core/fonts.py +52 -0
- dataface/core/inspect/__init__.py +32 -0
- dataface/core/inspect/cache_factory.py +98 -0
- dataface/core/inspect/db_types.py +162 -0
- dataface/core/inspect/dbt_schema.py +96 -0
- dataface/core/inspect/defaults.yml +37 -0
- dataface/core/inspect/fanout_risk.py +109 -0
- dataface/core/inspect/manifest_utils.py +77 -0
- dataface/core/inspect/partials/categorical.yml +40 -0
- dataface/core/inspect/partials/date.yml +40 -0
- dataface/core/inspect/partials/numeric.yml +55 -0
- dataface/core/inspect/partition_types.py +38 -0
- dataface/core/inspect/query_validator.py +975 -0
- dataface/core/inspect/renderer.py +354 -0
- dataface/core/inspect/resolver.py +808 -0
- dataface/core/inspect/search.py +461 -0
- dataface/core/inspect/sources/__init__.py +32 -0
- dataface/core/inspect/sources/dbt.py +738 -0
- dataface/core/inspect/sources/duckdb_utils.py +66 -0
- dataface/core/inspect/templates/__init__.py +1 -0
- dataface/core/inspect/templates/categorical_column.yml +196 -0
- dataface/core/inspect/templates/charts.yml +109 -0
- dataface/core/inspect/templates/date_column.yml +248 -0
- dataface/core/inspect/templates/model.yml +138 -0
- dataface/core/inspect/templates/numeric_column.yml +261 -0
- dataface/core/inspect/templates/quality.yml +80 -0
- dataface/core/inspect/templates/string_column.yml +263 -0
- dataface/core/project_roots.py +165 -0
- dataface/core/render/__init__.py +87 -0
- dataface/core/render/board_links.py +176 -0
- dataface/core/render/chart/__init__.py +27 -0
- dataface/core/render/chart/arc_attached_table.py +251 -0
- dataface/core/render/chart/artifacts.py +16 -0
- dataface/core/render/chart/callout.py +225 -0
- dataface/core/render/chart/decisions.py +358 -0
- dataface/core/render/chart/geo.py +700 -0
- dataface/core/render/chart/kpi.py +916 -0
- dataface/core/render/chart/labels.py +76 -0
- dataface/core/render/chart/pipeline.py +818 -0
- dataface/core/render/chart/presentation.py +36 -0
- dataface/core/render/chart/profile.py +3438 -0
- dataface/core/render/chart/render_single.py +347 -0
- dataface/core/render/chart/renderers.py +193 -0
- dataface/core/render/chart/rendering.py +565 -0
- dataface/core/render/chart/serialization.py +90 -0
- dataface/core/render/chart/spark.py +496 -0
- dataface/core/render/chart/spark_bar.py +370 -0
- dataface/core/render/chart/spec_builders.py +154 -0
- dataface/core/render/chart/standard_renderer.py +2645 -0
- dataface/core/render/chart/table.py +2957 -0
- dataface/core/render/chart/table_support.py +1452 -0
- dataface/core/render/chart/tick_values.py +66 -0
- dataface/core/render/chart/time_unit_detect.py +809 -0
- dataface/core/render/chart/title_overflow.py +157 -0
- dataface/core/render/chart/type_inference.py +122 -0
- dataface/core/render/chart/validation.py +99 -0
- dataface/core/render/chart/vega_lite.py +125 -0
- dataface/core/render/chart/vega_lite_types.py +268 -0
- dataface/core/render/chart/vl_field_maps.py +346 -0
- dataface/core/render/chart_interactivity.py +24 -0
- dataface/core/render/control_registry.py +287 -0
- dataface/core/render/converters/__init__.py +24 -0
- dataface/core/render/converters/chart.py +276 -0
- dataface/core/render/converters/html.py +98 -0
- dataface/core/render/converters/pdf.py +40 -0
- dataface/core/render/converters/png.py +41 -0
- dataface/core/render/errors.py +144 -0
- dataface/core/render/face_api.py +160 -0
- dataface/core/render/faces.py +1194 -0
- dataface/core/render/font_measurement.py +48 -0
- dataface/core/render/font_support.py +197 -0
- dataface/core/render/fonts/DFTSansTabular-Regular.ttf +0 -0
- dataface/core/render/fonts/DFTSansTabular-Regular.woff2 +0 -0
- dataface/core/render/fonts/DFTSerifOldstyleProportional-Regular.ttf +0 -0
- dataface/core/render/fonts/DFTSerifOldstyleTabular-Regular.ttf +0 -0
- dataface/core/render/fonts/InterVariable.ttf +0 -0
- dataface/core/render/fonts/InterVariable.woff2 +0 -0
- dataface/core/render/fonts/NOTO_COLOR_EMOJI_LICENSE.txt +93 -0
- dataface/core/render/fonts/NOTO_EMOJI_LICENSE.txt +93 -0
- dataface/core/render/fonts/NotoColorEmoji-Regular.ttf +0 -0
- dataface/core/render/fonts/NotoColorEmoji-Regular.woff2 +0 -0
- dataface/core/render/fonts/NotoEmoji-Regular.ttf +0 -0
- dataface/core/render/fonts/NotoEmoji-Regular.woff2 +0 -0
- dataface/core/render/fonts/SOURCE_CODE_PRO_LICENSE.txt +93 -0
- dataface/core/render/fonts/SOURCE_SERIF_4_LICENSE.txt +98 -0
- dataface/core/render/fonts/SourceCodePro-Regular.ttf +0 -0
- dataface/core/render/fonts/SourceSerif4-Regular.ttf +0 -0
- dataface/core/render/fonts/_emoji_font_face.css +43 -0
- dataface/core/render/fonts/source-serif-4-variable-latin.woff2 +0 -0
- dataface/core/render/format_utils.py +329 -0
- dataface/core/render/geo_defaults.yml +28 -0
- dataface/core/render/json_format.py +146 -0
- dataface/core/render/layout_sizing.py +865 -0
- dataface/core/render/layouts.py +541 -0
- dataface/core/render/markdown_defaults.yml +16 -0
- dataface/core/render/missing_vars_prompt.py +79 -0
- dataface/core/render/placeholder.py +389 -0
- dataface/core/render/render_result.py +14 -0
- dataface/core/render/renderer.py +467 -0
- dataface/core/render/script_embedding.py +16 -0
- dataface/core/render/svg_utils.py +212 -0
- dataface/core/render/template_loader.py +69 -0
- dataface/core/render/templates/controls/_styles.css +606 -0
- dataface/core/render/templates/controls/checkbox.html +16 -0
- dataface/core/render/templates/controls/date.html +16 -0
- dataface/core/render/templates/controls/number.html +19 -0
- dataface/core/render/templates/controls/readonly.html +9 -0
- dataface/core/render/templates/controls/select.html +21 -0
- dataface/core/render/templates/controls/slider.html +22 -0
- dataface/core/render/templates/controls/text.html +16 -0
- dataface/core/render/templates/scripts/chart_interactivity.js +191 -0
- dataface/core/render/templates/scripts/variables.js +976 -0
- dataface/core/render/templates/svg/grid_pattern.svg +3 -0
- dataface/core/render/templates/svg/styles.css +51 -0
- dataface/core/render/terminal.py +311 -0
- dataface/core/render/terminal_charts.py +563 -0
- dataface/core/render/terminal_defaults.yml +2 -0
- dataface/core/render/terminal_layouts.py +299 -0
- dataface/core/render/terminal_text.py +31 -0
- dataface/core/render/text/__init__.py +1 -0
- dataface/core/render/text/case.py +113 -0
- dataface/core/render/text_format.py +129 -0
- dataface/core/render/utils.py +106 -0
- dataface/core/render/variable_controls.py +946 -0
- dataface/core/render/variable_input_refinement.py +140 -0
- dataface/core/render/warnings/__init__.py +15 -0
- dataface/core/render/warnings/bar_color_1_to_1_with_x.py +80 -0
- dataface/core/render/warnings/base.py +44 -0
- dataface/core/render/warnings/fanout_risk.py +15 -0
- dataface/core/render/warnings/from_query_diagnostic.py +56 -0
- dataface/core/render/warnings/missing_join_predicate.py +13 -0
- dataface/core/render/warnings/query_parse_error.py +14 -0
- dataface/core/render/warnings/query_returned_zero_rows.py +42 -0
- dataface/core/render/warnings/reaggregation.py +14 -0
- dataface/core/render/warnings/registry.py +45 -0
- dataface/core/render/warnings/suppression.py +46 -0
- dataface/core/render/warnings/temporal_single_point.py +63 -0
- dataface/core/render/warnings/unreferenced_chart.py +15 -0
- dataface/core/render/warnings/y_encoding_mostly_null.py +76 -0
- dataface/core/render/yaml_format.py +167 -0
- dataface/core/resolve_face.py +195 -0
- dataface/core/schema/__init__.py +0 -0
- dataface/core/schema/guidance.py +151 -0
- dataface/core/scoped_paths.py +59 -0
- dataface/core/serve/__init__.py +14 -0
- dataface/core/serve/bootstrap.py +39 -0
- dataface/core/serve/embedded.py +57 -0
- dataface/core/serve/port.py +129 -0
- dataface/core/serve/server.py +938 -0
- dataface/core/serve/templates/__init__.py +0 -0
- dataface/core/serve/templates/directory.yml +6 -0
- dataface/core/serve/templates/error.html.j2 +217 -0
- dataface/core/utils.py +121 -0
- dataface/core/validate.py +64 -0
- dataface/integrations/__init__.py +0 -0
- dataface/integrations/highlighting.py +351 -0
- dataface/integrations/markdown.py +537 -0
- dataface/py.typed +0 -0
- dataface-0.1.2.dist-info/METADATA +375 -0
- dataface-0.1.2.dist-info/RECORD +455 -0
- dataface-0.1.2.dist-info/WHEEL +4 -0
- dataface-0.1.2.dist-info/entry_points.txt +2 -0
- dataface-0.1.2.dist-info/licenses/LICENSE +202 -0
- mdsvg/__init__.py +168 -0
- mdsvg/fonts.py +656 -0
- mdsvg/images.py +299 -0
- mdsvg/parser.py +629 -0
- mdsvg/playground.py +284 -0
- mdsvg/py.typed +2 -0
- mdsvg/renderer.py +1623 -0
- mdsvg/style.py +355 -0
- mdsvg/types.py +200 -0
- mdsvg/utils.py +86 -0
|
@@ -0,0 +1,975 @@
|
|
|
1
|
+
"""Deterministic query validator using SQLGlot AST analysis.
|
|
2
|
+
|
|
3
|
+
Parses SQL and detects structural issues that indicate likely query bugs:
|
|
4
|
+
|
|
5
|
+
- **missing_join_predicate**: Cross joins or comma-separated tables without
|
|
6
|
+
an explicit join predicate — usually an accidental cartesian product.
|
|
7
|
+
- **fanout_risk**: Aggregation over a joined query where aggregate expressions
|
|
8
|
+
reference columns from multiple tables, use unqualified columns with 2+
|
|
9
|
+
tables in scope, or COUNT(*) with joins — the structural signal for
|
|
10
|
+
double-counting / aggregate inflation.
|
|
11
|
+
- **reaggregation**: Outer query applies an aggregate function to a column
|
|
12
|
+
that is already aggregate-derived in a subquery or CTE — e.g. SUM of a
|
|
13
|
+
SUM, AVG of an AVG. Uses propagation of aggregate lineage through nested
|
|
14
|
+
scopes to detect these patterns.
|
|
15
|
+
- **parse_error**: SQL that SQLGlot cannot parse.
|
|
16
|
+
|
|
17
|
+
Schema context (grain, primary keys) is optional. Structural checks work
|
|
18
|
+
without it; metadata refines severity and adds repair guidance.
|
|
19
|
+
|
|
20
|
+
Relationship context (multiplicity, fanout factor) is optional. When
|
|
21
|
+
available, it calibrates fanout_risk severity and grounds recommendations
|
|
22
|
+
in known join metadata.
|
|
23
|
+
|
|
24
|
+
Pure functions — no DB queries, no side effects.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from typing import Literal, overload
|
|
32
|
+
|
|
33
|
+
import sqlglot
|
|
34
|
+
from sqlglot import exp
|
|
35
|
+
|
|
36
|
+
# Pattern for -- dft:ignore [code1 code2 ...]
|
|
37
|
+
_DFT_IGNORE_RE = re.compile(r"--\s*dft:ignore\b\s*(.*)", re.IGNORECASE)
|
|
38
|
+
|
|
39
|
+
# Codes that must never be suppressed.
|
|
40
|
+
_UNSUPPRESSIBLE_CODES = frozenset({"parse_error"})
|
|
41
|
+
|
|
42
|
+
from dataface.core.inspect.fanout_risk import HIGH_FANOUT_THRESHOLD
|
|
43
|
+
|
|
44
|
+
# Minimum confidence to trust a relationship hint for severity changes.
|
|
45
|
+
_MIN_CALIBRATION_CONFIDENCE = 0.75
|
|
46
|
+
|
|
47
|
+
# Multiplicity flip table for direction normalization.
|
|
48
|
+
_FLIP_MULTIPLICITY = {
|
|
49
|
+
"one-to-many": "many-to-one",
|
|
50
|
+
"many-to-one": "one-to-many",
|
|
51
|
+
"one-to-one": "one-to-one",
|
|
52
|
+
"many-to-many": "many-to-many",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Data model
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class QueryDiagnostic:
|
|
62
|
+
"""A single diagnostic finding from query validation."""
|
|
63
|
+
|
|
64
|
+
code: Literal[
|
|
65
|
+
"missing_join_predicate", "fanout_risk", "parse_error", "reaggregation"
|
|
66
|
+
]
|
|
67
|
+
severity: Literal["error", "warning", "info"]
|
|
68
|
+
message: str
|
|
69
|
+
detail: str | None = None
|
|
70
|
+
recommendation: str | None = None
|
|
71
|
+
confidence: float | None = None
|
|
72
|
+
evidence: tuple[str, ...] = ()
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> dict[str, str | float | list[str] | None]:
|
|
75
|
+
return {
|
|
76
|
+
"code": self.code,
|
|
77
|
+
"severity": self.severity,
|
|
78
|
+
"message": self.message,
|
|
79
|
+
"detail": self.detail,
|
|
80
|
+
"recommendation": self.recommendation,
|
|
81
|
+
"confidence": self.confidence,
|
|
82
|
+
"evidence": list(self.evidence),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass(frozen=True)
|
|
87
|
+
class TableContext:
|
|
88
|
+
"""Schema context for a single table."""
|
|
89
|
+
|
|
90
|
+
grain_columns: list[str] = field(default_factory=list)
|
|
91
|
+
primary_key_columns: list[str] = field(default_factory=list)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass(frozen=True)
|
|
95
|
+
class SchemaContext:
|
|
96
|
+
"""Optional schema context to enrich diagnostics."""
|
|
97
|
+
|
|
98
|
+
tables: dict[str, TableContext] = field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass(frozen=True)
|
|
102
|
+
class RelationshipHint:
|
|
103
|
+
"""Known relationship between two tables for severity calibration."""
|
|
104
|
+
|
|
105
|
+
left_table: str
|
|
106
|
+
right_table: str
|
|
107
|
+
multiplicity: str # "one-to-one" | "one-to-many" | "many-to-one" | "many-to-many"
|
|
108
|
+
fanout_factor: float
|
|
109
|
+
confidence: float # 0.0–1.0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass(frozen=True)
|
|
113
|
+
class RelationshipContext:
|
|
114
|
+
"""Relationship metadata for query validator severity calibration."""
|
|
115
|
+
|
|
116
|
+
hints: tuple[RelationshipHint, ...] = ()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# AST helpers
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _source_name(node: exp.Table | exp.Subquery) -> str:
|
|
125
|
+
"""Return the alias if present, else the table name.
|
|
126
|
+
|
|
127
|
+
For subqueries, returns the alias (subqueries always need one in valid SQL).
|
|
128
|
+
"""
|
|
129
|
+
if isinstance(node, exp.Subquery):
|
|
130
|
+
return node.alias or "<subquery>"
|
|
131
|
+
return node.alias or node.name
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _direct_from_sources(select: exp.Select) -> list[exp.Table | exp.Subquery]:
|
|
135
|
+
"""Get direct table/subquery sources from FROM clause (non-recursive)."""
|
|
136
|
+
from_ = select.find(exp.From)
|
|
137
|
+
if not from_:
|
|
138
|
+
return []
|
|
139
|
+
sources: list[exp.Table | exp.Subquery] = []
|
|
140
|
+
for child in from_.iter_expressions():
|
|
141
|
+
if isinstance(child, exp.Subquery):
|
|
142
|
+
sources.append(child)
|
|
143
|
+
elif isinstance(child, exp.Table):
|
|
144
|
+
# Only include if not nested inside a subquery
|
|
145
|
+
parent_subquery = child.find_ancestor(exp.Subquery)
|
|
146
|
+
if (
|
|
147
|
+
parent_subquery is None
|
|
148
|
+
or parent_subquery not in from_.iter_expressions()
|
|
149
|
+
):
|
|
150
|
+
sources.append(child)
|
|
151
|
+
return sources
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _direct_join_sources(select: exp.Select) -> list[exp.Table | exp.Subquery]:
|
|
155
|
+
"""Get direct table/subquery sources from JOIN clauses (non-recursive)."""
|
|
156
|
+
sources: list[exp.Table | exp.Subquery] = []
|
|
157
|
+
for join in select.find_all(exp.Join):
|
|
158
|
+
# Skip joins that belong to subqueries
|
|
159
|
+
if join.find_ancestor(exp.Subquery):
|
|
160
|
+
continue
|
|
161
|
+
for child in join.iter_expressions():
|
|
162
|
+
if isinstance(child, exp.Subquery):
|
|
163
|
+
sources.append(child)
|
|
164
|
+
elif isinstance(child, exp.Table):
|
|
165
|
+
parent_subquery = child.find_ancestor(exp.Subquery)
|
|
166
|
+
if parent_subquery is None:
|
|
167
|
+
sources.append(child)
|
|
168
|
+
return sources
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _build_alias_map(select: exp.Select) -> dict[str, str]:
|
|
172
|
+
"""Map alias → table name for direct (non-subquery) table sources."""
|
|
173
|
+
alias_map: dict[str, str] = {}
|
|
174
|
+
for src in _direct_from_sources(select) + _direct_join_sources(select):
|
|
175
|
+
if isinstance(src, exp.Table):
|
|
176
|
+
alias_map[_source_name(src)] = src.name
|
|
177
|
+
return alias_map
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _has_join(select: exp.Select) -> bool:
|
|
181
|
+
"""Check if the SELECT has any direct JOIN clause (not in subqueries)."""
|
|
182
|
+
return bool(_direct_join_sources(select))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _has_comma_join(select: exp.Select) -> bool:
|
|
186
|
+
"""Check if FROM clause has multiple direct sources (comma join)."""
|
|
187
|
+
return len(_direct_from_sources(select)) > 1
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _find_cross_joins(select: exp.Select) -> list[tuple[str, str]]:
|
|
191
|
+
"""Find explicit CROSS JOINs and return (left_name, right_name)."""
|
|
192
|
+
pairs: list[tuple[str, str]] = []
|
|
193
|
+
from_sources = _direct_from_sources(select)
|
|
194
|
+
if not from_sources:
|
|
195
|
+
return pairs
|
|
196
|
+
|
|
197
|
+
left_name = _source_name(from_sources[0])
|
|
198
|
+
|
|
199
|
+
for join in select.find_all(exp.Join):
|
|
200
|
+
if join.find_ancestor(exp.Subquery):
|
|
201
|
+
continue
|
|
202
|
+
if join.args.get("on") or join.args.get("using"):
|
|
203
|
+
continue
|
|
204
|
+
# NATURAL JOIN has implicit predicates — not a cross join
|
|
205
|
+
if (join.args.get("method") or "").upper() == "NATURAL":
|
|
206
|
+
continue
|
|
207
|
+
kind = (join.args.get("kind") or "").upper()
|
|
208
|
+
if kind == "CROSS" or kind == "":
|
|
209
|
+
# Find the table/subquery in this join
|
|
210
|
+
for child in join.iter_expressions():
|
|
211
|
+
if isinstance(child, (exp.Table, exp.Subquery)):
|
|
212
|
+
parent_subquery = child.find_ancestor(exp.Subquery)
|
|
213
|
+
if parent_subquery is None or parent_subquery is child:
|
|
214
|
+
pairs.append((left_name, _source_name(child)))
|
|
215
|
+
break
|
|
216
|
+
return pairs
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_aggregate_functions(select: exp.Select) -> list[exp.AggFunc]:
|
|
220
|
+
"""Get all aggregate function calls in SELECT and HAVING clauses."""
|
|
221
|
+
aggs: list[exp.AggFunc] = []
|
|
222
|
+
for expr in select.expressions:
|
|
223
|
+
aggs.extend(expr.find_all(exp.AggFunc))
|
|
224
|
+
having = select.find(exp.Having)
|
|
225
|
+
if having:
|
|
226
|
+
aggs.extend(having.find_all(exp.AggFunc))
|
|
227
|
+
return aggs
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _column_table_ref(col: exp.Column) -> str | None:
|
|
231
|
+
"""Get the table reference (alias or name) from a Column expression."""
|
|
232
|
+
return col.table if col.table else None
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _resolve_table(ref: str, alias_map: dict[str, str]) -> str:
|
|
236
|
+
"""Resolve a table reference through the alias map."""
|
|
237
|
+
return alias_map.get(ref, ref)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
# Relationship hint matching and calibration
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _find_matching_hint(
|
|
246
|
+
query_tables: set[str],
|
|
247
|
+
from_tables: set[str],
|
|
248
|
+
hints: tuple[RelationshipHint, ...],
|
|
249
|
+
) -> tuple[RelationshipHint, str, bool] | None:
|
|
250
|
+
"""Find the riskiest matching hint, direction-normalized multiplicity, and flip flag.
|
|
251
|
+
|
|
252
|
+
MVP limitation: returns only one hint. For multi-join queries (3+ tables),
|
|
253
|
+
only one table pair gets calibrated. Selects the riskiest multiplicity
|
|
254
|
+
first (error > warning > info), then highest confidence as tiebreaker.
|
|
255
|
+
|
|
256
|
+
Matches when both sides of a hint appear in the query's resolved table set.
|
|
257
|
+
Flips multiplicity when the hint's left_table is not in FROM (i.e. tables
|
|
258
|
+
are reversed relative to the query's join direction).
|
|
259
|
+
|
|
260
|
+
When both or neither hint sides are in FROM (e.g. comma joins), no flip
|
|
261
|
+
occurs — there's no clear directionality to normalize against.
|
|
262
|
+
|
|
263
|
+
Returns (hint, normalized_multiplicity, was_flipped) or None.
|
|
264
|
+
"""
|
|
265
|
+
tables_lower = {t.lower() for t in query_tables}
|
|
266
|
+
from_lower = {t.lower() for t in from_tables}
|
|
267
|
+
best: tuple[RelationshipHint, str, bool] | None = None
|
|
268
|
+
best_risk: int = -1 # Higher = riskier
|
|
269
|
+
for hint in hints:
|
|
270
|
+
left = hint.left_table.lower()
|
|
271
|
+
right = hint.right_table.lower()
|
|
272
|
+
if left not in tables_lower or right not in tables_lower:
|
|
273
|
+
continue
|
|
274
|
+
mult = hint.multiplicity
|
|
275
|
+
flipped = False
|
|
276
|
+
if right in from_lower and left not in from_lower:
|
|
277
|
+
mult = _FLIP_MULTIPLICITY.get(mult, mult)
|
|
278
|
+
flipped = True
|
|
279
|
+
severity = _severity_for_multiplicity(mult, hint.confidence, hint.fanout_factor)
|
|
280
|
+
risk = {"error": 2, "warning": 1, "info": 0}.get(severity, 1)
|
|
281
|
+
# Prefer riskiest; break ties by confidence.
|
|
282
|
+
if risk > best_risk or (
|
|
283
|
+
risk == best_risk and (best is None or hint.confidence > best[0].confidence)
|
|
284
|
+
):
|
|
285
|
+
best = (hint, mult, flipped)
|
|
286
|
+
best_risk = risk
|
|
287
|
+
return best
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _severity_for_multiplicity(
|
|
291
|
+
mult: str,
|
|
292
|
+
confidence: float,
|
|
293
|
+
fanout_factor: float,
|
|
294
|
+
) -> Literal["error", "warning", "info"]:
|
|
295
|
+
"""Determine calibrated severity from multiplicity, confidence, and fanout."""
|
|
296
|
+
if confidence < _MIN_CALIBRATION_CONFIDENCE:
|
|
297
|
+
return "warning"
|
|
298
|
+
if mult == "one-to-one":
|
|
299
|
+
return "info"
|
|
300
|
+
if mult == "many-to-one":
|
|
301
|
+
return "info"
|
|
302
|
+
if mult == "many-to-many":
|
|
303
|
+
return "error"
|
|
304
|
+
if mult == "one-to-many" and fanout_factor > HIGH_FANOUT_THRESHOLD:
|
|
305
|
+
return "error"
|
|
306
|
+
return "warning"
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _message_for_multiplicity(
|
|
310
|
+
mult: str,
|
|
311
|
+
from_table: str,
|
|
312
|
+
join_table: str,
|
|
313
|
+
fanout_factor: float,
|
|
314
|
+
severity: Literal["error", "warning", "info"],
|
|
315
|
+
) -> tuple[str, str]:
|
|
316
|
+
"""Return (message, recommendation) grounded in known multiplicity.
|
|
317
|
+
|
|
318
|
+
``from_table`` / ``join_table`` are query-oriented: the FROM-clause side
|
|
319
|
+
and the JOIN-clause side after direction normalization.
|
|
320
|
+
"""
|
|
321
|
+
lt, rt = from_table, join_table
|
|
322
|
+
ff = fanout_factor
|
|
323
|
+
if mult == "one-to-one":
|
|
324
|
+
return (
|
|
325
|
+
"Aggregation over joined tables — 1:1 join confirmed, no inflation risk",
|
|
326
|
+
"Join is 1:1 — aggregation is safe. "
|
|
327
|
+
"Verify the 1:1 assumption holds as data evolves.",
|
|
328
|
+
)
|
|
329
|
+
if mult == "many-to-one":
|
|
330
|
+
return (
|
|
331
|
+
"Aggregation over joined tables — N:1 join, no row multiplication",
|
|
332
|
+
f"Join {lt} → {rt} is N:1 (dimension lookup). "
|
|
333
|
+
f"Aggregates on {lt} are safe. "
|
|
334
|
+
f"Aggregates on {rt} columns may repeat values — "
|
|
335
|
+
f"verify they are grouping keys, not measures.",
|
|
336
|
+
)
|
|
337
|
+
if mult == "many-to-many":
|
|
338
|
+
if severity == "error":
|
|
339
|
+
return (
|
|
340
|
+
"N:M join with aggregation — results almost certainly inflated",
|
|
341
|
+
f"Use a bridge table or pre-aggregate each side to the correct "
|
|
342
|
+
f"grain before joining {lt} ↔ {rt}.",
|
|
343
|
+
)
|
|
344
|
+
return (
|
|
345
|
+
"Possible N:M join with aggregation — review join relationship",
|
|
346
|
+
f"Verify the join relationship between {lt} and {rt}. "
|
|
347
|
+
f"If N:M, pre-aggregate each side before joining.",
|
|
348
|
+
)
|
|
349
|
+
if mult == "one-to-many":
|
|
350
|
+
if severity == "error":
|
|
351
|
+
return (
|
|
352
|
+
f"1:N join with aggregation and high fanout "
|
|
353
|
+
f"({ff:.1f}x) — results likely inflated",
|
|
354
|
+
f"Pre-aggregate {rt} to the {lt} grain before joining "
|
|
355
|
+
f"({ff:.1f}x average row multiplication).",
|
|
356
|
+
)
|
|
357
|
+
return (
|
|
358
|
+
f"1:N join with aggregation ({ff:.1f}x fanout) "
|
|
359
|
+
f"— verify aggregate correctness",
|
|
360
|
+
f"Pre-aggregate {rt} to the {lt} join key grain before joining, "
|
|
361
|
+
f"or verify aggregation handles the {ff:.1f}x row expansion correctly.",
|
|
362
|
+
)
|
|
363
|
+
# Unknown multiplicity
|
|
364
|
+
return (
|
|
365
|
+
"Aggregation over joined tables may inflate results",
|
|
366
|
+
"Pre-aggregate each table to the join key grain "
|
|
367
|
+
"before joining, or verify the join is 1:1 / N:1.",
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _calibrate_fanout(
|
|
372
|
+
hint: RelationshipHint,
|
|
373
|
+
mult: str,
|
|
374
|
+
flipped: bool,
|
|
375
|
+
detail_parts: list[str],
|
|
376
|
+
) -> QueryDiagnostic:
|
|
377
|
+
"""Produce a calibrated fanout_risk diagnostic using a matched relationship hint."""
|
|
378
|
+
# Orient table names to match the query's join direction.
|
|
379
|
+
from_table = hint.right_table if flipped else hint.left_table
|
|
380
|
+
join_table = hint.left_table if flipped else hint.right_table
|
|
381
|
+
evidence = (
|
|
382
|
+
f"Relationship: {from_table} ↔ {join_table} "
|
|
383
|
+
f"({mult}, {hint.fanout_factor:.1f}x fanout, "
|
|
384
|
+
f"confidence {hint.confidence:.0%})",
|
|
385
|
+
)
|
|
386
|
+
severity = _severity_for_multiplicity(mult, hint.confidence, hint.fanout_factor)
|
|
387
|
+
message, recommendation = _message_for_multiplicity(
|
|
388
|
+
mult, from_table, join_table, hint.fanout_factor, severity
|
|
389
|
+
)
|
|
390
|
+
return QueryDiagnostic(
|
|
391
|
+
code="fanout_risk",
|
|
392
|
+
severity=severity,
|
|
393
|
+
message=message,
|
|
394
|
+
detail=". ".join(detail_parts) if detail_parts else None,
|
|
395
|
+
recommendation=recommendation,
|
|
396
|
+
confidence=hint.confidence,
|
|
397
|
+
evidence=evidence,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# ---------------------------------------------------------------------------
|
|
402
|
+
# Diagnostic detectors
|
|
403
|
+
# ---------------------------------------------------------------------------
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _detect_missing_join_predicates(
|
|
407
|
+
select: exp.Select,
|
|
408
|
+
) -> list[QueryDiagnostic]:
|
|
409
|
+
"""Detect comma joins and explicit CROSS JOINs."""
|
|
410
|
+
diags: list[QueryDiagnostic] = []
|
|
411
|
+
|
|
412
|
+
# Comma joins: multiple direct sources in FROM
|
|
413
|
+
if _has_comma_join(select):
|
|
414
|
+
sources = _direct_from_sources(select)
|
|
415
|
+
names = [_source_name(s) for s in sources]
|
|
416
|
+
diags.append(
|
|
417
|
+
QueryDiagnostic(
|
|
418
|
+
code="missing_join_predicate",
|
|
419
|
+
severity="error",
|
|
420
|
+
message=f"Implicit cross join between {', '.join(names)} — "
|
|
421
|
+
f"no explicit join predicate",
|
|
422
|
+
detail=f"Tables {', '.join(names)} appear in FROM without "
|
|
423
|
+
f"a JOIN ... ON clause. This produces a cartesian product.",
|
|
424
|
+
recommendation="Use explicit JOIN with ON clause to specify "
|
|
425
|
+
"the join relationship.",
|
|
426
|
+
)
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Explicit CROSS JOINs
|
|
430
|
+
for left, right in _find_cross_joins(select):
|
|
431
|
+
diags.append(
|
|
432
|
+
QueryDiagnostic(
|
|
433
|
+
code="missing_join_predicate",
|
|
434
|
+
severity="error",
|
|
435
|
+
message=f"CROSS JOIN between {left} and {right} — no join predicate",
|
|
436
|
+
detail=f"CROSS JOIN produces a cartesian product of {left} × {right}.",
|
|
437
|
+
recommendation="If intentional, document why. Otherwise, "
|
|
438
|
+
"use JOIN with ON clause.",
|
|
439
|
+
)
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return diags
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _detect_fanout_risk(
|
|
446
|
+
select: exp.Select,
|
|
447
|
+
schema_context: SchemaContext | None,
|
|
448
|
+
relationship_context: RelationshipContext | None,
|
|
449
|
+
) -> list[QueryDiagnostic]:
|
|
450
|
+
"""Detect structural fanout risk: aggregation over joined tables.
|
|
451
|
+
|
|
452
|
+
Triggers when:
|
|
453
|
+
1. The query has JOINs (explicit or comma-join)
|
|
454
|
+
2. The query has aggregate functions (in SELECT or HAVING)
|
|
455
|
+
3. Any of:
|
|
456
|
+
a. Aggregate functions reference columns from 2+ distinct tables, OR
|
|
457
|
+
b. COUNT(*) is used (inflated by row multiplication from join), OR
|
|
458
|
+
c. Unqualified columns appear in aggregates with 2+ tables in scope
|
|
459
|
+
|
|
460
|
+
When relationship_context is provided, matched hints calibrate severity:
|
|
461
|
+
- 1:1 → info (no inflation possible)
|
|
462
|
+
- N:1 → info (dimension lookup, safe for many-side aggregates)
|
|
463
|
+
- 1:N + high fanout → error
|
|
464
|
+
- N:M → error (with sufficient confidence)
|
|
465
|
+
"""
|
|
466
|
+
has_joins = _has_join(select) or _has_comma_join(select)
|
|
467
|
+
if not has_joins:
|
|
468
|
+
return []
|
|
469
|
+
|
|
470
|
+
aggs = _get_aggregate_functions(select)
|
|
471
|
+
if not aggs:
|
|
472
|
+
return []
|
|
473
|
+
|
|
474
|
+
alias_map = _build_alias_map(select)
|
|
475
|
+
table_count = len(_direct_from_sources(select)) + len(_direct_join_sources(select))
|
|
476
|
+
|
|
477
|
+
# Check for COUNT(*) — always risky with joins
|
|
478
|
+
has_count_star = any(
|
|
479
|
+
isinstance(agg, exp.Count) and isinstance(agg.this, exp.Star) for agg in aggs
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Track aliases (source refs) for multi-table detection — so self-joins
|
|
483
|
+
# (e1.salary, e2.salary → same table) are correctly flagged.
|
|
484
|
+
# Resolved names are only for detail messages.
|
|
485
|
+
agg_source_refs: set[str] = set()
|
|
486
|
+
agg_resolved_tables: set[str] = set()
|
|
487
|
+
has_unqualified_agg_cols = False
|
|
488
|
+
for agg in aggs:
|
|
489
|
+
for col in agg.find_all(exp.Column):
|
|
490
|
+
ref = _column_table_ref(col)
|
|
491
|
+
if ref:
|
|
492
|
+
agg_source_refs.add(ref)
|
|
493
|
+
agg_resolved_tables.add(_resolve_table(ref, alias_map))
|
|
494
|
+
else:
|
|
495
|
+
has_unqualified_agg_cols = True
|
|
496
|
+
|
|
497
|
+
multi_table_agg = len(agg_source_refs) > 1
|
|
498
|
+
# Unqualified columns in aggregates with 2+ tables = ambiguous ownership
|
|
499
|
+
ambiguous_agg = has_unqualified_agg_cols and table_count >= 2
|
|
500
|
+
|
|
501
|
+
has_structural_signal = has_count_star or multi_table_agg or ambiguous_agg
|
|
502
|
+
|
|
503
|
+
# --- Relationship-triggered detection path ---
|
|
504
|
+
# When no structural signal (COUNT(*), multi-table agg, ambiguous cols) but
|
|
505
|
+
# relationship context indicates a risky join (1:N or N:M), any aggregation
|
|
506
|
+
# + join should fire. This covers the most common real-world fanout pattern:
|
|
507
|
+
# single-table aggregate inflated by a 1:N join (e.g. SUM(o.amount) with
|
|
508
|
+
# JOIN line_items).
|
|
509
|
+
if not has_structural_signal:
|
|
510
|
+
if not relationship_context or not relationship_context.hints:
|
|
511
|
+
return []
|
|
512
|
+
all_query_tables = set(alias_map.values())
|
|
513
|
+
from_tables = {
|
|
514
|
+
src.name
|
|
515
|
+
for src in _direct_from_sources(select)
|
|
516
|
+
if isinstance(src, exp.Table)
|
|
517
|
+
}
|
|
518
|
+
match = _find_matching_hint(
|
|
519
|
+
all_query_tables, from_tables, relationship_context.hints
|
|
520
|
+
)
|
|
521
|
+
if match is None:
|
|
522
|
+
return []
|
|
523
|
+
hint, mult, flipped = match
|
|
524
|
+
# Only fire for risky multiplicities — 1:1 and N:1 are safe.
|
|
525
|
+
if mult in ("one-to-one", "many-to-one"):
|
|
526
|
+
return []
|
|
527
|
+
return [
|
|
528
|
+
_calibrate_fanout(
|
|
529
|
+
hint,
|
|
530
|
+
mult,
|
|
531
|
+
flipped,
|
|
532
|
+
[
|
|
533
|
+
"Single-table aggregation after row-multiplying join — "
|
|
534
|
+
"aggregate values are inflated by the join fanout"
|
|
535
|
+
],
|
|
536
|
+
)
|
|
537
|
+
]
|
|
538
|
+
|
|
539
|
+
# Build detail with schema context if available
|
|
540
|
+
detail_parts: list[str] = []
|
|
541
|
+
if has_count_star:
|
|
542
|
+
detail_parts.append("COUNT(*) is inflated by row multiplication from JOIN")
|
|
543
|
+
if multi_table_agg:
|
|
544
|
+
if len(agg_resolved_tables) == 1:
|
|
545
|
+
tname = next(iter(agg_resolved_tables))
|
|
546
|
+
aliases = sorted(agg_source_refs)
|
|
547
|
+
detail_parts.append(
|
|
548
|
+
f"Aggregate expressions reference columns from "
|
|
549
|
+
f"aliases {', '.join(aliases)} (all from table {tname})"
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
detail_parts.append(
|
|
553
|
+
f"Aggregate expressions reference columns from tables: "
|
|
554
|
+
f"{', '.join(sorted(agg_resolved_tables))}"
|
|
555
|
+
)
|
|
556
|
+
if ambiguous_agg:
|
|
557
|
+
detail_parts.append(
|
|
558
|
+
"Unqualified columns in aggregate expressions with multiple "
|
|
559
|
+
"tables in scope — column ownership is ambiguous"
|
|
560
|
+
)
|
|
561
|
+
if schema_context:
|
|
562
|
+
for tname in sorted(agg_resolved_tables):
|
|
563
|
+
tctx = schema_context.tables.get(tname)
|
|
564
|
+
if tctx and tctx.grain_columns:
|
|
565
|
+
detail_parts.append(
|
|
566
|
+
f"Table '{tname}' has grain: {', '.join(tctx.grain_columns)}"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# --- Relationship-based severity calibration ---
|
|
570
|
+
# Match against resolved table names only (not aliases) to avoid
|
|
571
|
+
# phantom matches with short alias names like 'o' or 'c'.
|
|
572
|
+
all_query_tables = set(alias_map.values())
|
|
573
|
+
# FROM tables for direction normalization.
|
|
574
|
+
from_tables = {
|
|
575
|
+
src.name for src in _direct_from_sources(select) if isinstance(src, exp.Table)
|
|
576
|
+
}
|
|
577
|
+
if relationship_context and relationship_context.hints:
|
|
578
|
+
match = _find_matching_hint(
|
|
579
|
+
all_query_tables, from_tables, relationship_context.hints
|
|
580
|
+
)
|
|
581
|
+
if match is not None:
|
|
582
|
+
hint, mult, flipped = match
|
|
583
|
+
# Early return: the calibrated finding subsumes the generic
|
|
584
|
+
# structural warning. detail_parts (COUNT(*), multi-table refs)
|
|
585
|
+
# are preserved in the calibrated diagnostic.
|
|
586
|
+
return [_calibrate_fanout(hint, mult, flipped, detail_parts)]
|
|
587
|
+
|
|
588
|
+
# No relationship context or no matching hint — generic structural finding
|
|
589
|
+
return [
|
|
590
|
+
QueryDiagnostic(
|
|
591
|
+
code="fanout_risk",
|
|
592
|
+
severity="warning",
|
|
593
|
+
message="Aggregation over joined tables may inflate results",
|
|
594
|
+
detail=". ".join(detail_parts) if detail_parts else None,
|
|
595
|
+
recommendation="Pre-aggregate each table to the join key grain "
|
|
596
|
+
"before joining, or verify the join is 1:1 / N:1.",
|
|
597
|
+
)
|
|
598
|
+
]
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
# ---------------------------------------------------------------------------
|
|
602
|
+
# Propagation-backed re-aggregation detection
|
|
603
|
+
# ---------------------------------------------------------------------------
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _select_output_columns(select: exp.Select) -> dict[str, bool]:
|
|
607
|
+
"""Map output alias → is_aggregate_derived for a SELECT.
|
|
608
|
+
|
|
609
|
+
Returns a dict where keys are the output column names (alias or column name)
|
|
610
|
+
and values indicate whether the column is derived from an aggregate function.
|
|
611
|
+
Group-by keys are explicitly marked as non-aggregate.
|
|
612
|
+
"""
|
|
613
|
+
# Collect GROUP BY column names to exclude them from aggregate tagging
|
|
614
|
+
group_by_keys: set[str] = set()
|
|
615
|
+
group = select.args.get("group")
|
|
616
|
+
if group:
|
|
617
|
+
for expr in group.expressions:
|
|
618
|
+
if isinstance(expr, exp.Column):
|
|
619
|
+
group_by_keys.add(expr.name.lower())
|
|
620
|
+
|
|
621
|
+
outputs: dict[str, bool] = {}
|
|
622
|
+
for expr in select.expressions:
|
|
623
|
+
alias = expr.alias if isinstance(expr, exp.Alias) else None
|
|
624
|
+
if alias:
|
|
625
|
+
inner = expr.this
|
|
626
|
+
# AggFunc inside a Window node is a window function (per-row),
|
|
627
|
+
# not a collapsed aggregate — don't tag as aggregate-derived.
|
|
628
|
+
agg_node = inner.find(exp.AggFunc)
|
|
629
|
+
is_agg = bool(agg_node) and not bool(inner.find(exp.Window))
|
|
630
|
+
outputs[alias.lower()] = is_agg
|
|
631
|
+
elif isinstance(expr, exp.Column):
|
|
632
|
+
outputs[expr.name.lower()] = False
|
|
633
|
+
elif isinstance(expr, exp.Star):
|
|
634
|
+
# Can't track through SELECT * — skip
|
|
635
|
+
pass
|
|
636
|
+
# Group-by keys are never aggregate-derived even if aliased from one
|
|
637
|
+
for key in group_by_keys:
|
|
638
|
+
if key in outputs:
|
|
639
|
+
outputs[key] = False
|
|
640
|
+
return outputs
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _propagate_aggregate_columns(
|
|
644
|
+
parsed: exp.Expression,
|
|
645
|
+
) -> dict[str, dict[str, bool]]:
|
|
646
|
+
"""Build a map of source_name → {column_name: is_aggregate_derived}.
|
|
647
|
+
|
|
648
|
+
Walks CTEs and inline subqueries to determine which output columns
|
|
649
|
+
are aggregate-derived so the outer query can detect re-aggregation.
|
|
650
|
+
"""
|
|
651
|
+
agg_map: dict[str, dict[str, bool]] = {}
|
|
652
|
+
|
|
653
|
+
# CTEs — process in order so later CTEs can inherit from earlier ones
|
|
654
|
+
with_ = parsed.find(exp.With)
|
|
655
|
+
if with_:
|
|
656
|
+
for cte in with_.expressions:
|
|
657
|
+
if not isinstance(cte, exp.CTE):
|
|
658
|
+
continue
|
|
659
|
+
alias = cte.alias
|
|
660
|
+
inner_select = cte.find(exp.Select)
|
|
661
|
+
if alias and inner_select:
|
|
662
|
+
outputs = _select_output_columns(inner_select)
|
|
663
|
+
# Propagate aggregate lineage: if this CTE selects from
|
|
664
|
+
# another CTE/source already in agg_map, columns that are
|
|
665
|
+
# merely passed through (not re-aggregated, not in GROUP BY)
|
|
666
|
+
# inherit aggregate status from the upstream source.
|
|
667
|
+
_inherit_aggregate_lineage(inner_select, outputs, agg_map)
|
|
668
|
+
agg_map[alias.lower()] = outputs
|
|
669
|
+
|
|
670
|
+
# Inline subqueries in FROM / JOIN of the main SELECT
|
|
671
|
+
if isinstance(parsed, exp.Select):
|
|
672
|
+
for src in _direct_from_sources(parsed) + _direct_join_sources(parsed):
|
|
673
|
+
if isinstance(src, exp.Subquery):
|
|
674
|
+
name = (src.alias or "<subquery>").lower()
|
|
675
|
+
inner_select = src.find(exp.Select)
|
|
676
|
+
if inner_select:
|
|
677
|
+
outputs = _select_output_columns(inner_select)
|
|
678
|
+
_inherit_aggregate_lineage(inner_select, outputs, agg_map)
|
|
679
|
+
agg_map[name] = outputs
|
|
680
|
+
|
|
681
|
+
return agg_map
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _inherit_aggregate_lineage(
|
|
685
|
+
select: exp.Select,
|
|
686
|
+
outputs: dict[str, bool],
|
|
687
|
+
agg_map: dict[str, dict[str, bool]],
|
|
688
|
+
) -> None:
|
|
689
|
+
"""Propagate aggregate-derived status through pass-through columns.
|
|
690
|
+
|
|
691
|
+
When a CTE or subquery selects a column from an upstream source that
|
|
692
|
+
is already in ``agg_map``, and the column is not wrapped in an
|
|
693
|
+
aggregate function, it inherits the upstream aggregate status.
|
|
694
|
+
|
|
695
|
+
Limitations:
|
|
696
|
+
- ``SELECT *`` pass-throughs are not tracked (same as ``_select_output_columns``).
|
|
697
|
+
"""
|
|
698
|
+
# Build source name → agg_map key for this SELECT's FROM and JOIN sources
|
|
699
|
+
source_keys: dict[str, str] = {}
|
|
700
|
+
for src in _direct_from_sources(select) + _direct_join_sources(select):
|
|
701
|
+
if isinstance(src, exp.Table):
|
|
702
|
+
name = (src.alias or src.name).lower()
|
|
703
|
+
table_name = src.name.lower()
|
|
704
|
+
if table_name in agg_map:
|
|
705
|
+
source_keys[name] = table_name
|
|
706
|
+
|
|
707
|
+
if not source_keys:
|
|
708
|
+
return
|
|
709
|
+
|
|
710
|
+
for expr in select.expressions:
|
|
711
|
+
if isinstance(expr, exp.Column):
|
|
712
|
+
col_name = expr.name.lower()
|
|
713
|
+
# Already marked as aggregate by _select_output_columns — skip
|
|
714
|
+
if outputs.get(col_name):
|
|
715
|
+
continue
|
|
716
|
+
ref = (expr.table or "").lower()
|
|
717
|
+
sources = (
|
|
718
|
+
[source_keys[ref]]
|
|
719
|
+
if ref and ref in source_keys
|
|
720
|
+
else list(source_keys.values())
|
|
721
|
+
)
|
|
722
|
+
for agg_key in sources:
|
|
723
|
+
if agg_map.get(agg_key, {}).get(col_name):
|
|
724
|
+
outputs[col_name] = True
|
|
725
|
+
break
|
|
726
|
+
elif isinstance(expr, exp.Alias):
|
|
727
|
+
inner = expr.this
|
|
728
|
+
alias_name = expr.alias.lower()
|
|
729
|
+
if outputs.get(alias_name):
|
|
730
|
+
continue
|
|
731
|
+
# Check if the inner expression is a simple column reference
|
|
732
|
+
if isinstance(inner, exp.Column):
|
|
733
|
+
col_name = inner.name.lower()
|
|
734
|
+
ref = (inner.table or "").lower()
|
|
735
|
+
sources = (
|
|
736
|
+
[source_keys[ref]]
|
|
737
|
+
if ref and ref in source_keys
|
|
738
|
+
else list(source_keys.values())
|
|
739
|
+
)
|
|
740
|
+
for agg_key in sources:
|
|
741
|
+
if agg_map.get(agg_key, {}).get(col_name):
|
|
742
|
+
outputs[alias_name] = True
|
|
743
|
+
break
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _detect_reaggregation(
|
|
747
|
+
select: exp.Select, parsed: exp.Expression
|
|
748
|
+
) -> list[QueryDiagnostic]:
|
|
749
|
+
"""Detect re-aggregation: outer aggregate wrapping an already-aggregated column.
|
|
750
|
+
|
|
751
|
+
Uses propagation of aggregate lineage through subqueries and CTEs to
|
|
752
|
+
identify columns that are aggregate-derived, then checks whether
|
|
753
|
+
the outer query applies aggregate functions to those columns.
|
|
754
|
+
"""
|
|
755
|
+
agg_map = _propagate_aggregate_columns(parsed)
|
|
756
|
+
if not agg_map:
|
|
757
|
+
return []
|
|
758
|
+
|
|
759
|
+
# Also resolve table aliases from FROM — CTE references appear as tables
|
|
760
|
+
# Build a map of alias/table name → source name in agg_map
|
|
761
|
+
source_names: dict[str, str] = {}
|
|
762
|
+
for src in _direct_from_sources(select) + _direct_join_sources(select):
|
|
763
|
+
name = _source_name(src).lower()
|
|
764
|
+
if isinstance(src, exp.Table):
|
|
765
|
+
table_name = src.name.lower()
|
|
766
|
+
# CTE references appear as Table nodes
|
|
767
|
+
if table_name in agg_map:
|
|
768
|
+
source_names[name] = table_name
|
|
769
|
+
elif isinstance(src, exp.Subquery):
|
|
770
|
+
if name in agg_map:
|
|
771
|
+
source_names[name] = name
|
|
772
|
+
|
|
773
|
+
if not source_names:
|
|
774
|
+
return []
|
|
775
|
+
|
|
776
|
+
# Find aggregate functions in the outer SELECT and HAVING
|
|
777
|
+
outer_aggs = _get_aggregate_functions(select)
|
|
778
|
+
if not outer_aggs:
|
|
779
|
+
return []
|
|
780
|
+
|
|
781
|
+
diags: list[QueryDiagnostic] = []
|
|
782
|
+
seen: set[tuple[str, str]] = set()
|
|
783
|
+
|
|
784
|
+
for agg in outer_aggs:
|
|
785
|
+
for col in agg.find_all(exp.Column):
|
|
786
|
+
col_name = col.name.lower()
|
|
787
|
+
# Try to resolve which source this column comes from
|
|
788
|
+
table_ref = (col.table or "").lower()
|
|
789
|
+
if table_ref and table_ref in source_names:
|
|
790
|
+
# Qualified column — resolve to the specific source
|
|
791
|
+
candidate_sources = [source_names[table_ref]]
|
|
792
|
+
else:
|
|
793
|
+
# Unqualified column — only flag if aggregate-derived in ALL
|
|
794
|
+
# sources that contain it (ambiguous ownership → skip)
|
|
795
|
+
candidate_sources = list(source_names.values())
|
|
796
|
+
containing = [
|
|
797
|
+
s for s in candidate_sources if col_name in agg_map.get(s, {})
|
|
798
|
+
]
|
|
799
|
+
if not containing or not all(agg_map[s][col_name] for s in containing):
|
|
800
|
+
continue
|
|
801
|
+
candidate_sources = containing
|
|
802
|
+
|
|
803
|
+
for src_name in candidate_sources:
|
|
804
|
+
col_info = agg_map.get(src_name, {})
|
|
805
|
+
key = (src_name, col_name)
|
|
806
|
+
if col_info.get(col_name) and key not in seen:
|
|
807
|
+
seen.add(key)
|
|
808
|
+
outer_func = type(agg).__name__.upper()
|
|
809
|
+
diags.append(
|
|
810
|
+
QueryDiagnostic(
|
|
811
|
+
code="reaggregation",
|
|
812
|
+
severity="warning",
|
|
813
|
+
message=(
|
|
814
|
+
f"{outer_func}({col.name}) re-aggregates an "
|
|
815
|
+
f"already aggregate-derived column"
|
|
816
|
+
),
|
|
817
|
+
detail=(
|
|
818
|
+
f"Column '{col.name}' is produced by an aggregate "
|
|
819
|
+
f"in source '{src_name}'. Applying {outer_func} "
|
|
820
|
+
f"on top is likely a statistical error."
|
|
821
|
+
),
|
|
822
|
+
recommendation=(
|
|
823
|
+
"Review whether the outer aggregation is correct. "
|
|
824
|
+
"Summing a pre-summed column or averaging an "
|
|
825
|
+
"already-averaged column usually produces "
|
|
826
|
+
"incorrect results."
|
|
827
|
+
),
|
|
828
|
+
)
|
|
829
|
+
)
|
|
830
|
+
return diags
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
# ---------------------------------------------------------------------------
|
|
834
|
+
# Public API
|
|
835
|
+
# ---------------------------------------------------------------------------
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
def parse_inline_suppressions(sql: str) -> set[str]:
|
|
839
|
+
"""Extract suppressed diagnostic codes from ``-- dft:ignore`` comments.
|
|
840
|
+
|
|
841
|
+
Supports:
|
|
842
|
+
- ``-- dft:ignore fanout_risk`` — suppress one code
|
|
843
|
+
- ``-- dft:ignore fanout_risk reaggregation`` — suppress multiple
|
|
844
|
+
- ``-- dft:ignore`` — blanket suppress all (returns ``{"*"}``)
|
|
845
|
+
|
|
846
|
+
Note: Uses a simple line scan. Does not distinguish SQL comments from
|
|
847
|
+
string literals — a pattern inside a string literal will also match.
|
|
848
|
+
This is consistent with how sqlfluff and other linters handle noqa.
|
|
849
|
+
"""
|
|
850
|
+
codes: set[str] = set()
|
|
851
|
+
for line in sql.splitlines():
|
|
852
|
+
m = _DFT_IGNORE_RE.search(line.strip())
|
|
853
|
+
if m:
|
|
854
|
+
rest = m.group(1).strip()
|
|
855
|
+
if rest:
|
|
856
|
+
codes.update(rest.split())
|
|
857
|
+
else:
|
|
858
|
+
codes.add("*")
|
|
859
|
+
return codes
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
def _apply_suppression(
|
|
863
|
+
diags: list[QueryDiagnostic],
|
|
864
|
+
suppressed_codes: set[str],
|
|
865
|
+
) -> tuple[list[QueryDiagnostic], list[QueryDiagnostic]]:
|
|
866
|
+
"""Split diagnostics into active and suppressed lists.
|
|
867
|
+
|
|
868
|
+
Unsuppressible codes (e.g. parse_error) are never suppressed.
|
|
869
|
+
"""
|
|
870
|
+
if not suppressed_codes:
|
|
871
|
+
return diags, []
|
|
872
|
+
blanket = "*" in suppressed_codes
|
|
873
|
+
active: list[QueryDiagnostic] = []
|
|
874
|
+
suppressed: list[QueryDiagnostic] = []
|
|
875
|
+
for d in diags:
|
|
876
|
+
if d.code in _UNSUPPRESSIBLE_CODES:
|
|
877
|
+
active.append(d)
|
|
878
|
+
elif blanket or d.code in suppressed_codes:
|
|
879
|
+
suppressed.append(d)
|
|
880
|
+
else:
|
|
881
|
+
active.append(d)
|
|
882
|
+
return active, suppressed
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
@overload
|
|
886
|
+
def validate_query(
|
|
887
|
+
sql: str,
|
|
888
|
+
*,
|
|
889
|
+
dialect: str | None = ...,
|
|
890
|
+
schema_context: SchemaContext | None = ...,
|
|
891
|
+
relationship_context: RelationshipContext | None = ...,
|
|
892
|
+
suppress: set[str] | None = ...,
|
|
893
|
+
return_suppressed: Literal[False] = ...,
|
|
894
|
+
) -> list[QueryDiagnostic]: ...
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
@overload
|
|
898
|
+
def validate_query(
|
|
899
|
+
sql: str,
|
|
900
|
+
*,
|
|
901
|
+
dialect: str | None = ...,
|
|
902
|
+
schema_context: SchemaContext | None = ...,
|
|
903
|
+
relationship_context: RelationshipContext | None = ...,
|
|
904
|
+
suppress: set[str] | None = ...,
|
|
905
|
+
return_suppressed: Literal[True],
|
|
906
|
+
) -> tuple[list[QueryDiagnostic], list[QueryDiagnostic]]: ...
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def validate_query(
|
|
910
|
+
sql: str,
|
|
911
|
+
*,
|
|
912
|
+
dialect: str | None = None,
|
|
913
|
+
schema_context: SchemaContext | None = None,
|
|
914
|
+
relationship_context: RelationshipContext | None = None,
|
|
915
|
+
suppress: set[str] | None = None,
|
|
916
|
+
return_suppressed: bool = False,
|
|
917
|
+
) -> list[QueryDiagnostic] | tuple[list[QueryDiagnostic], list[QueryDiagnostic]]:
|
|
918
|
+
"""Validate a SQL query and return structural diagnostics.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
sql: SQL query string to validate.
|
|
922
|
+
dialect: Optional SQLGlot dialect name (e.g. "duckdb", "bigquery").
|
|
923
|
+
schema_context: Optional schema metadata to enrich diagnostics.
|
|
924
|
+
relationship_context: Optional relationship metadata for severity
|
|
925
|
+
calibration. When provided, fanout_risk findings are refined
|
|
926
|
+
using known multiplicity, fanout factor, and confidence.
|
|
927
|
+
suppress: Optional set of diagnostic codes to suppress externally
|
|
928
|
+
(e.g. from YAML ``ignore`` or ``meta.yaml`` lint config).
|
|
929
|
+
return_suppressed: If True, return a tuple of
|
|
930
|
+
(active_diagnostics, suppressed_diagnostics).
|
|
931
|
+
|
|
932
|
+
Returns:
|
|
933
|
+
List of QueryDiagnostic findings (or tuple if return_suppressed=True).
|
|
934
|
+
Empty list means no issues found.
|
|
935
|
+
"""
|
|
936
|
+
if not sql or not sql.strip():
|
|
937
|
+
result = [
|
|
938
|
+
QueryDiagnostic(
|
|
939
|
+
code="parse_error",
|
|
940
|
+
severity="error",
|
|
941
|
+
message="Empty SQL query",
|
|
942
|
+
)
|
|
943
|
+
]
|
|
944
|
+
return (result, []) if return_suppressed else result
|
|
945
|
+
|
|
946
|
+
# Collect suppression codes from SQL-inline comments + external callers
|
|
947
|
+
suppressed_codes = parse_inline_suppressions(sql)
|
|
948
|
+
if suppress:
|
|
949
|
+
suppressed_codes.update(suppress)
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
parsed = sqlglot.parse_one(sql, read=dialect)
|
|
953
|
+
except sqlglot.errors.ParseError as e:
|
|
954
|
+
result = [
|
|
955
|
+
QueryDiagnostic(
|
|
956
|
+
code="parse_error",
|
|
957
|
+
severity="error",
|
|
958
|
+
message=f"SQL parse error: {e}",
|
|
959
|
+
)
|
|
960
|
+
]
|
|
961
|
+
return (result, []) if return_suppressed else result
|
|
962
|
+
|
|
963
|
+
if not isinstance(parsed, exp.Select):
|
|
964
|
+
# Only validate SELECT statements for now
|
|
965
|
+
return ([], []) if return_suppressed else []
|
|
966
|
+
|
|
967
|
+
diags: list[QueryDiagnostic] = []
|
|
968
|
+
diags.extend(_detect_missing_join_predicates(parsed))
|
|
969
|
+
diags.extend(_detect_fanout_risk(parsed, schema_context, relationship_context))
|
|
970
|
+
diags.extend(_detect_reaggregation(parsed, parsed))
|
|
971
|
+
|
|
972
|
+
active, suppressed_diags = _apply_suppression(diags, suppressed_codes)
|
|
973
|
+
if return_suppressed:
|
|
974
|
+
return active, suppressed_diags
|
|
975
|
+
return active
|