sql-chatbot-rails 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +20 -0
  4. data/app/controllers/sql_chatbot/chatbot_controller.rb +158 -0
  5. data/config/routes.rb +11 -0
  6. data/lib/generators/sql_chatbot/install_generator.rb +25 -0
  7. data/lib/generators/sql_chatbot/templates/initializer.rb +22 -0
  8. data/lib/sql_chatbot/auth/cors.rb +35 -0
  9. data/lib/sql_chatbot/auth/jwt.rb +34 -0
  10. data/lib/sql_chatbot/configuration.rb +58 -0
  11. data/lib/sql_chatbot/engine.rb +23 -0
  12. data/lib/sql_chatbot/grammar/count_renderer.rb +113 -0
  13. data/lib/sql_chatbot/grammar/entity_candidates.rb +210 -0
  14. data/lib/sql_chatbot/grammar/intent_extractor.rb +191 -0
  15. data/lib/sql_chatbot/grammar/list_renderer.rb +50 -0
  16. data/lib/sql_chatbot/grammar/miss_logger.rb +17 -0
  17. data/lib/sql_chatbot/grammar/modifiers.rb +145 -0
  18. data/lib/sql_chatbot/grammar/primitives.rb +69 -0
  19. data/lib/sql_chatbot/grammar/programmatic_renderer.rb +258 -0
  20. data/lib/sql_chatbot/grammar/registry.rb +66 -0
  21. data/lib/sql_chatbot/grammar/sanity_check.rb +37 -0
  22. data/lib/sql_chatbot/grammar/template_compiler.rb +179 -0
  23. data/lib/sql_chatbot/llm/client.rb +87 -0
  24. data/lib/sql_chatbot/prompts/answer.rb +157 -0
  25. data/lib/sql_chatbot/prompts/classify.rb +59 -0
  26. data/lib/sql_chatbot/prompts/generate_sql.rb +88 -0
  27. data/lib/sql_chatbot/services/code_indexer.rb +337 -0
  28. data/lib/sql_chatbot/services/grammar_pipeline.rb +45 -0
  29. data/lib/sql_chatbot/services/model_introspector.rb +152 -0
  30. data/lib/sql_chatbot/services/orchestrator.rb +635 -0
  31. data/lib/sql_chatbot/services/registry_builder.rb +385 -0
  32. data/lib/sql_chatbot/services/route_introspector.rb +118 -0
  33. data/lib/sql_chatbot/services/schema_service.rb +884 -0
  34. data/lib/sql_chatbot/services/sql_executor.rb +81 -0
  35. data/lib/sql_chatbot/version.rb +5 -0
  36. data/lib/sql_chatbot_rails.rb +91 -0
  37. data/vendor/assets/widget.js +53 -0
  38. metadata +180 -0
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/registry"
4
+
5
+ module SqlChatbot
6
+ module Grammar
7
+ module Primitives
8
+ PREFERRED_DISPLAY_FIELDS = %w[id name title label email].freeze
9
+
10
+ # Quote a single SQL identifier — wraps in double quotes and escapes
11
+ # embedded quotes. Prevents PG reserved-word collisions ("user", "order"
12
+ # etc.) which silently resolve to functions and corrupt counts.
13
+ def self.q(name)
14
+ %("#{name.to_s.gsub('"', '""')}")
15
+ end
16
+
17
+ # Qualified column reference: "table"."column"
18
+ def self.qc(table, col)
19
+ "#{q(table)}.#{q(col)}"
20
+ end
21
+
22
+ def self.build(primitive:, entity:, field: nil, which: nil, n: nil, rank_field: nil, direction: nil, group_by: nil)
23
+ t = q(entity.table)
24
+ case primitive
25
+ when :COUNT
26
+ "SELECT COUNT(*) FROM #{t}"
27
+ when :LIST
28
+ "SELECT #{pick_display_fields(entity).map { |c| q(c) }.join(", ")} FROM #{t}"
29
+ when :SUM
30
+ require_field!(entity, field, "SUM")
31
+ "SELECT SUM(#{qc(entity.table, field)}) FROM #{t}"
32
+ when :AVG
33
+ require_field!(entity, field, "AVG")
34
+ "SELECT ROUND(AVG(#{qc(entity.table, field)}), 2) FROM #{t}"
35
+ when :MIN_MAX
36
+ require_field!(entity, field, "MIN_MAX")
37
+ raise "MIN_MAX requires which" unless %i[MIN MAX].include?(which)
38
+ "SELECT #{which}(#{qc(entity.table, field)}) FROM #{t}"
39
+ when :TOP_N
40
+ rank = rank_field || entity.ranking_candidates.first
41
+ raise "TOP_N requires rankField" unless rank
42
+ limit = n || 10
43
+ dir = (direction || "desc").to_s.upcase
44
+ # V1.3-V: NULLS LAST in both directions. PG default for DESC is
45
+ # NULLS FIRST, which surfaced a NULL-rank row for 2BN's
46
+ # "biggest review by rating" — answer LLM rendered "N/A".
47
+ # NULL rank is never the answer the user wants.
48
+ "SELECT #{pick_display_fields(entity).map { |c| q(c) }.join(", ")}, #{qc(entity.table, rank)} FROM #{t} ORDER BY #{qc(entity.table, rank)} #{dir} NULLS LAST LIMIT #{limit}"
49
+ when :RANK
50
+ raise "RANK requires rankField and groupBy" unless rank_field && group_by
51
+ "SELECT #{t}.*, DENSE_RANK() OVER (PARTITION BY #{qc(entity.table, group_by)} ORDER BY #{qc(entity.table, rank_field)} DESC) AS rank FROM #{t}"
52
+ else
53
+ raise "unknown primitive #{primitive}"
54
+ end
55
+ end
56
+
57
+ def self.pick_display_fields(entity)
58
+ present = PREFERRED_DISPLAY_FIELDS.select { |p| entity.fields.key?(p) }
59
+ return present.map { |p| entity.fields[p].column } if present.any?
60
+ entity.fields.keys.first(4)
61
+ end
62
+
63
+ def self.require_field!(entity, field, name)
64
+ raise "#{name} requires field" unless field
65
+ raise "#{name} field '#{field}' not in entity" unless entity.fields.key?(field.to_s)
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/count_renderer"
4
+ require "sql_chatbot/grammar/list_renderer"
5
+
6
+ module SqlChatbot
7
+ module Grammar
8
+ # Unified programmatic-renderer registry. Mirror of npm's
9
+ # programmatic-renderer.ts. The grammar's intent extractor emits a
10
+ # primitive (COUNT, LIST, TOP_N, …); this module renders the result
11
+ # deterministically when it can — bypassing the answer-stream LLM
12
+ # which truncates lists, mis-narrates count=0 as "no records", and
13
+ # hallucinates empty TOP_N results when the metric is all zero.
14
+ #
15
+ # Adding a new primitive renderer (SUM, AVG, MIN_MAX, RANK) means:
16
+ # write a pure handler and add it to HANDLERS. No new files.
17
+ module ProgrammaticRenderer
18
+ def self.try_render(primitive, entity_display_label, rows, rank_field: nil, field: nil, which: nil)
19
+ return { ok: false } if primitive.nil?
20
+ handler = HANDLERS[primitive.to_s]
21
+ return { ok: false } unless handler
22
+ handler.call(entity_display_label, rows, rank_field, field, which)
23
+ end
24
+
25
+ # COUNT — one row, one numeric `count` column. Delegates to existing
26
+ # CountRenderer (already in unified shape).
27
+ HANDLE_COUNT = ->(label, rows, _rank, _field, _which) {
28
+ CountRenderer.try_render("COUNT", label, rows)
29
+ }
30
+
31
+ # LIST — small result, picks readable label per row. Delegates.
32
+ HANDLE_LIST = ->(label, rows, _rank, _field, _which) {
33
+ ListRenderer.try_render("LIST", label, rows)
34
+ }
35
+
36
+ TOPN_THRESHOLD = 10
37
+ PREFERRED_LABEL_KEYS = %w[title name label subject email username].freeze
38
+
39
+ # TOP_N — small ordered result, label + rank value per row. The
40
+ # Gitea sweep failure: "biggest repo by stars" returned 5 rows with
41
+ # all num_stars=0; the answer LLM rendered "No matching records."
42
+ # Programmatic render shows the data deterministically.
43
+ HANDLE_TOPN = ->(label, rows, rank_field, _field, _which) {
44
+ next { ok: false } unless rows.is_a?(Array)
45
+ next { ok: true, text: ProgrammaticRenderer.empty_text(label) } if rows.empty?
46
+ next { ok: false } if rows.length > TOPN_THRESHOLD
47
+
48
+ formatted = []
49
+ rows.each do |row|
50
+ next { ok: false } unless row.is_a?(Hash)
51
+ lbl = ProgrammaticRenderer.pick_row_label(row)
52
+ break { ok: false } unless lbl
53
+ rank_value = ProgrammaticRenderer.pick_rank_value(row, rank_field)
54
+ formatted << { label: lbl, rank: rank_value }
55
+ end
56
+ next formatted if formatted.is_a?(Hash) # short-circuit { ok: false }
57
+
58
+ label_or_item = label.to_s.empty? ? "item" : label.to_s
59
+ noun = rows.length == 1 ? CountRenderer.to_singular_label(label_or_item) : CountRenderer.to_plural_label(label_or_item)
60
+ by_part = rank_field ? " by #{rank_field}" : ""
61
+ intro = rows.length == 1 ? "Top #{noun}#{by_part}:" : "Top #{rows.length} #{noun}#{by_part}:"
62
+ lines = formatted.map { |f| f[:rank] ? "- #{f[:label]} (#{f[:rank]})" : "- #{f[:label]}" }.join("\n")
63
+ { ok: true, text: "#{intro}\n#{lines}" }
64
+ }
65
+
66
+ # SUM / AVG / MIN_MAX — one row, one numeric value. Mirror of TS
67
+ # handlers in programmatic-renderer.ts (V1.3-P / Fix G).
68
+ MONEY_HINTS = %w[amount price cost revenue total gross net fee].freeze
69
+
70
+ HANDLE_SUM = ->(label, rows, _rank, field, _which) {
71
+ value = ProgrammaticRenderer.first_aggregate_value(rows)
72
+ next { ok: false } if value == :no_value
73
+ formatted = ProgrammaticRenderer.format_aggregate_value(field, value)
74
+ next { ok: true, text: "No data available to sum." } if formatted.nil?
75
+ plural = CountRenderer.to_plural_label(label.to_s.empty? ? "item" : label.to_s)
76
+ { ok: true, text: "Total #{ProgrammaticRenderer.humanize_field(field)} across #{plural}: #{formatted}" }
77
+ }
78
+
79
+ HANDLE_AVG = ->(label, rows, _rank, field, _which) {
80
+ value = ProgrammaticRenderer.first_aggregate_value(rows)
81
+ next { ok: false } if value == :no_value
82
+ formatted = ProgrammaticRenderer.format_aggregate_value(field, value)
83
+ next { ok: true, text: "No data available to average." } if formatted.nil?
84
+ plural = CountRenderer.to_plural_label(label.to_s.empty? ? "item" : label.to_s)
85
+ { ok: true, text: "Average #{ProgrammaticRenderer.humanize_field(field)} across #{plural}: #{formatted}" }
86
+ }
87
+
88
+ HANDLE_MIN_MAX = ->(label, rows, _rank, field, which) {
89
+ value = ProgrammaticRenderer.first_aggregate_value(rows)
90
+ next { ok: false } if value == :no_value
91
+ formatted = ProgrammaticRenderer.format_aggregate_value(field, value)
92
+ next { ok: true, text: "No data available." } if formatted.nil?
93
+ plural = CountRenderer.to_plural_label(label.to_s.empty? ? "item" : label.to_s)
94
+ adj = which.to_s == "MIN" ? "Lowest" : "Highest"
95
+ { ok: true, text: "#{adj} #{ProgrammaticRenderer.humanize_field(field)} across #{plural}: #{formatted}" }
96
+ }
97
+
98
+ HANDLERS = {
99
+ "COUNT" => HANDLE_COUNT,
100
+ "LIST" => HANDLE_LIST,
101
+ "TOP_N" => HANDLE_TOPN,
102
+ "SUM" => HANDLE_SUM,
103
+ "AVG" => HANDLE_AVG,
104
+ "MIN_MAX" => HANDLE_MIN_MAX,
105
+ }.freeze
106
+
107
+ def self.first_aggregate_value(rows)
108
+ return :no_value unless rows.is_a?(Array) && rows.length == 1
109
+ row = rows.first
110
+ return :no_value unless row.is_a?(Hash)
111
+ return :no_value if row.empty? || row.size > 1
112
+ row.values.first
113
+ end
114
+
115
+ def self.format_aggregate_value(field, value)
116
+ return nil if value.nil?
117
+ n =
118
+ case value
119
+ when Numeric then value
120
+ when String then (value =~ /\A-?\d+(\.\d+)?\z/) ? value.to_f : nil
121
+ end
122
+ return nil unless n
123
+ if looks_like_money?(field)
124
+ # Format with thousands separators and 2 decimals, prefix $.
125
+ whole = n.truncate.to_s.reverse.scan(/\d{1,3}/).join(",").reverse
126
+ decimals = ((n - n.truncate).round(2).abs * 100).round.to_s.rjust(2, "0")
127
+ "$#{whole}.#{decimals}"
128
+ else
129
+ if n == n.to_i
130
+ n.to_i.to_s.reverse.scan(/\d{1,3}/).join(",").reverse
131
+ else
132
+ rounded = (n * 100).round / 100.0
133
+ whole = rounded.truncate.to_s.reverse.scan(/\d{1,3}/).join(",").reverse
134
+ frac = ((rounded - rounded.truncate).abs * 100).round.to_s.rjust(2, "0").gsub(/0$/, "")
135
+ frac.empty? ? whole : "#{whole}.#{frac}"
136
+ end
137
+ end
138
+ end
139
+
140
+ def self.looks_like_money?(field)
141
+ return false if field.nil?
142
+ f = field.to_s.downcase
143
+ MONEY_HINTS.any? { |h| f.include?(h) }
144
+ end
145
+
146
+ def self.humanize_field(field)
147
+ return "value" if field.nil? || field.to_s.empty?
148
+ field.to_s.tr("_", " ")
149
+ end
150
+
151
+ def self.pick_row_label(row)
152
+ PREFERRED_LABEL_KEYS.each do |k|
153
+ v = row[k] || row[k.to_sym]
154
+ return v.strip if v.is_a?(String) && !v.strip.empty?
155
+ end
156
+ row.each do |k, v|
157
+ next if k.to_s == "id"
158
+ return v.strip if v.is_a?(String) && !v.strip.empty?
159
+ end
160
+ nil
161
+ end
162
+
163
+ def self.pick_rank_value(row, rank_field)
164
+ if rank_field
165
+ v = row[rank_field] || row[rank_field.to_s] || row[rank_field.to_sym]
166
+ return format_scalar(v) if v
167
+ end
168
+ row.each do |k, v|
169
+ next if %w[id count].include?(k.to_s)
170
+ if v.is_a?(Numeric) || (v.is_a?(String) && v =~ /\A-?\d+(\.\d+)?\z/)
171
+ return format_scalar(v)
172
+ end
173
+ end
174
+ nil
175
+ end
176
+
177
+ def self.format_scalar(v)
178
+ return nil if v.nil?
179
+ return v.to_s if v.is_a?(Numeric)
180
+ return v if v.is_a?(String)
181
+ nil
182
+ end
183
+
184
+ # =========================================================
185
+ # Bug D guard for the LLM SQL path. Mirror of TS
186
+ # renderEmptyForLlmSql. Hard sweep on 2026-04-28 surfaced
187
+ # five questions where grammar missed → LLM SQL → 0 rows /
188
+ # COUNT(*)=0 / NULL aggregate → answer LLM emitted
189
+ # "No matching records found." even when a shape-aware
190
+ # programmatic rendering ("There are 0 X.", "No Xs found.")
191
+ # would be more honest. Returns nil when there *is* data —
192
+ # caller falls through to the answer LLM.
193
+ # =========================================================
194
+
195
+ COUNT_RE = /SELECT\s+COUNT\s*\(/i
196
+ AGGREGATE_RE = /SELECT\s+(SUM|AVG|MIN|MAX)\s*\(/i
197
+ FROM_TABLE_RE = /\bFROM\s+(?:"?[\w$]+"?\s*\.\s*)?"?([\w$]+)"?/i
198
+
199
+ def self.render_empty_for_llm_sql(_question, sql, rows)
200
+ is_count = sql =~ COUNT_RE
201
+ is_aggregate = sql =~ AGGREGATE_RE
202
+
203
+ if is_count && rows.is_a?(Array) && rows.length == 1
204
+ val = first_scalar(rows[0])
205
+ n = as_number(val)
206
+ if n == 0
207
+ label = entity_from_from(sql)
208
+ return label ? "There are 0 #{CountRenderer.to_plural_label(label)}." : "There are 0 matching items."
209
+ end
210
+ return nil
211
+ end
212
+
213
+ if is_aggregate && rows.is_a?(Array) && rows.length == 1
214
+ val = first_scalar(rows[0])
215
+ return "No data available for that question." if val.nil?
216
+ return nil
217
+ end
218
+
219
+ if rows.is_a?(Array) && rows.empty?
220
+ label = entity_from_from(sql)
221
+ return label ? "No #{CountRenderer.to_plural_label(label)} found." : "I didn't find anything matching that — could you rephrase or be more specific?"
222
+ end
223
+
224
+ nil
225
+ end
226
+
227
+ def self.first_scalar(row)
228
+ return nil unless row.is_a?(Hash) && !row.empty?
229
+ row.values.first
230
+ end
231
+
232
+ def self.as_number(v)
233
+ return v if v.is_a?(Numeric)
234
+ return v.to_i if v.is_a?(String) && v =~ /\A-?\d+\z/
235
+ nil
236
+ end
237
+
238
+ def self.entity_from_from(sql)
239
+ m = sql.match(FROM_TABLE_RE)
240
+ return nil unless m && m[1]
241
+ m[1].split("_").reject(&:empty?).map { |w| w[0].upcase + w[1..].to_s.downcase }.join(" ")
242
+ end
243
+
244
+ # Shared empty-text helper used by HANDLE_TOPN, ListRenderer, and
245
+ # render_empty_for_llm_sql so the user-facing string is consistent:
246
+ # "No <Plural> found." instead of the V1.2 Bug D phrase
247
+ # "No matching records found." which the user has flagged on the
248
+ # 2026-04-28 hard sweep as misleading.
249
+ def self.empty_text(entity_display_label)
250
+ if entity_display_label && !entity_display_label.to_s.strip.empty?
251
+ "No #{CountRenderer.to_plural_label(entity_display_label.to_s)} found."
252
+ else
253
+ "I didn't find anything matching that — could you rephrase or be more specific?"
254
+ end
255
+ end
256
+ end
257
+ end
258
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+
5
+ module SqlChatbot
6
+ module Grammar
7
+ Entity = Struct.new(:name, :table, :display_label, :row_count, :primary_key,
8
+ :timestamps, :fields, :scopes, :associations, :ranking_candidates,
9
+ :implicit_filters,
10
+ keyword_init: true) do
11
+ def initialize(**kwargs)
12
+ super(
13
+ name: kwargs[:name],
14
+ table: kwargs[:table],
15
+ display_label: kwargs[:display_label] || kwargs[:name]&.capitalize,
16
+ row_count: kwargs[:row_count] || 0,
17
+ primary_key: kwargs[:primary_key] || "id",
18
+ timestamps: kwargs[:timestamps] || {},
19
+ fields: kwargs[:fields] || {},
20
+ scopes: kwargs[:scopes] || {},
21
+ associations: kwargs[:associations] || {},
22
+ ranking_candidates: kwargs[:ranking_candidates] || [],
23
+ implicit_filters: kwargs[:implicit_filters] || []
24
+ )
25
+ end
26
+ end
27
+
28
+ # WHERE clauses appended to every SELECT for an entity. Two sources feed
29
+ # this list:
30
+ # - Schema detection: e.g. `deleted_at` → `{ column: "deleted_at", expr: "IS NULL", source: :soft_delete }`
31
+ # - Developer config (`default_filters`): e.g. MSP convention `*.status != 3`.
32
+ # The compiler emits each entry as `"<table>"."<column>" <expr>` and skips
33
+ # any entry whose column is already referenced in the generated SQL.
34
+ ImplicitFilter = Struct.new(:column, :expr, :source, keyword_init: true)
35
+
36
+ Field = Struct.new(:column, :type, :nullable, :enum_values, :fk_to,
37
+ :user_facing_label, :searchable, keyword_init: true)
38
+
39
+ Scope = Struct.new(:name, :where_clause, :param_slots, keyword_init: true)
40
+
41
+ Association = Struct.new(:name, :kind, :target_entity, :join_clause,
42
+ :through_entity, keyword_init: true)
43
+
44
+ class Registry
45
+ attr_reader :entities, :aliases, :version, :generated_at, :framework
46
+
47
+ def initialize(framework:, entities: {}, aliases: {})
48
+ @framework = framework
49
+ @entities = entities
50
+ @aliases = aliases
51
+ @version = 1
52
+ @generated_at = Time.now.utc.iso8601
53
+ end
54
+
55
+ def find_entity(name)
56
+ @entities[name.to_s]
57
+ end
58
+
59
+ def resolve_alias(term)
60
+ return @aliases[term.to_s] if @aliases.key?(term.to_s)
61
+ return term.to_s if @entities.key?(term.to_s)
62
+ nil
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SqlChatbot
4
+ module Grammar
5
+ # Post-execution sanity check for grammar-generated SQL.
6
+ #
7
+ # Catches "plausible but wrong" results where the SQL ran without error
8
+ # but the value disagrees with the registry's known row count. Concrete
9
+ # case: Gitea's `user` reserved-word table returned 1 instead of 9.
10
+ module SanityCheck
11
+ # Returns { ok: true } when result matches expectations,
12
+ # or { ok: false, reason: "..." } when there's a mismatch.
13
+ def self.check_count(primitive, entity, result_rows)
14
+ return { ok: true } unless primitive.to_s == "COUNT"
15
+ return { ok: true } unless result_rows.is_a?(Array) && result_rows.length == 1
16
+
17
+ row = result_rows.first
18
+ v = row.is_a?(Hash) ? row.values.first : nil
19
+ got = Integer(v.to_s) rescue nil
20
+ return { ok: true } unless got
21
+
22
+ expected = entity.row_count.to_i
23
+ # Trust the registry only when it has a non-trivial value.
24
+ # Tables with reltuples == 0 might just be stats-stale.
25
+ return { ok: true } if expected <= 5
26
+
27
+ if got < expected / 3 || got > expected * 3
28
+ return {
29
+ ok: false,
30
+ reason: "count_mismatch: SQL returned #{got}, registry has ~#{expected} rows in #{entity.table}",
31
+ }
32
+ end
33
+ { ok: true }
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/registry"
4
+ require "sql_chatbot/grammar/primitives"
5
+ require "sql_chatbot/grammar/modifiers"
6
+
7
+ module SqlChatbot
8
+ module Grammar
9
+ module TemplateCompiler
10
+ def self.compile(intent, registry)
11
+ return { ok: false, reason: "unmatched: #{intent[:reason]}" } if intent[:status].to_s == "unmatched"
12
+
13
+ entity_name = registry.aliases[intent[:entity]] || intent[:entity]
14
+ entity = registry.entities[entity_name]
15
+ return { ok: false, reason: "entity '#{intent[:entity]}' not in registry" } unless entity
16
+
17
+ begin
18
+ modifiers = Array(intent[:modifiers])
19
+ primitive_sym = intent[:primitive].to_s
20
+ rank_field = intent[:rank_field]
21
+ limit_n = intent[:n]
22
+
23
+ # V1.3-K shape-repair pass — three real-world LLM emit-fidelity issues
24
+ # captured during the 2026-04-28 hard sweep. Mirror of TS
25
+ # `normalizeIntentShape` in template-compiler.ts.
26
+ modifiers = normalize_intent_shape(modifiers, entity)
27
+
28
+ # TOP_N / MIN_MAX express the rank/field through an `order_by`
29
+ # modifier when the LLM phrases it that way ("biggest repo by stars"
30
+ # → primitive=TOP_N + order_by(num_stars,desc)). Lift the
31
+ # modifier's field into the primitive slot and drop it so the
32
+ # compiler doesn't emit ORDER BY twice (which produces a Postgres
33
+ # syntax error and silently pushes the question to the LLM
34
+ # fallback path). Write the lifted values back to the intent so
35
+ # downstream consumers (programmatic renderer) can read the
36
+ # canonical rank_field/field regardless of which surface the LLM
37
+ # emitted.
38
+ field = intent[:field]
39
+ which = intent[:which]
40
+ direction = intent[:direction]
41
+ if primitive_sym == "TOP_N" || primitive_sym == "MIN_MAX"
42
+ order_mod = modifiers.find { |m| (m[:kind] || m["kind"]).to_s == "order_by" }
43
+ if order_mod
44
+ lifted_field = order_mod[:field] || order_mod["field"]
45
+ raw_dir = order_mod[:direction] || order_mod["direction"] || order_mod[:op] || order_mod["op"] || "desc"
46
+ if primitive_sym == "TOP_N"
47
+ rank_field ||= lifted_field
48
+ # V1.3-U: pass through the lifted direction so "smallest X"
49
+ # becomes ASC, mirroring TS liftOrderByForRank.
50
+ direction ||= raw_dir.to_s.downcase
51
+ else
52
+ field ||= lifted_field
53
+ which ||= raw_dir.to_s.downcase == "asc" ? :MIN : :MAX
54
+ end
55
+ modifiers = modifiers.reject { |m| m.equal?(order_mod) }
56
+ end
57
+ limit_mod = modifiers.find { |m| (m[:kind] || m["kind"]).to_s == "limit" }
58
+ if limit_mod && primitive_sym == "TOP_N"
59
+ limit_n ||= limit_mod[:value] || limit_mod["value"]
60
+ modifiers = modifiers.reject { |m| m.equal?(limit_mod) }
61
+ end
62
+ intent[:rank_field] = rank_field
63
+ intent[:field] = field
64
+ intent[:which] = which
65
+ intent[:direction] = direction
66
+ intent[:modifiers] = modifiers
67
+ intent[:n] = limit_n
68
+ end
69
+
70
+ sql = Primitives.build(
71
+ primitive: intent[:primitive],
72
+ entity: entity,
73
+ field: field,
74
+ which: which,
75
+ n: limit_n,
76
+ rank_field: rank_field,
77
+ direction: direction,
78
+ group_by: intent[:group_by]
79
+ )
80
+
81
+ modifiers.each do |m|
82
+ sql = Modifiers.apply(sql, m, entity)
83
+ end
84
+
85
+ sql = with_implicit_filters(sql, entity)
86
+
87
+ unless sql =~ /LIMIT \d+/i || primitive_sym == "COUNT" || sql =~ /COUNT\(/i
88
+ sql = "#{sql} LIMIT 100"
89
+ end
90
+
91
+ { ok: true, sql: sql }
92
+ rescue => e
93
+ { ok: false, reason: e.message }
94
+ end
95
+ end
96
+
97
+ # Append every implicit WHERE clause for this entity. Two sources land
98
+ # in the same list: schema-detected soft-delete columns and developer-
99
+ # declared `default_filters`. A clause is skipped when its column is
100
+ # already referenced in the generated SQL — explicit user filters always
101
+ # win over the implicit default.
102
+ def self.with_implicit_filters(sql, entity)
103
+ filters = Array(entity.respond_to?(:implicit_filters) ? entity.implicit_filters : nil)
104
+ return sql if filters.empty?
105
+
106
+ to_apply = filters.reject { |f| column_already_filtered?(sql, entity.table, f.column) }
107
+ return sql if to_apply.empty?
108
+
109
+ combined = to_apply.map { |f| "#{Primitives.qc(entity.table, f.column)} #{f.expr}" }.join(" AND ")
110
+ if /\bWHERE\b/i.match?(sql)
111
+ return sql.sub(/\bWHERE\b/i) { "WHERE #{combined} AND " }
112
+ end
113
+ before_match = sql.match(/ (GROUP BY|ORDER BY|LIMIT) /i)
114
+ if before_match
115
+ return sql.sub(before_match[0]) { " WHERE #{combined}#{before_match[0]}" }
116
+ end
117
+ "#{sql} WHERE #{combined}"
118
+ end
119
+
120
+ def self.column_already_filtered?(sql, table, col)
121
+ return true if sql.include?(Primitives.qc(table, col))
122
+ /\b#{Regexp.escape(table)}\.#{Regexp.escape(col)}\b/i.match?(sql)
123
+ end
124
+
125
+ # =====================================================================
126
+ # V1.3-K shape-repair pass — mirror of TS normalizeIntentShape.
127
+ #
128
+ # 1. `where` modifier whose value is a WINDOWS keyword
129
+ # → coerce to a `time` modifier with that window.
130
+ # 2. `time` modifier where the LLM used `value` for the keyword
131
+ # instead of `window` → alias the key.
132
+ # 3. `time` modifier missing field entirely → default to created_at
133
+ # if it exists, else the lone timestamp column.
134
+ # =====================================================================
135
+ def self.normalize_intent_shape(modifiers, entity)
136
+ modifiers.map { |m| repair_one(m, entity) }
137
+ end
138
+
139
+ def self.repair_one(modifier, entity)
140
+ kind = (modifier[:kind] || modifier["kind"]).to_s
141
+ value = modifier[:value] || modifier["value"]
142
+ field = modifier[:field] || modifier["field"]
143
+ window = modifier[:window] || modifier["window"]
144
+
145
+ # Rule 1: where with WINDOWS keyword → time
146
+ if kind == "where" && value.is_a?(String) && SqlChatbot::Grammar::Modifiers::WINDOWS.key?(value.downcase)
147
+ return {
148
+ kind: "time",
149
+ field: field || default_timestamp_field(entity) || "",
150
+ window: value.downcase,
151
+ }
152
+ end
153
+
154
+ # Rule 2: time with `value` instead of `window`
155
+ if kind == "time" && (window.nil? || window.to_s.empty?) && value.is_a?(String) && SqlChatbot::Grammar::Modifiers::WINDOWS.key?(value.downcase)
156
+ return {
157
+ kind: "time",
158
+ field: field || default_timestamp_field(entity) || "",
159
+ window: value.downcase,
160
+ }
161
+ end
162
+
163
+ # Rule 3: time missing field → default
164
+ if kind == "time" && (field.nil? || field.to_s.empty?) && window
165
+ fallback = default_timestamp_field(entity)
166
+ return { kind: "time", field: fallback, window: window } if fallback
167
+ end
168
+
169
+ modifier
170
+ end
171
+
172
+ def self.default_timestamp_field(entity)
173
+ return "created_at" if entity.fields["created_at"]
174
+ ts_fields = entity.fields.values.select { |f| f.type.to_s == "timestamp" }
175
+ ts_fields.length == 1 ? ts_fields.first.column : nil
176
+ end
177
+ end
178
+ end
179
+ end