sql-chatbot-rails 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +20 -0
  4. data/app/controllers/sql_chatbot/chatbot_controller.rb +158 -0
  5. data/config/routes.rb +11 -0
  6. data/lib/generators/sql_chatbot/install_generator.rb +25 -0
  7. data/lib/generators/sql_chatbot/templates/initializer.rb +22 -0
  8. data/lib/sql_chatbot/auth/cors.rb +35 -0
  9. data/lib/sql_chatbot/auth/jwt.rb +34 -0
  10. data/lib/sql_chatbot/configuration.rb +58 -0
  11. data/lib/sql_chatbot/engine.rb +23 -0
  12. data/lib/sql_chatbot/grammar/count_renderer.rb +113 -0
  13. data/lib/sql_chatbot/grammar/entity_candidates.rb +210 -0
  14. data/lib/sql_chatbot/grammar/intent_extractor.rb +191 -0
  15. data/lib/sql_chatbot/grammar/list_renderer.rb +50 -0
  16. data/lib/sql_chatbot/grammar/miss_logger.rb +17 -0
  17. data/lib/sql_chatbot/grammar/modifiers.rb +145 -0
  18. data/lib/sql_chatbot/grammar/primitives.rb +69 -0
  19. data/lib/sql_chatbot/grammar/programmatic_renderer.rb +258 -0
  20. data/lib/sql_chatbot/grammar/registry.rb +66 -0
  21. data/lib/sql_chatbot/grammar/sanity_check.rb +37 -0
  22. data/lib/sql_chatbot/grammar/template_compiler.rb +179 -0
  23. data/lib/sql_chatbot/llm/client.rb +87 -0
  24. data/lib/sql_chatbot/prompts/answer.rb +157 -0
  25. data/lib/sql_chatbot/prompts/classify.rb +59 -0
  26. data/lib/sql_chatbot/prompts/generate_sql.rb +88 -0
  27. data/lib/sql_chatbot/services/code_indexer.rb +337 -0
  28. data/lib/sql_chatbot/services/grammar_pipeline.rb +45 -0
  29. data/lib/sql_chatbot/services/model_introspector.rb +152 -0
  30. data/lib/sql_chatbot/services/orchestrator.rb +635 -0
  31. data/lib/sql_chatbot/services/registry_builder.rb +385 -0
  32. data/lib/sql_chatbot/services/route_introspector.rb +118 -0
  33. data/lib/sql_chatbot/services/schema_service.rb +884 -0
  34. data/lib/sql_chatbot/services/sql_executor.rb +81 -0
  35. data/lib/sql_chatbot/version.rb +5 -0
  36. data/lib/sql_chatbot_rails.rb +91 -0
  37. data/vendor/assets/widget.js +53 -0
  38. metadata +180 -0
@@ -0,0 +1,210 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/registry"
4
+
5
+ module SqlChatbot
6
+ module Grammar
7
+ module EntityCandidates
8
+ # Score an entity against the question.
9
+ # Tokenizes name on '_' so e.g. `projects_project` token "project" matches
10
+ # the question "how many projects". Tie-breakers: fewer name segments,
11
+ # then higher row count.
12
+ def self.score_entity(question, entity, registry)
13
+ q = question.to_s.downcase
14
+ singular = entity.name.to_s.downcase
15
+ plural = entity.table.to_s.downcase
16
+ score = 0
17
+
18
+ score += 12 if q.include?(" #{singular} ") || q.start_with?("#{singular} ") || q.end_with?(" #{singular}")
19
+ score += 10 if q.include?(plural)
20
+ score += 5 if q.include?(singular)
21
+
22
+ registry.aliases.each do |alias_term, target|
23
+ next unless target == entity.name
24
+ score += 8 if q.include?(alias_term.to_s.downcase)
25
+ end
26
+
27
+ # Token-level matching for compound names like `projects_project`.
28
+ tokens = singular.split("_").select { |t| t.length >= 3 }
29
+ tokens.each do |tok|
30
+ tok_plural = pluralize_simple(tok)
31
+ if q =~ /\b(#{Regexp.escape(tok)}|#{Regexp.escape(tok_plural)})\b/
32
+ score += 4
33
+ end
34
+ end
35
+
36
+ # Whitespace-collapsed match — length-weighted so longer matches win.
37
+ q_compact = q.gsub(/\s+/, "")
38
+ best_len = 0
39
+ tokens.each do |tok|
40
+ next if tok.length < 5
41
+ [tok, pluralize_simple(tok)].each do |c|
42
+ best_len = c.length if q_compact.include?(c) && c.length > best_len
43
+ end
44
+ end
45
+ score += best_len
46
+
47
+ # Fuzzy match for typos. Two-tier — aliases first (strongest signal,
48
+ # same tier as exact-alias just approximate), then name/plural/tokens
49
+ # (weaker — tokens are shared across many entities in compound-named
50
+ # schemas like Saleor's `product_product` / `product_category`).
51
+ # 4-char threshold catches "usrs" → "users" while rejecting 3-char
52
+ # noise ("lon" → "log"). Distance ≤ 25% of length further filters.
53
+ fuzzy_min_len = 4
54
+ if score == 0
55
+ alias_targets = []
56
+ registry.aliases.each do |alias_term, target|
57
+ next unless target == entity.name
58
+ alias_targets << alias_term.to_s.downcase if alias_term.to_s.length >= fuzzy_min_len
59
+ end
60
+ name_targets = []
61
+ name_targets << singular if singular.length >= fuzzy_min_len
62
+ name_targets << plural if plural.length >= fuzzy_min_len
63
+ tokens.each do |tok|
64
+ next if tok.length < fuzzy_min_len
65
+ name_targets << tok
66
+ tp = pluralize_simple(tok)
67
+ name_targets << tp if tp.length >= fuzzy_min_len
68
+ end
69
+
70
+ words = q.split(/\W+/).select { |w| w.length >= fuzzy_min_len }
71
+ # Aliases first — score 5
72
+ words.each do |word|
73
+ alias_targets.each do |t|
74
+ d = levenshtein(word, t)
75
+ next if d == 0
76
+ max = [word.length, t.length].max
77
+ if d <= 2 && d.to_f / max <= 0.25
78
+ return [5, { typed: word, corrected: t }]
79
+ end
80
+ end
81
+ end
82
+ # Then name/tokens — score 3
83
+ words.each do |word|
84
+ name_targets.each do |t|
85
+ d = levenshtein(word, t)
86
+ next if d == 0
87
+ max = [word.length, t.length].max
88
+ if d <= 2 && d.to_f / max <= 0.25
89
+ return [3, { typed: word, corrected: t }]
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ [score, nil]
96
+ end
97
+
98
+ # Damerau-Levenshtein edit distance: insertions, deletions, substitutions,
99
+ # and adjacent transpositions. Transposition counted as 1 (vs 2 in plain
100
+ # Levenshtein) because keyboard typos like "lables" ↔ "labels" are
101
+ # extremely common and should match at distance 1.
102
+ def self.levenshtein(a, b)
103
+ return 0 if a == b
104
+ return b.length if a.empty?
105
+ return a.length if b.empty?
106
+ n = b.length
107
+ prev2 = Array.new(n + 1, 0)
108
+ prev = (0..n).to_a
109
+ curr = Array.new(n + 1, 0)
110
+ (1..a.length).each do |i|
111
+ curr[0] = i
112
+ (1..n).each do |j|
113
+ cost = (a[i - 1] == b[j - 1]) ? 0 : 1
114
+ v = [prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost].min
115
+ if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]
116
+ v = [v, prev2[j - 2] + 1].min
117
+ end
118
+ curr[j] = v
119
+ end
120
+ prev2, prev, curr = prev, curr, prev2
121
+ end
122
+ prev[n]
123
+ end
124
+
125
+ def self.pluralize_simple(word)
126
+ return word + "es" if word.end_with?("s", "x", "ch", "sh")
127
+ return word[0..-2] + "ies" if word.end_with?("y") && !%w[a e i o u].include?(word[-2])
128
+ word + "s"
129
+ end
130
+
131
+ def self.name_segments(entity)
132
+ entity.name.to_s.split("_").length
133
+ end
134
+
135
+ def self.select(question:, registry:, top_n:)
136
+ select_with_meta(question: question, registry: registry, top_n: top_n).map { |row| row[:entity] }
137
+ end
138
+
139
+ # Returns rows of `{ entity:, score:, fuzzy_match: nil|{typed:,corrected:} }`.
140
+ # Used by the intent-extractor prompt to tell the LLM "the user word
141
+ # `<typo>` is likely a typo of `<entity>`" so it commits to the
142
+ # candidate instead of returning unmatched on a stray typo.
143
+ def self.select_with_meta(question:, registry:, top_n:)
144
+ rows = registry.entities.values.map do |entity|
145
+ score, fuzzy = score_entity(question, entity, registry)
146
+ {
147
+ entity: entity,
148
+ score: score,
149
+ fuzzy_match: fuzzy,
150
+ segments: name_segments(entity),
151
+ row_count: entity.row_count,
152
+ }
153
+ end
154
+
155
+ rows.sort_by! { |r| [-r[:score], r[:segments], -r[:row_count]] }
156
+
157
+ if rows.first && rows.first[:score] == 0
158
+ return registry.entities.values.sort_by { |e| -e.row_count }.first(top_n).map do |entity|
159
+ { entity: entity, score: 0, fuzzy_match: nil }
160
+ end
161
+ end
162
+
163
+ # When a typed word has a strong alias-fuzzy resolution (score 5),
164
+ # drop OTHER candidates that scored only via a weaker token-fuzzy
165
+ # match on the same typed word — they represent unrelated tables
166
+ # whose presence in the prompt tempts the LLM to override the
167
+ # resolution.
168
+ strong = {}
169
+ rows.each do |r|
170
+ if r[:fuzzy_match] && r[:score] == 5
171
+ strong[r[:fuzzy_match][:typed]] = true
172
+ end
173
+ end
174
+ filtered =
175
+ if strong.empty?
176
+ rows
177
+ else
178
+ rows.reject { |r| r[:fuzzy_match] && r[:score] == 3 && strong[r[:fuzzy_match][:typed]] }
179
+ end
180
+
181
+ # V1.3-R: alternate-suppression when a primary clearly dominates.
182
+ # Mirror of TS selectEntityCandidatesWithMeta. When the top entity
183
+ # has a strong alias/name match (score ≥ 8), drop alternates whose
184
+ # score is at most half the primary's. Keeps the LLM focused on the
185
+ # right binding instead of weighing distractor token matches.
186
+ primary_dominance_floor = 8
187
+ alternate_keep_ratio = 0.5
188
+ top = filtered.first
189
+ if top && top[:score] >= primary_dominance_floor
190
+ cutoff = top[:score] * alternate_keep_ratio
191
+ filtered = filtered.each_with_index.reject { |r, i| i > 0 && r[:score] <= cutoff }.map(&:first)
192
+ end
193
+
194
+ # Dedup fuzzy_match annotation per typed word.
195
+ claimed = {}
196
+ filtered.first(top_n).map do |r|
197
+ fuzzy = r[:fuzzy_match]
198
+ if fuzzy
199
+ if claimed[fuzzy[:typed]]
200
+ fuzzy = nil
201
+ else
202
+ claimed[fuzzy[:typed]] = true
203
+ end
204
+ end
205
+ { entity: r[:entity], score: r[:score], fuzzy_match: fuzzy }
206
+ end
207
+ end
208
+ end
209
+ end
210
+ end
@@ -0,0 +1,191 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "set"
5
+ require "sql_chatbot/grammar/entity_candidates"
6
+
7
+ module SqlChatbot
8
+ module Grammar
9
+ module IntentExtractor
10
+ PRIMITIVE_DESCRIPTIONS = <<~TEXT.strip
11
+ COUNT — how many rows of X
12
+ LIST — show/list rows of X
13
+ SUM — total of numeric field on X
14
+ AVG — average of numeric field on X
15
+ MIN_MAX — lowest/highest value of field on X
16
+ TOP_N — top N rows of X ordered by a ranking field
17
+ RANK — window-ranked rows of X within groups
18
+ TEXT
19
+
20
+ MODIFIER_DESCRIPTIONS = <<~TEXT.strip
21
+ where — filter by a field value (op: eq/neq/lt/lte/gt/gte/like/in)
22
+ shape: {"kind":"where","field":"<col>","op":"<op>","value":<lit>}
23
+ time — filter by time window on a timestamp field
24
+ shape: {"kind":"time","field":"<timestamp_col>","window":"<keyword>"}
25
+ windows: today | yesterday | last_7_days | last_30_days | this_week | this_month | this_year
26
+ join — include a related entity via association
27
+ shape: {"kind":"join","association":"<assoc_name>"}
28
+ group_by — group results by field
29
+ shape: {"kind":"group_by","field":"<col>"}
30
+ having — filter grouped results by aggregate op+value
31
+ order_by — order results by field
32
+ shape: {"kind":"order_by","field":"<col>","direction":"asc|desc"}
33
+ limit — cap result count
34
+ shape: {"kind":"limit","value":<int>}
35
+ distinct — deduplicate rows
36
+ shape: {"kind":"distinct"}
37
+ scope — apply a named scope from the entity. Use this when the user's word
38
+ matches an entity scope name; the grammar compiles the filter for you.
39
+ shape: {"kind":"scope","name":"<scope_name>"}
40
+ TEXT
41
+
42
+ # call_llm: a proc/lambda taking Array<Hash{role:, content:}> and returning the raw LLM string.
43
+ # Returns a Hash with :status and related keys (mirrors TS Intent discriminated union).
44
+ def self.extract(question:, registry:, history:, call_llm:, confidence_threshold: 0.7)
45
+ candidates = EntityCandidates.select_with_meta(question: question, registry: registry, top_n: 5)
46
+ messages = [
47
+ { role: "system", content: build_system_prompt },
48
+ { role: "user", content: build_user_prompt(question, candidates, history) },
49
+ ]
50
+ raw =
51
+ begin
52
+ call_llm.call(messages)
53
+ rescue => e
54
+ return { status: "unmatched", confidence: 0, reason: "llm_error: #{e.message}" }
55
+ end
56
+
57
+ parsed =
58
+ begin
59
+ JSON.parse(raw)
60
+ rescue JSON::ParserError
61
+ return { status: "unmatched", confidence: 0, reason: "malformed_json" }
62
+ end
63
+
64
+ if parsed["status"] == "unmatched"
65
+ return {
66
+ status: "unmatched",
67
+ confidence: parsed["confidence"] || 0,
68
+ reason: parsed["reason"] || "unmatched",
69
+ }
70
+ end
71
+
72
+ conf = parsed["confidence"]
73
+ if !conf.is_a?(Numeric) || conf < confidence_threshold
74
+ return { status: "unmatched", confidence: conf || 0, reason: "low_confidence:#{conf}" }
75
+ end
76
+
77
+ # V1.3-R: validate that the LLM's chosen entity is actually in the
78
+ # candidate set we showed it. See TS extractIntent for rationale.
79
+ if parsed["entity"].is_a?(String)
80
+ candidate_names = Set.new
81
+ candidates.each do |c|
82
+ candidate_names << c[:entity].name
83
+ registry.aliases.each do |alias_term, target|
84
+ candidate_names << alias_term if target == c[:entity].name
85
+ end
86
+ end
87
+ unless candidate_names.include?(parsed["entity"])
88
+ return {
89
+ status: "unmatched",
90
+ confidence: conf,
91
+ reason: "entity_not_in_candidates: '#{parsed["entity"]}' not among #{candidate_names.first(5).to_a.join(", ")}",
92
+ }
93
+ end
94
+ end
95
+
96
+ # Normalize keys to symbols for consumers
97
+ {
98
+ status: "matched",
99
+ primitive: parsed["primitive"]&.to_sym,
100
+ entity: parsed["entity"],
101
+ modifiers: (parsed["modifiers"] || []).map { |m| m.transform_keys(&:to_sym) },
102
+ field: parsed["field"],
103
+ which: parsed["which"]&.to_sym,
104
+ n: parsed["n"],
105
+ rank_field: parsed["rankField"] || parsed["rank_field"],
106
+ group_by: parsed["groupBy"] || parsed["group_by"],
107
+ confidence: conf,
108
+ }.compact
109
+ end
110
+
111
+ def self.build_system_prompt
112
+ <<~PROMPT.strip
113
+ You are an intent classifier for a SQL chatbot. Given a user question and a list of available entities, extract a structured intent.
114
+
115
+ Primitives:
116
+ #{PRIMITIVE_DESCRIPTIONS}
117
+
118
+ Modifiers:
119
+ #{MODIFIER_DESCRIPTIONS}
120
+
121
+ You MUST output JSON. If the question doesn't fit any primitive cleanly, set status=unmatched.
122
+
123
+ Output shape — generic placeholders, fill in real values from the candidates:
124
+ - COUNT: {"status":"matched","primitive":"COUNT","entity":"<E>","modifiers":[],"confidence":0.9}
125
+ - LIST: {"status":"matched","primitive":"LIST","entity":"<E>","modifiers":[],"confidence":0.9}
126
+ - SUM: {"status":"matched","primitive":"SUM","entity":"<E>","field":"<numeric_col>","modifiers":[],"confidence":0.9}
127
+ - AVG: {"status":"matched","primitive":"AVG","entity":"<E>","field":"<numeric_col>","modifiers":[],"confidence":0.9}
128
+ - MIN_MAX: {"status":"matched","primitive":"MIN_MAX","entity":"<E>","field":"<numeric_col>","which":"MAX","modifiers":[],"confidence":0.9}
129
+ - TOP_N: {"status":"matched","primitive":"TOP_N","entity":"<E>","rankField":"<numeric_col>","n":5,"modifiers":[],"confidence":0.9}
130
+ - unmatched: {"status":"unmatched","confidence":0.3,"reason":"<short reason>"}
131
+
132
+ SUM/AVG/MIN_MAX REQUIRE a "field" slot (a top-level "field" key, not inside modifiers).
133
+ TOP_N REQUIRES a "rankField" (a top-level "rankField" key, not an order_by modifier).
134
+ Pick "<E>" by reading the entity candidates — never invent a name. Pluralized
135
+ forms in the user's question ("orders", "users") map to the singular entity
136
+ name from the candidates list.
137
+
138
+ Casual phrasing maps to primitives like this (use the same patterns when the user speaks informally):
139
+ - "stuff we got" / "what we have" / "show me X" → LIST X
140
+ - "how many", "count of", "number of" → COUNT
141
+ - "biggest" / "largest" / "top" / "smallest" / "lowest" / "least" → TOP_N (returns the row with name + value). Use TOP_N for ANY question asking for an entity (a row) ranked by some field — even when n=1 ("smallest repo", "biggest order"). For ASC ranking, set order_by direction=asc; for DESC, direction=desc.
142
+ - MIN_MAX is ONLY for questions asking for the VALUE of an aggregate, not the row. "what's the highest price?" → MIN_MAX of price. "biggest order" → TOP_N of orders (the user wants the row, not just a number).
143
+ - "how's the team doing" / "how are things looking" → unmatched (vague — no clear primitive)
144
+
145
+ Rules:
146
+ - Only reference entities from the provided candidates. Never invent.
147
+ - Only use fields listed under the chosen entity.
148
+ - For enum filters, pass the enum key (e.g. "active"), NOT the underlying integer.
149
+ - When the user's word matches an enum-value KEY of any field on the chosen entity (case-insensitive), bind it via a "where" modifier with op:"eq" and value:"<the-key>". Example: user says "urgent" and field priority_id has enum={"Urgent":4,...} → emit {"kind":"where","field":"priority_id","op":"eq","value":"Urgent"}.
150
+ - When the user's word matches a scope name listed under the entity, prefer a {"kind":"scope","name":"<name>"} modifier over reconstructing the scope's filter manually. The grammar already knows how to compile the scope.
151
+ - Set confidence honestly: 0.9+ only when every slot has a clear mapping. If the question is vague and you're guessing the entity or aggregation, return unmatched.
152
+ - ENTITY-BINDING RULE: If a user word matches the canonical name OR any listed alias of exactly one candidate entity, the entity IS bound. Commit to a primitive — default to COUNT for "any/how many/got any" phrasings and LIST for "show me/list/which". Do NOT return unmatched just because the question is casual; the alias resolved the ambiguity. The "When in doubt, return unmatched" rule applies to ambiguous PRIMITIVES (e.g., "what's up with X"), not to ambiguous ENTITIES — entity ambiguity is already resolved by the candidate set.
153
+ - PRIMARY MATCH RULE: When a candidate is annotated "[PRIMARY MATCH — prefer this entity ...]", use that entity unless the question explicitly names something only a different candidate has. Do NOT pick a non-primary candidate just because it has more rows or a more specific token match — the scorer already weighed those factors. Example: "smallest repo" with the "repository" entity marked PRIMARY (alias match) and "repo_unit" as alternate (token match) → pick "repository". The user said "repo", and "repo" is the alias of "repository".
154
+ - When a candidate is annotated with "fuzzy-matched from '<typo>' → '<entity>'", the entity has already been chosen as the resolution for that typed word. Use this entity at confidence ≥ 0.85. Do NOT pick a different candidate just because its name appears to contain the user's word — unrelated tables (audit logs, bot tables, event tables) can have similar names. The fuzzy resolution was done before you saw these candidates.
155
+ - For any time phrasing ("today", "yesterday", "last 7 days", "last week", "this week", "last month", "this year", etc.), use a "time" modifier with a named window. NEVER put a literal date string OR a window keyword into a "where" modifier value — both produce broken SQL. Supported windows: today, yesterday, last_7_days, last_30_days, this_week, this_month, this_year. The "time" modifier shape is {"kind":"time","field":"<timestamp_col>","window":"<keyword>"} — do NOT use "value" instead of "window", and ALWAYS include "field". If the user names a window not in this list, return unmatched.
156
+ - When in doubt, return unmatched.
157
+ PROMPT
158
+ end
159
+
160
+ def self.build_user_prompt(question, candidates, history)
161
+ history_text = (history || []).last(2).map { |m| "#{m[:role] || m['role']}: #{m[:content] || m['content']}" }.join("\n")
162
+ # Mark the top candidate PRIMARY when there's more than one. Mirror
163
+ # of TS buildUserPrompt — see comments there for the rationale.
164
+ entity_blocks = candidates.each_with_index.map { |c, i| format_entity_brief(c, i.zero? && candidates.length > 1) }.join("\n\n")
165
+ "History:\n#{history_text}\n\nEntity candidates:\n#{entity_blocks}\n\nQuestion: #{question}"
166
+ end
167
+
168
+ def self.format_entity_brief(candidate, is_primary = false)
169
+ e = candidate[:entity]
170
+ fields = e.fields.first(15).map do |n, f|
171
+ enum_part = f.enum_values ? " enum=#{JSON.generate(f.enum_values)}" : ""
172
+ " #{n}: #{f.type}#{enum_part}"
173
+ end.join("\n")
174
+ assocs = e.associations.keys.first(8).join(", ")
175
+ assocs = "(none)" if assocs.empty?
176
+ scope_names = e.scopes.keys.first(8)
177
+ scopes_text =
178
+ if scope_names.any?
179
+ %(Scopes (when a user word matches one of these names, emit {"kind":"scope","name":"<name>"} as a modifier): #{scope_names.join(", ")})
180
+ else
181
+ "Scopes: (none)"
182
+ end
183
+ fuzzy_hint = candidate[:fuzzy_match] ?
184
+ %(\nNOTE: fuzzy-matched from "#{candidate[:fuzzy_match][:typed]}" → "#{candidate[:fuzzy_match][:corrected]}" (treat the user word as a typo of this entity).) :
185
+ ""
186
+ primary_marker = is_primary ? " [PRIMARY MATCH — prefer this entity unless the question explicitly names a different one]\n" : ""
187
+ "Entity: #{e.name} (table=#{e.table}, rows=#{e.row_count})#{fuzzy_hint}\n#{primary_marker}Fields:\n#{fields}\nAssociations: #{assocs}\n#{scopes_text}"
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/count_renderer"
4
+
5
+ module SqlChatbot
6
+ module Grammar
7
+ # Programmatic renderer for the grammar's LIST primitive when the result
8
+ # is small. Bypasses the answer-stream LLM so it can't drop or truncate.
9
+ module ListRenderer
10
+ THRESHOLD = 10
11
+
12
+ PREFERRED_LABEL_KEYS = %w[title name label subject email username].freeze
13
+
14
+ # Returns { ok: true, text: "..." } when conditions met, else { ok: false }.
15
+ def self.try_render(primitive, entity_display_label, rows)
16
+ return { ok: false } unless primitive.to_s == "LIST"
17
+ return { ok: false } unless rows.is_a?(Array)
18
+ return { ok: true, text: ProgrammaticRenderer.empty_text(entity_display_label) } if rows.empty?
19
+ return { ok: false } if rows.length > THRESHOLD
20
+
21
+ labels = []
22
+ rows.each do |row|
23
+ return { ok: false } unless row.is_a?(Hash)
24
+ lbl = pick_label(row)
25
+ return { ok: false } unless lbl
26
+ labels << lbl
27
+ end
28
+
29
+ label_or_item = entity_display_label.to_s.empty? ? "item" : entity_display_label.to_s
30
+ noun = rows.length == 1 ? CountRenderer.to_singular_label(label_or_item) : CountRenderer.to_plural_label(label_or_item)
31
+
32
+ intro = rows.length == 1 ? "Here is the #{noun}:" : "Here are the #{rows.length} #{noun}:"
33
+ lines = labels.map { |l| "- #{l}" }.join("\n")
34
+ { ok: true, text: "#{intro}\n#{lines}" }
35
+ end
36
+
37
+ def self.pick_label(row)
38
+ PREFERRED_LABEL_KEYS.each do |k|
39
+ v = row[k] || row[k.to_sym]
40
+ return v.strip if v.is_a?(String) && !v.strip.empty?
41
+ end
42
+ row.each do |k, v|
43
+ next if k.to_s == "id"
44
+ return v.strip if v.is_a?(String) && !v.strip.empty?
45
+ end
46
+ nil
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+ require "time"
6
+
7
+ module SqlChatbot
8
+ module Grammar
9
+ module MissLogger
10
+ def self.log(log_path, entry)
11
+ FileUtils.mkdir_p(File.dirname(log_path))
12
+ line = JSON.generate({ ts: Time.now.utc.iso8601 }.merge(entry)) + "\n"
13
+ File.open(log_path, "a") { |f| f.write(line) }
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sql_chatbot/grammar/registry"
4
+ require "sql_chatbot/grammar/primitives"
5
+
6
+ module SqlChatbot
7
+ module Grammar
8
+ module Modifiers
9
+ def self.q(name)
10
+ Primitives.q(name)
11
+ end
12
+
13
+ def self.qc(table, col)
14
+ Primitives.qc(table, col)
15
+ end
16
+
17
+ WINDOWS = {
18
+ "today" => "DATE_TRUNC('day', NOW())",
19
+ "yesterday" => "DATE_TRUNC('day', NOW() - INTERVAL '1 day')",
20
+ "last_7_days" => "NOW() - INTERVAL '7 days'",
21
+ "last_30_days" => "NOW() - INTERVAL '30 days'",
22
+ "this_week" => "DATE_TRUNC('week', NOW())",
23
+ "this_month" => "DATE_TRUNC('month', NOW())",
24
+ "this_year" => "DATE_TRUNC('year', NOW())",
25
+ }.freeze
26
+
27
+ OPS = {
28
+ "eq" => "=",
29
+ "neq" => "!=",
30
+ "lt" => "<",
31
+ "lte" => "<=",
32
+ "gt" => ">",
33
+ "gte" => ">=",
34
+ }.freeze
35
+
36
+ def self.apply(sql, modifier, entity)
37
+ kind = modifier[:kind].to_s
38
+ case kind
39
+ when "where" then apply_where(sql, modifier, entity)
40
+ when "time" then apply_time(sql, modifier, entity)
41
+ when "join" then apply_join(sql, modifier, entity)
42
+ when "group_by" then apply_group_by(sql, modifier, entity)
43
+ when "having" then apply_having(sql, modifier, entity)
44
+ when "order_by" then apply_order_by(sql, modifier, entity)
45
+ when "limit" then apply_limit(sql, modifier)
46
+ when "distinct" then sql.sub(/^SELECT /, "SELECT DISTINCT ")
47
+ else
48
+ raise "unknown modifier kind #{kind}"
49
+ end
50
+ end
51
+
52
+ def self.append_clause(sql, clause)
53
+ if /\bWHERE\b/i.match?(sql)
54
+ "#{sql} AND #{clause}"
55
+ else
56
+ "#{sql} WHERE #{clause}"
57
+ end
58
+ end
59
+
60
+ def self.apply_where(sql, modifier, entity)
61
+ field_name = modifier[:field].to_s
62
+ field = entity.fields[field_name]
63
+ raise "field '#{field_name}' not on entity #{entity.name}" unless field
64
+
65
+ value = modifier[:value]
66
+ ftype = field.type.to_s
67
+ if ftype == "enum"
68
+ enum_values = field.enum_values || {}
69
+ str_value = value.to_s
70
+ unless enum_values.key?(str_value) || enum_values.key?(str_value.to_sym)
71
+ raise "enum value '#{value}' not in registry for #{entity.name}.#{field_name}"
72
+ end
73
+ value = enum_values[str_value] || enum_values[str_value.to_sym]
74
+ elsif (ftype == "int" || ftype == "decimal") && value.is_a?(String) && value !~ /\A-?\d+(\.\d+)?\z/
75
+ raise "type mismatch: #{ftype} column #{entity.name}.#{field_name} cannot equal string '#{value}'"
76
+ elsif ftype == "bool" && value.is_a?(String) && value !~ /\A(true|false|t|f|0|1)\z/i
77
+ raise "type mismatch: bool column #{entity.name}.#{field_name} cannot equal string '#{value}'"
78
+ end
79
+
80
+ op = OPS[modifier[:op].to_s] || "="
81
+ formatted = value.is_a?(String) ? "'#{value.gsub("'", "''")}'" : value
82
+ append_clause(sql, "#{qc(entity.table, field_name)} #{op} #{formatted}")
83
+ end
84
+
85
+ def self.apply_time(sql, modifier, entity)
86
+ window_key = modifier[:window].to_s
87
+ expr = WINDOWS[window_key]
88
+ raise "unknown time window #{window_key}" unless expr
89
+ append_clause(sql, "#{qc(entity.table, modifier[:field])} >= #{expr}")
90
+ end
91
+
92
+ def self.apply_join(sql, modifier, entity)
93
+ assoc_name = modifier[:association].to_s
94
+ assoc = entity.associations[assoc_name]
95
+ raise "association '#{assoc_name}' not on entity #{entity.name}" unless assoc
96
+
97
+ join_clause = assoc.join_clause
98
+ # Re-emit the join clause with quoted identifiers.
99
+ # joinClause format: "src_table.src_col = tgt_table.tgt_col"
100
+ lhs, rhs = join_clause.split("=").map(&:strip)
101
+ lt, lc = lhs.split(".")
102
+ rt, rc = rhs.split(".")
103
+ target_table = rt
104
+ quoted_clause = "#{qc(lt, lc)} = #{qc(rt, rc)}"
105
+ join_sql = " JOIN #{q(target_table)} ON #{quoted_clause}"
106
+
107
+ if /\bWHERE\b/i.match?(sql)
108
+ sql.sub(/\bWHERE\b/i) { "#{join_sql} WHERE " }
109
+ else
110
+ "#{sql}#{join_sql}"
111
+ end
112
+ end
113
+
114
+ def self.apply_group_by(sql, modifier, entity)
115
+ field_name = modifier[:field].to_s
116
+ raise "group_by field '#{field_name}' not on entity #{entity.name}" unless entity.fields[field_name]
117
+ "#{sql} GROUP BY #{qc(entity.table, field_name)}"
118
+ end
119
+
120
+ def self.apply_having(sql, modifier, entity)
121
+ raise "HAVING requires GROUP BY" unless /GROUP BY/i.match?(sql)
122
+ op = OPS[modifier[:op].to_s] || "="
123
+ "#{sql} HAVING #{modifier[:field]} #{op} #{modifier[:value]}"
124
+ end
125
+
126
+ def self.apply_order_by(sql, modifier, entity)
127
+ field_name = modifier[:field].to_s
128
+ raise "order_by field '#{field_name}' not on entity #{entity.name}" unless entity.fields[field_name]
129
+ # The intent extractor LLM emits direction as either `direction` or
130
+ # `op` (it's prompted with the where-modifier shape and reuses `op`).
131
+ direction = (modifier[:direction] || modifier["direction"] || modifier[:op] || modifier["op"] || "desc").to_s.upcase
132
+ "#{sql} ORDER BY #{qc(entity.table, field_name)} #{direction}"
133
+ end
134
+
135
+ def self.apply_limit(sql, modifier)
136
+ limit_val = modifier[:value]
137
+ if /LIMIT \d+/i.match?(sql)
138
+ sql.sub(/LIMIT \d+/i, "LIMIT #{limit_val}")
139
+ else
140
+ "#{sql} LIMIT #{limit_val}"
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end