sql-chatbot-rails 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +20 -0
- data/app/controllers/sql_chatbot/chatbot_controller.rb +158 -0
- data/config/routes.rb +11 -0
- data/lib/generators/sql_chatbot/install_generator.rb +25 -0
- data/lib/generators/sql_chatbot/templates/initializer.rb +22 -0
- data/lib/sql_chatbot/auth/cors.rb +35 -0
- data/lib/sql_chatbot/auth/jwt.rb +34 -0
- data/lib/sql_chatbot/configuration.rb +58 -0
- data/lib/sql_chatbot/engine.rb +23 -0
- data/lib/sql_chatbot/grammar/count_renderer.rb +113 -0
- data/lib/sql_chatbot/grammar/entity_candidates.rb +210 -0
- data/lib/sql_chatbot/grammar/intent_extractor.rb +191 -0
- data/lib/sql_chatbot/grammar/list_renderer.rb +50 -0
- data/lib/sql_chatbot/grammar/miss_logger.rb +17 -0
- data/lib/sql_chatbot/grammar/modifiers.rb +145 -0
- data/lib/sql_chatbot/grammar/primitives.rb +69 -0
- data/lib/sql_chatbot/grammar/programmatic_renderer.rb +258 -0
- data/lib/sql_chatbot/grammar/registry.rb +66 -0
- data/lib/sql_chatbot/grammar/sanity_check.rb +37 -0
- data/lib/sql_chatbot/grammar/template_compiler.rb +179 -0
- data/lib/sql_chatbot/llm/client.rb +87 -0
- data/lib/sql_chatbot/prompts/answer.rb +157 -0
- data/lib/sql_chatbot/prompts/classify.rb +59 -0
- data/lib/sql_chatbot/prompts/generate_sql.rb +88 -0
- data/lib/sql_chatbot/services/code_indexer.rb +337 -0
- data/lib/sql_chatbot/services/grammar_pipeline.rb +45 -0
- data/lib/sql_chatbot/services/model_introspector.rb +152 -0
- data/lib/sql_chatbot/services/orchestrator.rb +635 -0
- data/lib/sql_chatbot/services/registry_builder.rb +385 -0
- data/lib/sql_chatbot/services/route_introspector.rb +118 -0
- data/lib/sql_chatbot/services/schema_service.rb +884 -0
- data/lib/sql_chatbot/services/sql_executor.rb +81 -0
- data/lib/sql_chatbot/version.rb +5 -0
- data/lib/sql_chatbot_rails.rb +91 -0
- data/vendor/assets/widget.js +53 -0
- metadata +180 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sql_chatbot/grammar/registry"
|
|
4
|
+
|
|
5
|
+
module SqlChatbot
|
|
6
|
+
module Grammar
|
|
7
|
+
module EntityCandidates
|
|
8
|
+
# Score an entity against the question.
|
|
9
|
+
# Tokenizes name on '_' so e.g. `projects_project` token "project" matches
|
|
10
|
+
# the question "how many projects". Tie-breakers: fewer name segments,
|
|
11
|
+
# then higher row count.
|
|
12
|
+
def self.score_entity(question, entity, registry)
|
|
13
|
+
q = question.to_s.downcase
|
|
14
|
+
singular = entity.name.to_s.downcase
|
|
15
|
+
plural = entity.table.to_s.downcase
|
|
16
|
+
score = 0
|
|
17
|
+
|
|
18
|
+
score += 12 if q.include?(" #{singular} ") || q.start_with?("#{singular} ") || q.end_with?(" #{singular}")
|
|
19
|
+
score += 10 if q.include?(plural)
|
|
20
|
+
score += 5 if q.include?(singular)
|
|
21
|
+
|
|
22
|
+
registry.aliases.each do |alias_term, target|
|
|
23
|
+
next unless target == entity.name
|
|
24
|
+
score += 8 if q.include?(alias_term.to_s.downcase)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Token-level matching for compound names like `projects_project`.
|
|
28
|
+
tokens = singular.split("_").select { |t| t.length >= 3 }
|
|
29
|
+
tokens.each do |tok|
|
|
30
|
+
tok_plural = pluralize_simple(tok)
|
|
31
|
+
if q =~ /\b(#{Regexp.escape(tok)}|#{Regexp.escape(tok_plural)})\b/
|
|
32
|
+
score += 4
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Whitespace-collapsed match — length-weighted so longer matches win.
|
|
37
|
+
q_compact = q.gsub(/\s+/, "")
|
|
38
|
+
best_len = 0
|
|
39
|
+
tokens.each do |tok|
|
|
40
|
+
next if tok.length < 5
|
|
41
|
+
[tok, pluralize_simple(tok)].each do |c|
|
|
42
|
+
best_len = c.length if q_compact.include?(c) && c.length > best_len
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
score += best_len
|
|
46
|
+
|
|
47
|
+
# Fuzzy match for typos. Two-tier — aliases first (strongest signal,
|
|
48
|
+
# same tier as exact-alias just approximate), then name/plural/tokens
|
|
49
|
+
# (weaker — tokens are shared across many entities in compound-named
|
|
50
|
+
# schemas like Saleor's `product_product` / `product_category`).
|
|
51
|
+
# 4-char threshold catches "usrs" → "users" while rejecting 3-char
|
|
52
|
+
# noise ("lon" → "log"). Distance ≤ 25% of length further filters.
|
|
53
|
+
fuzzy_min_len = 4
|
|
54
|
+
if score == 0
|
|
55
|
+
alias_targets = []
|
|
56
|
+
registry.aliases.each do |alias_term, target|
|
|
57
|
+
next unless target == entity.name
|
|
58
|
+
alias_targets << alias_term.to_s.downcase if alias_term.to_s.length >= fuzzy_min_len
|
|
59
|
+
end
|
|
60
|
+
name_targets = []
|
|
61
|
+
name_targets << singular if singular.length >= fuzzy_min_len
|
|
62
|
+
name_targets << plural if plural.length >= fuzzy_min_len
|
|
63
|
+
tokens.each do |tok|
|
|
64
|
+
next if tok.length < fuzzy_min_len
|
|
65
|
+
name_targets << tok
|
|
66
|
+
tp = pluralize_simple(tok)
|
|
67
|
+
name_targets << tp if tp.length >= fuzzy_min_len
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
words = q.split(/\W+/).select { |w| w.length >= fuzzy_min_len }
|
|
71
|
+
# Aliases first — score 5
|
|
72
|
+
words.each do |word|
|
|
73
|
+
alias_targets.each do |t|
|
|
74
|
+
d = levenshtein(word, t)
|
|
75
|
+
next if d == 0
|
|
76
|
+
max = [word.length, t.length].max
|
|
77
|
+
if d <= 2 && d.to_f / max <= 0.25
|
|
78
|
+
return [5, { typed: word, corrected: t }]
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
# Then name/tokens — score 3
|
|
83
|
+
words.each do |word|
|
|
84
|
+
name_targets.each do |t|
|
|
85
|
+
d = levenshtein(word, t)
|
|
86
|
+
next if d == 0
|
|
87
|
+
max = [word.length, t.length].max
|
|
88
|
+
if d <= 2 && d.to_f / max <= 0.25
|
|
89
|
+
return [3, { typed: word, corrected: t }]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
[score, nil]
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Damerau-Levenshtein edit distance: insertions, deletions, substitutions,
|
|
99
|
+
# and adjacent transpositions. Transposition counted as 1 (vs 2 in plain
|
|
100
|
+
# Levenshtein) because keyboard typos like "lables" ↔ "labels" are
|
|
101
|
+
# extremely common and should match at distance 1.
|
|
102
|
+
def self.levenshtein(a, b)
|
|
103
|
+
return 0 if a == b
|
|
104
|
+
return b.length if a.empty?
|
|
105
|
+
return a.length if b.empty?
|
|
106
|
+
n = b.length
|
|
107
|
+
prev2 = Array.new(n + 1, 0)
|
|
108
|
+
prev = (0..n).to_a
|
|
109
|
+
curr = Array.new(n + 1, 0)
|
|
110
|
+
(1..a.length).each do |i|
|
|
111
|
+
curr[0] = i
|
|
112
|
+
(1..n).each do |j|
|
|
113
|
+
cost = (a[i - 1] == b[j - 1]) ? 0 : 1
|
|
114
|
+
v = [prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost].min
|
|
115
|
+
if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]
|
|
116
|
+
v = [v, prev2[j - 2] + 1].min
|
|
117
|
+
end
|
|
118
|
+
curr[j] = v
|
|
119
|
+
end
|
|
120
|
+
prev2, prev, curr = prev, curr, prev2
|
|
121
|
+
end
|
|
122
|
+
prev[n]
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def self.pluralize_simple(word)
|
|
126
|
+
return word + "es" if word.end_with?("s", "x", "ch", "sh")
|
|
127
|
+
return word[0..-2] + "ies" if word.end_with?("y") && !%w[a e i o u].include?(word[-2])
|
|
128
|
+
word + "s"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def self.name_segments(entity)
|
|
132
|
+
entity.name.to_s.split("_").length
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def self.select(question:, registry:, top_n:)
|
|
136
|
+
select_with_meta(question: question, registry: registry, top_n: top_n).map { |row| row[:entity] }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Returns rows of `{ entity:, score:, fuzzy_match: nil|{typed:,corrected:} }`.
|
|
140
|
+
# Used by the intent-extractor prompt to tell the LLM "the user word
|
|
141
|
+
# `<typo>` is likely a typo of `<entity>`" so it commits to the
|
|
142
|
+
# candidate instead of returning unmatched on a stray typo.
|
|
143
|
+
def self.select_with_meta(question:, registry:, top_n:)
|
|
144
|
+
rows = registry.entities.values.map do |entity|
|
|
145
|
+
score, fuzzy = score_entity(question, entity, registry)
|
|
146
|
+
{
|
|
147
|
+
entity: entity,
|
|
148
|
+
score: score,
|
|
149
|
+
fuzzy_match: fuzzy,
|
|
150
|
+
segments: name_segments(entity),
|
|
151
|
+
row_count: entity.row_count,
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
rows.sort_by! { |r| [-r[:score], r[:segments], -r[:row_count]] }
|
|
156
|
+
|
|
157
|
+
if rows.first && rows.first[:score] == 0
|
|
158
|
+
return registry.entities.values.sort_by { |e| -e.row_count }.first(top_n).map do |entity|
|
|
159
|
+
{ entity: entity, score: 0, fuzzy_match: nil }
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# When a typed word has a strong alias-fuzzy resolution (score 5),
|
|
164
|
+
# drop OTHER candidates that scored only via a weaker token-fuzzy
|
|
165
|
+
# match on the same typed word — they represent unrelated tables
|
|
166
|
+
# whose presence in the prompt tempts the LLM to override the
|
|
167
|
+
# resolution.
|
|
168
|
+
strong = {}
|
|
169
|
+
rows.each do |r|
|
|
170
|
+
if r[:fuzzy_match] && r[:score] == 5
|
|
171
|
+
strong[r[:fuzzy_match][:typed]] = true
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
filtered =
|
|
175
|
+
if strong.empty?
|
|
176
|
+
rows
|
|
177
|
+
else
|
|
178
|
+
rows.reject { |r| r[:fuzzy_match] && r[:score] == 3 && strong[r[:fuzzy_match][:typed]] }
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# V1.3-R: alternate-suppression when a primary clearly dominates.
|
|
182
|
+
# Mirror of TS selectEntityCandidatesWithMeta. When the top entity
|
|
183
|
+
# has a strong alias/name match (score ≥ 8), drop alternates whose
|
|
184
|
+
# score is at most half the primary's. Keeps the LLM focused on the
|
|
185
|
+
# right binding instead of weighing distractor token matches.
|
|
186
|
+
primary_dominance_floor = 8
|
|
187
|
+
alternate_keep_ratio = 0.5
|
|
188
|
+
top = filtered.first
|
|
189
|
+
if top && top[:score] >= primary_dominance_floor
|
|
190
|
+
cutoff = top[:score] * alternate_keep_ratio
|
|
191
|
+
filtered = filtered.each_with_index.reject { |r, i| i > 0 && r[:score] <= cutoff }.map(&:first)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Dedup fuzzy_match annotation per typed word.
|
|
195
|
+
claimed = {}
|
|
196
|
+
filtered.first(top_n).map do |r|
|
|
197
|
+
fuzzy = r[:fuzzy_match]
|
|
198
|
+
if fuzzy
|
|
199
|
+
if claimed[fuzzy[:typed]]
|
|
200
|
+
fuzzy = nil
|
|
201
|
+
else
|
|
202
|
+
claimed[fuzzy[:typed]] = true
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
{ entity: r[:entity], score: r[:score], fuzzy_match: fuzzy }
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "set"
|
|
5
|
+
require "sql_chatbot/grammar/entity_candidates"
|
|
6
|
+
|
|
7
|
+
module SqlChatbot
|
|
8
|
+
module Grammar
|
|
9
|
+
module IntentExtractor
|
|
10
|
+
PRIMITIVE_DESCRIPTIONS = <<~TEXT.strip
|
|
11
|
+
COUNT — how many rows of X
|
|
12
|
+
LIST — show/list rows of X
|
|
13
|
+
SUM — total of numeric field on X
|
|
14
|
+
AVG — average of numeric field on X
|
|
15
|
+
MIN_MAX — lowest/highest value of field on X
|
|
16
|
+
TOP_N — top N rows of X ordered by a ranking field
|
|
17
|
+
RANK — window-ranked rows of X within groups
|
|
18
|
+
TEXT
|
|
19
|
+
|
|
20
|
+
MODIFIER_DESCRIPTIONS = <<~TEXT.strip
|
|
21
|
+
where — filter by a field value (op: eq/neq/lt/lte/gt/gte/like/in)
|
|
22
|
+
shape: {"kind":"where","field":"<col>","op":"<op>","value":<lit>}
|
|
23
|
+
time — filter by time window on a timestamp field
|
|
24
|
+
shape: {"kind":"time","field":"<timestamp_col>","window":"<keyword>"}
|
|
25
|
+
windows: today | yesterday | last_7_days | last_30_days | this_week | this_month | this_year
|
|
26
|
+
join — include a related entity via association
|
|
27
|
+
shape: {"kind":"join","association":"<assoc_name>"}
|
|
28
|
+
group_by — group results by field
|
|
29
|
+
shape: {"kind":"group_by","field":"<col>"}
|
|
30
|
+
having — filter grouped results by aggregate op+value
|
|
31
|
+
order_by — order results by field
|
|
32
|
+
shape: {"kind":"order_by","field":"<col>","direction":"asc|desc"}
|
|
33
|
+
limit — cap result count
|
|
34
|
+
shape: {"kind":"limit","value":<int>}
|
|
35
|
+
distinct — deduplicate rows
|
|
36
|
+
shape: {"kind":"distinct"}
|
|
37
|
+
scope — apply a named scope from the entity. Use this when the user's word
|
|
38
|
+
matches an entity scope name; the grammar compiles the filter for you.
|
|
39
|
+
shape: {"kind":"scope","name":"<scope_name>"}
|
|
40
|
+
TEXT
|
|
41
|
+
|
|
42
|
+
# call_llm: a proc/lambda taking Array<Hash{role:, content:}> and returning the raw LLM string.
|
|
43
|
+
# Returns a Hash with :status and related keys (mirrors TS Intent discriminated union).
|
|
44
|
+
def self.extract(question:, registry:, history:, call_llm:, confidence_threshold: 0.7)
|
|
45
|
+
candidates = EntityCandidates.select_with_meta(question: question, registry: registry, top_n: 5)
|
|
46
|
+
messages = [
|
|
47
|
+
{ role: "system", content: build_system_prompt },
|
|
48
|
+
{ role: "user", content: build_user_prompt(question, candidates, history) },
|
|
49
|
+
]
|
|
50
|
+
raw =
|
|
51
|
+
begin
|
|
52
|
+
call_llm.call(messages)
|
|
53
|
+
rescue => e
|
|
54
|
+
return { status: "unmatched", confidence: 0, reason: "llm_error: #{e.message}" }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
parsed =
|
|
58
|
+
begin
|
|
59
|
+
JSON.parse(raw)
|
|
60
|
+
rescue JSON::ParserError
|
|
61
|
+
return { status: "unmatched", confidence: 0, reason: "malformed_json" }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if parsed["status"] == "unmatched"
|
|
65
|
+
return {
|
|
66
|
+
status: "unmatched",
|
|
67
|
+
confidence: parsed["confidence"] || 0,
|
|
68
|
+
reason: parsed["reason"] || "unmatched",
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
conf = parsed["confidence"]
|
|
73
|
+
if !conf.is_a?(Numeric) || conf < confidence_threshold
|
|
74
|
+
return { status: "unmatched", confidence: conf || 0, reason: "low_confidence:#{conf}" }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# V1.3-R: validate that the LLM's chosen entity is actually in the
|
|
78
|
+
# candidate set we showed it. See TS extractIntent for rationale.
|
|
79
|
+
if parsed["entity"].is_a?(String)
|
|
80
|
+
candidate_names = Set.new
|
|
81
|
+
candidates.each do |c|
|
|
82
|
+
candidate_names << c[:entity].name
|
|
83
|
+
registry.aliases.each do |alias_term, target|
|
|
84
|
+
candidate_names << alias_term if target == c[:entity].name
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
unless candidate_names.include?(parsed["entity"])
|
|
88
|
+
return {
|
|
89
|
+
status: "unmatched",
|
|
90
|
+
confidence: conf,
|
|
91
|
+
reason: "entity_not_in_candidates: '#{parsed["entity"]}' not among #{candidate_names.first(5).to_a.join(", ")}",
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Normalize keys to symbols for consumers
|
|
97
|
+
{
|
|
98
|
+
status: "matched",
|
|
99
|
+
primitive: parsed["primitive"]&.to_sym,
|
|
100
|
+
entity: parsed["entity"],
|
|
101
|
+
modifiers: (parsed["modifiers"] || []).map { |m| m.transform_keys(&:to_sym) },
|
|
102
|
+
field: parsed["field"],
|
|
103
|
+
which: parsed["which"]&.to_sym,
|
|
104
|
+
n: parsed["n"],
|
|
105
|
+
rank_field: parsed["rankField"] || parsed["rank_field"],
|
|
106
|
+
group_by: parsed["groupBy"] || parsed["group_by"],
|
|
107
|
+
confidence: conf,
|
|
108
|
+
}.compact
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def self.build_system_prompt
|
|
112
|
+
<<~PROMPT.strip
|
|
113
|
+
You are an intent classifier for a SQL chatbot. Given a user question and a list of available entities, extract a structured intent.
|
|
114
|
+
|
|
115
|
+
Primitives:
|
|
116
|
+
#{PRIMITIVE_DESCRIPTIONS}
|
|
117
|
+
|
|
118
|
+
Modifiers:
|
|
119
|
+
#{MODIFIER_DESCRIPTIONS}
|
|
120
|
+
|
|
121
|
+
You MUST output JSON. If the question doesn't fit any primitive cleanly, set status=unmatched.
|
|
122
|
+
|
|
123
|
+
Output shape — generic placeholders, fill in real values from the candidates:
|
|
124
|
+
- COUNT: {"status":"matched","primitive":"COUNT","entity":"<E>","modifiers":[],"confidence":0.9}
|
|
125
|
+
- LIST: {"status":"matched","primitive":"LIST","entity":"<E>","modifiers":[],"confidence":0.9}
|
|
126
|
+
- SUM: {"status":"matched","primitive":"SUM","entity":"<E>","field":"<numeric_col>","modifiers":[],"confidence":0.9}
|
|
127
|
+
- AVG: {"status":"matched","primitive":"AVG","entity":"<E>","field":"<numeric_col>","modifiers":[],"confidence":0.9}
|
|
128
|
+
- MIN_MAX: {"status":"matched","primitive":"MIN_MAX","entity":"<E>","field":"<numeric_col>","which":"MAX","modifiers":[],"confidence":0.9}
|
|
129
|
+
- TOP_N: {"status":"matched","primitive":"TOP_N","entity":"<E>","rankField":"<numeric_col>","n":5,"modifiers":[],"confidence":0.9}
|
|
130
|
+
- unmatched: {"status":"unmatched","confidence":0.3,"reason":"<short reason>"}
|
|
131
|
+
|
|
132
|
+
SUM/AVG/MIN_MAX REQUIRE a "field" slot (a top-level "field" key, not inside modifiers).
|
|
133
|
+
TOP_N REQUIRES a "rankField" (a top-level "rankField" key, not an order_by modifier).
|
|
134
|
+
Pick "<E>" by reading the entity candidates — never invent a name. Pluralized
|
|
135
|
+
forms in the user's question ("orders", "users") map to the singular entity
|
|
136
|
+
name from the candidates list.
|
|
137
|
+
|
|
138
|
+
Casual phrasing maps to primitives like this (use the same patterns when the user speaks informally):
|
|
139
|
+
- "stuff we got" / "what we have" / "show me X" → LIST X
|
|
140
|
+
- "how many", "count of", "number of" → COUNT
|
|
141
|
+
- "biggest" / "largest" / "top" / "smallest" / "lowest" / "least" → TOP_N (returns the row with name + value). Use TOP_N for ANY question asking for an entity (a row) ranked by some field — even when n=1 ("smallest repo", "biggest order"). For ASC ranking, set order_by direction=asc; for DESC, direction=desc.
|
|
142
|
+
- MIN_MAX is ONLY for questions asking for the VALUE of an aggregate, not the row. "what's the highest price?" → MIN_MAX of price. "biggest order" → TOP_N of orders (the user wants the row, not just a number).
|
|
143
|
+
- "how's the team doing" / "how are things looking" → unmatched (vague — no clear primitive)
|
|
144
|
+
|
|
145
|
+
Rules:
|
|
146
|
+
- Only reference entities from the provided candidates. Never invent.
|
|
147
|
+
- Only use fields listed under the chosen entity.
|
|
148
|
+
- For enum filters, pass the enum key (e.g. "active"), NOT the underlying integer.
|
|
149
|
+
- When the user's word matches an enum-value KEY of any field on the chosen entity (case-insensitive), bind it via a "where" modifier with op:"eq" and value:"<the-key>". Example: user says "urgent" and field priority_id has enum={"Urgent":4,...} → emit {"kind":"where","field":"priority_id","op":"eq","value":"Urgent"}.
|
|
150
|
+
- When the user's word matches a scope name listed under the entity, prefer a {"kind":"scope","name":"<name>"} modifier over reconstructing the scope's filter manually. The grammar already knows how to compile the scope.
|
|
151
|
+
- Set confidence honestly: 0.9+ only when every slot has a clear mapping. If the question is vague and you're guessing the entity or aggregation, return unmatched.
|
|
152
|
+
- ENTITY-BINDING RULE: If a user word matches the canonical name OR any listed alias of exactly one candidate entity, the entity IS bound. Commit to a primitive — default to COUNT for "any/how many/got any" phrasings and LIST for "show me/list/which". Do NOT return unmatched just because the question is casual; the alias resolved the ambiguity. The "When in doubt, return unmatched" rule applies to ambiguous PRIMITIVES (e.g., "what's up with X"), not to ambiguous ENTITIES — entity ambiguity is already resolved by the candidate set.
|
|
153
|
+
- PRIMARY MATCH RULE: When a candidate is annotated "[PRIMARY MATCH — prefer this entity ...]", use that entity unless the question explicitly names something only a different candidate has. Do NOT pick a non-primary candidate just because it has more rows or a more specific token match — the scorer already weighed those factors. Example: "smallest repo" with the "repository" entity marked PRIMARY (alias match) and "repo_unit" as alternate (token match) → pick "repository". The user said "repo", and "repo" is the alias of "repository".
|
|
154
|
+
- When a candidate is annotated with "fuzzy-matched from '<typo>' → '<entity>'", the entity has already been chosen as the resolution for that typed word. Use this entity at confidence ≥ 0.85. Do NOT pick a different candidate just because its name appears to contain the user's word — unrelated tables (audit logs, bot tables, event tables) can have similar names. The fuzzy resolution was done before you saw these candidates.
|
|
155
|
+
- For any time phrasing ("today", "yesterday", "last 7 days", "last week", "this week", "last month", "this year", etc.), use a "time" modifier with a named window. NEVER put a literal date string OR a window keyword into a "where" modifier value — both produce broken SQL. Supported windows: today, yesterday, last_7_days, last_30_days, this_week, this_month, this_year. The "time" modifier shape is {"kind":"time","field":"<timestamp_col>","window":"<keyword>"} — do NOT use "value" instead of "window", and ALWAYS include "field". If the user names a window not in this list, return unmatched.
|
|
156
|
+
- When in doubt, return unmatched.
|
|
157
|
+
PROMPT
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def self.build_user_prompt(question, candidates, history)
|
|
161
|
+
history_text = (history || []).last(2).map { |m| "#{m[:role] || m['role']}: #{m[:content] || m['content']}" }.join("\n")
|
|
162
|
+
# Mark the top candidate PRIMARY when there's more than one. Mirror
|
|
163
|
+
# of TS buildUserPrompt — see comments there for the rationale.
|
|
164
|
+
entity_blocks = candidates.each_with_index.map { |c, i| format_entity_brief(c, i.zero? && candidates.length > 1) }.join("\n\n")
|
|
165
|
+
"History:\n#{history_text}\n\nEntity candidates:\n#{entity_blocks}\n\nQuestion: #{question}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def self.format_entity_brief(candidate, is_primary = false)
|
|
169
|
+
e = candidate[:entity]
|
|
170
|
+
fields = e.fields.first(15).map do |n, f|
|
|
171
|
+
enum_part = f.enum_values ? " enum=#{JSON.generate(f.enum_values)}" : ""
|
|
172
|
+
" #{n}: #{f.type}#{enum_part}"
|
|
173
|
+
end.join("\n")
|
|
174
|
+
assocs = e.associations.keys.first(8).join(", ")
|
|
175
|
+
assocs = "(none)" if assocs.empty?
|
|
176
|
+
scope_names = e.scopes.keys.first(8)
|
|
177
|
+
scopes_text =
|
|
178
|
+
if scope_names.any?
|
|
179
|
+
%(Scopes (when a user word matches one of these names, emit {"kind":"scope","name":"<name>"} as a modifier): #{scope_names.join(", ")})
|
|
180
|
+
else
|
|
181
|
+
"Scopes: (none)"
|
|
182
|
+
end
|
|
183
|
+
fuzzy_hint = candidate[:fuzzy_match] ?
|
|
184
|
+
%(\nNOTE: fuzzy-matched from "#{candidate[:fuzzy_match][:typed]}" → "#{candidate[:fuzzy_match][:corrected]}" (treat the user word as a typo of this entity).) :
|
|
185
|
+
""
|
|
186
|
+
primary_marker = is_primary ? " [PRIMARY MATCH — prefer this entity unless the question explicitly names a different one]\n" : ""
|
|
187
|
+
"Entity: #{e.name} (table=#{e.table}, rows=#{e.row_count})#{fuzzy_hint}\n#{primary_marker}Fields:\n#{fields}\nAssociations: #{assocs}\n#{scopes_text}"
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sql_chatbot/grammar/count_renderer"
|
|
4
|
+
|
|
5
|
+
module SqlChatbot
|
|
6
|
+
module Grammar
|
|
7
|
+
# Programmatic renderer for the grammar's LIST primitive when the result
|
|
8
|
+
# is small. Bypasses the answer-stream LLM so it can't drop or truncate.
|
|
9
|
+
module ListRenderer
|
|
10
|
+
THRESHOLD = 10
|
|
11
|
+
|
|
12
|
+
PREFERRED_LABEL_KEYS = %w[title name label subject email username].freeze
|
|
13
|
+
|
|
14
|
+
# Returns { ok: true, text: "..." } when conditions met, else { ok: false }.
|
|
15
|
+
def self.try_render(primitive, entity_display_label, rows)
|
|
16
|
+
return { ok: false } unless primitive.to_s == "LIST"
|
|
17
|
+
return { ok: false } unless rows.is_a?(Array)
|
|
18
|
+
return { ok: true, text: ProgrammaticRenderer.empty_text(entity_display_label) } if rows.empty?
|
|
19
|
+
return { ok: false } if rows.length > THRESHOLD
|
|
20
|
+
|
|
21
|
+
labels = []
|
|
22
|
+
rows.each do |row|
|
|
23
|
+
return { ok: false } unless row.is_a?(Hash)
|
|
24
|
+
lbl = pick_label(row)
|
|
25
|
+
return { ok: false } unless lbl
|
|
26
|
+
labels << lbl
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
label_or_item = entity_display_label.to_s.empty? ? "item" : entity_display_label.to_s
|
|
30
|
+
noun = rows.length == 1 ? CountRenderer.to_singular_label(label_or_item) : CountRenderer.to_plural_label(label_or_item)
|
|
31
|
+
|
|
32
|
+
intro = rows.length == 1 ? "Here is the #{noun}:" : "Here are the #{rows.length} #{noun}:"
|
|
33
|
+
lines = labels.map { |l| "- #{l}" }.join("\n")
|
|
34
|
+
{ ok: true, text: "#{intro}\n#{lines}" }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.pick_label(row)
|
|
38
|
+
PREFERRED_LABEL_KEYS.each do |k|
|
|
39
|
+
v = row[k] || row[k.to_sym]
|
|
40
|
+
return v.strip if v.is_a?(String) && !v.strip.empty?
|
|
41
|
+
end
|
|
42
|
+
row.each do |k, v|
|
|
43
|
+
next if k.to_s == "id"
|
|
44
|
+
return v.strip if v.is_a?(String) && !v.strip.empty?
|
|
45
|
+
end
|
|
46
|
+
nil
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "time"
|
|
6
|
+
|
|
7
|
+
module SqlChatbot
|
|
8
|
+
module Grammar
|
|
9
|
+
module MissLogger
|
|
10
|
+
def self.log(log_path, entry)
|
|
11
|
+
FileUtils.mkdir_p(File.dirname(log_path))
|
|
12
|
+
line = JSON.generate({ ts: Time.now.utc.iso8601 }.merge(entry)) + "\n"
|
|
13
|
+
File.open(log_path, "a") { |f| f.write(line) }
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sql_chatbot/grammar/registry"
|
|
4
|
+
require "sql_chatbot/grammar/primitives"
|
|
5
|
+
|
|
6
|
+
module SqlChatbot
|
|
7
|
+
module Grammar
|
|
8
|
+
module Modifiers
|
|
9
|
+
def self.q(name)
|
|
10
|
+
Primitives.q(name)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.qc(table, col)
|
|
14
|
+
Primitives.qc(table, col)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
WINDOWS = {
|
|
18
|
+
"today" => "DATE_TRUNC('day', NOW())",
|
|
19
|
+
"yesterday" => "DATE_TRUNC('day', NOW() - INTERVAL '1 day')",
|
|
20
|
+
"last_7_days" => "NOW() - INTERVAL '7 days'",
|
|
21
|
+
"last_30_days" => "NOW() - INTERVAL '30 days'",
|
|
22
|
+
"this_week" => "DATE_TRUNC('week', NOW())",
|
|
23
|
+
"this_month" => "DATE_TRUNC('month', NOW())",
|
|
24
|
+
"this_year" => "DATE_TRUNC('year', NOW())",
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
OPS = {
|
|
28
|
+
"eq" => "=",
|
|
29
|
+
"neq" => "!=",
|
|
30
|
+
"lt" => "<",
|
|
31
|
+
"lte" => "<=",
|
|
32
|
+
"gt" => ">",
|
|
33
|
+
"gte" => ">=",
|
|
34
|
+
}.freeze
|
|
35
|
+
|
|
36
|
+
def self.apply(sql, modifier, entity)
|
|
37
|
+
kind = modifier[:kind].to_s
|
|
38
|
+
case kind
|
|
39
|
+
when "where" then apply_where(sql, modifier, entity)
|
|
40
|
+
when "time" then apply_time(sql, modifier, entity)
|
|
41
|
+
when "join" then apply_join(sql, modifier, entity)
|
|
42
|
+
when "group_by" then apply_group_by(sql, modifier, entity)
|
|
43
|
+
when "having" then apply_having(sql, modifier, entity)
|
|
44
|
+
when "order_by" then apply_order_by(sql, modifier, entity)
|
|
45
|
+
when "limit" then apply_limit(sql, modifier)
|
|
46
|
+
when "distinct" then sql.sub(/^SELECT /, "SELECT DISTINCT ")
|
|
47
|
+
else
|
|
48
|
+
raise "unknown modifier kind #{kind}"
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def self.append_clause(sql, clause)
|
|
53
|
+
if /\bWHERE\b/i.match?(sql)
|
|
54
|
+
"#{sql} AND #{clause}"
|
|
55
|
+
else
|
|
56
|
+
"#{sql} WHERE #{clause}"
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def self.apply_where(sql, modifier, entity)
|
|
61
|
+
field_name = modifier[:field].to_s
|
|
62
|
+
field = entity.fields[field_name]
|
|
63
|
+
raise "field '#{field_name}' not on entity #{entity.name}" unless field
|
|
64
|
+
|
|
65
|
+
value = modifier[:value]
|
|
66
|
+
ftype = field.type.to_s
|
|
67
|
+
if ftype == "enum"
|
|
68
|
+
enum_values = field.enum_values || {}
|
|
69
|
+
str_value = value.to_s
|
|
70
|
+
unless enum_values.key?(str_value) || enum_values.key?(str_value.to_sym)
|
|
71
|
+
raise "enum value '#{value}' not in registry for #{entity.name}.#{field_name}"
|
|
72
|
+
end
|
|
73
|
+
value = enum_values[str_value] || enum_values[str_value.to_sym]
|
|
74
|
+
elsif (ftype == "int" || ftype == "decimal") && value.is_a?(String) && value !~ /\A-?\d+(\.\d+)?\z/
|
|
75
|
+
raise "type mismatch: #{ftype} column #{entity.name}.#{field_name} cannot equal string '#{value}'"
|
|
76
|
+
elsif ftype == "bool" && value.is_a?(String) && value !~ /\A(true|false|t|f|0|1)\z/i
|
|
77
|
+
raise "type mismatch: bool column #{entity.name}.#{field_name} cannot equal string '#{value}'"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
op = OPS[modifier[:op].to_s] || "="
|
|
81
|
+
formatted = value.is_a?(String) ? "'#{value.gsub("'", "''")}'" : value
|
|
82
|
+
append_clause(sql, "#{qc(entity.table, field_name)} #{op} #{formatted}")
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def self.apply_time(sql, modifier, entity)
|
|
86
|
+
window_key = modifier[:window].to_s
|
|
87
|
+
expr = WINDOWS[window_key]
|
|
88
|
+
raise "unknown time window #{window_key}" unless expr
|
|
89
|
+
append_clause(sql, "#{qc(entity.table, modifier[:field])} >= #{expr}")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def self.apply_join(sql, modifier, entity)
|
|
93
|
+
assoc_name = modifier[:association].to_s
|
|
94
|
+
assoc = entity.associations[assoc_name]
|
|
95
|
+
raise "association '#{assoc_name}' not on entity #{entity.name}" unless assoc
|
|
96
|
+
|
|
97
|
+
join_clause = assoc.join_clause
|
|
98
|
+
# Re-emit the join clause with quoted identifiers.
|
|
99
|
+
# joinClause format: "src_table.src_col = tgt_table.tgt_col"
|
|
100
|
+
lhs, rhs = join_clause.split("=").map(&:strip)
|
|
101
|
+
lt, lc = lhs.split(".")
|
|
102
|
+
rt, rc = rhs.split(".")
|
|
103
|
+
target_table = rt
|
|
104
|
+
quoted_clause = "#{qc(lt, lc)} = #{qc(rt, rc)}"
|
|
105
|
+
join_sql = " JOIN #{q(target_table)} ON #{quoted_clause}"
|
|
106
|
+
|
|
107
|
+
if /\bWHERE\b/i.match?(sql)
|
|
108
|
+
sql.sub(/\bWHERE\b/i) { "#{join_sql} WHERE " }
|
|
109
|
+
else
|
|
110
|
+
"#{sql}#{join_sql}"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def self.apply_group_by(sql, modifier, entity)
|
|
115
|
+
field_name = modifier[:field].to_s
|
|
116
|
+
raise "group_by field '#{field_name}' not on entity #{entity.name}" unless entity.fields[field_name]
|
|
117
|
+
"#{sql} GROUP BY #{qc(entity.table, field_name)}"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def self.apply_having(sql, modifier, entity)
|
|
121
|
+
raise "HAVING requires GROUP BY" unless /GROUP BY/i.match?(sql)
|
|
122
|
+
op = OPS[modifier[:op].to_s] || "="
|
|
123
|
+
"#{sql} HAVING #{modifier[:field]} #{op} #{modifier[:value]}"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def self.apply_order_by(sql, modifier, entity)
|
|
127
|
+
field_name = modifier[:field].to_s
|
|
128
|
+
raise "order_by field '#{field_name}' not on entity #{entity.name}" unless entity.fields[field_name]
|
|
129
|
+
# The intent extractor LLM emits direction as either `direction` or
|
|
130
|
+
# `op` (it's prompted with the where-modifier shape and reuses `op`).
|
|
131
|
+
direction = (modifier[:direction] || modifier["direction"] || modifier[:op] || modifier["op"] || "desc").to_s.upcase
|
|
132
|
+
"#{sql} ORDER BY #{qc(entity.table, field_name)} #{direction}"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def self.apply_limit(sql, modifier)
|
|
136
|
+
limit_val = modifier[:value]
|
|
137
|
+
if /LIMIT \d+/i.match?(sql)
|
|
138
|
+
sql.sub(/LIMIT \d+/i, "LIMIT #{limit_val}")
|
|
139
|
+
else
|
|
140
|
+
"#{sql} LIMIT #{limit_val}"
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|