code_to_query 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,581 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+ require 'erb'
7
+
8
+ module CodeToQuery
9
+ module Providers
10
+ class OpenAI < Base
11
+ API_BASE = 'https://api.openai.com/v1'
12
+
13
+ def extract_intent(prompt:, schema:, allow_tables:)
14
+ @schema = schema || {}
15
+ @glossary = begin
16
+ @schema['glossary'] || {}
17
+ rescue StandardError
18
+ {}
19
+ end
20
+ candidate_tables = select_context_tables(prompt, allow_tables)
21
+ context = build_system_context(schema, candidate_tables)
22
+
23
+ started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
24
+ response = make_api_request(
25
+ messages: build_messages(prompt, context),
26
+ functions: [intent_extraction_function],
27
+ function_call: { name: 'extract_query_intent' }
28
+ )
29
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at
30
+
31
+ function_call = response.dig('choices', 0, 'message', 'function_call')
32
+ intent_json = JSON.parse(function_call['arguments'])
33
+
34
+ normalized = enhance_with_schema(intent_json, allow_tables: allow_tables, prompt_text: prompt)
35
+
36
+ normalized['limit'] = @config.default_limit if @config.default_limit
37
+ result = validate_and_enhance_intent(normalized, allow_tables)
38
+
39
+ usage = response['usage'] || {}
40
+ @metrics[:prompt_tokens] = usage['prompt_tokens']
41
+ @metrics[:completion_tokens] = usage['completion_tokens']
42
+ @metrics[:total_tokens] = usage['total_tokens']
43
+ @metrics[:elapsed_s] = elapsed
44
+
45
+ result
46
+ end
47
+
48
+ private
49
+
50
+ def select_context_tables(prompt, allow_tables)
51
+ top_k = (@config.context_rag_top_k || 0).to_i
52
+ allow = Array(allow_tables).compact
53
+
54
+ return allow if top_k <= 0
55
+
56
+ ranked = rank_tables_for_prompt(prompt)
57
+ chosen = ranked.take(top_k)
58
+ if allow.any?
59
+ chosen &= allow
60
+ chosen = allow if chosen.empty?
61
+ end
62
+ chosen
63
+ rescue StandardError
64
+ Array(allow_tables).compact
65
+ end
66
+
67
+ def rank_tables_for_prompt(prompt)
68
+ candidates = extract_schema_tables
69
+ return [] unless candidates.any?
70
+
71
+ text = prompt.to_s
72
+ tokens = text.scan(/[a-z0-9_]+/i).map { |t| normalize_token(t) }.uniq
73
+
74
+ candidates.map do |t|
75
+ name = t[:name].to_s
76
+ base = name_match_score(name, text)
77
+ column_score = column_overlap_score(name, tokens)
78
+ { table: name, score: (0.7 * base) + (0.3 * column_score) }
79
+ end.sort_by { |h| -h[:score] }.map { |h| h[:table] }
80
+ rescue StandardError
81
+ []
82
+ end
83
+
84
+ def make_api_request(messages:, functions:, function_call:)
85
+ base = (@config.llm_api_base || API_BASE).to_s
86
+ uri = URI("#{base.chomp('/')}/chat/completions")
87
+
88
+ payload = {
89
+ model: @config.openai_model,
90
+ messages: messages,
91
+ temperature: @config.llm_temperature,
92
+ functions: functions,
93
+ function_call: function_call
94
+ }.merge(@config.provider_options || {})
95
+
96
+ response = http_request(uri, payload)
97
+
98
+ raise "OpenAI API error: #{response.code} #{response.message}" unless response.is_a?(Net::HTTPSuccess)
99
+
100
+ JSON.parse(response.body)
101
+ end
102
+
103
+ def enhance_with_schema(intent, allow_tables: [], prompt_text: nil)
104
+ original_table = intent['table']
105
+ if Array(allow_tables).any? && !Array(allow_tables).include?(original_table)
106
+ return intent
107
+ end
108
+
109
+ resolved_table = pick_best_table(original_table)
110
+ table_name = if Array(allow_tables).any?
111
+ Array(allow_tables).include?(resolved_table) ? resolved_table : original_table
112
+ else
113
+ resolved_table
114
+ end
115
+ return intent unless table_name
116
+
117
+ intent['table'] = table_name
118
+
119
+ if intent['order'].is_a?(Array)
120
+ intent['order'] = intent['order'].map do |ord|
121
+ col = ord['column'] || ord[:column]
122
+ dir = (ord['dir'] || ord[:dir] || 'desc').to_s
123
+ resolved = resolve_column_name(col, table_name) || col
124
+ { 'column' => resolved, 'dir' => dir }
125
+ end
126
+ end
127
+
128
+ if intent['filters'].is_a?(Array)
129
+ intent['filters'] = intent['filters'].map do |f|
130
+ col = f['column'] || f[:column]
131
+ resolved = resolve_column_name(col, table_name) || col
132
+ f = f.merge('column' => resolved)
133
+ if %w[exists not_exists].include?(f['op'].to_s) && f['related_filters'].is_a?(Array)
134
+ f['related_filters'] = f['related_filters'].map do |rf|
135
+ rcol = rf['column'] || rf[:column]
136
+ rf.merge('column' => rcol)
137
+ end
138
+ end
139
+ f
140
+ end
141
+ end
142
+
143
+ if intent['columns'].is_a?(Array)
144
+ intent['columns'] = intent['columns'].map do |c|
145
+ resolve_column_name(c, table_name) || c
146
+ end
147
+ end
148
+
149
+ # Backfill missing params for '=' filters using enum labels from prompt tokens
150
+ if prompt_text.is_a?(String) && intent['params'].is_a?(Hash)
151
+ tokens = prompt_text.scan(/[a-z0-9_]+/i).map { |t| normalize_token(t) }
152
+ model_name = find_model_name_for_table(table_name)
153
+ enums = model_enums(model_name)
154
+ if enums.is_a?(Hash) && enums.any?
155
+ intent['filters'].to_a.each do |f|
156
+ next unless f.is_a?(Hash) && f['op'].to_s == '='
157
+
158
+ col = f['column']
159
+ next unless col
160
+
161
+ pkey = (f['param'] || col).to_s
162
+ next if intent['params'].key?(pkey)
163
+
164
+ mapping = enums[col] || enums[col.to_s]
165
+ next unless mapping.is_a?(Hash) && mapping.any?
166
+
167
+ normalized_labels = mapping.keys.map { |k| normalize_token(k) }
168
+ match_idx = normalized_labels.index { |lab| tokens.include?(lab) }
169
+ if match_idx
170
+ original_label = mapping.keys[match_idx]
171
+ intent['params'][pkey] = original_label
172
+ end
173
+ end
174
+ end
175
+
176
+ # Fallback using glossary synonyms when enums are missing
177
+ intent['filters'].to_a.each do |f|
178
+ next unless f.is_a?(Hash) && f['op'].to_s == '='
179
+
180
+ col = f['column']
181
+ next unless col
182
+
183
+ pkey = (f['param'] || col).to_s
184
+ next if intent['params'].key?(pkey)
185
+
186
+ syns = Array(@glossary["#{table_name}.#{col}"]) + Array(@glossary[col.to_s])
187
+ next if syns.empty?
188
+
189
+ norm_syns = syns.map { |s| normalize_token(s) }
190
+ match_idx = norm_syns.index { |s| tokens.include?(s) }
191
+ intent['params'][pkey] = syns[match_idx] if match_idx
192
+ end
193
+ end
194
+
195
+ if intent['params'].is_a?(Hash)
196
+ model_name = find_model_name_for_table(table_name)
197
+ intent['filters'].to_a.each do |f|
198
+ param_key = f['param'] || f['column']
199
+ next unless param_key
200
+
201
+ raw_value = intent['params'][param_key] || intent['params'][param_key.to_s] || intent['params'][param_key.to_sym]
202
+ next if raw_value.nil?
203
+
204
+ normalized = map_enum_like_value(model_name, f['column'], raw_value)
205
+ unless normalized.equal?(raw_value)
206
+ intent['params'][param_key] = normalized
207
+ end
208
+ end
209
+ end
210
+
211
+ if intent['params'].is_a?(Hash)
212
+ model_name = find_model_name_for_table(table_name)
213
+ intent['filters'].to_a.each do |f|
214
+ col = f['column']
215
+ param_key = f['param'] || col
216
+ next unless param_key && col
217
+
218
+ raw_value = intent['params'][param_key]
219
+ next unless raw_value.is_a?(String)
220
+
221
+ prev = intent['params'][param_key]
222
+ mapped = map_enum_like_value(model_name, col, raw_value)
223
+ intent['params'][param_key] = mapped if mapped != prev
224
+ end
225
+ end
226
+
227
+ intent
228
+ rescue StandardError
229
+ intent
230
+ end
231
+
232
+ def pick_best_table(requested)
233
+ candidates = extract_schema_tables
234
+ return requested unless candidates.any?
235
+
236
+ @schema['prompt_normalized'] # not present; fallback will compute below
237
+ prompt_text = [requested.to_s].join(' ').downcase
238
+ tokens = prompt_text.scan(/[a-z0-9_]+/).map { |t| normalize_token(t) }.uniq
239
+
240
+ ranked = candidates.map do |t|
241
+ name = t[:name].to_s
242
+ base = name_match_score(name, requested.to_s)
243
+ column_score = column_overlap_score(name, tokens)
244
+ { table: name, score: (0.7 * base) + (0.3 * column_score) }
245
+ end.sort_by { |h| -h[:score] }
246
+
247
+ ranked.first[:table]
248
+ rescue StandardError
249
+ requested
250
+ end
251
+
252
+ def name_match_score(table_name, requested)
253
+ p = requested.to_s.downcase
254
+ return 0.0 if p.empty?
255
+ return 1.0 if p.include?(table_name.to_s.downcase)
256
+
257
+ singular = table_name.to_s.chomp('s')
258
+ return 0.9 if p.include?(singular)
259
+
260
+ # glossary table synonyms
261
+ syns = Array(@glossary[table_name.to_s])
262
+ return 0.8 if syns.any? { |s| p.include?(s.to_s.downcase) }
263
+
264
+ # partial
265
+ chunks = table_name.to_s.downcase.chars.each_cons(3).map(&:join)
266
+ overlap = chunks.count { |c| p.include?(c) }
267
+ [overlap.to_f / [table_name.length, 1].max * 0.6, 0.1].max
268
+ end
269
+
270
+ def column_overlap_score(table_name, tokens)
271
+ return 0.0 if tokens.empty?
272
+
273
+ cols = table_columns(table_name)
274
+ terms = []
275
+ cols.each do |c|
276
+ pieces = c.to_s.downcase.split('_')
277
+ terms.concat(pieces)
278
+ terms.concat(Array(@glossary["#{table_name}.#{c}"]))
279
+ pieces.each { |p| terms.concat(generic_token_synonyms(p)) }
280
+ end
281
+ terms = terms.compact.map { |t| normalize_token(t) }.uniq
282
+ overlap = tokens & terms
283
+ [(overlap.length.to_f / tokens.length), 1.0].min
284
+ end
285
+
286
+ def generic_token_synonyms(token)
287
+ dict = {
288
+ 'score' => %w[rating grade ability level points],
289
+ 'ability' => %w[skill proficiency competency level score]
290
+ }
291
+ Array(dict[normalize_token(token)])
292
+ end
293
+
294
+ def extract_schema_tables
295
+ return [] unless @schema.is_a?(Hash)
296
+
297
+ raw_tables = if @schema['tables'].is_a?(Array)
298
+ @schema['tables']
299
+ elsif @schema['schema'].is_a?(Hash) && @schema['schema']['tables'].is_a?(Array)
300
+ @schema['schema']['tables']
301
+ else
302
+ []
303
+ end
304
+
305
+ Array(raw_tables).map do |table|
306
+ next unless table.is_a?(Hash)
307
+
308
+ {
309
+ name: table['name'] || table[:name],
310
+ columns: Array(table['columns'] || table[:columns])
311
+ }
312
+ end.compact
313
+ end
314
+
315
+ def table_columns(table_name)
316
+ tables = extract_schema_tables
317
+ table = tables.find { |t| t[:name].to_s == table_name.to_s }
318
+ return [] unless table
319
+
320
+ Array(table[:columns]).map { |c| c['name'] || c[:name] }.compact
321
+ end
322
+
323
+ def resolve_column_name(requested_name, table_name)
324
+ return nil if requested_name.nil? || table_name.nil?
325
+ return nil if requested_name == '*'
326
+
327
+ requested_norm = normalize_token(requested_name)
328
+ return nil if requested_norm.empty?
329
+
330
+ available = table_columns(table_name)
331
+ return requested_name if available.include?(requested_name)
332
+
333
+ available.each do |col|
334
+ return col if normalize_token(col) == requested_norm
335
+
336
+ glossary_key = "#{table_name}.#{col}"
337
+ synonyms = Array(@glossary[glossary_key])
338
+ synonyms.each do |syn|
339
+ return col if normalize_token(syn) == requested_norm
340
+ end
341
+ end
342
+
343
+ general_synonyms = Array(@glossary[requested_name]) + Array(@glossary[requested_norm]) + Array(@glossary[requested_name.to_s.downcase])
344
+ general_synonyms.map { |s| normalize_token(s) }.uniq.each do |cand|
345
+ available.each do |col|
346
+ col_norm = normalize_token(col)
347
+ return col if col_norm == cand || col_norm.include?(cand) || cand.include?(col_norm)
348
+ end
349
+ end
350
+
351
+ if requested_norm.length >= 3
352
+ available.each do |col|
353
+ col_norm = normalize_token(col)
354
+ return col if col_norm.include?(requested_norm) || requested_norm.include?(col_norm)
355
+ end
356
+ end
357
+
358
+ nil
359
+ end
360
+
361
+ def normalize_token(str)
362
+ str.to_s.downcase.gsub(/[^a-z0-9]/, '')
363
+ end
364
+
365
+ def find_model_name_for_table(table_name)
366
+ models = @schema.dig('models', 'models')
367
+ return nil unless models.is_a?(Hash)
368
+
369
+ entry = models.find { |_mn, md| (md['table_name'] || md[:table_name]).to_s == table_name.to_s }
370
+ entry&.first
371
+ end
372
+
373
+ def model_enums(model_name)
374
+ @schema.dig('models', 'models', model_name, 'enums') || {}
375
+ end
376
+
377
+ def model_scopes(model_name)
378
+ @schema.dig('models', 'scopes', model_name) || {}
379
+ end
380
+
381
+ def map_enum_like_value(model_name, column_name, raw_value)
382
+ return raw_value if raw_value.is_a?(Numeric)
383
+ return raw_value unless model_name && column_name
384
+
385
+ str = raw_value.to_s
386
+ return raw_value if str.empty?
387
+
388
+ enums = model_enums(model_name)
389
+ if enums.is_a?(Hash)
390
+ mapping = enums[column_name] || enums[column_name.to_s]
391
+ if mapping.is_a?(Hash)
392
+ # Rails defined_enums use string keys and integer values
393
+ mapped = mapping[str] || mapping[str.downcase]
394
+ return Integer(mapped) if mapped
395
+ end
396
+ end
397
+
398
+ # Fallback: infer from scopes where clause, e.g. scopes like with_videos -> WHERE attachment_type = 0
399
+ scopes = model_scopes(model_name)
400
+ if scopes.is_a?(Hash)
401
+ token = normalize_token(str)
402
+ scopes.each do |scope_name, meta|
403
+ next unless meta.is_a?(Hash)
404
+
405
+ where = meta['where'] || meta[:where]
406
+ next unless where.is_a?(String)
407
+ # Require the where to mention the target column
408
+ next unless where.match?(/\b#{Regexp.escape(column_name)}\b\s*=\s*(\d+)/)
409
+
410
+ # Heuristic match of scope name to token
411
+ sn = normalize_token(scope_name.to_s)
412
+ if sn.include?(token) || token.include?(sn)
413
+ m = where.match(/\b#{Regexp.escape(column_name)}\b\s*=\s*(\d+)/)
414
+ return Integer(m[1]) if m
415
+ end
416
+ end
417
+ end
418
+
419
+ raw_value
420
+ end
421
+
422
+ def http_request(uri, payload)
423
+ # If a custom client is provided, use it (must respond to :chat and return content)
424
+ if @config.llm_client.respond_to?(:chat)
425
+ content = @config.llm_client.chat(messages: payload[:messages], options: payload.except(:messages))
426
+ # Emulate Net::HTTP success + body with content stitched into OpenAI-like response
427
+ fake = Struct.new(:code, :message, :body)
428
+ return fake.new('200', 'OK', { choices: [{ message: { function_call: { arguments: content.to_s } } }] }.to_json)
429
+ end
430
+
431
+ http = Net::HTTP.new(uri.host, uri.port)
432
+ http.use_ssl = (uri.scheme == 'https')
433
+ http.read_timeout = @config.llm_timeout
434
+
435
+ request = Net::HTTP::Post.new(uri)
436
+ request['Authorization'] = "Bearer #{@config.openai_api_key}"
437
+ request['Content-Type'] = 'application/json'
438
+ request.body = payload.to_json
439
+
440
+ http.request(request)
441
+ end
442
+
443
+ def build_messages(prompt, context)
444
+ [
445
+ {
446
+ role: 'system',
447
+ content: build_system_prompt(context)
448
+ },
449
+ {
450
+ role: 'user',
451
+ content: "Convert this natural language query into a structured intent: \"#{prompt}\""
452
+ }
453
+ ]
454
+ end
455
+
456
+ def build_system_prompt(context)
457
+ if @config.system_prompt_template
458
+ return render_template(@config.system_prompt_template, context)
459
+ end
460
+
461
+ app_kind = defined?(Rails) ? 'a Rails application' : 'a Ruby application'
462
+ <<~PROMPT
463
+ You are an expert SQL query planner for #{app_kind}. Convert natural language queries into structured JSON intent objects that can be safely compiled into parameterized SQL.
464
+
465
+ Available tables: #{context[:available_tables].any? ? context[:available_tables].join(', ') : 'Any table in the schema'}
466
+
467
+ Schema summary (columns and foreign keys):
468
+ #{context[:schema_info]}
469
+
470
+ Requirements:
471
+ #{context[:constraints].map { |c| "- #{c}" }.join("\n")}
472
+
473
+ Guidance:
474
+ - Infer relationships from foreign key columns (e.g., *_id patterns)
475
+ - Use the business glossary to understand domain terminology and map user language to database concepts
476
+ - For relationship queries (e.g., "X by Y with id Z"), create EXISTS filters with proper related_table and related_filters
477
+ - CRITICAL: Extract specific entity IDs from natural language - put descriptive names in "param" fields and actual values in "params" object
478
+ - When users mention relationships, look for junction/bridge tables that connect entities
479
+ - Use bound parameters and avoid string/numeric literals in WHERE clauses
480
+ - Map business domain terms using the glossary provided in the schema
481
+ - Do not invent tables or columns not present in the schema summary
482
+ - For complex relationships, prefer EXISTS/NOT EXISTS over JOINs for better performance
483
+
484
+ CRITICAL: For EXISTS/NOT EXISTS operations, analyze the schema to find the relationship and provide:
485
+ - related_table: The bridge/junction table that connects the entities
486
+ - fk_column: The foreign key column in that table pointing back to main table
487
+ - base_column: Column in main table to join on (usually "id")
488
+ - related_filters: Array of conditions to apply in the EXISTS subquery (use descriptive param names, not literal values)
489
+ - column: MUST be "id" (never null) - this is a dummy value required by the schema
490
+
491
+ Parameter Naming: Use descriptive names for "param" fields based on the schema context and put the actual values in the "params" object.
492
+ CRITICAL: param fields must contain parameter names (like column names or descriptive identifiers), never literal values from the prompt.
493
+
494
+ Generate queries that are safe, performant, and match user intent precisely.
495
+ PROMPT
496
+ end
497
+
498
+ def render_template(template_str, context)
499
+ ERB.new(template_str).result(binding)
500
+ end
501
+
502
+ def intent_extraction_function
503
+ {
504
+ name: 'extract_query_intent',
505
+ description: 'Extract structured query intent from natural language, paying special attention to entity IDs and relationships',
506
+ parameters: {
507
+ type: 'object',
508
+ properties: {
509
+ type: {
510
+ type: 'string',
511
+ enum: ['select'],
512
+ description: 'Type of query (only SELECT allowed)'
513
+ },
514
+ table: {
515
+ type: 'string',
516
+ description: 'Primary table to query'
517
+ },
518
+ columns: {
519
+ type: 'array',
520
+ items: { type: 'string' },
521
+ description: 'Columns to select (* for all)'
522
+ },
523
+ filters: {
524
+ type: 'array',
525
+ description: 'Array of filter conditions. For relationship queries, create EXISTS filters. Each filter must have either a simple column condition OR a complete EXISTS structure.',
526
+ items: {
527
+ type: 'object',
528
+ properties: {
529
+ column: { type: 'string', description: 'Column name for simple filters. For EXISTS/NOT EXISTS filters, use "id" as a dummy value (never null)' },
530
+ op: { type: 'string', enum: ['=', '!=', '<>', '>', '<', '>=', '<=', 'between', 'in', 'like', 'ilike', 'exists', 'not_exists'] },
531
+ param: { type: 'string', description: 'Parameter name (NEVER the literal value) - derive from column names or context' },
532
+ param_start: { type: 'string' },
533
+ param_end: { type: 'string' },
534
+ related_table: { type: 'string', description: 'REQUIRED for exists/not_exists: the table to check in the subquery' },
535
+ fk_column: { type: 'string', description: 'REQUIRED for exists/not_exists: foreign key column in related_table' },
536
+ base_column: { type: 'string', description: 'Column in main table to join on (use "id" if not specified)' },
537
+ related_filters: {
538
+ type: 'array',
539
+ description: 'REQUIRED for exists/not_exists: Additional conditions in the EXISTS subquery',
540
+ items: {
541
+ type: 'object',
542
+ properties: {
543
+ column: { type: 'string', description: 'Column name in the related table' },
544
+ op: { type: 'string', description: 'Operator (usually "=")' },
545
+ param: { type: 'string', description: 'Parameter name (NEVER literal value) - derive from column names or context' }
546
+ },
547
+ required: %w[column op param]
548
+ }
549
+ }
550
+ },
551
+ required: %w[column op]
552
+ }
553
+ },
554
+ order: {
555
+ type: 'array',
556
+ items: {
557
+ type: 'object',
558
+ properties: {
559
+ column: { type: 'string' },
560
+ dir: { type: 'string', enum: %w[asc desc] }
561
+ },
562
+ required: %w[column dir]
563
+ }
564
+ },
565
+ limit: {
566
+ type: 'integer',
567
+ minimum: 1,
568
+ maximum: 10_000
569
+ },
570
+ params: {
571
+ type: 'object',
572
+ description: 'REQUIRED: Extract and include ALL literal values from the prompt. Map each param name to its actual value'
573
+ }
574
+ },
575
+ required: %w[type table columns limit params]
576
+ }
577
+ }
578
+ end
579
+ end
580
+ end
581
+ end