sql-chatbot-rails 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +20 -0
  4. data/app/controllers/sql_chatbot/chatbot_controller.rb +158 -0
  5. data/config/routes.rb +11 -0
  6. data/lib/generators/sql_chatbot/install_generator.rb +25 -0
  7. data/lib/generators/sql_chatbot/templates/initializer.rb +22 -0
  8. data/lib/sql_chatbot/auth/cors.rb +35 -0
  9. data/lib/sql_chatbot/auth/jwt.rb +34 -0
  10. data/lib/sql_chatbot/configuration.rb +58 -0
  11. data/lib/sql_chatbot/engine.rb +23 -0
  12. data/lib/sql_chatbot/grammar/count_renderer.rb +113 -0
  13. data/lib/sql_chatbot/grammar/entity_candidates.rb +210 -0
  14. data/lib/sql_chatbot/grammar/intent_extractor.rb +191 -0
  15. data/lib/sql_chatbot/grammar/list_renderer.rb +50 -0
  16. data/lib/sql_chatbot/grammar/miss_logger.rb +17 -0
  17. data/lib/sql_chatbot/grammar/modifiers.rb +145 -0
  18. data/lib/sql_chatbot/grammar/primitives.rb +69 -0
  19. data/lib/sql_chatbot/grammar/programmatic_renderer.rb +258 -0
  20. data/lib/sql_chatbot/grammar/registry.rb +66 -0
  21. data/lib/sql_chatbot/grammar/sanity_check.rb +37 -0
  22. data/lib/sql_chatbot/grammar/template_compiler.rb +179 -0
  23. data/lib/sql_chatbot/llm/client.rb +87 -0
  24. data/lib/sql_chatbot/prompts/answer.rb +157 -0
  25. data/lib/sql_chatbot/prompts/classify.rb +59 -0
  26. data/lib/sql_chatbot/prompts/generate_sql.rb +88 -0
  27. data/lib/sql_chatbot/services/code_indexer.rb +337 -0
  28. data/lib/sql_chatbot/services/grammar_pipeline.rb +45 -0
  29. data/lib/sql_chatbot/services/model_introspector.rb +152 -0
  30. data/lib/sql_chatbot/services/orchestrator.rb +635 -0
  31. data/lib/sql_chatbot/services/registry_builder.rb +385 -0
  32. data/lib/sql_chatbot/services/route_introspector.rb +118 -0
  33. data/lib/sql_chatbot/services/schema_service.rb +884 -0
  34. data/lib/sql_chatbot/services/sql_executor.rb +81 -0
  35. data/lib/sql_chatbot/version.rb +5 -0
  36. data/lib/sql_chatbot_rails.rb +91 -0
  37. data/vendor/assets/widget.js +53 -0
  38. metadata +180 -0
@@ -0,0 +1,884 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SqlChatbot
4
+ module Services
5
+ class SchemaService
6
+ # Columns that indicate soft-delete patterns
7
+ SOFT_DELETE_COLUMNS = %w[deleted_at discarded_at archived_at removed_at].freeze
8
+
9
+ # Word-boundary patterns for sensitive columns — must match as whole "word segments"
10
+ # separated by underscores or string boundaries, to avoid false positives like
11
+ # "pinned_at" matching "pin".
12
+ SENSITIVE_PATTERNS = %w[
13
+ password passwd secret token ssn social_security
14
+ credit_card card_number cvv pin encrypted hash
15
+ salt private_key api_key auth_key access_key
16
+ ].freeze
17
+
18
+ # Maps PostgreSQL data_type strings to concise labels used in the schema summary
19
+ TYPE_MAP = {
20
+ "character varying" => "VARCHAR",
21
+ "integer" => "INT",
22
+ "bigint" => "BIGINT",
23
+ "smallint" => "SMALLINT",
24
+ "timestamp without time zone" => "TIMESTAMP",
25
+ "timestamp with time zone" => "TIMESTAMPTZ",
26
+ "numeric" => "DECIMAL",
27
+ "boolean" => "BOOL",
28
+ "text" => "TEXT",
29
+ "date" => "DATE",
30
+ "double precision" => "DOUBLE",
31
+ "real" => "REAL",
32
+ "uuid" => "UUID",
33
+ "jsonb" => "JSONB",
34
+ "json" => "JSON",
35
+ }.freeze
36
+
37
+ # -------------------------------------------------------------------
38
+ # Class-level helpers
39
+ # -------------------------------------------------------------------
40
+
41
+ # Returns true if the column name matches any sensitive pattern using
42
+ # word-boundary matching: pattern must appear between start-of-string /
43
+ # underscore boundaries. This avoids false positives like "pinned_at"
44
+ # matching "pin".
45
+ def self.sensitive?(column_name)
46
+ lower = column_name.downcase
47
+ SENSITIVE_PATTERNS.any? do |pattern|
48
+ lower.match?(/(?:^|_)#{Regexp.escape(pattern)}(?:$|_)/)
49
+ end
50
+ end
51
+
52
+ # Map a PostgreSQL data_type to a concise label; unknown types are uppercased.
53
+ def self.map_type(pg_type)
54
+ TYPE_MAP[pg_type] || pg_type.upcase
55
+ end
56
+
57
+ # -------------------------------------------------------------------
58
+ # Instance
59
+ # -------------------------------------------------------------------
60
+
61
+ attr_reader :table_count
62
+
63
+ def initialize
64
+ @summary_text = ""
65
+ @tables = []
66
+ @per_table_schemas = {}
67
+ @table_index = {}
68
+ @fk_graph = {}
69
+ end
70
+
71
+ def summary
72
+ @summary_text
73
+ end
74
+
75
+ def table_count
76
+ @tables.length
77
+ end
78
+
79
+ # Scan FK LOOKUP and RAILS ENUM annotations for values that match words in the question.
80
+ # Returns array of hint strings like:
81
+ # "The user mentions 'movies'. In the titles table, use WHERE category_id = 2 (Movie)."
82
+ # "The user mentions 'active'. In the contractors table, use WHERE status = 1 (Active)."
83
+ def find_lookup_hints(question)
84
+ return [] if @summary_text.empty?
85
+
86
+ # Filter out stop words that would match too broadly
87
+ stop_words = Set.new(%w[a an the is are was were be been being have has had do does did will would shall should may might can could how what when where who which why not and or but if then else for from by with at in on to of it its this that these those])
88
+ words = question.downcase.split(/\W+/).reject { |w| w.empty? || w.length < 2 || stop_words.include?(w) }
89
+ hints = []
90
+ current_table = nil
91
+
92
+ @summary_text.split("\n").each do |line|
93
+ if line.start_with?("TABLE ")
94
+ current_table = line.match(/^TABLE (\S+)/)[1]
95
+ elsif line.include?("FK LOOKUP:") && current_table
96
+ match = line.match(/FK LOOKUP:\s+(\S+).*?values:\s+(.+)/)
97
+ next unless match
98
+
99
+ fk_col = match[1]
100
+ pairs = match[2].split(",").map(&:strip)
101
+ pairs.each do |pair|
102
+ id, name = pair.split("=", 2)
103
+ next unless name
104
+ clean_name = name.strip
105
+ next if clean_name.empty? || clean_name.length < 2 # Skip empty/tiny names
106
+
107
+ name_words = clean_name.downcase.split(/\W+/).reject(&:empty?)
108
+ matched_word = words.find do |w|
109
+ name_words.include?(w) ||
110
+ clean_name.downcase == w ||
111
+ (clean_name.length >= 3 && clean_name.downcase.start_with?(w)) ||
112
+ (w.length >= 3 && w.start_with?(clean_name.downcase))
113
+ end
114
+ if matched_word
115
+ hints << "The user mentions \"#{matched_word}\". In the #{current_table} table, use WHERE #{fk_col} = #{id.strip} (#{clean_name})."
116
+ end
117
+ end
118
+ elsif line.include?("RAILS ENUM:") && current_table
119
+ match = line.match(/RAILS ENUM:\s+(\S+)\s+values:\s+(.+)/)
120
+ next unless match
121
+
122
+ col = match[1]
123
+ pairs = match[2].split(",").map(&:strip)
124
+ pairs.each do |pair|
125
+ label, num = pair.split("=", 2)
126
+ next unless label && num
127
+ clean_label = label.strip
128
+ next if clean_label.empty? || clean_label.length < 2
129
+
130
+ label_words = clean_label.downcase.split(/\W+/).reject(&:empty?)
131
+ matched_word = words.find do |w|
132
+ label_words.include?(w) ||
133
+ clean_label.downcase == w ||
134
+ (clean_label.length >= 3 && clean_label.downcase.start_with?(w)) ||
135
+ (w.length >= 3 && w.start_with?(clean_label.downcase))
136
+ end
137
+ if matched_word
138
+ hints << "The user mentions \"#{matched_word}\". In the #{current_table} table, use WHERE #{col} = #{num.strip} (#{clean_label})."
139
+ end
140
+ end
141
+ end
142
+ end
143
+
144
+ hints.uniq.first(15) # Cap at 15 hints to avoid drowning the LLM
145
+ end
146
+
147
+ # Extract RAILS ENUM annotations from a schema string for the answer prompt.
148
+ # Returns a string like:
149
+ # "contractors.status: Active=1, Inactive=2, Deleted=3\njobs.status: Active=1, ..."
150
+ def extract_enum_context(schema_text = nil)
151
+ source = schema_text || @summary_text
152
+ return "" if source.empty?
153
+
154
+ lines = []
155
+ current_table = nil
156
+
157
+ source.split("\n").each do |line|
158
+ if line.start_with?("TABLE ")
159
+ current_table = line.match(/^TABLE (\S+)/)[1]
160
+ elsif line.include?("RAILS ENUM:") && current_table
161
+ match = line.match(/RAILS ENUM:\s+(\S+)\s+values:\s+(.+)/)
162
+ next unless match
163
+ lines << "#{current_table}.#{match[1]}: #{match[2]}"
164
+ end
165
+ end
166
+
167
+ lines.join("\n")
168
+ end
169
+
170
+ # Returns a short string listing all known table names.
171
+ # Used by the classify prompt so the LLM can see available tables without
172
+ # the full schema (~500 tokens vs ~275K chars for the full schema).
173
+ def table_names
174
+ return "" if @tables.empty?
175
+
176
+ "Available tables: #{@tables.join(', ')}"
177
+ end
178
+
179
+ # Given an array of search terms (e.g. ["customers"] or ["jobs", "job_types"]),
180
+ # returns a schema string containing ONLY the matching tables plus any tables
181
+ # needed to join them (bridge tables via FK paths, max depth 2).
182
+ #
183
+ # Falls back to hub tables (top 10 by FK edge count) when no terms match.
184
+ def select_schema(terms)
185
+ # If indexes are empty (discover not yet called), return full summary
186
+ return @summary_text if @per_table_schemas.empty?
187
+
188
+ # Step 1: Score tables by relevance
189
+ scores = score_tables(terms)
190
+
191
+ # Step 2: Fallback to hub tables if no matches
192
+ if scores.empty?
193
+ hub_tables(8).each { |t| scores[t] = 1 }
194
+ end
195
+
196
+ # Step 3: Take top 8 by score
197
+ max_primary = 8
198
+ top_tables = scores.sort_by { |_, s| -s }.first(max_primary).map(&:first)
199
+
200
+ # Step 4: Find FK join paths between top tables
201
+ all_tables = Set.new(top_tables)
202
+ top_tables.combination(2).each do |from, to|
203
+ bridge = find_join_path(from, to)
204
+ all_tables.merge(bridge) unless bridge.nil?
205
+ end
206
+
207
+ # Step 5: Cap total at 12
208
+ max_total = 12
209
+ final_tables = if all_tables.size <= max_total
210
+ all_tables
211
+ else
212
+ Set.new((top_tables + all_tables.to_a.reject { |t| top_tables.include?(t) }).first(max_total))
213
+ end
214
+
215
+ # Step 6: Build schema string preserving original order
216
+ @tables.select { |t| final_tables.include?(t) }
217
+ .map { |t| @per_table_schemas[t] }
218
+ .compact
219
+ .join("\n")
220
+ end
221
+
222
+ # Inject model-level annotations (from ModelIntrospector) into the schema summary.
223
+ # annotations_by_table: Hash of table_name => [annotation_strings]
224
+ # Each annotation is inserted after the TABLE line and any existing annotations.
225
+ def append_model_annotations(annotations_by_table)
226
+ return if annotations_by_table.nil? || annotations_by_table.empty?
227
+
228
+ lines = @summary_text.split("\n")
229
+ result = []
230
+ current_table = nil
231
+
232
+ lines.each do |line|
233
+ if line.start_with?("TABLE ")
234
+ # Before moving to next table, flush pending annotations for previous table
235
+ if current_table && annotations_by_table.key?(current_table)
236
+ annotations_by_table[current_table].each { |ann| result << ann }
237
+ end
238
+ current_table = line.match(/^TABLE (\S+)/)[1]
239
+ end
240
+ result << line
241
+ end
242
+
243
+ # Flush annotations for the last table
244
+ if current_table && annotations_by_table.key?(current_table)
245
+ annotations_by_table[current_table].each { |ann| result << ann }
246
+ end
247
+
248
+ @summary_text = result.join("\n")
249
+
250
+ # Also update per-table schemas so select_schema() includes the annotations
251
+ annotations_by_table.each do |table, annotations|
252
+ next unless @per_table_schemas.key?(table)
253
+ annotations.each do |ann|
254
+ @per_table_schemas[table] += "\n#{ann}"
255
+ end
256
+ end
257
+ end
258
+
259
+ # Introspect the database and build a schema summary string with enrichment
260
+ # annotations (soft delete, polymorphic, lookup values, enums, check constraints).
261
+ # Requires ActiveRecord::Base.connection to be available.
262
+ def discover
263
+ conn = ActiveRecord::Base.connection
264
+
265
+ # Run all introspection queries
266
+ table_names = query_tables(conn)
267
+ columns_rows = query_columns(conn)
268
+ pk_rows = query_primary_keys(conn)
269
+ fk_rows = query_foreign_keys(conn)
270
+ enum_rows = query_enums(conn)
271
+ check_rows = query_check_constraints(conn)
272
+
273
+ # Index primary keys: Set of "table.column"
274
+ pk_set = Set.new(pk_rows.map { |r| "#{r['table_name']}.#{r['column_name']}" })
275
+
276
+ # Index foreign keys: Hash of "from_table.from_column" => "to_table.to_column"
277
+ fk_map = fk_rows.each_with_object({}) do |r, h|
278
+ h["#{r['from_table']}.#{r['from_column']}"] = "#{r['to_table']}.#{r['to_column']}"
279
+ end
280
+
281
+ # Index enum types: enum_name => [ordered values]
282
+ enum_map = enum_rows.each_with_object({}) do |r, h|
283
+ (h[r["enum_name"]] ||= []) << r["enum_value"]
284
+ end
285
+
286
+ # Parse check constraints for IN (...) or ANY(ARRAY[...]) patterns
287
+ check_enum_map = {}
288
+ check_rows.each do |r|
289
+ result = parse_check_constraint(r["check_def"])
290
+ next unless result
291
+
292
+ col_name, values = result
293
+ check_enum_map["#{r['table_name']}.#{col_name}"] = values
294
+ end
295
+
296
+ # Group columns by table
297
+ columns_by_table = columns_rows.each_with_object({}) do |col, h|
298
+ (h[col["table_name"]] ||= []) << col
299
+ end
300
+
301
+ # Collect FK target tables for lookup value detection
302
+ fk_target_tables = Set.new(fk_rows.map { |r| r["to_table"] })
303
+
304
+ # Convention-based references: *_id columns => plural table names
305
+ table_name_set = Set.new(table_names)
306
+ columns_by_table.each_value do |cols|
307
+ cols.each do |col|
308
+ col_name = col["column_name"]
309
+ next unless col_name.end_with?("_id")
310
+ next if pk_set.include?("#{col['table_name']}.#{col_name}")
311
+
312
+ base = col_name[0..-4] # remove '_id'
313
+ candidates = [
314
+ "#{base}s",
315
+ "#{base.sub(/y$/, 'ie')}s",
316
+ "#{base}es",
317
+ base,
318
+ ]
319
+ candidates.each do |candidate|
320
+ if table_name_set.include?(candidate)
321
+ fk_target_tables.add(candidate)
322
+ break
323
+ end
324
+ end
325
+ end
326
+ end
327
+
328
+ # Discover lookup values for small referenced tables
329
+ lookup_values = discover_lookup_values(conn, fk_target_tables, columns_by_table, pk_set)
330
+
331
+ # Get approximate row counts for all tables (helps LLM distinguish data vs config tables)
332
+ row_counts = query_row_counts(conn)
333
+
334
+ # Build summary lines
335
+ lines = []
336
+ table_names.each do |table|
337
+ columns = columns_by_table[table] || []
338
+ col_parts = []
339
+ annotations = []
340
+ col_name_types = {} # column_name => mapped_type (for polymorphic detection)
341
+
342
+ columns.each do |col|
343
+ next if self.class.sensitive?(col["column_name"])
344
+
345
+ key = "#{table}.#{col['column_name']}"
346
+
347
+ # Resolve enum values: PG native enum or check-constraint enum
348
+ enum_values = if col["data_type"] == "USER-DEFINED" && col["udt_name"]
349
+ enum_map[col["udt_name"]]
350
+ end
351
+ enum_values ||= check_enum_map[key]
352
+
353
+ mapped_type = if enum_values
354
+ "ENUM(#{enum_values.join(',')})"
355
+ else
356
+ self.class.map_type(col["data_type"])
357
+ end
358
+
359
+ part = "#{col['column_name']} #{mapped_type}"
360
+ part += " PK" if pk_set.include?(key)
361
+ part += " FK=>#{fk_map[key]}" if fk_map.key?(key)
362
+
363
+ col_parts << part
364
+ col_name_types[col["column_name"]] = mapped_type
365
+
366
+ # Defer soft delete annotation (applied after model introspection)
367
+ if SOFT_DELETE_COLUMNS.include?(col["column_name"])
368
+ (@deferred_soft_deletes ||= {})[table] ||= []
369
+ @deferred_soft_deletes[table] << col["column_name"]
370
+ end
371
+
372
+ # Enum value annotation
373
+ if enum_values
374
+ annotations << " -- ENUM: #{col['column_name']} values: #{enum_values.join(', ')}"
375
+ end
376
+ end
377
+
378
+ # Polymorphic association detection
379
+ col_name_types.each do |col_name, col_type|
380
+ next unless col_name.end_with?("_type") && %w[VARCHAR TEXT].include?(col_type)
381
+
382
+ prefix = col_name[0..-6] # remove '_type'
383
+ id_col = "#{prefix}_id"
384
+ id_type = col_name_types[id_col]
385
+ if id_type && %w[INT BIGINT].include?(id_type)
386
+ annotations << " -- POLYMORPHIC: #{col_name} + #{id_col} (join target depends on type value)"
387
+ end
388
+ end
389
+
390
+ # Lookup values annotation
391
+ if lookup_values.key?(table)
392
+ annotations << " -- VALUES: #{lookup_values[table]}"
393
+ end
394
+
395
+ count = row_counts[table]
396
+ count_hint = count ? " (~#{count} rows)" : ""
397
+ lines << "TABLE #{table}#{count_hint} (#{col_parts.join(', ')})"
398
+ annotations.each { |ann| lines << ann }
399
+ end
400
+
401
+ @tables = table_names
402
+ @summary_text = lines.join("\n")
403
+
404
+ build_per_table_schemas(lines)
405
+ build_table_index(columns_by_table)
406
+ build_fk_graph(fk_rows, columns_by_table, table_name_set)
407
+ end
408
+
409
+ # Re-discover schema (alias for discover)
410
+ def refresh
411
+ discover
412
+ end
413
+
414
+ # Move "-- VALUES:" annotations from lookup tables to the FK columns that reference them.
415
+ # After this, LLMs see lookup values next to the FK column (e.g., category_id) instead of
416
+ # on the lookup table itself, preventing confusion between unrelated integer columns.
417
+ def relocate_lookup_annotations
418
+ lines = @summary_text.split("\n")
419
+
420
+ # Step 1: Extract VALUES annotations and their tables
421
+ lookup_values = {} # table_name => values_string
422
+ lines_without_values = []
423
+ current_table = nil
424
+
425
+ lines.each do |line|
426
+ if line.start_with?("TABLE ")
427
+ current_table = line.match(/^TABLE (\S+)/)[1]
428
+ end
429
+
430
+ if line.strip.start_with?("-- VALUES:")
431
+ lookup_values[current_table] = line.strip.sub("-- VALUES: ", "") if current_table
432
+ else
433
+ lines_without_values << line
434
+ end
435
+ end
436
+
437
+ return if lookup_values.empty?
438
+
439
+ # Step 2: Build convention-based table name patterns for matching
440
+ convention_map = {} # "singular_id" => lookup_table
441
+ lookup_values.each_key do |table|
442
+ singular = if table.end_with?("ies")
443
+ table[0..-4] + "y"
444
+ elsif table.end_with?("ses")
445
+ table[0..-3]
446
+ elsif table.end_with?("s")
447
+ table[0..-2]
448
+ else
449
+ table
450
+ end
451
+ convention_map["#{singular}_id"] = table
452
+ end
453
+
454
+ # Step 3: Find FK columns and inject FK LOOKUP annotations
455
+ result = []
456
+ lines_without_values.each do |line|
457
+ result << line
458
+
459
+ if line.start_with?("TABLE ")
460
+ # Match explicit FK references: "column_name INT FK=>target_table.target_column"
461
+ lookup_values.each do |lookup_table, values|
462
+ line.scan(/(\w+)\s+\w+\s+FK=>#{Regexp.escape(lookup_table)}\.(\w+)/).each do |fk_col, _target_col|
463
+ result << " -- FK LOOKUP: #{fk_col} values: #{values}"
464
+ end
465
+ end
466
+
467
+ # Match convention-based references: "category_id INT" (no FK=> marker)
468
+ convention_map.each do |fk_col_name, lookup_table|
469
+ # Skip if already matched by explicit FK above
470
+ next if line.include?("#{fk_col_name} ") && line.include?("FK=>#{lookup_table}")
471
+ if line.match?(/\b#{Regexp.escape(fk_col_name)}\s+\w+(?!\s+FK)/)
472
+ result << " -- FK LOOKUP: #{fk_col_name} values: #{lookup_values[lookup_table]}"
473
+ end
474
+ end
475
+ end
476
+ end
477
+
478
+ @summary_text = result.join("\n")
479
+ end
480
+
481
+ # Apply soft delete annotations conditionally based on model introspection results.
482
+ # - Tables using a soft delete gem (paranoia, discard): always add SOFT DELETE annotation
483
+ # - Tables with enum soft delete but no gem: suppress SOFT DELETE (enum is the real mechanism)
484
+ # - Tables with neither: add SOFT DELETE annotation (assume column is used)
485
+ def apply_soft_delete_annotations(soft_delete_tables:, enum_soft_delete_tables:)
486
+ return if @deferred_soft_deletes.nil? || @deferred_soft_deletes.empty?
487
+
488
+ new_annotations = {}
489
+ @deferred_soft_deletes.each do |table, columns|
490
+ if soft_delete_tables.include?(table)
491
+ # Gem manages this column — keep the annotation
492
+ columns.each do |col|
493
+ (new_annotations[table] ||= []) << " -- SOFT DELETE: filter #{col} IS NULL for active records"
494
+ end
495
+ elsif enum_soft_delete_tables.include?(table)
496
+ # Enum is the real soft delete, column is likely unused — suppress
497
+ next
498
+ else
499
+ # No competing mechanism — assume column is used
500
+ columns.each do |col|
501
+ (new_annotations[table] ||= []) << " -- SOFT DELETE: filter #{col} IS NULL for active records"
502
+ end
503
+ end
504
+ end
505
+
506
+ append_model_annotations(new_annotations) unless new_annotations.empty?
507
+ end
508
+
509
+ private
510
+
511
+ # -------------------------------------------------------------------
512
+ # Smart schema index builders (called at end of discover)
513
+ # -------------------------------------------------------------------
514
+
515
+ # Parse the lines array produced by discover() and split into per-table chunks.
516
+ # Each entry in @per_table_schemas is the full multi-line string for one table,
517
+ # including its TABLE header line and all annotation lines.
518
+ def build_per_table_schemas(lines)
519
+ @per_table_schemas = {}
520
+ current_table = nil
521
+ current_lines = []
522
+
523
+ lines.each do |line|
524
+ if line.start_with?("TABLE ")
525
+ # Flush previous table
526
+ if current_table
527
+ @per_table_schemas[current_table] = current_lines.join("\n")
528
+ end
529
+ current_table = line.match(/^TABLE (\S+)/)[1]
530
+ current_lines = [line]
531
+ else
532
+ current_lines << line if current_table
533
+ end
534
+ end
535
+
536
+ # Flush last table
537
+ if current_table
538
+ @per_table_schemas[current_table] = current_lines.join("\n")
539
+ end
540
+ end
541
+
542
+ # Build an inverted index: column_name => [table_names].
543
+ # Skips sensitive columns so they can't be used as search hints.
544
+ def build_table_index(columns_by_table)
545
+ @table_index = {}
546
+ columns_by_table.each do |table, columns|
547
+ columns.each do |col|
548
+ col_name = col["column_name"]
549
+ next if self.class.sensitive?(col_name)
550
+
551
+ (@table_index[col_name] ||= []) << table
552
+ end
553
+ end
554
+ end
555
+
556
+ # Build a bidirectional FK graph: table_name => [{from_col:, to_table:, to_col:}]
557
+ # Includes both explicit FK constraints and convention-based _id columns.
558
+ def build_fk_graph(fk_rows, columns_by_table, table_name_set)
559
+ @fk_graph = {}
560
+
561
+ # Explicit FK constraints (both directions)
562
+ fk_rows.each do |r|
563
+ from_table = r["from_table"]
564
+ to_table = r["to_table"]
565
+ from_col = r["from_column"]
566
+ to_col = r["to_column"]
567
+
568
+ # Forward: from_table -> to_table
569
+ (@fk_graph[from_table] ||= []) << { from_col: from_col, to_table: to_table, to_col: to_col }
570
+ # Reverse: to_table -> from_table
571
+ (@fk_graph[to_table] ||= []) << { from_col: to_col, to_table: from_table, to_col: from_col }
572
+ end
573
+
574
+ # Convention-based: *_id columns that resolve to a table name
575
+ columns_by_table.each do |table, columns|
576
+ columns.each do |col|
577
+ col_name = col["column_name"]
578
+ next unless col_name.end_with?("_id")
579
+
580
+ base = col_name[0..-4] # remove '_id'
581
+ candidates = [
582
+ "#{base}s",
583
+ "#{base.sub(/y$/, 'ie')}s",
584
+ "#{base}es",
585
+ base,
586
+ ]
587
+ candidates.each do |candidate|
588
+ next unless table_name_set.include?(candidate)
589
+
590
+ # Add forward edge if not already present from explicit FKs
591
+ existing = (@fk_graph[table] ||= [])
592
+ already = existing.any? { |e| e[:from_col] == col_name && e[:to_table] == candidate }
593
+ unless already
594
+ existing << { from_col: col_name, to_table: candidate, to_col: "id" }
595
+ (@fk_graph[candidate] ||= []) << { from_col: "id", to_table: table, to_col: col_name }
596
+ end
597
+ break
598
+ end
599
+ end
600
+ end
601
+ end
602
+
603
+ # Match an array of search terms to table names.
604
+ # Tries (in order): exact table name, singular/plural variants, column name match,
605
+ # substring match on column names.
606
+ # Returns a Set of matching table names.
607
+ def score_tables(terms)
608
+ scores = Hash.new(0)
609
+ table_set = Set.new(@tables)
610
+
611
+ terms.each do |term|
612
+ t = term.to_s.downcase.strip
613
+ next if t.empty?
614
+
615
+ @tables.each do |table|
616
+ # Exact match (highest priority)
617
+ if table == t
618
+ scores[table] += 10
619
+ next
620
+ end
621
+
622
+ # Singular/plural match
623
+ variants = ["#{t}s", "#{t.sub(/y$/, 'ie')}s", "#{t}es", t.sub(/ies$/, 'y'), t.sub(/s$/, '')]
624
+ if variants.include?(table) || variants.any? { |v| v == table }
625
+ scores[table] += 8
626
+ next
627
+ end
628
+
629
+ # Django/prefix match: "order" matches "order_order" (primary=7) vs "order_orderevent" (secondary=4)
630
+ parts = table.split("_", 2)
631
+ if parts.length >= 2
632
+ app_name = parts[0]
633
+ model_name = parts[1]
634
+ if app_name == t
635
+ scores[table] += (model_name == t || model_name == app_name) ? 7 : 4
636
+ next
637
+ end
638
+ model_singular = model_name.sub(/s$/, "")
639
+ if model_name == t || model_singular == t
640
+ scores[table] += 6
641
+ next
642
+ end
643
+ end
644
+
645
+ # General substring (low priority, min 3 chars)
646
+ if t.length >= 3 && table.include?(t)
647
+ scores[table] += 2
648
+ end
649
+ end
650
+
651
+ # Column name match
652
+ @table_index.each do |col_name, tables|
653
+ if col_name == t || col_name == "#{t}_id"
654
+ tables.each { |tbl| scores[tbl] += 3 }
655
+ end
656
+ end
657
+ end
658
+
659
+ scores.select { |_, v| v > 0 }
660
+ end
661
+
662
+ # Keep match_tables as alias for backward compatibility (used in tests)
663
+ def match_tables(terms)
664
+ Set.new(score_tables(terms).keys)
665
+ end
666
+
667
+ # Return the top N tables by FK edge count (most-connected = hub tables).
668
+ # Used as fallback when no search terms match any table.
669
+ def hub_tables(limit)
670
+ sorted = @tables.sort_by { |t| -(@fk_graph[t]&.length || 0) }
671
+ Set.new(sorted.first(limit))
672
+ end
673
+
674
+ # BFS to find shortest FK join path between two tables (max depth 2).
675
+ # Returns:
676
+ # [] — tables are directly connected (no bridge needed)
677
+ # [t] — one bridge table t is needed
678
+ # nil — no path found within max depth
679
+ def find_join_path(from, to)
680
+ return [] if from == to
681
+
682
+ # Depth 1: direct edge
683
+ neighbors_from = (@fk_graph[from] || []).map { |e| e[:to_table] }
684
+ return [] if neighbors_from.include?(to)
685
+
686
+ # Depth 2: one bridge table
687
+ neighbors_from.each do |bridge|
688
+ bridge_neighbors = (@fk_graph[bridge] || []).map { |e| e[:to_table] }
689
+ return [bridge] if bridge_neighbors.include?(to)
690
+ end
691
+
692
+ nil
693
+ end
694
+
695
+ # -------------------------------------------------------------------
696
+ # Legacy detection helpers (also called from discover loop above)
697
+ # -------------------------------------------------------------------
698
+
699
+ # Detect polymorphic associations: _type VARCHAR/TEXT + _id INT/BIGINT pairs.
700
+ # Returns array of prefix strings (e.g., ["commentable", "taggable"]).
701
+ def detect_polymorphic(columns)
702
+ col_types = {}
703
+ columns.each do |col|
704
+ col_types[col["column_name"]] = self.class.map_type(col["data_type"])
705
+ end
706
+
707
+ prefixes = []
708
+ col_types.each do |col_name, col_type|
709
+ next unless col_name.end_with?("_type") && %w[VARCHAR TEXT].include?(col_type)
710
+
711
+ prefix = col_name[0..-6] # remove '_type'
712
+ id_col = "#{prefix}_id"
713
+ id_type = col_types[id_col]
714
+ prefixes << prefix if id_type && %w[INT BIGINT].include?(id_type)
715
+ end
716
+
717
+ prefixes
718
+ end
719
+
720
+ # Detect soft-delete column. Returns the column name or nil.
721
+ def detect_soft_delete(columns)
722
+ columns.each do |col|
723
+ return col["column_name"] if SOFT_DELETE_COLUMNS.include?(col["column_name"])
724
+ end
725
+ nil
726
+ end
727
+
728
+ # Parse a check constraint definition for IN (...) or ANY(ARRAY[...]) enum patterns.
729
+ # Returns [column_name, [values]] or nil.
730
+ def parse_check_constraint(check_def)
731
+ # Format 1: ((col)::text = ANY ((ARRAY['a'::varchar, 'b'::varchar])::text[]))
732
+ match = check_def.match(/\(\((\w+)\)::\w+\s*=\s*ANY\s*\(\(?ARRAY\[([^\]]+)\]/i)
733
+ # Format 2: (col IN ('a', 'b', 'c'))
734
+ match ||= check_def.match(/\((\w+)\s+IN\s*\(([^)]+)\)/i)
735
+ return nil unless match
736
+
737
+ col_name = match[1]
738
+ values_str = match[2]
739
+ return nil if values_str.nil? || values_str.empty?
740
+
741
+ values = values_str.split(",").map do |v|
742
+ v.strip.sub(/^'([^']*)'(?:::\w+.*)?$/, '\1').strip
743
+ end.reject(&:empty?)
744
+
745
+ values.empty? ? nil : [col_name, values]
746
+ end
747
+
748
+ # -------------------------------------------------------------------
749
+ # SQL query helpers (require ActiveRecord::Base.connection)
750
+ # -------------------------------------------------------------------
751
+
752
+ def query_tables(conn)
753
+ rows = conn.exec_query(<<~SQL)
754
+ SELECT table_name FROM information_schema.tables
755
+ WHERE table_schema = 'public' AND table_type = 'BASE TABLE'
756
+ ORDER BY table_name
757
+ SQL
758
+ rows.map { |r| r["table_name"] }
759
+ end
760
+
761
+ def query_columns(conn)
762
+ conn.exec_query(<<~SQL).to_a
763
+ SELECT table_name, column_name, data_type, udt_name, is_nullable, column_default
764
+ FROM information_schema.columns
765
+ WHERE table_schema = 'public'
766
+ ORDER BY table_name, ordinal_position
767
+ SQL
768
+ end
769
+
770
+ def query_primary_keys(conn)
771
+ conn.exec_query(<<~SQL).to_a
772
+ SELECT kcu.table_name, kcu.column_name
773
+ FROM information_schema.table_constraints tc
774
+ JOIN information_schema.key_column_usage kcu
775
+ ON tc.constraint_name = kcu.constraint_name
776
+ AND tc.constraint_schema = kcu.constraint_schema
777
+ WHERE tc.constraint_type = 'PRIMARY KEY' AND tc.table_schema = 'public'
778
+ SQL
779
+ end
780
+
781
+ def query_foreign_keys(conn)
782
+ conn.exec_query(<<~SQL).to_a
783
+ SELECT
784
+ kcu.table_name AS from_table,
785
+ kcu.column_name AS from_column,
786
+ ccu.table_name AS to_table,
787
+ ccu.column_name AS to_column
788
+ FROM information_schema.table_constraints tc
789
+ JOIN information_schema.key_column_usage kcu
790
+ ON tc.constraint_name = kcu.constraint_name
791
+ AND tc.constraint_schema = kcu.constraint_schema
792
+ JOIN information_schema.constraint_column_usage ccu
793
+ ON tc.constraint_name = ccu.constraint_name
794
+ AND tc.constraint_schema = ccu.constraint_schema
795
+ WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = 'public'
796
+ SQL
797
+ end
798
+
799
+ def query_enums(conn)
800
+ conn.exec_query(<<~SQL).to_a
801
+ SELECT t.typname AS enum_name, e.enumlabel AS enum_value
802
+ FROM pg_enum e
803
+ JOIN pg_type t ON e.enumtypid = t.oid
804
+ ORDER BY t.typname, e.enumsortorder
805
+ SQL
806
+ end
807
+
808
+ def query_check_constraints(conn)
809
+ conn.exec_query(<<~SQL).to_a
810
+ SELECT conrelid::regclass AS table_name, pg_get_constraintdef(oid) AS check_def
811
+ FROM pg_constraint
812
+ WHERE contype = 'c' AND connamespace = 'public'::regnamespace
813
+ SQL
814
+ end
815
+
816
+ # Get approximate row counts for all tables from pg_stat_user_tables.
817
+ # Returns Hash of table_name => integer count.
818
+ def query_row_counts(conn)
819
+ conn.exec_query(<<~SQL).to_a.each_with_object({}) do |r, h|
820
+ SELECT relname, n_live_tup FROM pg_stat_user_tables WHERE schemaname = 'public'
821
+ SQL
822
+ h[r["relname"]] = r["n_live_tup"].to_i
823
+ end
824
+ end
825
+
826
+ # Query lookup values for small FK-target tables.
827
+ # Returns Hash of table_name => "id1=name1, id2=name2, ..."
828
+ def discover_lookup_values(conn, fk_target_tables, columns_by_table, pk_set)
829
+ result = {}
830
+ return result if fk_target_tables.empty?
831
+
832
+ # Get approximate row counts
833
+ stats_rows = conn.exec_query(<<~SQL).to_a
834
+ SELECT relname, n_live_tup FROM pg_stat_user_tables WHERE schemaname = 'public'
835
+ SQL
836
+
837
+ row_counts = stats_rows.each_with_object({}) do |r, h|
838
+ h[r["relname"]] = r["n_live_tup"].to_i
839
+ end
840
+
841
+ fk_target_tables.each do |table|
842
+ count = row_counts[table]
843
+ next if count.nil? || count >= 50
844
+
845
+ columns = columns_by_table[table]
846
+ next unless columns
847
+
848
+ # Find PK column
849
+ pk_col = columns.find { |c| pk_set.include?("#{table}.#{c['column_name']}") }
850
+ next unless pk_col
851
+
852
+ # Find first VARCHAR/TEXT non-PK column as display value
853
+ display_col = columns.find do |c|
854
+ next false if pk_set.include?("#{table}.#{c['column_name']}")
855
+
856
+ mapped = self.class.map_type(c["data_type"])
857
+ %w[VARCHAR TEXT].include?(mapped)
858
+ end
859
+ next unless display_col
860
+
861
+ pk_name = pk_col["column_name"]
862
+ display_name = display_col["column_name"]
863
+
864
+ begin
865
+ values_rows = conn.exec_query(
866
+ "SELECT #{conn.quote_column_name(pk_name)}, #{conn.quote_column_name(display_name)} " \
867
+ "FROM #{conn.quote_table_name(table)} " \
868
+ "ORDER BY #{conn.quote_column_name(pk_name)} LIMIT 50"
869
+ ).to_a
870
+
871
+ if values_rows.any?
872
+ pairs = values_rows.map { |r| "#{r[pk_name]}=#{r[display_name]}" }
873
+ result[table] = pairs.join(", ")
874
+ end
875
+ rescue StandardError
876
+ # Skip tables that fail (e.g., permission issues)
877
+ end
878
+ end
879
+
880
+ result
881
+ end
882
+ end
883
+ end
884
+ end