contsql 0.3.1__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: contsql
3
- Version: 0.3.1
3
+ Version: 0.3.6
4
4
  Requires-Python: >=3.10
5
5
  Requires-Dist: duckdb
6
6
  Requires-Dist: requests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: contsql
3
- Version: 0.3.1
3
+ Version: 0.3.6
4
4
  Requires-Python: >=3.10
5
5
  Requires-Dist: duckdb
6
6
  Requires-Dist: requests
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env python3
2
- # v0.2.9 | 2026-04-13 | JSONL session loglama her şey kaydedilir, default açık
2
+ # v0.3.5 | 2026-04-14 | warmup + DB health check + 0-satır teşhis
3
3
  """contsql — Minimal DuckDB SQL agent. Soru sor, SQL üret, çalıştır, göster."""
4
4
 
5
5
  import argparse
@@ -7,6 +7,7 @@ import json
7
7
  import os
8
8
  import re
9
9
  import sys
10
+ import threading
10
11
  import time
11
12
  from pathlib import Path
12
13
 
@@ -22,6 +23,66 @@ TIMEOUT = int(os.environ.get("CONTSQL_TIMEOUT", "120"))
22
23
 
23
24
  BANNED_SQL = ["INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "CREATE", "TRUNCATE", "EXEC"]
24
25
 
26
+ # ── Warmup + Health Check ──
27
+
28
+ CRITICAL_TABLES = [
29
+ ("fact_periodic", "Dönemsel risk verileri"),
30
+ ("map_identity", "Firma kimlik bilgileri"),
31
+ ("dim_entity", "Firma boyut bilgileri"),
32
+ ]
33
+
34
+
35
+ def _warmup_model():
36
+ """Model'i GPU'ya yükle — background thread."""
37
+ try:
38
+ requests.post(f"{OLLAMA_URL}/api/generate", json={
39
+ "model": MODEL, "prompt": " ", "options": {"num_predict": 1}
40
+ }, timeout=60)
41
+ except Exception:
42
+ pass
43
+
44
+
45
+ def _db_health_check(conn):
46
+ """Ana tabloların doluluk kontrolü."""
47
+ warnings = []
48
+ for table, desc in CRITICAL_TABLES:
49
+ try:
50
+ count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
51
+ if count == 0:
52
+ warnings.append(f" ⚠ {table} boş — {desc} yüklenmemiş.")
53
+ except Exception:
54
+ warnings.append(f" ⚠ {table} tablosu bulunamadı.")
55
+ if warnings:
56
+ print("⚠ DB UYARI:")
57
+ for w in warnings:
58
+ print(w)
59
+ print()
60
+ return len(warnings) == 0
61
+
62
+
63
+ def _diagnose_empty_result(sql, conn):
64
+ """0 satır dönen sorguda olası nedeni bul."""
65
+ muta_match = re.search(r"muta\s*=\s*'(\d+)'", sql)
66
+ if muta_match:
67
+ tid = muta_match.group(1)
68
+ if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE muta='{tid}'").fetchone()[0] == 0:
69
+ return f"💡 MUTA {tid} veritabanında yok."
70
+
71
+ entity_match = re.search(r"entity_id\s*(?:=|IN)\s*\(?'?(\d+)", sql)
72
+ if entity_match:
73
+ tid = entity_match.group(1)
74
+ if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE entity_id='{tid}'").fetchone()[0] == 0:
75
+ return f"💡 Entity {tid} veritabanında yok."
76
+
77
+ ilike_match = re.search(r"ILIKE\s+'([^']+)'", sql, re.IGNORECASE)
78
+ if ilike_match:
79
+ pattern = ilike_match.group(1)
80
+ if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE unvan ILIKE '{pattern}'").fetchone()[0] == 0:
81
+ return f"💡 '{pattern}' ile eşleşen firma yok."
82
+
83
+ return None
84
+
85
+
25
86
  # ── Loglama ──
26
87
 
27
88
  _LOG_FILE = None
@@ -125,16 +186,18 @@ def has_reference_trigger(question):
125
186
 
126
187
 
127
188
  SORGU_TRIGGERS = [
128
- "yanına ekle", "yanına da ekle", "yanına", "buna ekle", "buna da ekle",
189
+ "yanına ekle", "yanına da ekle", "yanına", "yanlarına",
190
+ "buna ekle", "buna da ekle",
129
191
  "kolonu da ekle", "kolonunu da ekle", "bir de", "aynısına",
130
192
  "aynı sorguya", "aynı sorgu", "üstüne ekle", "ekle yanına",
131
193
  "göster yanında", "da göster", "da getir", "de göster", "de getir",
132
194
  "tablodan", "tablodaki", "tabloyu", "tabloya",
195
+ "altına ekle", "altına", "altına da ekle",
133
196
  "çıkart", "çıkar", "kaldır", "at şunu", "filtrele", "daralt",
134
197
  "sadece", "hariç", "hariç tut",
135
198
  ]
136
199
 
137
- MAX_SQL_CONTEXT_LENGTH = 500
200
+ MAX_SQL_CONTEXT_LENGTH = 1000
138
201
 
139
202
 
140
203
  def has_query_trigger(question):
@@ -145,6 +208,38 @@ def has_query_trigger(question):
145
208
 
146
209
  # ── System prompt ──
147
210
 
211
+ def _compact_schema(schema_text):
212
+ """Schema'yı kompakt tek-satır formata dönüştür."""
213
+ lines = []
214
+ for line in schema_text.strip().split("\n"):
215
+ line = line.strip()
216
+ if not line:
217
+ continue
218
+ # " tablo (N satır): col1 (TYPE), col2 (TYPE)" → "tablo(col1 TYPE, col2 TYPE)"
219
+ if "satır" in line:
220
+ parts = line.split(":")
221
+ if len(parts) >= 2:
222
+ tbl = parts[0].split("(")[0].strip()
223
+ cols = parts[1].strip()
224
+ # "col (TYPE)" → "col TYPE" — sadece sorunlu tipleri koru
225
+ compact_cols = []
226
+ for c in cols.split(", "):
227
+ c = c.strip()
228
+ if "(" in c:
229
+ name = c.split("(")[0].strip()
230
+ typ = c.split("(")[1].rstrip(")")
231
+ if typ in ("VARCHAR", "BIGINT", "DOUBLE"):
232
+ compact_cols.append(f"{name} {typ}")
233
+ else:
234
+ compact_cols.append(name)
235
+ else:
236
+ compact_cols.append(c)
237
+ lines.append(f"{tbl}({', '.join(compact_cols)})")
238
+ else:
239
+ lines.append(line)
240
+ return "\n".join(lines)
241
+
242
+
148
243
  def build_system_prompt(schema_text, domain_text="", last_result_entities=None,
149
244
  question=None, last_sql=None, column_hints=""):
150
245
  prompt = f"""Sen bir SQL asistanısın. Kullanıcının sorusuna uygun SQL yaz.
@@ -161,10 +256,10 @@ Kurallar:
161
256
  Veritabanı şeması:
162
257
  {schema_text}
163
258
  """
164
- if domain_text:
165
- prompt += f"\nDomain bilgisi:\n{domain_text}\n"
166
259
  if column_hints:
167
260
  prompt += f"\n{column_hints}\n"
261
+ if domain_text:
262
+ prompt += f"\nDomain:\n{domain_text}\n"
168
263
  if last_sql and question and has_query_trigger(question):
169
264
  if len(last_sql) <= MAX_SQL_CONTEXT_LENGTH:
170
265
  prompt += f"""\nÖNCEKİ SQL'İ MODİFİYE ET:
@@ -232,6 +327,49 @@ def _like_to_ilike(sql):
232
327
  )
233
328
 
234
329
 
330
+ VARCHAR_SCORE_COLUMNS = ["ews_skor", "yis_skor", "eus_skor"]
331
+
332
+
333
+ def _fix_varchar_sort(sql):
334
+ """ORDER BY'da VARCHAR skor kolonlarını TRY_CAST ile sar."""
335
+ for col in VARCHAR_SCORE_COLUMNS:
336
+ pattern = rf'(ORDER\s+BY\s+.*?)(\w+\.)?({re.escape(col)})\b(?!\s*AS\b)'
337
+
338
+ def _repl(m, _col=col):
339
+ prefix, alias, column = m.group(1), m.group(2) or "", m.group(3)
340
+ if "TRY_CAST" in prefix.split(",")[-1]:
341
+ return m.group(0)
342
+ return f"{prefix}TRY_CAST({alias}{column} AS DOUBLE)"
343
+
344
+ sql = re.sub(pattern, _repl, sql, flags=re.IGNORECASE)
345
+
346
+ if "TRY_CAST" in sql and "ORDER BY" in sql.upper() and "NULLS LAST" not in sql.upper():
347
+ sql = re.sub(r'(ORDER\s+BY\s+.+?)(\s*;?\s*$)', r'\1 NULLS LAST\2',
348
+ sql, flags=re.IGNORECASE)
349
+ return sql
350
+
351
+
352
+ def _fix_varchar_comparison(sql):
353
+ """WHERE'de VARCHAR skor kolonlarının sayısal karşılaştırmasını düzelt."""
354
+ for col in VARCHAR_SCORE_COLUMNS:
355
+ pattern = rf'(?<!TRY_CAST\()(\w+\.)?({re.escape(col)})\s*([><=!]+)\s*(\d+)'
356
+
357
+ def _repl(m, _col=col):
358
+ alias = m.group(1) or ""
359
+ return f"TRY_CAST({alias}{m.group(2)} AS DOUBLE) {m.group(3)} {m.group(4)}"
360
+
361
+ sql = re.sub(pattern, _repl, sql, flags=re.IGNORECASE)
362
+ return sql
363
+
364
+
365
+ def _apply_guardrails(sql):
366
+ """Tüm SQL post-processing guardrail'leri uygula."""
367
+ sql = _like_to_ilike(sql)
368
+ sql = _fix_varchar_sort(sql)
369
+ sql = _fix_varchar_comparison(sql)
370
+ return sql
371
+
372
+
235
373
  def check_sql_safety(sql):
236
374
  """Sadece SELECT/WITH izinli. Tehlikeli keyword varsa hata döndür."""
237
375
  sql_upper = sql.strip().upper()
@@ -246,7 +384,7 @@ def check_sql_safety(sql):
246
384
  # ── LLM call ──
247
385
 
248
386
  def ask_model(system_prompt, question):
249
- """Ollama'ya soru gönder, yanıt al."""
387
+ """Ollama'ya soru gönder, yanıt + timing döndür."""
250
388
  t0 = time.time()
251
389
  try:
252
390
  resp = requests.post(
@@ -266,9 +404,17 @@ def ask_model(system_prompt, question):
266
404
  content = data.get("message", {}).get("content", "")
267
405
  elapsed = time.time() - t0
268
406
  tokens = data.get("eval_count", 0)
269
- return content, elapsed, tokens
407
+ # Ollama timing metrikleri (nanosecond → ms)
408
+ timing = {
409
+ "prompt_eval_ms": data.get("prompt_eval_duration", 0) / 1e6,
410
+ "generation_ms": data.get("eval_duration", 0) / 1e6,
411
+ "prompt_tokens": data.get("prompt_eval_count", 0),
412
+ "gen_tokens": tokens,
413
+ "prompt_chars": len(system_prompt),
414
+ }
415
+ return content, elapsed, tokens, timing
270
416
  except Exception as e:
271
- return f"LLM HATA: {e}", time.time() - t0, 0
417
+ return f"LLM HATA: {e}", time.time() - t0, 0, {}
272
418
 
273
419
 
274
420
  def _short_error(msg):
@@ -331,21 +477,21 @@ def generate_sql(conn, question, last_result_entities=None, domain_text="",
331
477
  column_hints = format_column_hints(col_map) if col_map else ""
332
478
  system_prompt = build_system_prompt(schema_text, domain_text, last_result_entities,
333
479
  question=question, column_hints=column_hints)
334
- response, _, _ = ask_model(system_prompt, question)
480
+ response, _, _, _ = ask_model(system_prompt, question)
335
481
  sql = extract_sql(response)
336
482
  if not sql or check_sql_safety(sql):
337
483
  return None
338
- sql = _like_to_ilike(sql)
484
+ sql = _apply_guardrails(sql)
339
485
 
340
486
  # Genel SQL hata retry: EXPLAIN ile ön kontrol
341
487
  try:
342
488
  conn.execute(f"EXPLAIN {sql}")
343
489
  except Exception as e:
344
490
  retry_q = _build_retry_prompt(question, sql, e, col_map)
345
- resp2, _, _ = ask_model(system_prompt, retry_q)
491
+ resp2, _, _, _ = ask_model(system_prompt, retry_q)
346
492
  sql2 = extract_sql(resp2)
347
493
  if sql2 and not check_sql_safety(sql2):
348
- return _like_to_ilike(sql2)
494
+ return _apply_guardrails(sql2)
349
495
  return None
350
496
 
351
497
  return sql
@@ -408,11 +554,17 @@ def format_table(columns, rows, max_rows=50):
408
554
  # ── Main loop ──
409
555
 
410
556
  def _extract_entity_ids(columns, rows, max_entities=100):
411
- """Sorgu sonucundan entity_id listesini çıkar. Yoksa None döner."""
557
+ """Sorgu sonucundan entity_id veya muta listesini çıkar. Yoksa None döner."""
412
558
  col_lower = [c.lower() for c in columns]
413
- if "entity_id" not in col_lower:
559
+ # entity_id veya muta — ikisi de firma kimliği
560
+ id_col = None
561
+ for name in ("entity_id", "muta"):
562
+ if name in col_lower:
563
+ id_col = name
564
+ break
565
+ if id_col is None:
414
566
  return None
415
- idx = col_lower.index("entity_id")
567
+ idx = col_lower.index(id_col)
416
568
  ids = list(dict.fromkeys(row[idx] for row in rows if row[idx] is not None))
417
569
  if len(ids) > max_entities:
418
570
  return None # çok geniş, context'e ekleme
@@ -422,11 +574,17 @@ def _extract_entity_ids(columns, rows, max_entities=100):
422
574
  def run_query(conn, system_prompt, question, col_map=None):
423
575
  """Tek soru → SQL → çalıştır → sonuç. (entity_id listesi, sql) tuple döndürür."""
424
576
  # 1. Model'e sor
425
- response, elapsed, tokens = ask_model(system_prompt, question)
577
+ response, elapsed, tokens, timing = ask_model(system_prompt, question)
426
578
  _log("model", question=question, elapsed=round(elapsed, 1), tokens=tokens,
427
- response=response)
579
+ response=response, timing=timing)
428
580
 
429
581
  print(f"\n💭 MODEL ({elapsed:.1f}s, ~{tokens} tok)")
582
+ if timing.get("prompt_eval_ms"):
583
+ pe = timing["prompt_eval_ms"]
584
+ ge = timing["generation_ms"]
585
+ pt = timing["prompt_tokens"]
586
+ pc = timing["prompt_chars"]
587
+ print(f"⏱ Prompt: {pc} char ({pt} tok) → eval: {pe:.0f}ms | gen: {ge:.0f}ms")
430
588
  if not response.startswith("LLM HATA"):
431
589
  thought = re.sub(r'```sql.*?```', '', response, flags=re.DOTALL).strip()
432
590
  if thought:
@@ -449,7 +607,7 @@ def run_query(conn, system_prompt, question, col_map=None):
449
607
  return None, None
450
608
 
451
609
  # 3b. LIKE → ILIKE guardrail
452
- sql = _like_to_ilike(sql)
610
+ sql = _apply_guardrails(sql)
453
611
 
454
612
  print(f"🔍 SQL: {sql}")
455
613
 
@@ -463,7 +621,11 @@ def run_query(conn, system_prompt, question, col_map=None):
463
621
 
464
622
  print(f"\n📊 SONUÇ ({len(rows)} satır, {query_ms:.0f}ms)")
465
623
  print(format_table(columns, rows))
466
- if len(rows) > 50:
624
+ if len(rows) == 0:
625
+ hint = _diagnose_empty_result(sql, conn)
626
+ if hint:
627
+ print(f" {hint}")
628
+ elif len(rows) > 50:
467
629
  print(f" ⚠ {len(rows)} satır döndü, ilk 50 gösteriliyor. Soruyu daraltın.")
468
630
 
469
631
  # Entity context çıkar
@@ -478,10 +640,10 @@ def run_query(conn, system_prompt, question, col_map=None):
478
640
  # Genel SQL hata retry — tek retry, her hata tipinde
479
641
  print(f"🔄 Retry ({_short_error(e)})...")
480
642
  retry_q = _build_retry_prompt(question, sql, e, col_map)
481
- resp2, _, _ = ask_model(system_prompt, retry_q)
643
+ resp2, _, _, _ = ask_model(system_prompt, retry_q)
482
644
  sql2 = extract_sql(resp2)
483
645
  if sql2 and not check_sql_safety(sql2):
484
- sql2 = _like_to_ilike(sql2)
646
+ sql2 = _apply_guardrails(sql2)
485
647
  print(f"🔍 Retry SQL: {sql2}")
486
648
  try:
487
649
  result = conn.execute(sql2)
@@ -587,6 +749,64 @@ def interactive_loop(conn, schema_text, domain_text, col_map):
587
749
  print()
588
750
 
589
751
 
752
+ BENCH_QUESTIONS = [
753
+ "toplam kaç firma var",
754
+ "en büyük firma hangisi",
755
+ "kobi segmentinde kaç firma var",
756
+ "en riskli 5 firma",
757
+ "10000041 mutanın bilgileri",
758
+ "son dönemde riski artan firmalar",
759
+ "ews skoru 500den büyük firmalar",
760
+ "en büyük 3 kobi firması",
761
+ "2602 döneminde kaç firma var",
762
+ "kombine riski 10 milyonun üstünde olan firmalar",
763
+ ]
764
+
765
+
766
+ def run_benchmark(conn, schema_text, domain_text, col_map):
767
+ """10 soru × 2 tur benchmark."""
768
+ column_hints = format_column_hints(col_map)
769
+
770
+ def run_one_pass(label):
771
+ results = []
772
+ for q in BENCH_QUESTIONS:
773
+ sp = build_system_prompt(schema_text, domain_text,
774
+ question=q, column_hints=column_hints)
775
+ _, elapsed, tokens, timing = ask_model(sp, q)
776
+ results.append({
777
+ "question": q, "elapsed": elapsed,
778
+ "tokens": tokens, **timing,
779
+ })
780
+ return results
781
+
782
+ print(f"contsql benchmark — {len(BENCH_QUESTIONS)} soru × 2 tur\n")
783
+ print(f"Prompt boyutu: {len(build_system_prompt(schema_text, domain_text, column_hints=column_hints))} char\n")
784
+
785
+ cold = run_one_pass("cold")
786
+ warm = run_one_pass("warm")
787
+
788
+ for label, results in [("TUR 1 (cold)", cold), ("TUR 2 (warm)", warm)]:
789
+ times = [r["elapsed"] for r in results]
790
+ pe = [r.get("prompt_eval_ms", 0) for r in results]
791
+ ge = [r.get("generation_ms", 0) for r in results]
792
+ times.sort()
793
+ print(f"{label}:")
794
+ print(f" Ort: {sum(times)/len(times):.1f}s | "
795
+ f"P50: {times[len(times)//2]:.1f}s | "
796
+ f"P95: {times[int(len(times)*0.95)]:.1f}s")
797
+ print(f" Prompt eval ort: {sum(pe)/len(pe):.0f}ms | "
798
+ f"Gen ort: {sum(ge)/len(ge):.0f}ms")
799
+ print()
800
+
801
+ # Soru bazlı detay
802
+ print(f"{'#':>2} | {'Soru':<45} | {'Cold':>5} | {'Warm':>5} | {'P.Eval':>7} | {'Gen':>5}")
803
+ print("-" * 85)
804
+ for i, (c, w) in enumerate(zip(cold, warm)):
805
+ print(f"{i+1:2d} | {c['question'][:45]:<45} | "
806
+ f"{c['elapsed']:5.1f} | {w['elapsed']:5.1f} | "
807
+ f"{w.get('prompt_eval_ms',0):7.0f} | {w.get('generation_ms',0):5.0f}")
808
+
809
+
590
810
  def main():
591
811
  parser = argparse.ArgumentParser(
592
812
  description="contsql — DuckDB SQL agent",
@@ -596,6 +816,7 @@ def main():
596
816
  parser.add_argument("question", nargs="?", help="Tek soru (opsiyonel)")
597
817
  global MODEL
598
818
  parser.add_argument("--model", default=MODEL, help="Ollama model adı (default: cont-local)")
819
+ parser.add_argument("--bench", action="store_true", help="Benchmark modu (10 soru × 2 tur)")
599
820
  args = parser.parse_args()
600
821
  MODEL = args.model
601
822
 
@@ -610,16 +831,31 @@ def main():
610
831
  # Loglama başlat
611
832
  log_path = _init_log(db_path)
612
833
 
613
- # Schema + domain + column map
834
+ # Warmup background thread
835
+ warmup_t = threading.Thread(target=_warmup_model, daemon=True)
836
+ t0_warmup = time.time()
837
+ warmup_t.start()
838
+
839
+ # Schema + domain + column map (warmup parallel çalışır)
614
840
  schema_text = read_schema(conn)
615
841
  domain_text = read_domain_notes(str(db_path))
616
842
  col_map = build_column_owner_map(conn)
617
843
 
844
+ # Warmup bitmesini bekle
845
+ warmup_t.join(timeout=60)
846
+ warmup_ms = (time.time() - t0_warmup) * 1000
847
+
848
+ # DB health check
849
+ db_ok = _db_health_check(conn)
850
+
618
851
  print(f"DB: {db_path} | Model: {MODEL} | Log: {log_path}")
619
- _log("session_start", db=str(db_path), model=MODEL)
852
+ print(f"🔥 Warmup: {warmup_ms:.0f}ms")
853
+ _log("session_start", db=str(db_path), model=MODEL, warmup_ms=round(warmup_ms))
620
854
 
621
- # Tek soru veya interaktif
622
- if args.question:
855
+ # Benchmark, tek soru veya interaktif
856
+ if args.bench:
857
+ run_benchmark(conn, schema_text, domain_text, col_map)
858
+ elif args.question:
623
859
  column_hints = format_column_hints(col_map)
624
860
  system_prompt = build_system_prompt(schema_text, domain_text,
625
861
  question=args.question,
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "contsql"
7
- version = "0.3.1"
7
+ version = "0.3.6"
8
8
  requires-python = ">=3.10"
9
9
  dependencies = ["duckdb", "requests"]
10
10
 
File without changes
File without changes
File without changes
File without changes