PyPI - contsql - Versions diffs - 0.3.1__tar.gz → 0.3.6__tar.gz - Mend

contsql 0.3.1tar.gz → 0.3.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{contsql-0.3.1 → contsql-0.3.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: contsql
-Version: 0.3.1
+Version: 0.3.6
 Requires-Python: >=3.10
 Requires-Dist: duckdb
 Requires-Dist: requests

{contsql-0.3.1 → contsql-0.3.6}/contsql.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: contsql
-Version: 0.3.1
+Version: 0.3.6
 Requires-Python: >=3.10
 Requires-Dist: duckdb
 Requires-Dist: requests

{contsql-0.3.1 → contsql-0.3.6}/contsql.py RENAMED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# v0.2.9 | 2026-04-13 | JSONL session loglama — her şey kaydedilir, default açık
+# v0.3.5 | 2026-04-14 | warmup + DB health check + 0-satır teşhis
 """contsql — Minimal DuckDB SQL agent. Soru sor, SQL üret, çalıştır, göster."""
 import argparse
@@ -7,6 +7,7 @@ import json
 import os
 import re
 import sys
+import threading
 import time
 from pathlib import Path
@@ -22,6 +23,66 @@ TIMEOUT = int(os.environ.get("CONTSQL_TIMEOUT", "120"))
 BANNED_SQL = ["INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "CREATE", "TRUNCATE", "EXEC"]
+# ── Warmup + Health Check ──
+CRITICAL_TABLES = [
+    ("fact_periodic", "Dönemsel risk verileri"),
+    ("map_identity", "Firma kimlik bilgileri"),
+    ("dim_entity", "Firma boyut bilgileri"),
+]
+def _warmup_model():
+    """Model'i GPU'ya yükle — background thread."""
+    try:
+        requests.post(f"{OLLAMA_URL}/api/generate", json={
+            "model": MODEL, "prompt": " ", "options": {"num_predict": 1}
+        }, timeout=60)
+    except Exception:
+        pass
+def _db_health_check(conn):
+    """Ana tabloların doluluk kontrolü."""
+    warnings = []
+    for table, desc in CRITICAL_TABLES:
+        try:
+            count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
+            if count == 0:
+                warnings.append(f"  ⚠ {table} boş — {desc} yüklenmemiş.")
+        except Exception:
+            warnings.append(f"  ⚠ {table} tablosu bulunamadı.")
+    if warnings:
+        print("⚠ DB UYARI:")
+        for w in warnings:
+            print(w)
+        print()
+    return len(warnings) == 0
+def _diagnose_empty_result(sql, conn):
+    """0 satır dönen sorguda olası nedeni bul."""
+    muta_match = re.search(r"muta\s*=\s*'(\d+)'", sql)
+    if muta_match:
+        tid = muta_match.group(1)
+        if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE muta='{tid}'").fetchone()[0] == 0:
+            return f"💡 MUTA {tid} veritabanında yok."
+    entity_match = re.search(r"entity_id\s*(?:=|IN)\s*\(?'?(\d+)", sql)
+    if entity_match:
+        tid = entity_match.group(1)
+        if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE entity_id='{tid}'").fetchone()[0] == 0:
+            return f"💡 Entity {tid} veritabanında yok."
+    ilike_match = re.search(r"ILIKE\s+'([^']+)'", sql, re.IGNORECASE)
+    if ilike_match:
+        pattern = ilike_match.group(1)
+        if conn.execute(f"SELECT COUNT(*) FROM map_identity WHERE unvan ILIKE '{pattern}'").fetchone()[0] == 0:
+            return f"💡 '{pattern}' ile eşleşen firma yok."
+    return None
 # ── Loglama ──
 _LOG_FILE = None
@@ -125,16 +186,18 @@ def has_reference_trigger(question):
 SORGU_TRIGGERS = [
-    "yanına ekle", "yanına da ekle", "yanına", "buna ekle", "buna da ekle",
+    "yanına ekle", "yanına da ekle", "yanına", "yanlarına",
+    "buna ekle", "buna da ekle",
     "kolonu da ekle", "kolonunu da ekle", "bir de", "aynısına",
     "aynı sorguya", "aynı sorgu", "üstüne ekle", "ekle yanına",
     "göster yanında", "da göster", "da getir", "de göster", "de getir",
     "tablodan", "tablodaki", "tabloyu", "tabloya",
+    "altına ekle", "altına", "altına da ekle",
     "çıkart", "çıkar", "kaldır", "at şunu", "filtrele", "daralt",
     "sadece", "hariç", "hariç tut",
 ]
-MAX_SQL_CONTEXT_LENGTH = 500
+MAX_SQL_CONTEXT_LENGTH = 1000
 def has_query_trigger(question):
@@ -145,6 +208,38 @@ def has_query_trigger(question):
 # ── System prompt ──
+def _compact_schema(schema_text):
+    """Schema'yı kompakt tek-satır formata dönüştür."""
+    lines = []
+    for line in schema_text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        # "  tablo (N satır): col1 (TYPE), col2 (TYPE)" → "tablo(col1 TYPE, col2 TYPE)"
+        if "satır" in line:
+            parts = line.split(":")
+            if len(parts) >= 2:
+                tbl = parts[0].split("(")[0].strip()
+                cols = parts[1].strip()
+                # "col (TYPE)" → "col TYPE" — sadece sorunlu tipleri koru
+                compact_cols = []
+                for c in cols.split(", "):
+                    c = c.strip()
+                    if "(" in c:
+                        name = c.split("(")[0].strip()
+                        typ = c.split("(")[1].rstrip(")")
+                        if typ in ("VARCHAR", "BIGINT", "DOUBLE"):
+                            compact_cols.append(f"{name} {typ}")
+                        else:
+                            compact_cols.append(name)
+                    else:
+                        compact_cols.append(c)
+                lines.append(f"{tbl}({', '.join(compact_cols)})")
+        else:
+            lines.append(line)
+    return "\n".join(lines)
 def build_system_prompt(schema_text, domain_text="", last_result_entities=None,
                         question=None, last_sql=None, column_hints=""):
     prompt = f"""Sen bir SQL asistanısın. Kullanıcının sorusuna uygun SQL yaz.
@@ -161,10 +256,10 @@ Kurallar:
 Veritabanı şeması:
 {schema_text}
 """
-    if domain_text:
-        prompt += f"\nDomain bilgisi:\n{domain_text}\n"
     if column_hints:
         prompt += f"\n{column_hints}\n"
+    if domain_text:
+        prompt += f"\nDomain:\n{domain_text}\n"
     if last_sql and question and has_query_trigger(question):
         if len(last_sql) <= MAX_SQL_CONTEXT_LENGTH:
             prompt += f"""\nÖNCEKİ SQL'İ MODİFİYE ET:
@@ -232,6 +327,49 @@ def _like_to_ilike(sql):
     )
+VARCHAR_SCORE_COLUMNS = ["ews_skor", "yis_skor", "eus_skor"]
+def _fix_varchar_sort(sql):
+    """ORDER BY'da VARCHAR skor kolonlarını TRY_CAST ile sar."""
+    for col in VARCHAR_SCORE_COLUMNS:
+        pattern = rf'(ORDER\s+BY\s+.*?)(\w+\.)?({re.escape(col)})\b(?!\s*AS\b)'
+        def _repl(m, _col=col):
+            prefix, alias, column = m.group(1), m.group(2) or "", m.group(3)
+            if "TRY_CAST" in prefix.split(",")[-1]:
+                return m.group(0)
+            return f"{prefix}TRY_CAST({alias}{column} AS DOUBLE)"
+        sql = re.sub(pattern, _repl, sql, flags=re.IGNORECASE)
+    if "TRY_CAST" in sql and "ORDER BY" in sql.upper() and "NULLS LAST" not in sql.upper():
+        sql = re.sub(r'(ORDER\s+BY\s+.+?)(\s*;?\s*$)', r'\1 NULLS LAST\2',
+                     sql, flags=re.IGNORECASE)
+    return sql
+def _fix_varchar_comparison(sql):
+    """WHERE'de VARCHAR skor kolonlarının sayısal karşılaştırmasını düzelt."""
+    for col in VARCHAR_SCORE_COLUMNS:
+        pattern = rf'(?<!TRY_CAST\()(\w+\.)?({re.escape(col)})\s*([><=!]+)\s*(\d+)'
+        def _repl(m, _col=col):
+            alias = m.group(1) or ""
+            return f"TRY_CAST({alias}{m.group(2)} AS DOUBLE) {m.group(3)} {m.group(4)}"
+        sql = re.sub(pattern, _repl, sql, flags=re.IGNORECASE)
+    return sql
+def _apply_guardrails(sql):
+    """Tüm SQL post-processing guardrail'leri uygula."""
+    sql = _like_to_ilike(sql)
+    sql = _fix_varchar_sort(sql)
+    sql = _fix_varchar_comparison(sql)
+    return sql
 def check_sql_safety(sql):
     """Sadece SELECT/WITH izinli. Tehlikeli keyword varsa hata döndür."""
     sql_upper = sql.strip().upper()
@@ -246,7 +384,7 @@ def check_sql_safety(sql):
 # ── LLM call ──
 def ask_model(system_prompt, question):
-    """Ollama'ya soru gönder, yanıt al."""
+    """Ollama'ya soru gönder, yanıt + timing döndür."""
     t0 = time.time()
     try:
         resp = requests.post(
@@ -266,9 +404,17 @@ def ask_model(system_prompt, question):
         content = data.get("message", {}).get("content", "")
         elapsed = time.time() - t0
         tokens = data.get("eval_count", 0)
-        return content, elapsed, tokens
+        # Ollama timing metrikleri (nanosecond → ms)
+        timing = {
+            "prompt_eval_ms": data.get("prompt_eval_duration", 0) / 1e6,
+            "generation_ms": data.get("eval_duration", 0) / 1e6,
+            "prompt_tokens": data.get("prompt_eval_count", 0),
+            "gen_tokens": tokens,
+            "prompt_chars": len(system_prompt),
+        }
+        return content, elapsed, tokens, timing
     except Exception as e:
-        return f"LLM HATA: {e}", time.time() - t0, 0
+        return f"LLM HATA: {e}", time.time() - t0, 0, {}
 def _short_error(msg):
@@ -331,21 +477,21 @@ def generate_sql(conn, question, last_result_entities=None, domain_text="",
     column_hints = format_column_hints(col_map) if col_map else ""
     system_prompt = build_system_prompt(schema_text, domain_text, last_result_entities,
                                        question=question, column_hints=column_hints)
-    response, _, _ = ask_model(system_prompt, question)
+    response, _, _, _ = ask_model(system_prompt, question)
     sql = extract_sql(response)
     if not sql or check_sql_safety(sql):
         return None
-    sql = _like_to_ilike(sql)
+    sql = _apply_guardrails(sql)
     # Genel SQL hata retry: EXPLAIN ile ön kontrol
     try:
         conn.execute(f"EXPLAIN {sql}")
     except Exception as e:
         retry_q = _build_retry_prompt(question, sql, e, col_map)
-        resp2, _, _ = ask_model(system_prompt, retry_q)
+        resp2, _, _, _ = ask_model(system_prompt, retry_q)
         sql2 = extract_sql(resp2)
         if sql2 and not check_sql_safety(sql2):
-            return _like_to_ilike(sql2)
+            return _apply_guardrails(sql2)
         return None
     return sql
@@ -408,11 +554,17 @@ def format_table(columns, rows, max_rows=50):
 # ── Main loop ──
 def _extract_entity_ids(columns, rows, max_entities=100):
-    """Sorgu sonucundan entity_id listesini çıkar. Yoksa None döner."""
+    """Sorgu sonucundan entity_id veya muta listesini çıkar. Yoksa None döner."""
     col_lower = [c.lower() for c in columns]
-    if "entity_id" not in col_lower:
+    # entity_id veya muta — ikisi de firma kimliği
+    id_col = None
+    for name in ("entity_id", "muta"):
+        if name in col_lower:
+            id_col = name
+            break
+    if id_col is None:
         return None
-    idx = col_lower.index("entity_id")
+    idx = col_lower.index(id_col)
     ids = list(dict.fromkeys(row[idx] for row in rows if row[idx] is not None))
     if len(ids) > max_entities:
         return None  # çok geniş, context'e ekleme
@@ -422,11 +574,17 @@ def _extract_entity_ids(columns, rows, max_entities=100):
 def run_query(conn, system_prompt, question, col_map=None):
     """Tek soru → SQL → çalıştır → sonuç. (entity_id listesi, sql) tuple döndürür."""
     # 1. Model'e sor
-    response, elapsed, tokens = ask_model(system_prompt, question)
+    response, elapsed, tokens, timing = ask_model(system_prompt, question)
     _log("model", question=question, elapsed=round(elapsed, 1), tokens=tokens,
-         response=response)
+         response=response, timing=timing)
     print(f"\n💭 MODEL ({elapsed:.1f}s, ~{tokens} tok)")
+    if timing.get("prompt_eval_ms"):
+        pe = timing["prompt_eval_ms"]
+        ge = timing["generation_ms"]
+        pt = timing["prompt_tokens"]
+        pc = timing["prompt_chars"]
+        print(f"⏱ Prompt: {pc} char ({pt} tok) → eval: {pe:.0f}ms | gen: {ge:.0f}ms")
     if not response.startswith("LLM HATA"):
         thought = re.sub(r'```sql.*?```', '', response, flags=re.DOTALL).strip()
         if thought:
@@ -449,7 +607,7 @@ def run_query(conn, system_prompt, question, col_map=None):
         return None, None
     # 3b. LIKE → ILIKE guardrail
-    sql = _like_to_ilike(sql)
+    sql = _apply_guardrails(sql)
     print(f"🔍 SQL: {sql}")
@@ -463,7 +621,11 @@ def run_query(conn, system_prompt, question, col_map=None):
         print(f"\n📊 SONUÇ ({len(rows)} satır, {query_ms:.0f}ms)")
         print(format_table(columns, rows))
-        if len(rows) > 50:
+        if len(rows) == 0:
+            hint = _diagnose_empty_result(sql, conn)
+            if hint:
+                print(f"  {hint}")
+        elif len(rows) > 50:
             print(f"  ⚠ {len(rows)} satır döndü, ilk 50 gösteriliyor. Soruyu daraltın.")
         # Entity context çıkar
@@ -478,10 +640,10 @@ def run_query(conn, system_prompt, question, col_map=None):
         # Genel SQL hata retry — tek retry, her hata tipinde
         print(f"🔄 Retry ({_short_error(e)})...")
         retry_q = _build_retry_prompt(question, sql, e, col_map)
-        resp2, _, _ = ask_model(system_prompt, retry_q)
+        resp2, _, _, _ = ask_model(system_prompt, retry_q)
         sql2 = extract_sql(resp2)
         if sql2 and not check_sql_safety(sql2):
-            sql2 = _like_to_ilike(sql2)
+            sql2 = _apply_guardrails(sql2)
             print(f"🔍 Retry SQL: {sql2}")
             try:
                 result = conn.execute(sql2)
@@ -587,6 +749,64 @@ def interactive_loop(conn, schema_text, domain_text, col_map):
         print()
+BENCH_QUESTIONS = [
+    "toplam kaç firma var",
+    "en büyük firma hangisi",
+    "kobi segmentinde kaç firma var",
+    "en riskli 5 firma",
+    "10000041 mutanın bilgileri",
+    "son dönemde riski artan firmalar",
+    "ews skoru 500den büyük firmalar",
+    "en büyük 3 kobi firması",
+    "2602 döneminde kaç firma var",
+    "kombine riski 10 milyonun üstünde olan firmalar",
+]
+def run_benchmark(conn, schema_text, domain_text, col_map):
+    """10 soru × 2 tur benchmark."""
+    column_hints = format_column_hints(col_map)
+    def run_one_pass(label):
+        results = []
+        for q in BENCH_QUESTIONS:
+            sp = build_system_prompt(schema_text, domain_text,
+                                     question=q, column_hints=column_hints)
+            _, elapsed, tokens, timing = ask_model(sp, q)
+            results.append({
+                "question": q, "elapsed": elapsed,
+                "tokens": tokens, **timing,
+            })
+        return results
+    print(f"contsql benchmark — {len(BENCH_QUESTIONS)} soru × 2 tur\n")
+    print(f"Prompt boyutu: {len(build_system_prompt(schema_text, domain_text, column_hints=column_hints))} char\n")
+    cold = run_one_pass("cold")
+    warm = run_one_pass("warm")
+    for label, results in [("TUR 1 (cold)", cold), ("TUR 2 (warm)", warm)]:
+        times = [r["elapsed"] for r in results]
+        pe = [r.get("prompt_eval_ms", 0) for r in results]
+        ge = [r.get("generation_ms", 0) for r in results]
+        times.sort()
+        print(f"{label}:")
+        print(f"  Ort: {sum(times)/len(times):.1f}s | "
+              f"P50: {times[len(times)//2]:.1f}s | "
+              f"P95: {times[int(len(times)*0.95)]:.1f}s")
+        print(f"  Prompt eval ort: {sum(pe)/len(pe):.0f}ms | "
+              f"Gen ort: {sum(ge)/len(ge):.0f}ms")
+        print()
+    # Soru bazlı detay
+    print(f"{'#':>2} | {'Soru':<45} | {'Cold':>5} | {'Warm':>5} | {'P.Eval':>7} | {'Gen':>5}")
+    print("-" * 85)
+    for i, (c, w) in enumerate(zip(cold, warm)):
+        print(f"{i+1:2d} | {c['question'][:45]:<45} | "
+              f"{c['elapsed']:5.1f} | {w['elapsed']:5.1f} | "
+              f"{w.get('prompt_eval_ms',0):7.0f} | {w.get('generation_ms',0):5.0f}")
 def main():
     parser = argparse.ArgumentParser(
         description="contsql — DuckDB SQL agent",
@@ -596,6 +816,7 @@ def main():
     parser.add_argument("question", nargs="?", help="Tek soru (opsiyonel)")
     global MODEL
     parser.add_argument("--model", default=MODEL, help="Ollama model adı (default: cont-local)")
+    parser.add_argument("--bench", action="store_true", help="Benchmark modu (10 soru × 2 tur)")
     args = parser.parse_args()
     MODEL = args.model
@@ -610,16 +831,31 @@ def main():
     # Loglama başlat
     log_path = _init_log(db_path)
-    # Schema + domain + column map
+    # Warmup — background thread
+    warmup_t = threading.Thread(target=_warmup_model, daemon=True)
+    t0_warmup = time.time()
+    warmup_t.start()
+    # Schema + domain + column map (warmup parallel çalışır)
     schema_text = read_schema(conn)
     domain_text = read_domain_notes(str(db_path))
     col_map = build_column_owner_map(conn)
+    # Warmup bitmesini bekle
+    warmup_t.join(timeout=60)
+    warmup_ms = (time.time() - t0_warmup) * 1000
+    # DB health check
+    db_ok = _db_health_check(conn)
     print(f"DB: {db_path} | Model: {MODEL} | Log: {log_path}")
-    _log("session_start", db=str(db_path), model=MODEL)
+    print(f"🔥 Warmup: {warmup_ms:.0f}ms")
+    _log("session_start", db=str(db_path), model=MODEL, warmup_ms=round(warmup_ms))
-    # Tek soru veya interaktif
-    if args.question:
+    # Benchmark, tek soru veya interaktif
+    if args.bench:
+        run_benchmark(conn, schema_text, domain_text, col_map)
+    elif args.question:
         column_hints = format_column_hints(col_map)
         system_prompt = build_system_prompt(schema_text, domain_text,
                                            question=args.question,

{contsql-0.3.1 → contsql-0.3.6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "contsql"
-version = "0.3.1"
+version = "0.3.6"
 requires-python = ">=3.10"
 dependencies = ["duckdb", "requests"]