PyPI - QuerySUTRA - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

QuerySUTRA 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

querysutra-0.4.2.dist-info/METADATA +264 -0
{querysutra-0.4.0.dist-info → querysutra-0.4.2.dist-info}/RECORD +7 -7
sutra/__init__.py +2 -3
sutra/sutra.py +272 -294
querysutra-0.4.0.dist-info/METADATA +0 -438
{querysutra-0.4.0.dist-info → querysutra-0.4.2.dist-info}/WHEEL +0 -0
{querysutra-0.4.0.dist-info → querysutra-0.4.2.dist-info}/licenses/LICENSE +0 -0
{querysutra-0.4.0.dist-info → querysutra-0.4.2.dist-info}/top_level.txt +0 -0

sutra/sutra.py CHANGED Viewed

@@ -1,36 +1,32 @@
 """
-QuerySUTRA v0.3.5 - FIXED COLAB COMPATIBILITY
+QuerySUTRA v0.4.0 - SIMPLE & AUTOMATIC
 SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
-FIXED:
-- Colab disk I/O errors resolved
-- Batch processing for large datasets
-- Proper error handling
-- Unique IDs and proper foreign keys
-- Comprehensive entity extraction
+FIXED:
+- Auto-creates MySQL database if not exists
+- One-line export to MySQL
+- Complete data extraction from large PDFs
+- No manual file transfers needed
 Author: Aditya Batta
-License: MIT
-Version: 0.3.5
+Version: 0.4.0
 """
-__version__ = "0.3.5"
+__version__ = "0.4.0"
 __author__ = "Aditya Batta"
-__title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
 __all__ = ["SUTRA", "QueryResult", "quick_start"]
 import os
 import sqlite3
 import pandas as pd
 import numpy as np
-from typing import Optional, Union, Dict, Any, List
+from typing import Optional, Union, Dict, List
 from pathlib import Path
 import json
 import hashlib
 import warnings
 import shutil
 import datetime
-import re
 from io import StringIO
 from difflib import get_close_matches
 warnings.filterwarnings('ignore')
@@ -74,19 +70,13 @@ except ImportError:
 class SUTRA:
-    """
-    SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
-    """
-    def __init__(self,
-                 api_key: Optional[str] = None,
-                 db: str = "sutra.db",
-                 use_embeddings: bool = False,
-                 check_relevance: bool = False,
-                 fuzzy_match: bool = True,
-                 cache_queries: bool = True):
-        """Initialize SUTRA."""
-        print("Initializing QuerySUTRA v0.3.5")
+    """SUTRA: Structured-Unstructured-Text-Retrieval-Architecture"""
+    def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
+                 use_embeddings: bool = False, check_relevance: bool = False,
+                 fuzzy_match: bool = True, cache_queries: bool = True):
+        """Initialize."""
+        print("Initializing QuerySUTRA v0.4.0")
         if api_key:
             os.environ["OPENAI_API_KEY"] = api_key
@@ -96,7 +86,6 @@ class SUTRA:
         self.db_path = db
-        # FIXED: Better connection handling for Colab
         try:
             self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
             self.conn.execute("PRAGMA journal_mode=WAL")
@@ -105,24 +94,20 @@ class SUTRA:
             self.conn = sqlite3.connect(db, check_same_thread=False)
         self.cursor = self.conn.cursor()
         self.current_table = None
         self.schema_info = {}
         self.cache_queries = cache_queries
         self.cache = {} if cache_queries else None
         self.use_embeddings = use_embeddings
         self.embedding_model = None
         self.query_embeddings = {}
         self.check_relevance = check_relevance
         self.fuzzy_match = fuzzy_match
         if use_embeddings and HAS_EMBEDDINGS:
             try:
                 self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-                print("Embeddings ready")
             except:
                 self.use_embeddings = False
@@ -133,12 +118,8 @@ class SUTRA:
     def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
         """Load existing database."""
         if not Path(db_path).exists():
-            raise FileNotFoundError(f"Database not found: {db_path}")
-        print(f"Loading: {db_path}")
-        instance = cls(api_key=api_key, db=db_path, **kwargs)
-        print(f"Loaded {len(instance.tables())} tables")
-        return instance
+            raise FileNotFoundError(f"Not found: {db_path}")
+        return cls(api_key=api_key, db=db_path, **kwargs)
     @classmethod
     def connect_mysql(cls, host: str, user: str, password: str, database: str,
@@ -146,26 +127,34 @@ class SUTRA:
         """Connect to MySQL."""
         try:
             from sqlalchemy import create_engine
+            import mysql.connector
         except ImportError:
             raise ImportError("Run: pip install QuerySUTRA[mysql]")
-        print(f"Connecting to MySQL: {host}:{port}/{database}")
+        print(f"Connecting to MySQL...")
-        engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
+        # Auto-create database if not exists
+        try:
+            temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
+            temp_cursor = temp_conn.cursor()
+            temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
+            temp_cursor.close()
+            temp_conn.close()
+        except:
+            pass
+        engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
         temp_db = f"sutra_mysql_{database}.db"
         instance = cls(api_key=api_key, db=temp_db, **kwargs)
         tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
-        print(f"Syncing {len(tables)} tables...")
         for table in tables:
             df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
             df.to_sql(table, instance.conn, if_exists='replace', index=False)
-            print(f"  {table}: {len(df)} rows")
         instance._refresh_schema()
-        print("Connected!")
+        print(f"Connected! {len(tables)} tables")
         return instance
     @classmethod
@@ -177,286 +166,250 @@ class SUTRA:
         except ImportError:
             raise ImportError("Run: pip install QuerySUTRA[postgres]")
-        print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
+        print(f"Connecting to PostgreSQL...")
         engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
         temp_db = f"sutra_postgres_{database}.db"
         instance = cls(api_key=api_key, db=temp_db, **kwargs)
         tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
-        print(f"Syncing {len(tables)} tables...")
         for table in tables:
             df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
             df.to_sql(table, instance.conn, if_exists='replace', index=False)
-            print(f"  {table}: {len(df)} rows")
         instance._refresh_schema()
-        print("Connected!")
+        print(f"Connected! {len(tables)} tables")
         return instance
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
-               extract_entities: Optional[List[str]] = None) -> 'SUTRA':
-        """Upload data."""
-        print("\nUploading data...")
+               extract_entities: Optional[List[str]] = None,
+               auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
+        """
+        Upload data with OPTIONAL automatic MySQL export.
+        Args:
+            data: File path or DataFrame
+            name: Table name
+            extract_entities: Custom entities to extract
+            auto_export_mysql: Auto-export to MySQL after upload
+                              {'host': 'localhost', 'user': 'root', 'password': 'pass', 'database': 'mydb'}
+        Example:
+            sutra.upload("data.pdf", auto_export_mysql={
+                'host': 'localhost',
+                'user': 'root',
+                'password': '123456',
+                'database': 'my_database'
+            })
+        """
+        print("\nUploading...")
         if isinstance(data, pd.DataFrame):
             name = name or "data"
             self._store_dataframe(data, name)
-            return self
-        path = Path(data)
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {data}")
-        name = name or path.stem.replace(" ", "_").replace("-", "_")
-        ext = path.suffix.lower()
-        print(f"File: {path.name}")
-        if ext == ".csv":
-            df = pd.read_csv(path)
-            self._store_dataframe(df, name)
-        elif ext in [".xlsx", ".xls"]:
-            df = pd.read_excel(path)
-            self._store_dataframe(df, name)
-        elif ext == ".json":
-            df = pd.read_json(path)
-            self._store_dataframe(df, name)
-        elif ext == ".sql":
-            with open(path) as f:
-                self.cursor.executescript(f.read())
-            self.conn.commit()
-            self._refresh_schema()
-            print("SQL executed")
-        elif ext == ".pdf":
-            self._smart_upload_pdf(path, name, extract_entities)
-        elif ext == ".docx":
-            self._smart_upload_docx(path, name, extract_entities)
-        elif ext == ".txt":
-            self._smart_upload_txt(path, name, extract_entities)
         else:
-            raise ValueError(f"Unsupported: {ext}")
+            path = Path(data)
+            if not path.exists():
+                raise FileNotFoundError(f"Not found: {data}")
+            name = name or path.stem.replace(" ", "_").replace("-", "_")
+            ext = path.suffix.lower()
+            print(f"File: {path.name}")
+            if ext == ".csv":
+                self._store_dataframe(pd.read_csv(path), name)
+            elif ext in [".xlsx", ".xls"]:
+                self._store_dataframe(pd.read_excel(path), name)
+            elif ext == ".json":
+                self._store_dataframe(pd.read_json(path), name)
+            elif ext == ".sql":
+                with open(path) as f:
+                    self.cursor.executescript(f.read())
+                self.conn.commit()
+                self._refresh_schema()
+            elif ext == ".pdf":
+                self._smart_upload_pdf(path, name, extract_entities)
+            elif ext == ".docx":
+                self._smart_upload_docx(path, name, extract_entities)
+            elif ext == ".txt":
+                self._smart_upload_txt(path, name, extract_entities)
+            else:
+                raise ValueError(f"Unsupported: {ext}")
+        # AUTO-EXPORT to MySQL if requested
+        if auto_export_mysql:
+            print("\nAuto-exporting to MySQL...")
+            self.save_to_mysql(
+                host=auto_export_mysql.get('host', 'localhost'),
+                user=auto_export_mysql.get('user', 'root'),
+                password=auto_export_mysql['password'],
+                database=auto_export_mysql['database'],
+                port=auto_export_mysql.get('port', 3306)
+            )
         return self
     def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
-        """Parse PDF."""
+        """Parse PDF - extracts ALL pages."""
         if not HAS_PYPDF2:
             raise ImportError("Run: pip install PyPDF2")
-        print("Extracting from PDF...")
+        print("Extracting PDF...")
         with open(path, 'rb') as file:
             pdf_reader = PyPDF2.PdfReader(file)
-            text = ""
+            full_text = ""
             for page_num, page in enumerate(pdf_reader.pages, 1):
-                text += page.extract_text() + "\n"
+                full_text += page.extract_text() + "\n"
                 print(f"  Page {page_num}/{len(pdf_reader.pages)}")
         if self.client:
             print("AI: Extracting entities...")
-            tables = self._create_tables_with_ai(text, base_name, extract_entities)
-            if tables and len(tables) > 0:
-                print(f"\nCreated {len(tables)} tables:")
-                for tbl in tables:
-                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-                    cols = len(self.schema_info.get(tbl, {}))
-                    print(f"  {tbl}: {cnt} rows, {cols} columns")
+            # Process in chunks for large documents
+            chunk_size = 10000
+            all_entities = {}
+            for i in range(0, len(full_text), chunk_size):
+                chunk = full_text[i:i+chunk_size]
+                chunk_num = (i // chunk_size) + 1
+                total_chunks = (len(full_text) // chunk_size) + 1
+                if total_chunks > 1:
+                    print(f"  Chunk {chunk_num}/{total_chunks}...")
+                entities = self._extract_chunk(chunk, extract_entities)
+                for entity_type, records in entities.items():
+                    if entity_type not in all_entities:
+                        all_entities[entity_type] = []
+                    all_entities[entity_type].extend(records)
+            # Renumber IDs
+            for entity_type, records in all_entities.items():
+                for idx, record in enumerate(records, 1):
+                    record['id'] = idx
+            # Create tables
+            if all_entities:
+                print(f"\nCreated {len(all_entities)} tables:")
+                for entity_type, records in all_entities.items():
+                    if records:
+                        table_name = f"{base_name}_{entity_type}"
+                        df = pd.DataFrame(records)
+                        self._store_dataframe_safe(df, table_name)
+                        print(f"  {entity_type}: {len(df)} records")
                 return
         print("Creating simple table")
-        df = self._parse_text_simple(text)
-        self._store_dataframe(df, base_name)
+        self._store_dataframe(self._parse_text_simple(full_text), base_name)
     def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
         """Parse DOCX."""
         if not HAS_DOCX:
             raise ImportError("Run: pip install python-docx")
-        print("Extracting from DOCX...")
         doc = docx.Document(path)
         if doc.tables:
-            print(f"Found {len(doc.tables)} table(s)")
             for i, table in enumerate(doc.tables):
                 data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
                 if data and len(data) > 1:
                     df = pd.DataFrame(data[1:], columns=data[0])
-                    table_name = f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name
-                    self._store_dataframe(df, table_name)
+                    self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
             return
         text = "\n".join([para.text for para in doc.paragraphs])
-        if self.client:
-            print("AI: Extracting...")
-            tables = self._create_tables_with_ai(text, base_name, extract_entities)
-            if tables and len(tables) > 0:
-                print(f"\nCreated {len(tables)} tables:")
-                for tbl in tables:
-                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-                    print(f"  {tbl}: {cnt} rows")
-                return
-        df = self._parse_text_simple(text)
-        self._store_dataframe(df, base_name)
+        if self.client and len(text) > 0:
+            entities = self._extract_chunk(text, extract_entities)
+            for entity_type, records in entities.items():
+                if records:
+                    df = pd.DataFrame(records)
+                    self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
+        else:
+            self._store_dataframe(self._parse_text_simple(text), base_name)
     def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
         """Parse TXT."""
-        print("Reading TXT...")
         with open(path, 'r', encoding='utf-8') as file:
             text = file.read()
-        if self.client:
-            print("AI: Extracting...")
-            tables = self._create_tables_with_ai(text, base_name, extract_entities)
-            if tables and len(tables) > 0:
-                print(f"\nCreated {len(tables)} tables:")
-                for tbl in tables:
-                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-                    print(f"  {tbl}: {cnt} rows")
-                return
-        df = self._parse_text_simple(text)
-        self._store_dataframe(df, base_name)
+        if self.client and len(text) > 0:
+            entities = self._extract_chunk(text, extract_entities)
+            for entity_type, records in entities.items():
+                if records:
+                    df = pd.DataFrame(records)
+                    self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
+        else:
+            self._store_dataframe(self._parse_text_simple(text), base_name)
-    def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
-        """AI extraction with proper keys."""
+    def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
+        """Extract entities from text chunk."""
         if not self.client:
-            return []
+            return {}
         try:
-            entity_list = """Extract ALL entities you find:
-- people: id, name, email, phone, address, city, state, zip
-- skills: id, person_id, skill_name, proficiency, years
-- technologies: id, person_id, technology, category, proficiency
-- projects: id, person_id, project_name, description, role
-- certifications: id, person_id, cert_name, issuer, date
-- education: id, person_id, degree, institution, year
-- work_experience: id, person_id, company, title, start_date, end_date
-- events: id, host_id, description, location, date
-- organizations: id, name, address, city
-- ANY other structured data
-CRITICAL: Use UNIQUE sequential IDs (1,2,3...) for each table. Foreign keys MUST reference valid IDs."""
-            if custom_entities:
-                entity_list = f"Extract these entities: {', '.join(custom_entities)}"
-            extraction_prompt = f"""Extract structured data from this text.
+            prompt = f"""Extract ALL structured entities from this text.
 Text:
-{text[:5000]}
+{text[:8000]}
+Extract entities like: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY other structured data.
-{entity_list}
+Return JSON with arrays. Use sequential IDs (1,2,3...). Foreign keys reference primary keys.
-Return JSON:
+Example:
 {{
-  "people": [{{"id": 1, "name": "John", ...}}, {{"id": 2, "name": "Jane", ...}}],
-  "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python", ...}}, {{"id": 2, "person_id": 2, ...}}]
+  "people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
+  "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
 }}
-Requirements:
-- UNIQUE IDs: id=1,2,3,... (no duplicates)
-- Valid foreign keys: person_id must match people.id
-- Extract EVERYTHING
-- Return ONLY valid JSON"""
+Return ONLY valid JSON."""
-            response = self.client.chat.completions.create(
+            resp = self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "Extract entities with unique IDs and proper foreign keys. Return only JSON."},
-                    {"role": "user", "content": extraction_prompt}
+                    {"role": "system", "content": "Extract ALL entities with unique IDs. Return only JSON."},
+                    {"role": "user", "content": prompt}
                 ],
                 temperature=0,
-                max_tokens=4096
+                max_tokens=8000
             )
-            json_text = response.choices[0].message.content.strip()
-            json_text = json_text.replace("```json", "").replace("```", "").strip()
-            extracted_data = json.loads(json_text)
-            created_tables = []
-            for entity_type, records in extracted_data.items():
-                if records and isinstance(records, list) and len(records) > 0:
-                    table_name = f"{base_name}_{entity_type}"
-                    try:
-                        df = pd.DataFrame(records)
-                        if not df.empty:
-                            # FIXED: Store with better error handling
-                            self._store_dataframe_safe(df, table_name)
-                            created_tables.append(table_name)
-                            print(f"  {entity_type}: {len(df)} records")
-                    except Exception as e:
-                        print(f"  Error {entity_type}: {e}")
-            return created_tables
+            json_text = resp.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()
+            return json.loads(json_text)
         except Exception as e:
-            print(f"AI error: {e}")
-            return []
+            return {}
     def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
-        """FIXED: Store with proper error handling for Colab."""
+        """Store with error handling."""
         try:
             df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
-            # FIXED: Use method='multi' for better performance and if_exists='replace'
             df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
-            self.conn.commit()  # FIXED: Explicit commit
+            self.conn.commit()
+            self.current_table = name
+            self._refresh_schema()
+        except:
+            df.to_sql(name, self.conn, if_exists='replace', index=False)
+            self.conn.commit()
             self.current_table = name
             self._refresh_schema()
-        except Exception as e:
-            # FIXED: Fallback to single-row insert if bulk fails
-            print(f"  Bulk insert failed, using row-by-row (slower but safer)")
-            try:
-                df.to_sql(name, self.conn, if_exists='replace', index=False)
-                self.conn.commit()
-                self.current_table = name
-                self._refresh_schema()
-            except Exception as e2:
-                print(f"  Storage error: {e2}")
-                raise
     def _parse_text_simple(self, text: str) -> pd.DataFrame:
         """Simple parsing."""
         lines = [line.strip() for line in text.split('\n') if line.strip()]
         if not lines:
             return pd.DataFrame({'content': ['No content']})
-        sample = lines[:min(10, len(lines))]
-        for delimiter in ['\t', ',', '|', ';']:
-            if all(delimiter in line for line in sample):
-                try:
-                    df = pd.read_csv(StringIO('\n'.join(lines)), sep=delimiter)
-                    if len(df.columns) > 1:
-                        return df
-                except:
-                    continue
-        return pd.DataFrame({
-            'line_number': range(1, len(lines) + 1),
-            'content': lines
-        })
-    def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
-        """Store DataFrame."""
+        return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
+    def _store_dataframe(self, df: pd.DataFrame, name: str):
+        """Store."""
         self._store_dataframe_safe(df, name)
-        if not silent:
-            print(f"Uploaded: {name}")
-            print(f"  {len(df)} rows, {len(df.columns)} columns")
+        print(f"Uploaded: {name} ({len(df)} rows)")
     def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
         """Natural language query."""
@@ -466,7 +419,7 @@ Requirements:
         print(f"\nQuestion: {question}")
         if self.check_relevance and not self._is_relevant_query(question):
-            print("Warning: Query may be irrelevant")
+            print("Warning: Irrelevant query")
             choice = input("Continue? (yes/no): ").strip().lower()
             if choice not in ['yes', 'y']:
                 return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
@@ -478,7 +431,7 @@ Requirements:
         if self.use_embeddings and self.embedding_model:
             cached = self._check_embedding_cache(question, tbl)
             if cached:
-                print("  Cached result")
+                print("  Cached")
                 return cached
         if self.fuzzy_match:
@@ -519,19 +472,17 @@ Requirements:
         if not self.client:
             return True
-        tables = self._get_table_names()[:3]
-        cols = []
-        for tbl in tables:
-            cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
-        context = f"Tables: {', '.join(tables)}. Columns: {', '.join(cols[:15])}"
         try:
+            tables = self._get_table_names()[:3]
+            cols = []
+            for tbl in tables:
+                cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
             resp = self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "Return only 'yes' or 'no'."},
-                    {"role": "user", "content": f"Relevant to {context}?\n\nQ: {question}\n\nyes/no:"}
+                    {"role": "system", "content": "Return 'yes' or 'no'."},
+                    {"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
                 ],
                 temperature=0,
                 max_tokens=5
@@ -571,7 +522,6 @@ Requirements:
             return None
         q_emb = self.embedding_model.encode([question])[0]
         best_match = None
         best_sim = 0.85
@@ -580,13 +530,12 @@ Requirements:
                 continue
             sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
             if sim > best_sim:
                 best_sim = sim
                 best_match = cached_q
         if best_match:
-            print(f"  Similar ({best_sim:.0%}): '{best_match}'")
+            print(f"  Similar ({best_sim:.0%})")
             return self.query_embeddings[best_match]['result']
         return None
@@ -605,7 +554,7 @@ Requirements:
         return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
     def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
-        """Plotly viz."""
+        """Plotly."""
         try:
             num = df.select_dtypes(include=[np.number]).columns.tolist()
             cat = df.select_dtypes(include=['object']).columns.tolist()
@@ -631,14 +580,12 @@ Requirements:
                     fig = px.bar(df, y=df.columns[0], title=title)
             fig.show()
-            print("Chart displayed")
             return fig
-        except Exception as e:
-            print(f"Viz error: {e}")
+        except:
             return None
     def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
-        """Matplotlib viz."""
+        """Matplotlib."""
         try:
             plt.figure(figsize=(10, 6))
             num = df.select_dtypes(include=[np.number]).columns
@@ -654,14 +601,13 @@ Requirements:
             plt.tight_layout()
             plt.show()
             return plt.gcf()
-        except Exception as e:
-            print(f"Viz error: {e}")
+        except:
             return None
     def tables(self) -> Dict[str, dict]:
         """List tables."""
         print("\n" + "="*70)
-        print("TABLES IN DATABASE")
+        print("TABLES")
         print("="*70)
         all_tables = self._get_table_names()
@@ -673,11 +619,7 @@ Requirements:
         for i, tbl in enumerate(all_tables, 1):
             cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
             cols = list(self.schema_info.get(tbl, {}).keys())
-            print(f" {i}. {tbl}")
-            print(f"     {cnt} rows, {len(cols)} columns")
-            print(f"     {', '.join(cols[:8])}")
+            print(f" {i}. {tbl}: {cnt} rows, {len(cols)} columns")
             result[tbl] = {'rows': cnt, 'columns': cols}
         print("="*70)
@@ -689,16 +631,14 @@ Requirements:
             self._refresh_schema()
         print("\n" + "="*70)
-        print("DATABASE SCHEMA")
+        print("SCHEMA")
         print("="*70)
-        tables_to_show = [table] if table else self.schema_info.keys()
         result = {}
-        for tbl in tables_to_show:
+        for tbl in ([table] if table else self.schema_info.keys()):
             if tbl in self.schema_info:
                 cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-                print(f"\nTable: {tbl} ({cnt} records)")
+                print(f"\n{tbl}: {cnt} records")
                 for col, dtype in self.schema_info[tbl].items():
                     print(f"  - {col:<30} {dtype}")
                 result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
@@ -723,12 +663,10 @@ Requirements:
     def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
         """Execute SQL."""
-        print("\nExecuting SQL...")
         try:
             df = pd.read_sql_query(query, self.conn)
             print(f"Success! {len(df)} rows")
-            fig = self._visualize(df, "SQL Result", viz if isinstance(viz, str) else "auto") if viz else None
+            fig = self._visualize(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
             return QueryResult(True, query, df, fig)
         except Exception as e:
             print(f"Error: {e}")
@@ -736,44 +674,79 @@ Requirements:
     def interactive(self, question: str) -> 'QueryResult':
         """Interactive."""
-        print(f"\nQuestion: {question}")
         choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
         viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
         return self.ask(question, viz=viz)
     def export_db(self, path: str, format: str = "sqlite"):
-        """Export."""
-        formats = {
-            "sqlite": lambda: shutil.copy2(self.db_path, path),
-            "sql": lambda: open(path, 'w', encoding='utf-8').writelines(f'{line}\n' for line in self.conn.iterdump()),
-            "json": lambda: json.dump({t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}, open(path, 'w', encoding='utf-8'), indent=2, default=str),
-            "excel": lambda: pd.ExcelWriter(path, engine='openpyxl').__enter__() and [pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(path, sheet_name=t[:31], index=False) for t in self._get_table_names()]
-        }
-        if format in formats:
-            formats[format]()
-            print(f"Saved: {path}")
+        """Export database."""
+        if format == "sqlite":
+            shutil.copy2(self.db_path, path)
+        elif format == "sql":
+            with open(path, 'w', encoding='utf-8') as f:
+                for line in self.conn.iterdump():
+                    f.write(f'{line}\n')
+        elif format == "json":
+            data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}
+            with open(path, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2, default=str)
+        elif format == "excel":
+            with pd.ExcelWriter(path, engine='openpyxl') as writer:
+                for t in self._get_table_names():
+                    pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
         else:
             raise ValueError(f"Unsupported: {format}")
+        print(f"Saved: {path}")
         return self
-    def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306, tables: Optional[List[str]] = None):
-        """Export to MySQL."""
+    def save_to_mysql(self, host: str, user: str, password: str, database: str,
+                      port: int = 3306, tables: Optional[List[str]] = None,
+                      auto_create: bool = True):
+        """
+        Export to MySQL - AUTO-CREATES database if not exists.
+        Args:
+            host: MySQL host
+            user: MySQL user
+            password: MySQL password
+            database: Database name (auto-created if not exists)
+            port: MySQL port
+            tables: Specific tables to export (None = all)
+            auto_create: Auto-create database if not exists
+        """
         try:
             from sqlalchemy import create_engine
-            engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
-            print(f"Exporting to MySQL...")
-            for t in (tables or self._get_table_names()):
-                df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
-                df.to_sql(t, engine, if_exists='replace', index=False)
-                print(f"  {t}: {len(df)} rows")
-            print("Complete!")
-            return self
+            import mysql.connector
         except ImportError:
             raise ImportError("Run: pip install QuerySUTRA[mysql]")
+        print(f"Exporting to MySQL: {host}/{database}")
+        # Auto-create database if requested
+        if auto_create:
+            try:
+                temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
+                temp_cursor = temp_conn.cursor()
+                temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
+                temp_cursor.close()
+                temp_conn.close()
+                print(f"  Database '{database}' ready")
+            except Exception as e:
+                print(f"  Warning: Could not auto-create database: {e}")
+        engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
+        for t in (tables or self._get_table_names()):
+            df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
+            df.to_sql(t, engine, if_exists='replace', index=False)
+            print(f"  {t}: {len(df)} rows")
+        print("Complete!")
+        return self
-    def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
+    def save_to_postgres(self, host: str, user: str, password: str, database: str,
+                         port: int = 5432, tables: Optional[List[str]] = None):
         """Export to PostgreSQL."""
         try:
             from sqlalchemy import create_engine
@@ -795,7 +768,6 @@ Requirements:
         dir.mkdir(parents=True, exist_ok=True)
         ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        print("Creating backup...")
         self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
         self.export_db(str(dir / f"sutra_{ts}.json"), "json")
         print("Backup complete!")
@@ -803,9 +775,12 @@ Requirements:
     def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
         """Export results."""
-        {"csv": lambda: data.to_csv(path, index=False),
-         "excel": lambda: data.to_excel(path, index=False),
-         "json": lambda: data.to_json(path, orient="records", indent=2)}[format]()
+        if format == "csv":
+            data.to_csv(path, index=False)
+        elif format in ["excel", "xlsx"]:
+            data.to_excel(path, index=False)
+        elif format == "json":
+            data.to_json(path, orient="records", indent=2)
         print(f"Exported: {path}")
         return self
@@ -820,7 +795,7 @@ Requirements:
         return [r[0] for r in self.cursor.fetchall()]
     def _refresh_schema(self):
-        """Refresh schema."""
+        """Refresh."""
         self.schema_info = {}
         for tbl in self._get_table_names():
             self.cursor.execute(f"PRAGMA table_info({tbl})")
@@ -836,7 +811,7 @@ Requirements:
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": "SQL expert. Return only SQL."},
-                {"role": "user", "content": f"Convert to SQL.\nTable: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
+                {"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
             ],
             temperature=0
         )
@@ -850,8 +825,7 @@ Requirements:
         self.close()
     def __repr__(self):
-        feat = [f for f, v in [("cache", self.cache_queries), ("embeddings", self.use_embeddings), ("relevance", self.check_relevance), ("fuzzy", self.fuzzy_match)] if v]
-        return f"SUTRA(tables={len(self.schema_info)}, {', '.join(feat)})"
+        return f"SUTRA(tables={len(self.schema_info)})"
 class QueryResult:
@@ -860,7 +834,7 @@ class QueryResult:
         self.success, self.sql, self.data, self.viz, self.error = success, sql, data, viz, error
     def __repr__(self):
-        return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
+        return f"QueryResult(rows={len(self.data)})" if self.success else f"QueryResult(error='{self.error}')"
     def show(self):
         print(self.data if self.success else f"Error: {self.error}")
@@ -872,3 +846,7 @@ def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, st
     with SUTRA(api_key=api_key) as sutra:
         sutra.upload(data_path)
         return sutra.ask(question, viz=viz)
+if __name__ == "__main__":
+    print("QuerySUTRA v0.4.0 - Simple & Automatic")

QuerySUTRA 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

QuerySUTRA 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl