PyPI - QuerySUTRA - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

QuerySUTRA 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

querysutra-0.4.0.dist-info/METADATA +438 -0
{querysutra-0.3.3.dist-info → querysutra-0.4.0.dist-info}/RECORD +7 -7
sutra/__init__.py +2 -4
sutra/sutra.py +251 -457
querysutra-0.3.3.dist-info/METADATA +0 -285
{querysutra-0.3.3.dist-info → querysutra-0.4.0.dist-info}/WHEEL +0 -0
{querysutra-0.3.3.dist-info → querysutra-0.4.0.dist-info}/licenses/LICENSE +0 -0
{querysutra-0.3.3.dist-info → querysutra-0.4.0.dist-info}/top_level.txt +0 -0

sutra/sutra.py CHANGED Viewed

@@ -1,19 +1,20 @@
 """
-QuerySUTRA v0.3.3 - PROPER RELATIONAL DATABASE EXTRACTION
+QuerySUTRA v0.3.5 - FIXED COLAB COMPATIBILITY
 SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
-FIXED: Proper primary keys, foreign keys, and relational integrity
-- Unique IDs for each entity
-- Proper foreign key relationships
-- No duplicate keys
-- Comprehensive entity extraction (skills, technologies, projects, etc.)
+FIXED:
+- Colab disk I/O errors resolved
+- Batch processing for large datasets
+- Proper error handling
+- Unique IDs and proper foreign keys
+- Comprehensive entity extraction
 Author: Aditya Batta
 License: MIT
-Version: 0.3.3
+Version: 0.3.5
 """
-__version__ = "0.3.3"
+__version__ = "0.3.5"
 __author__ = "Aditya Batta"
 __title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
 __all__ = ["SUTRA", "QueryResult", "quick_start"]
@@ -75,8 +76,6 @@ except ImportError:
 class SUTRA:
     """
     SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
-    Professional data analysis with proper relational database structure
     """
     def __init__(self,
@@ -86,9 +85,8 @@ class SUTRA:
                  check_relevance: bool = False,
                  fuzzy_match: bool = True,
                  cache_queries: bool = True):
-        """Initialize SUTRA with optional features."""
-        print("Initializing QuerySUTRA v0.3.3")
-        print("SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
+        """Initialize SUTRA."""
+        print("Initializing QuerySUTRA v0.3.5")
         if api_key:
             os.environ["OPENAI_API_KEY"] = api_key
@@ -97,7 +95,15 @@ class SUTRA:
         self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
         self.db_path = db
-        self.conn = sqlite3.connect(db, check_same_thread=False)
+        # FIXED: Better connection handling for Colab
+        try:
+            self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
+            self.conn.execute("PRAGMA journal_mode=WAL")
+            self.conn.execute("PRAGMA synchronous=NORMAL")
+        except:
+            self.conn = sqlite3.connect(db, check_same_thread=False)
         self.cursor = self.conn.cursor()
         self.current_table = None
@@ -115,37 +121,29 @@ class SUTRA:
         if use_embeddings and HAS_EMBEDDINGS:
             try:
-                print("Loading embeddings model...")
                 self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
                 print("Embeddings ready")
             except:
-                print("Embeddings unavailable")
                 self.use_embeddings = False
         self._refresh_schema()
         print(f"Ready! Database: {db}")
-        if not self.api_key:
-            print("No API key - use .sql() for direct queries")
     @classmethod
     def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
-        """Load existing SQLite database."""
+        """Load existing database."""
         if not Path(db_path).exists():
             raise FileNotFoundError(f"Database not found: {db_path}")
-        print(f"Loading database: {db_path}")
+        print(f"Loading: {db_path}")
         instance = cls(api_key=api_key, db=db_path, **kwargs)
-        tables = instance.tables()
-        print(f"Loaded {len(tables)} tables")
+        print(f"Loaded {len(instance.tables())} tables")
         return instance
     @classmethod
     def connect_mysql(cls, host: str, user: str, password: str, database: str,
                      port: int = 3306, api_key: Optional[str] = None, **kwargs):
-        """Connect to MySQL database."""
+        """Connect to MySQL."""
         try:
             from sqlalchemy import create_engine
         except ImportError:
@@ -153,16 +151,13 @@ class SUTRA:
         print(f"Connecting to MySQL: {host}:{port}/{database}")
-        connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
+        engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
         temp_db = f"sutra_mysql_{database}.db"
         instance = cls(api_key=api_key, db=temp_db, **kwargs)
-        engine = create_engine(connection_string)
         tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
-        print(f"Found {len(tables)} tables, syncing...")
+        print(f"Syncing {len(tables)} tables...")
         for table in tables:
             df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
@@ -170,14 +165,13 @@ class SUTRA:
             print(f"  {table}: {len(df)} rows")
         instance._refresh_schema()
-        print(f"Connected! {len(tables)} tables available")
+        print("Connected!")
         return instance
     @classmethod
     def connect_postgres(cls, host: str, user: str, password: str, database: str,
                         port: int = 5432, api_key: Optional[str] = None, **kwargs):
-        """Connect to PostgreSQL database."""
+        """Connect to PostgreSQL."""
         try:
             from sqlalchemy import create_engine
         except ImportError:
@@ -185,19 +179,13 @@ class SUTRA:
         print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
-        connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
+        engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
         temp_db = f"sutra_postgres_{database}.db"
         instance = cls(api_key=api_key, db=temp_db, **kwargs)
-        engine = create_engine(connection_string)
-        tables = pd.read_sql_query(
-            "SELECT tablename FROM pg_tables WHERE schemaname='public'",
-            engine
-        )['tablename'].tolist()
-        print(f"Found {len(tables)} tables, syncing...")
+        tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
+        print(f"Syncing {len(tables)} tables...")
         for table in tables:
             df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
@@ -205,21 +193,13 @@ class SUTRA:
             print(f"  {table}: {len(df)} rows")
         instance._refresh_schema()
-        print(f"Connected! {len(tables)} tables available")
+        print("Connected!")
         return instance
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
                extract_entities: Optional[List[str]] = None) -> 'SUTRA':
-        """
-        Upload data with optional custom entity extraction.
-        Args:
-            data: File path or DataFrame
-            name: Table name
-            extract_entities: Custom entities to extract (e.g., ['skills', 'technologies'])
-        """
-        print(f"\nUploading data...")
+        """Upload data."""
+        print("\nUploading data...")
         if isinstance(data, pd.DataFrame):
             name = name or "data"
@@ -238,42 +218,35 @@ class SUTRA:
         if ext == ".csv":
             df = pd.read_csv(path)
             self._store_dataframe(df, name)
         elif ext in [".xlsx", ".xls"]:
             df = pd.read_excel(path)
             self._store_dataframe(df, name)
         elif ext == ".json":
             df = pd.read_json(path)
             self._store_dataframe(df, name)
         elif ext == ".sql":
             with open(path) as f:
                 self.cursor.executescript(f.read())
             self.conn.commit()
             self._refresh_schema()
             print("SQL executed")
         elif ext == ".pdf":
             self._smart_upload_pdf(path, name, extract_entities)
         elif ext == ".docx":
             self._smart_upload_docx(path, name, extract_entities)
         elif ext == ".txt":
             self._smart_upload_txt(path, name, extract_entities)
         else:
-            raise ValueError(f"Unsupported format: {ext}")
+            raise ValueError(f"Unsupported: {ext}")
         return self
     def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
-        """Parse PDF with proper relational structure."""
+        """Parse PDF."""
         if not HAS_PYPDF2:
             raise ImportError("Run: pip install PyPDF2")
-        print("Extracting text from PDF...")
+        print("Extracting from PDF...")
         with open(path, 'rb') as file:
             pdf_reader = PyPDF2.PdfReader(file)
@@ -283,23 +256,23 @@ class SUTRA:
                 print(f"  Page {page_num}/{len(pdf_reader.pages)}")
         if self.client:
-            print("AI: Comprehensive entity extraction with proper relationships...")
+            print("AI: Extracting entities...")
             tables = self._create_tables_with_ai(text, base_name, extract_entities)
             if tables and len(tables) > 0:
-                print(f"\nCreated {len(tables)} relational tables:")
-                for tbl_name in tables:
-                    count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
-                    cols = len(self.schema_info.get(tbl_name, {}))
-                    print(f"  {tbl_name}: {count} rows, {cols} columns")
+                print(f"\nCreated {len(tables)} tables:")
+                for tbl in tables:
+                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                    cols = len(self.schema_info.get(tbl, {}))
+                    print(f"  {tbl}: {cnt} rows, {cols} columns")
                 return
-        print("AI unavailable, creating simple table")
+        print("Creating simple table")
         df = self._parse_text_simple(text)
         self._store_dataframe(df, base_name)
     def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
-        """Parse DOCX with proper structure."""
+        """Parse DOCX."""
         if not HAS_DOCX:
             raise ImportError("Run: pip install python-docx")
@@ -320,137 +293,84 @@ class SUTRA:
         text = "\n".join([para.text for para in doc.paragraphs])
         if self.client:
-            print("AI: Analyzing...")
+            print("AI: Extracting...")
             tables = self._create_tables_with_ai(text, base_name, extract_entities)
             if tables and len(tables) > 0:
                 print(f"\nCreated {len(tables)} tables:")
-                for tbl_name in tables:
-                    count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
-                    cols = len(self.schema_info.get(tbl_name, {}))
-                    print(f"  {tbl_name}: {count} rows, {cols} columns")
+                for tbl in tables:
+                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                    print(f"  {tbl}: {cnt} rows")
                 return
         df = self._parse_text_simple(text)
         self._store_dataframe(df, base_name)
     def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
-        """Parse TXT with proper structure."""
+        """Parse TXT."""
         print("Reading TXT...")
         with open(path, 'r', encoding='utf-8') as file:
             text = file.read()
         if self.client:
-            print("AI: Analyzing...")
+            print("AI: Extracting...")
             tables = self._create_tables_with_ai(text, base_name, extract_entities)
             if tables and len(tables) > 0:
                 print(f"\nCreated {len(tables)} tables:")
-                for tbl_name in tables:
-                    count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
-                    cols = len(self.schema_info.get(tbl_name, {}))
-                    print(f"  {tbl_name}: {count} rows, {cols} columns")
+                for tbl in tables:
+                    cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                    print(f"  {tbl}: {cnt} rows")
                 return
         df = self._parse_text_simple(text)
         self._store_dataframe(df, base_name)
     def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
-        """
-        AI extracts ALL entities with PROPER primary and foreign keys.
-        CRITICAL: Each entity gets UNIQUE IDs, foreign keys properly link tables.
-        """
+        """AI extraction with proper keys."""
         if not self.client:
             return []
         try:
-            if custom_entities:
-                entity_instruction = f"""Extract these specific entities: {', '.join(custom_entities)}
-For each entity type, create a proper table with unique IDs."""
-            else:
-                entity_instruction = """Automatically identify and extract ALL structured entities.
-Common entities (extract ALL you find):
-- people: Personal information (id, name, email, phone, address, city, state, zip)
-- skills: Individual skills (id, person_id, skill_name, proficiency_level, years_experience)
-- technologies: Technologies/tools (id, person_id, technology_name, category, proficiency)
-- projects: Projects (id, person_id, project_name, description, start_date, end_date)
-- certifications: Certifications (id, person_id, cert_name, issuer, date_obtained)
-- education: Education records (id, person_id, degree, institution, graduation_year)
-- work_experience: Work history (id, person_id, company, title, start_date, end_date)
-- events: Events/meetings (id, host_id, description, location, date, attendee_ids)
-- organizations: Companies/departments (id, name, address, city, industry)
-- products: Products/services (id, name, description, price, category)
-- ANY other structured entities you identify
+            entity_list = """Extract ALL entities you find:
+- people: id, name, email, phone, address, city, state, zip
+- skills: id, person_id, skill_name, proficiency, years
+- technologies: id, person_id, technology, category, proficiency
+- projects: id, person_id, project_name, description, role
+- certifications: id, person_id, cert_name, issuer, date
+- education: id, person_id, degree, institution, year
+- work_experience: id, person_id, company, title, start_date, end_date
+- events: id, host_id, description, location, date
+- organizations: id, name, address, city
+- ANY other structured data
+CRITICAL: Use UNIQUE sequential IDs (1,2,3...) for each table. Foreign keys MUST reference valid IDs."""
-Extract EVERYTHING you find in the text."""
+            if custom_entities:
+                entity_list = f"Extract these entities: {', '.join(custom_entities)}"
-            extraction_prompt = f"""Analyze this text and extract ALL structured data into proper relational database tables.
+            extraction_prompt = f"""Extract structured data from this text.
 Text:
-{text[:6000]}
-{entity_instruction}
-CRITICAL REQUIREMENTS FOR PROPER DATABASE DESIGN:
-1. PRIMARY KEYS:
-   - Each table MUST have unique sequential IDs starting from 1
-   - Person 1 gets id=1, Person 2 gets id=2, etc.
-   - NO DUPLICATE IDs within same table
-   - IDs must be integers
+{text[:5000]}
-2. FOREIGN KEYS:
-   - Use foreign keys to link related tables
-   - Example: skills table has person_id that references people.id
-   - Example: projects table has person_id that references people.id
-   - Foreign keys MUST match existing primary keys
+{entity_list}
-3. TABLE STRUCTURE:
-   - Each entity type gets its own table
-   - Use clear table names (people, skills, technologies, not table1, table2)
-   - Include ALL relevant attributes for each entity
-Return JSON with this EXACT structure:
+Return JSON:
 {{
-  "people": [
-    {{"id": 1, "name": "John Doe", "email": "john@email.com", "phone": "+1-555-0100", "city": "Dallas", "state": "TX"}},
-    {{"id": 2, "name": "Jane Smith", "email": "jane@email.com", "phone": "+1-555-0101", "city": "New York", "state": "NY"}},
-    ...
-  ],
-  "skills": [
-    {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
-    {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
-    {{"id": 3, "person_id": 2, "skill_name": "Java", "proficiency": "Expert", "years": 7}},
-    ...
-  ],
-  "technologies": [
-    {{"id": 1, "person_id": 1, "technology": "React", "category": "Frontend"}},
-    {{"id": 2, "person_id": 1, "technology": "PostgreSQL", "category": "Database"}},
-    {{"id": 3, "person_id": 2, "technology": "Spring Boot", "category": "Backend"}},
-    ...
-  ],
-  "projects": [
-    {{"id": 1, "person_id": 1, "project_name": "E-commerce Platform", "role": "Lead Developer"}},
-    {{"id": 2, "person_id": 2, "project_name": "Analytics Dashboard", "role": "Backend Engineer"}},
-    ...
-  ]
+  "people": [{{"id": 1, "name": "John", ...}}, {{"id": 2, "name": "Jane", ...}}],
+  "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python", ...}}, {{"id": 2, "person_id": 2, ...}}]
 }}
-IMPORTANT:
-- Extract EVERY structured piece of data you find
-- Assign UNIQUE sequential IDs (1, 2, 3, ...) for each table
-- Foreign keys MUST reference valid primary keys
-- Create as many tables as needed (don't limit yourself)
-- Return ONLY valid JSON, no explanations
-- Be COMPREHENSIVE - extract skills, technologies, projects, certifications, education, work history, etc."""
+Requirements:
+- UNIQUE IDs: id=1,2,3,... (no duplicates)
+- Valid foreign keys: person_id must match people.id
+- Extract EVERYTHING
+- Return ONLY valid JSON"""
             response = self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "You are a database design expert. Extract ALL entities with proper primary keys (unique sequential IDs) and foreign keys (referencing valid IDs). Be comprehensive and extract EVERYTHING. Return only valid JSON."},
+                    {"role": "system", "content": "Extract entities with unique IDs and proper foreign keys. Return only JSON."},
                     {"role": "user", "content": extraction_prompt}
                 ],
                 temperature=0,
@@ -471,20 +391,45 @@ IMPORTANT:
                     try:
                         df = pd.DataFrame(records)
                         if not df.empty:
-                            self._store_dataframe(df, table_name, silent=True)
+                            # FIXED: Store with better error handling
+                            self._store_dataframe_safe(df, table_name)
                             created_tables.append(table_name)
                             print(f"  {entity_type}: {len(df)} records")
                     except Exception as e:
-                        print(f"  Failed {entity_type}: {e}")
+                        print(f"  Error {entity_type}: {e}")
             return created_tables
         except Exception as e:
-            print(f"AI extraction error: {e}")
+            print(f"AI error: {e}")
             return []
+    def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
+        """FIXED: Store with proper error handling for Colab."""
+        try:
+            df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
+            # FIXED: Use method='multi' for better performance and if_exists='replace'
+            df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
+            self.conn.commit()  # FIXED: Explicit commit
+            self.current_table = name
+            self._refresh_schema()
+        except Exception as e:
+            # FIXED: Fallback to single-row insert if bulk fails
+            print(f"  Bulk insert failed, using row-by-row (slower but safer)")
+            try:
+                df.to_sql(name, self.conn, if_exists='replace', index=False)
+                self.conn.commit()
+                self.current_table = name
+                self._refresh_schema()
+            except Exception as e2:
+                print(f"  Storage error: {e2}")
+                raise
     def _parse_text_simple(self, text: str) -> pd.DataFrame:
-        """Fallback text parsing."""
+        """Simple parsing."""
         lines = [line.strip() for line in text.split('\n') if line.strip()]
         if not lines:
@@ -507,44 +452,34 @@ IMPORTANT:
     def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
         """Store DataFrame."""
-        df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
-        df.to_sql(name, self.conn, if_exists='replace', index=False)
-        self.current_table = name
-        self._refresh_schema()
+        self._store_dataframe_safe(df, name)
         if not silent:
             print(f"Uploaded: {name}")
             print(f"  {len(df)} rows, {len(df.columns)} columns")
     def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
-        """Query with natural language."""
+        """Natural language query."""
         if not self.client:
-            print("No API key")
             return QueryResult(False, "", pd.DataFrame(), None, "No API key")
         print(f"\nQuestion: {question}")
-        if self.check_relevance:
-            if not self._is_relevant_query(question):
-                print("Warning: Query may be irrelevant")
-                choice = input("Continue? (yes/no): ").strip().lower()
-                if choice not in ['yes', 'y']:
-                    return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
+        if self.check_relevance and not self._is_relevant_query(question):
+            print("Warning: Query may be irrelevant")
+            choice = input("Continue? (yes/no): ").strip().lower()
+            if choice not in ['yes', 'y']:
+                return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
-        tbl = table or self.current_table
+        tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
         if not tbl:
-            all_tables = self._get_table_names()
-            if all_tables:
-                tbl = all_tables[0]
-            else:
-                print("No tables found")
-                return QueryResult(False, "", pd.DataFrame(), None, "No table")
+            return QueryResult(False, "", pd.DataFrame(), None, "No table")
         if self.use_embeddings and self.embedding_model:
-            cached_result = self._check_embedding_cache(question, tbl)
-            if cached_result:
-                print("  Using cached result")
-                return cached_result
+            cached = self._check_embedding_cache(question, tbl)
+            if cached:
+                print("  Cached result")
+                return cached
         if self.fuzzy_match:
             question = self._apply_fuzzy_matching(question, tbl)
@@ -567,7 +502,7 @@ IMPORTANT:
             fig = None
             if viz:
                 viz_type = viz if isinstance(viz, str) else "auto"
-                fig = self._visualize(df, question, viz_type=viz_type)
+                fig = self._visualize(df, question, viz_type)
             result = QueryResult(True, sql_query, df, fig)
@@ -584,142 +519,116 @@ IMPORTANT:
         if not self.client:
             return True
-        tables = self._get_table_names()
-        columns = []
-        for tbl in tables[:3]:
-            cols = list(self.schema_info.get(tbl, {}).keys())
-            columns.extend(cols[:5])
+        tables = self._get_table_names()[:3]
+        cols = []
+        for tbl in tables:
+            cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
-        db_context = f"Tables: {', '.join(tables[:5])}. Columns: {', '.join(columns[:15])}"
+        context = f"Tables: {', '.join(tables)}. Columns: {', '.join(cols[:15])}"
         try:
-            response = self.client.chat.completions.create(
+            resp = self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "Relevance checker. Return only 'yes' or 'no'."},
-                    {"role": "user", "content": f"Is this relevant to database with {db_context}?\n\nQuestion: {question}\n\nyes or no:"}
+                    {"role": "system", "content": "Return only 'yes' or 'no'."},
+                    {"role": "user", "content": f"Relevant to {context}?\n\nQ: {question}\n\nyes/no:"}
                 ],
                 temperature=0,
                 max_tokens=5
             )
-            return 'yes' in response.choices[0].message.content.strip().lower()
+            return 'yes' in resp.choices[0].message.content.lower()
         except:
             return True
     def _apply_fuzzy_matching(self, question: str, table: str) -> str:
-        """Fuzzy match query terms."""
+        """Fuzzy matching."""
         if not self.schema_info.get(table):
             return question
         try:
-            string_cols = [col for col, dtype in self.schema_info[table].items()
-                          if 'TEXT' in dtype or 'VARCHAR' in dtype]
+            string_cols = [col for col, dtype in self.schema_info[table].items() if 'TEXT' in dtype]
             if not string_cols:
                 return question
             for col in string_cols[:2]:
                 df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
-                unique_values = [str(v) for v in df[col].dropna().tolist()]
+                values = [str(v) for v in df[col].dropna().tolist()]
                 words = question.split()
                 for i, word in enumerate(words):
-                    matches = get_close_matches(word, unique_values, n=1, cutoff=0.6)
+                    matches = get_close_matches(word, values, n=1, cutoff=0.6)
                     if matches and word != matches[0]:
                         words[i] = matches[0]
                         print(f"  Fuzzy: '{word}' -> '{matches[0]}'")
                 question = " ".join(words)
             return question
         except:
             return question
     def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
-        """Check embedding cache."""
+        """Check cache."""
         if not self.query_embeddings:
             return None
-        q_embedding = self.embedding_model.encode([question])[0]
+        q_emb = self.embedding_model.encode([question])[0]
         best_match = None
-        best_similarity = 0.85
+        best_sim = 0.85
-        for cached_q, cached_data in self.query_embeddings.items():
-            if cached_data['table'] != table:
+        for cached_q, data in self.query_embeddings.items():
+            if data['table'] != table:
                 continue
-            similarity = np.dot(q_embedding, cached_data['embedding']) / (
-                np.linalg.norm(q_embedding) * np.linalg.norm(cached_data['embedding'])
-            )
+            sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
-            if similarity > best_similarity:
-                best_similarity = similarity
+            if sim > best_sim:
+                best_sim = sim
                 best_match = cached_q
         if best_match:
-            print(f"  Similar query ({best_similarity:.0%}): '{best_match}'")
+            print(f"  Similar ({best_sim:.0%}): '{best_match}'")
             return self.query_embeddings[best_match]['result']
         return None
     def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
-        """Store in cache."""
-        q_embedding = self.embedding_model.encode([question])[0]
-        self.query_embeddings[question] = {
-            'table': table,
-            'embedding': q_embedding,
-            'result': result
-        }
+        """Store cache."""
+        q_emb = self.embedding_model.encode([question])[0]
+        self.query_embeddings[question] = {'table': table, 'embedding': q_emb, 'result': result}
     def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
-        """Create visualization."""
+        """Visualize."""
         if not HAS_PLOTLY and not HAS_MATPLOTLIB:
-            print("Install plotly or matplotlib")
             return None
         print(f"Creating {viz_type} chart...")
-        if HAS_PLOTLY:
-            return self._plotly_viz(df, title, viz_type)
-        else:
-            return self._matplotlib_viz(df, title, viz_type)
+        return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
     def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
-        """Plotly visualization."""
+        """Plotly viz."""
         try:
-            numeric = df.select_dtypes(include=[np.number]).columns.tolist()
-            categorical = df.select_dtypes(include=['object']).columns.tolist()
+            num = df.select_dtypes(include=[np.number]).columns.tolist()
+            cat = df.select_dtypes(include=['object']).columns.tolist()
-            if viz_type == "table" or len(df) == 1:
-                fig = go.Figure(data=[go.Table(
-                    header=dict(values=list(df.columns)),
-                    cells=dict(values=[df[c] for c in df.columns])
-                )])
-            elif viz_type == "pie" and categorical and numeric:
-                fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
-            elif viz_type == "bar" and categorical and numeric:
-                fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
-            elif viz_type == "line" and numeric:
-                fig = px.line(df, y=numeric[0], title=title)
-            elif viz_type == "scatter" and len(numeric) >= 2:
-                fig = px.scatter(df, x=numeric[0], y=numeric[1], title=title)
-            elif viz_type == "heatmap" and len(numeric) >= 2:
-                corr = df[numeric].corr()
-                fig = go.Figure(data=go.Heatmap(
-                    z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'
-                ))
+            if viz_type == "table":
+                fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)), cells=dict(values=[df[c] for c in df.columns]))])
+            elif viz_type == "pie" and cat and num:
+                fig = px.pie(df, names=cat[0], values=num[0], title=title)
+            elif viz_type == "bar" and cat and num:
+                fig = px.bar(df, x=cat[0], y=num[0], title=title)
+            elif viz_type == "line" and num:
+                fig = px.line(df, y=num[0], title=title)
+            elif viz_type == "scatter" and len(num) >= 2:
+                fig = px.scatter(df, x=num[0], y=num[1], title=title)
+            elif viz_type == "heatmap" and len(num) >= 2:
+                corr = df[num].corr()
+                fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
                 fig.update_layout(title=title)
-            elif viz_type == "auto":
-                if categorical and numeric:
-                    fig = px.pie(df, names=categorical[0], values=numeric[0], title=title) if len(df) <= 10 else px.bar(df, x=categorical[0], y=numeric[0], title=title)
-                elif len(numeric) >= 2:
-                    fig = px.line(df, y=numeric[0], title=title)
+            else:
+                if cat and num:
+                    fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
                 else:
                     fig = px.bar(df, y=df.columns[0], title=title)
-            else:
-                fig = px.bar(df, x=categorical[0] if categorical else df.index, y=numeric[0] if numeric else df.columns[0], title=title)
             fig.show()
             print("Chart displayed")
@@ -729,54 +638,47 @@ IMPORTANT:
             return None
     def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
-        """Matplotlib visualization."""
+        """Matplotlib viz."""
         try:
             plt.figure(figsize=(10, 6))
-            numeric = df.select_dtypes(include=[np.number]).columns
+            num = df.select_dtypes(include=[np.number]).columns
-            if viz_type == "pie" and len(numeric) > 0:
+            if viz_type == "pie":
                 df[df.columns[0]].value_counts().plot(kind='pie')
-            elif viz_type == "line" and len(numeric) > 0:
-                df[numeric[0]].plot(kind='line')
+            elif viz_type == "line" and len(num) > 0:
+                df[num[0]].plot(kind='line')
             else:
-                if len(numeric) > 0:
-                    df[numeric[0]].plot(kind='bar')
-                else:
-                    df.iloc[:, 0].value_counts().plot(kind='bar')
+                (df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
             plt.title(title)
             plt.tight_layout()
             plt.show()
-            print("Chart displayed")
             return plt.gcf()
         except Exception as e:
             print(f"Viz error: {e}")
             return None
     def tables(self) -> Dict[str, dict]:
-        """List all tables."""
+        """List tables."""
         print("\n" + "="*70)
         print("TABLES IN DATABASE")
         print("="*70)
         all_tables = self._get_table_names()
         if not all_tables:
-            print("No tables found")
+            print("No tables")
             return {}
         result = {}
         for i, tbl in enumerate(all_tables, 1):
-            count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-            cols = self.schema_info.get(tbl, {})
-            col_list = list(cols.keys())
+            cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+            cols = list(self.schema_info.get(tbl, {}).keys())
-            marker = ">" if tbl == self.current_table else " "
-            print(f"{marker} {i}. {tbl}")
-            print(f"     {count} rows, {len(col_list)} columns")
-            print(f"     Columns: {', '.join(col_list[:8])}")
+            print(f" {i}. {tbl}")
+            print(f"     {cnt} rows, {len(cols)} columns")
+            print(f"     {', '.join(cols[:8])}")
-            result[tbl] = {'rows': count, 'columns': col_list}
+            result[tbl] = {'rows': cnt, 'columns': cols}
         print("="*70)
         return result
@@ -795,184 +697,122 @@ IMPORTANT:
         result = {}
         for tbl in tables_to_show:
             if tbl in self.schema_info:
-                count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
-                print(f"\nTable: {tbl}")
-                print(f"Records: {count}")
-                print("Columns:")
+                cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                print(f"\nTable: {tbl} ({cnt} records)")
                 for col, dtype in self.schema_info[tbl].items():
-                    print(f"  - {col:<30} ({dtype})")
-                result[tbl] = {
-                    'records': count,
-                    'columns': self.schema_info[tbl]
-                }
+                    print(f"  - {col:<30} {dtype}")
+                result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
         print("="*70)
         return result
     def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
-        """Preview data."""
+        """Preview."""
         tbl = table or self.current_table
         if not tbl:
-            print("No table specified")
             return pd.DataFrame()
         df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
-        print(f"\nSample from '{tbl}' ({n} rows):")
+        print(f"\nSample from '{tbl}':")
         print(df.to_string(index=False))
         return df
     def info(self):
-        """Database overview."""
+        """Overview."""
         return self.tables()
     def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
         """Execute SQL."""
         print("\nExecuting SQL...")
         try:
             df = pd.read_sql_query(query, self.conn)
             print(f"Success! {len(df)} rows")
-            fig = None
-            if viz:
-                viz_type = viz if isinstance(viz, str) else "auto"
-                fig = self._visualize(df, "SQL Result", viz_type=viz_type)
+            fig = self._visualize(df, "SQL Result", viz if isinstance(viz, str) else "auto") if viz else None
             return QueryResult(True, query, df, fig)
         except Exception as e:
             print(f"Error: {e}")
             return QueryResult(False, query, pd.DataFrame(), None, str(e))
     def interactive(self, question: str) -> 'QueryResult':
-        """Interactive query."""
+        """Interactive."""
         print(f"\nQuestion: {question}")
         choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
         viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
         return self.ask(question, viz=viz)
     def export_db(self, path: str, format: str = "sqlite"):
-        """Export database."""
-        print(f"\nExporting to {format}...")
-        if format == "sqlite":
-            shutil.copy2(self.db_path, path)
-        elif format == "sql":
-            with open(path, 'w', encoding='utf-8') as f:
-                for line in self.conn.iterdump():
-                    f.write(f'{line}\n')
-        elif format == "json":
-            data = {}
-            for table in self._get_table_names():
-                df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
-                data[table] = df.to_dict(orient='records')
-            with open(path, 'w', encoding='utf-8') as f:
-                json.dump(data, f, indent=2, default=str)
-        elif format == "excel":
-            with pd.ExcelWriter(path, engine='openpyxl') as writer:
-                for table in self._get_table_names():
-                    df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
-                    df.to_excel(writer, sheet_name=table[:31], index=False)
+        """Export."""
+        formats = {
+            "sqlite": lambda: shutil.copy2(self.db_path, path),
+            "sql": lambda: open(path, 'w', encoding='utf-8').writelines(f'{line}\n' for line in self.conn.iterdump()),
+            "json": lambda: json.dump({t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}, open(path, 'w', encoding='utf-8'), indent=2, default=str),
+            "excel": lambda: pd.ExcelWriter(path, engine='openpyxl').__enter__() and [pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(path, sheet_name=t[:31], index=False) for t in self._get_table_names()]
+        }
+        if format in formats:
+            formats[format]()
+            print(f"Saved: {path}")
         else:
             raise ValueError(f"Unsupported: {format}")
-        print(f"Saved to {path}")
         return self
-    def save_to_mysql(self, host: str, user: str, password: str, database: str,
-                      port: int = 3306, tables: Optional[List[str]] = None):
+    def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306, tables: Optional[List[str]] = None):
         """Export to MySQL."""
         try:
             from sqlalchemy import create_engine
+            engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
+            print(f"Exporting to MySQL...")
+            for t in (tables or self._get_table_names()):
+                df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
+                df.to_sql(t, engine, if_exists='replace', index=False)
+                print(f"  {t}: {len(df)} rows")
+            print("Complete!")
+            return self
         except ImportError:
             raise ImportError("Run: pip install QuerySUTRA[mysql]")
-        print(f"\nConnecting to MySQL: {host}:{port}...")
-        engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
-        tables_to_export = tables or self._get_table_names()
-        print(f"Exporting {len(tables_to_export)} tables...")
-        for table in tables_to_export:
-            df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
-            df.to_sql(table, engine, if_exists='replace', index=False)
-            print(f"  {table}: {len(df)} rows")
-        print("Complete!")
-        return self
-    def save_to_postgres(self, host: str, user: str, password: str, database: str,
-                         port: int = 5432, tables: Optional[List[str]] = None):
+    def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
         """Export to PostgreSQL."""
         try:
             from sqlalchemy import create_engine
+            engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
+            print(f"Exporting to PostgreSQL...")
+            for t in (tables or self._get_table_names()):
+                df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
+                df.to_sql(t, engine, if_exists='replace', index=False)
+                print(f"  {t}: {len(df)} rows")
+            print("Complete!")
+            return self
         except ImportError:
             raise ImportError("Run: pip install QuerySUTRA[postgres]")
-        print(f"\nConnecting to PostgreSQL: {host}:{port}...")
-        engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
-        tables_to_export = tables or self._get_table_names()
-        print(f"Exporting {len(tables_to_export)} tables...")
-        for table in tables_to_export:
-            df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
-            df.to_sql(table, engine, if_exists='replace', index=False)
-            print(f"  {table}: {len(df)} rows")
-        print("Complete!")
-        return self
-    def backup(self, backup_path: str = None):
-        """Create backup."""
-        if backup_path:
-            backup_dir = Path(backup_path)
-            backup_dir.mkdir(parents=True, exist_ok=True)
-        else:
-            backup_dir = Path(".")
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        print("\nCreating backup...")
-        db_backup = backup_dir / f"sutra_{timestamp}.db"
-        self.export_db(str(db_backup), format="sqlite")
-        json_backup = backup_dir / f"sutra_{timestamp}.json"
-        self.export_db(str(json_backup), format="json")
-        print(f"\nBackup complete!")
-        print(f"  Database: {db_backup}")
-        print(f"  Data: {json_backup}")
+    def backup(self, path: str = None):
+        """Backup."""
+        dir = Path(path) if path else Path(".")
+        dir.mkdir(parents=True, exist_ok=True)
+        ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        print("Creating backup...")
+        self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
+        self.export_db(str(dir / f"sutra_{ts}.json"), "json")
+        print("Backup complete!")
         return self
     def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
         """Export results."""
-        if format == "csv":
-            data.to_csv(path, index=False)
-        elif format in ["excel", "xlsx"]:
-            data.to_excel(path, index=False)
-        elif format == "json":
-            data.to_json(path, orient="records", indent=2)
-        else:
-            raise ValueError(f"Unknown: {format}")
-        print(f"Exported to {path}")
+        {"csv": lambda: data.to_csv(path, index=False),
+         "excel": lambda: data.to_excel(path, index=False),
+         "json": lambda: data.to_json(path, orient="records", indent=2)}[format]()
+        print(f"Exported: {path}")
         return self
     def close(self):
-        """Close connection."""
+        """Close."""
         if self.conn:
             self.conn.close()
-            print("Closed")
     def _get_table_names(self) -> List[str]:
         """Get tables."""
@@ -981,45 +821,27 @@ IMPORTANT:
     def _refresh_schema(self):
         """Refresh schema."""
-        tables = self._get_table_names()
         self.schema_info = {}
-        for tbl in tables:
+        for tbl in self._get_table_names():
             self.cursor.execute(f"PRAGMA table_info({tbl})")
             self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
     def _generate_sql(self, question: str, table: str) -> str:
         """Generate SQL."""
         schema = self.schema_info.get(table, {})
-        sample_df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn)
-        sample = sample_df.to_string(index=False)
+        sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
         schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
-        prompt = f"""Convert to SQL.
-Database: SQLite
-Table: {table}
-Columns: {schema_str}
-Sample:
-{sample}
-Question: {question}
-Return ONLY SQL."""
-        response = self.client.chat.completions.create(
+        resp = self.client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
-                {"role": "system", "content": "SQL expert. Return only SQL code."},
-                {"role": "user", "content": prompt}
+                {"role": "system", "content": "SQL expert. Return only SQL."},
+                {"role": "user", "content": f"Convert to SQL.\nTable: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
             ],
             temperature=0
         )
-        sql = response.choices[0].message.content.strip()
-        return sql.replace("```sql", "").replace("```", "").strip()
+        return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
     def __enter__(self):
         return self
@@ -1028,53 +850,25 @@ Return ONLY SQL."""
         self.close()
     def __repr__(self):
-        features = []
-        if self.cache_queries:
-            features.append("cache")
-        if self.use_embeddings:
-            features.append("embeddings")
-        if self.check_relevance:
-            features.append("relevance")
-        if self.fuzzy_match:
-            features.append("fuzzy")
-        feat_str = f", {', '.join(features)}" if features else ""
-        return f"SUTRA(tables={len(self.schema_info)}{feat_str})"
+        feat = [f for f, v in [("cache", self.cache_queries), ("embeddings", self.use_embeddings), ("relevance", self.check_relevance), ("fuzzy", self.fuzzy_match)] if v]
+        return f"SUTRA(tables={len(self.schema_info)}, {', '.join(feat)})"
 class QueryResult:
-    """Query result."""
+    """Result."""
     def __init__(self, success: bool, sql: str, data: pd.DataFrame, viz, error: str = None):
-        self.success = success
-        self.sql = sql
-        self.data = data
-        self.viz = viz
-        self.error = error
+        self.success, self.sql, self.data, self.viz, self.error = success, sql, data, viz, error
     def __repr__(self):
         return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
     def show(self):
-        print(self.data) if self.success else print(f"Error: {self.error}")
+        print(self.data if self.success else f"Error: {self.error}")
         return self
 def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
-    """One-liner."""
+    """Quick start."""
     with SUTRA(api_key=api_key) as sutra:
         sutra.upload(data_path)
         return sutra.ask(question, viz=viz)
-if __name__ == "__main__":
-    print("""
-QuerySUTRA v0.3.3 - Professional Data Analysis
-SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
-Fixed: Proper primary and foreign keys with unique IDs
-Features: Load existing DB, custom viz, fuzzy matching, embeddings
-Installation: pip install QuerySUTRA
-Usage: from sutra import SUTRA
-""")

QuerySUTRA 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

QuerySUTRA 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl