PyPI - QuerySUTRA - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

QuerySUTRA 0.5.3py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/METADATA +18 -2
querysutra-0.6.1.dist-info/RECORD +22 -0
{querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/WHEEL +1 -1
querysutra-0.6.1.dist-info/top_level.txt +1 -0
sutra/__init__.py +6 -4
sutra/database_manager.py +235 -195
sutra/nlp_processor.py +175 -143
sutra/schema_generator.py +56 -52
sutra/sutra.py +133 -16
querysutra-0.5.3.dist-info/RECORD +0 -28
querysutra-0.5.3.dist-info/top_level.txt +0 -3
tests/__init__.py +0 -0
tests/test_modules.py +0 -0
tests/test_sutra.py +0 -76
utils/__init__.py +0 -0
utils/file_utils.py +0 -0
utils/text_utils.py +0 -0
{querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/licenses/LICENSE +0 -0

sutra/sutra.py CHANGED Viewed

@@ -1,5 +1,5 @@
-"""QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
-__version__ = "0.5.2"
+"""QuerySUTRA v0.6.1 - AI-powered data analysis for structured and unstructured data"""
+__version__ = "0.6.1"
 __author__ = "Aditya Batta"
 __all__ = ["SUTRA", "QueryResult"]
@@ -72,7 +72,7 @@ class SUTRA:
                 pass
         self._refresh_schema()
-        print(f"QuerySUTRA v0.5.2 Ready")
+        print(f"QuerySUTRA v{__version__} Ready")
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
         """Upload."""
@@ -136,6 +136,8 @@ class SUTRA:
                         rec['id'] = idx
                     self._store(pd.DataFrame(recs), f"{name}_{etype}")
                     print(f"  {etype}: {len(recs)} rows")
+            # After all tables are created, detect and store foreign key relationships
+            self._create_foreign_keys()
             return
         print("Using regex fallback...")
@@ -253,8 +255,17 @@ JSON:"""
         self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _store(self, df: pd.DataFrame, name: str):
-        """Store."""
+        """Store. Flattens any list/dict values to strings for SQLite compatibility."""
         df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
+        # Flatten list/dict values to strings — SQLite can't store Python objects
+        for col in df.columns:
+            df[col] = df[col].apply(
+                lambda x: ', '.join(str(i) for i in x) if isinstance(x, list)
+                else json.dumps(x) if isinstance(x, dict)
+                else x
+            )
         try:
             df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
         except:
@@ -264,6 +275,55 @@ JSON:"""
         self._refresh_schema()
         print(f"  {name}: {len(df)} rows")
+    def _create_foreign_keys(self, silent=False):
+        """Detect foreign key relationships between tables by matching column naming patterns.
+        e.g., 'person_id' in work_experience -> 'id' in people table."""
+        tables = self._get_tables()
+        # Build a map of potential parent tables by looking for 'id' columns
+        # e.g., employee_data_people has 'id' -> can be referenced as person_id, people_id
+        parent_map = {}  # Maps potential FK column names -> (parent_table, parent_pk)
+        for t in tables:
+            self.cursor.execute(f"PRAGMA table_info({t})")
+            cols = {r[1]: r[2] for r in self.cursor.fetchall()}
+            if 'id' in cols:
+                # Generate possible FK names from table name
+                # e.g., 'employee_data_people' -> 'person_id', 'people_id'
+                parts = t.split('_')
+                for part in parts:
+                    # singular form guesses
+                    fk_name = f"{part}_id"
+                    parent_map[fk_name] = (t, 'id')
+                    # Handle plural -> singular (people -> person)
+                    if part.endswith('ies'):
+                        parent_map[f"{part[:-3]}y_id"] = (t, 'id')
+                    elif part.endswith('es'):
+                        parent_map[f"{part[:-2]}_id"] = (t, 'id')
+                    elif part.endswith('s'):
+                        parent_map[f"{part[:-1]}_id"] = (t, 'id')
+                # Also try full table name as FK
+                parent_map[f"{t}_id"] = (t, 'id')
+        # Now scan all tables for columns matching FK patterns
+        self.foreign_keys = {}  # table -> [(fk_col, parent_table, parent_col)]
+        for t in tables:
+            self.cursor.execute(f"PRAGMA table_info({t})")
+            cols = [r[1] for r in self.cursor.fetchall()]
+            fks = []
+            for col in cols:
+                if col in parent_map:
+                    parent_table, parent_col = parent_map[col]
+                    if parent_table != t:  # Don't self-reference
+                        fks.append((col, parent_table, parent_col))
+            if fks:
+                self.foreign_keys[t] = fks
+        if self.foreign_keys and not silent:
+            print(f"\n🔗 Detected relationships:")
+            for t, fks in self.foreign_keys.items():
+                for fk_col, parent_table, parent_col in fks:
+                    print(f"   {t}.{fk_col} → {parent_table}.{parent_col}")
     def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
         """
         Query - FIXED: Considers ALL tables, picks best one or joins multiple.
@@ -273,6 +333,10 @@ JSON:"""
         print(f"\nQuestion: {q}")
+        # Ensure foreign key relationships are detected
+        if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
+            self._create_foreign_keys(silent=True)
         # FIXED: If no table specified, let AI pick the right one(s)
         if not table:
             # Get ALL table schemas
@@ -308,6 +372,24 @@ JSON:"""
             print(f"Error: {e}")
             return QueryResult(False, sql, pd.DataFrame(), None, str(e))
+    def _get_relationship_context(self) -> str:
+        """Build a clear relationship context string for the AI prompt."""
+        if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
+            # Try to detect relationships if not already done
+            self._create_foreign_keys(silent=True)
+        if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
+            return ""
+        lines = ["\n=== TABLE RELATIONSHIPS (FOREIGN KEYS) ==="]
+        lines.append("Use these to JOIN tables when a question needs data from multiple tables:")
+        for t, fks in self.foreign_keys.items():
+            for fk_col, parent_table, parent_col in fks:
+                lines.append(f"  {t}.{fk_col} → {parent_table}.{parent_col}")
+                lines.append(f"    JOIN syntax: JOIN {parent_table} ON {t}.{fk_col} = {parent_table}.{parent_col}")
+        lines.append("=" * 50)
+        return "\n".join(lines)
     def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
         """
         FIXED: Generate SQL considering ALL tables and their relationships.
@@ -318,35 +400,70 @@ JSON:"""
             schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
             schema_context += f"  Columns: {', '.join(info['columns'])}\n"
-        # Add sample data from key tables
+        # Add relationship context
+        relationship_context = self._get_relationship_context()
+        # Add sample data from ALL tables (not just first 3)
         samples = ""
-        for tbl in list(all_schemas.keys())[:3]:  # First 3 tables
+        for tbl in list(all_schemas.keys())[:6]:  # Show more tables
             try:
                 sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
                 samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
             except:
                 pass
-        prompt = f"""You are an SQL expert. Generate a query for this question.
+        prompt = f"""You are an expert SQL query generator.
 {schema_context}
+{relationship_context}
 {samples}
 Question: {q}
-Rules:
-1. Use JOIN if question needs data from multiple tables
-2. If asking about "employee" or "person" info, always include employee_data_people table
-3. Use proper foreign key relationships (person_id references people.id)
-4. Return employee names/info when asked "which employee" or "who"
+CRITICAL INSTRUCTIONS - FOLLOW THESE STEPS:
+STEP 1: READ THE TABLE RELATIONSHIPS SECTION ABOVE.
+        Those show you exactly how tables connect via foreign keys.
+STEP 2: IDENTIFY WHICH TABLES HAVE THE DATA NEEDED.
+        - Person info (name, email, city, state) → look in *_people table
+        - Work info (company, position, start_date) → look in *_work_experience table
+        - Skills, education, etc. → look in their respective tables
+STEP 3: IF THE QUESTION NEEDS DATA FROM MULTIPLE TABLES, YOU MUST USE JOIN.
+        Use the foreign key relationships shown above.
+        Example: If work_experience has person_id and people has id:
+        JOIN people ON work_experience.person_id = people.id
+STEP 4: WRITE THE QUERY.
+        - Use table aliases for readability
+        - Qualify ALL column names with table alias to avoid ambiguity
+        - For "who" / "which person" questions, ALWAYS join to the people table to get names
+        - For "from <state>" or "in <city>" questions, the location is in the people table, JOIN to it
+        - For "count by state" or "group by state", the state column is in the people table, JOIN to it
+EXAMPLES:
+❌ WRONG: SELECT COUNT(*) FROM work_experience GROUP BY company
+   (when asked "count by state" - state is NOT in work_experience!)
+✅ CORRECT: SELECT p.state, COUNT(*) as employee_count
+            FROM work_experience w
+            JOIN people p ON w.person_id = p.id
+            GROUP BY p.state
+❌ WRONG: SELECT * FROM work_experience WHERE company LIKE '%FL%'
+   (when asked "how many from FL" - FL is a state, not a company!)
+✅ CORRECT: SELECT COUNT(*) as count
+            FROM people p
+            WHERE p.state = 'FL'
-Return ONLY the SQL query, no explanations:"""
+Return ONLY the executable SQL query. No explanations, no markdown, no code blocks:"""
         r = self.client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
-                {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
+                {"role": "system", "content": "You are an expert SQL query generator. ALWAYS use JOIN when data is spread across multiple tables. ALWAYS check which table a column belongs to before using it. State, city, name are typically in people tables. Position, company are in work_experience tables. Return ONLY executable SQL."},
                 {"role": "user", "content": prompt}
             ],
             temperature=0
@@ -562,7 +679,7 @@ Return ONLY the SQL query, no explanations:"""
         return [r[0] for r in self.cursor.fetchall()]
     def _refresh_schema(self):
-        """Refresh."""
+        """Refresh schema info."""
         self.schema_info = {}
         for t in self._get_tables():
             self.cursor.execute(f"PRAGMA table_info({t})")

querysutra-0.5.3.dist-info/RECORD DELETED Viewed

@@ -1,28 +0,0 @@
-querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
-sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
-sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
-sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
-sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
-sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
-sutra/database_manager.py,sha256=L-QC_WwR3Pnl1BRh0rnEv5MNSTr4C7ZP-hIPfCHRK88,7672
-sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
-sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
-sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
-sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
-sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
-sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
-sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
-sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
-sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
-sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
-sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
-tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/test_modules.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
-utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
-querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
-querysutra-0.5.3.dist-info/RECORD,,

querysutra-0.5.3.dist-info/top_level.txt DELETED Viewed

@@ -1,3 +0,0 @@
-sutra
-tests
-utils

tests/__init__.py DELETED Viewed

File without changes

tests/test_modules.py DELETED Viewed

File without changes

tests/test_sutra.py DELETED Viewed

@@ -1,76 +0,0 @@
-"""
-Test suite for SUTRA library
-Run with: pytest test_sutra.py
-"""
-import pytest
-import pandas as pd
-import os
-from sutra import SutraClient
-class TestSutraClient:
-    """Test cases for SutraClient"""
-    @pytest.fixture
-    def client(self):
-        """Create a test client"""
-        # Use a test database
-        api_key = os.getenv('OPENAI_API_KEY', 'test-key')
-        client = SutraClient(api_key=api_key, db_path="test_db.db")
-        yield client
-        # Cleanup
-        client.close()
-        if os.path.exists("test_db.db"):
-            os.remove("test_db.db")
-    @pytest.fixture
-    def sample_data(self):
-        """Create sample DataFrame"""
-        return pd.DataFrame({
-            'name': ['Alice', 'Bob', 'Charlie'],
-            'age': [25, 30, 35],
-            'city': ['New York', 'London', 'Paris']
-        })
-    def test_client_initialization(self, client):
-        """Test client can be initialized"""
-        assert client is not None
-        assert client.db_path == "test_db.db"
-    def test_upload_dataframe(self, client, sample_data):
-        """Test uploading a DataFrame"""
-        result = client.upload_dataframe(sample_data, "test_table")
-        assert result['status'] == 'success'
-        assert result['table_name'] == 'test_table'
-        assert result['rows_inserted'] == 3
-    def test_list_tables(self, client, sample_data):
-        """Test listing tables"""
-        client.upload_dataframe(sample_data, "test_table")
-        tables = client.list_tables()
-        assert 'test_table' in tables
-    def test_execute_sql(self, client, sample_data):
-        """Test direct SQL execution"""
-        client.upload_dataframe(sample_data, "test_table")
-        result = client.execute_sql("SELECT * FROM test_table")
-        assert result['status'] == 'success'
-        assert len(result['results']) == 3
-    def test_get_table_info(self, client, sample_data):
-        """Test getting table information"""
-        client.upload_dataframe(sample_data, "test_table")
-        info = client.get_table_info("test_table")
-        assert info['table_name'] == 'test_table'
-        assert len(info['columns']) > 0
-def test_import():
-    """Test that the library can be imported"""
-    from sutra import SutraClient
-    assert SutraClient is not None
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])

utils/__init__.py DELETED Viewed

File without changes

utils/file_utils.py DELETED Viewed

File without changes

utils/text_utils.py DELETED Viewed

File without changes

{querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl

QuerySUTRA 0.5.3py3-none-any.whl → 0.6.1py3-none-any.whl