PyPI - QuerySUTRA - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

QuerySUTRA 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: QuerySUTRA
-Version: 0.5.2
+Version: 0.5.3
 Summary: SUTRA
 Author: Aditya Batta
 License: MIT

{querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
+querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
 sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
 sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
 sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
 sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
 sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
 sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
-sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
+sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
 sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
 sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
 sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
 utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
-querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
-querysutra-0.5.2.dist-info/RECORD,,
+querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
+querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
+querysutra-0.5.3.dist-info/RECORD,,

sutra/sutra.py CHANGED Viewed

@@ -1,5 +1,5 @@
-"""QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
-__version__ = "0.5.1"
+"""QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
+__version__ = "0.5.2"
 __author__ = "Aditya Batta"
 __all__ = ["SUTRA", "QueryResult"]
@@ -41,7 +41,7 @@ except:
 class SUTRA:
-    """SUTRA - BULLETPROOF"""
+    """SUTRA - FIXED: Considers ALL tables"""
     def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
                  use_embeddings: bool = False, fuzzy_match: bool = True,
@@ -72,7 +72,7 @@ class SUTRA:
                 pass
         self._refresh_schema()
-        print(f"QuerySUTRA v0.5.1 Ready")
+        print(f"QuerySUTRA v0.5.2 Ready")
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
         """Upload."""
@@ -105,7 +105,7 @@ class SUTRA:
         return self
     def _pdf(self, path: Path, name: str):
-        """BULLETPROOF PDF - ALWAYS creates multiple tables."""
+        """PDF extraction."""
         if not HAS_PYPDF2:
             raise ImportError("pip install PyPDF2")
@@ -115,12 +115,11 @@ class SUTRA:
             text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
         if not self.client:
-            print("ERROR: No API key! Set api_key parameter")
+            print("ERROR: No API key!")
             return
         print("AI: Extracting...")
-        # TRY 3 TIMES
         entities = None
         for attempt in [1, 2, 3]:
             entities = self._extract(text, attempt)
@@ -129,7 +128,6 @@ class SUTRA:
             if attempt < 3:
                 print(f"  Retry {attempt+1}/3...")
-        # Create tables from entities
         if entities and len(entities) > 0:
             print(f"Extracted {len(entities)} entity types:")
             for etype, recs in entities.items():
@@ -140,16 +138,12 @@ class SUTRA:
                     print(f"  {etype}: {len(recs)} rows")
             return
-        # REGEX FALLBACK - FIXED
         print("Using regex fallback...")
         people = []
         emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
-        # Extract names from common patterns
         name_patterns = [
             r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
             r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
-            r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
         ]
         names = []
@@ -158,7 +152,6 @@ class SUTRA:
             if len(names) >= len(emails):
                 break
-        # Match emails to names
         max_people = min(len(emails), 50)
         for i in range(max_people):
             people.append({
@@ -169,55 +162,49 @@ class SUTRA:
         if people:
             self._store(pd.DataFrame(people), f"{name}_people")
-            print(f"  Extracted {len(people)} people via regex")
+            print(f"  Extracted {len(people)} people")
         else:
-            # Absolute last resort
             lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
             self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _extract(self, text: str, attempt: int) -> Dict:
-        """Extract with 3 different strategies."""
+        """Extract."""
         if not self.client:
             return {}
         try:
             if attempt == 1:
                 sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
-                usr_msg = f"""Extract ALL entities from text.
+                usr_msg = f"""Extract ALL entities.
 Text:
 {text[:15000]}
-Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
+JSON with: people, skills, technologies, projects, certifications, education, work_experience
 Example:
-{{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
+{{"people":[{{"id":1,"name":"Sarah","email":"s@co.com","city":"NYC","state":"NY"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python"}}]}}
-Rules: Unique IDs (1,2,3...), person_id references people.id
+Unique IDs (1,2,3...), person_id links to people.id
 JSON:"""
             elif attempt == 2:
                 sys_msg = "Return JSON."
                 usr_msg = f"""Text: {text[:10000]}
-Extract people as JSON:
-{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
+Extract people:
+{{"people":[{{"id":1,"name":"...","email":"..."}}]}}
 JSON:"""
             else:
-                sys_msg = "JSON only."
-                usr_msg = f"""Find names and emails in: {text[:8000]}
+                sys_msg = "JSON."
+                usr_msg = f"""Names/emails from: {text[:8000]}
-{{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
+{{"people":[{{"id":1,"name":"John","email":"j@co.com"}}]}}"""
             r = self.client.chat.completions.create(
                 model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": sys_msg},
-                    {"role": "user", "content": usr_msg}
-                ],
+                messages=[{"role": "system", "content": sys_msg}, {"role": "user", "content": usr_msg}],
                 temperature=0,
                 max_tokens=12000
             )
@@ -237,7 +224,6 @@ JSON:"""
                 has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
                 if has_data:
                     return result
             return {}
         except Exception as e:
@@ -279,29 +265,32 @@ JSON:"""
         print(f"  {name}: {len(df)} rows")
     def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
-        """Query."""
+        """
+        Query - FIXED: Considers ALL tables, picks best one or joins multiple.
+        """
         if not self.client:
             return QueryResult(False, "", pd.DataFrame(), None, "No API")
-        t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
-        if not t:
-            return QueryResult(False, "", pd.DataFrame(), None, "No table")
-        if self.use_embeddings and self.embedding_model:
-            cached = self._check_cache(q, t)
-            if cached:
-                return cached
+        print(f"\nQuestion: {q}")
+        # FIXED: If no table specified, let AI pick the right one(s)
+        if not table:
+            # Get ALL table schemas
+            all_schemas = {}
+            for tbl in self._get_tables():
+                all_schemas[tbl] = {
+                    'columns': list(self.schema_info.get(tbl, {}).keys()),
+                    'row_count': pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                }
+            # Let AI decide which table(s) to use
+            sql = self._gen_sql_smart(q, all_schemas)
+        else:
+            # Use specified table
+            sql = self._gen_sql(q, table)
         if self.fuzzy_match:
-            q = self._fuzzy(q, t)
-        key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
-        if self.cache_queries and self.cache and key in self.cache:
-            sql = self.cache[key]
-        else:
-            sql = self._gen_sql(q, t)
-            if self.cache_queries and self.cache:
-                self.cache[key] = sql
+            q = self._fuzzy(q, table or self._get_tables()[0])
         print(f"SQL: {sql}")
@@ -312,12 +301,58 @@ JSON:"""
             r = QueryResult(True, sql, df, fig)
             if self.use_embeddings and self.embedding_model:
-                self._store_cache(q, t, r)
+                self._store_cache(q, table or "all", r)
             return r
         except Exception as e:
+            print(f"Error: {e}")
             return QueryResult(False, sql, pd.DataFrame(), None, str(e))
+    def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
+        """
+        FIXED: Generate SQL considering ALL tables and their relationships.
+        """
+        # Build context with ALL tables
+        schema_context = "Database has these tables:\n"
+        for tbl, info in all_schemas.items():
+            schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
+            schema_context += f"  Columns: {', '.join(info['columns'])}\n"
+        # Add sample data from key tables
+        samples = ""
+        for tbl in list(all_schemas.keys())[:3]:  # First 3 tables
+            try:
+                sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
+                samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
+            except:
+                pass
+        prompt = f"""You are an SQL expert. Generate a query for this question.
+{schema_context}
+{samples}
+Question: {q}
+Rules:
+1. Use JOIN if question needs data from multiple tables
+2. If asking about "employee" or "person" info, always include employee_data_people table
+3. Use proper foreign key relationships (person_id references people.id)
+4. Return employee names/info when asked "which employee" or "who"
+Return ONLY the SQL query, no explanations:"""
+        r = self.client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
     def _fuzzy(self, q: str, t: str) -> str:
         """Fuzzy."""
         try:
@@ -506,7 +541,7 @@ JSON:"""
         return instance
     def _gen_sql(self, q: str, t: str) -> str:
-        """SQL."""
+        """SQL for single table."""
         schema = self.schema_info.get(t, {})
         sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
         cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])

{querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

QuerySUTRA 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

QuerySUTRA 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl