PyPI - QuerySUTRA - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

QuerySUTRA 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{querysutra-0.5.1.dist-info → querysutra-0.5.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: QuerySUTRA
-Version: 0.5.1
+Version: 0.5.3
 Summary: SUTRA
 Author: Aditya Batta
 License: MIT

{querysutra-0.5.1.dist-info → querysutra-0.5.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-querysutra-0.5.1.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
-sutra/__init__.py,sha256=fCBD8dtNCkIaglLrLPBC4UGJxYPUJ7GyCfBh7zj8bLg,118
+querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
+sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
 sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
 sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
 sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
 sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
 sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
 sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
-sutra/sutra.py,sha256=A2qX0tm2eaxVTU4yNKFk8v07suYaD86P1degwBhAyGk,22919
+sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
 sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
 sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
 sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
 utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-querysutra-0.5.1.dist-info/METADATA,sha256=uiNLBUFwgNkwo1NfMYkg7uZLzfgzoEnTncNwweRnenY,7258
-querysutra-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-querysutra-0.5.1.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
-querysutra-0.5.1.dist-info/RECORD,,
+querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
+querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
+querysutra-0.5.3.dist-info/RECORD,,

sutra/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""QuerySUTRA v0.5.1"""
-__version__="0.5.1"
+"""QuerySUTRA v0.5.2"""
+__version__="0.5.2"
 from sutra.sutra import SUTRA,QueryResult
 __all__=["SUTRA","QueryResult"]

sutra/sutra.py CHANGED Viewed

@@ -1,10 +1,5 @@
-"""
-QuerySUTRA v0.5.0 - BULLETPROOF
-GUARANTEED to create multiple tables with proper keys
-NEVER falls back to single table
-"""
-__version__ = "0.5.0"
+"""QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
+__version__ = "0.5.2"
 __author__ = "Aditya Batta"
 __all__ = ["SUTRA", "QueryResult"]
@@ -46,7 +41,7 @@ except:
 class SUTRA:
-    """SUTRA - BULLETPROOF AI EXTRACTION"""
+    """SUTRA - FIXED: Considers ALL tables"""
     def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
                  use_embeddings: bool = False, fuzzy_match: bool = True,
@@ -77,10 +72,10 @@ class SUTRA:
                 pass
         self._refresh_schema()
-        print(f"QuerySUTRA v0.5.0 Ready")
+        print(f"QuerySUTRA v0.5.2 Ready")
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
-        """Upload data."""
+        """Upload."""
         if isinstance(data, pd.DataFrame):
             self._store(data, name or "data")
             return self
@@ -110,7 +105,7 @@ class SUTRA:
         return self
     def _pdf(self, path: Path, name: str):
-        """BULLETPROOF PDF extraction - GUARANTEED to create multiple tables."""
+        """PDF extraction."""
         if not HAS_PYPDF2:
             raise ImportError("pip install PyPDF2")
@@ -120,164 +115,119 @@ class SUTRA:
             text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
         if not self.client:
-            print("No API key - using simple extraction")
-            self._store(pd.DataFrame({'line': range(1, len(text.split('\n'))), 'text': text.split('\n')}), name)
+            print("ERROR: No API key!")
             return
-        print("AI: Extracting entities (BULLETPROOF mode)...")
+        print("AI: Extracting...")
-        # TRY 3 TIMES with progressively simpler prompts
         entities = None
+        for attempt in [1, 2, 3]:
+            entities = self._extract(text, attempt)
+            if entities and len(entities) > 0:
+                break
+            if attempt < 3:
+                print(f"  Retry {attempt+1}/3...")
-        # ATTEMPT 1: Full extraction
-        entities = self._extract(text, attempt=1)
-        # ATTEMPT 2: Simpler prompt
-        if not entities or len(entities) == 0:
-            print("  Retry with simpler prompt...")
-            entities = self._extract(text, attempt=2)
-        # ATTEMPT 3: Basic extraction
-        if not entities or len(entities) == 0:
-            print("  Final retry with basic prompt...")
-            entities = self._extract(text, attempt=3)
-        # SUCCESS - Create tables
         if entities and len(entities) > 0:
-            print(f"SUCCESS! Extracted {len(entities)} entity types:")
+            print(f"Extracted {len(entities)} entity types:")
             for etype, recs in entities.items():
                 if recs and len(recs) > 0:
-                    # Renumber IDs
                     for idx, rec in enumerate(recs, 1):
                         rec['id'] = idx
-                    df = pd.DataFrame(recs)
-                    self._store(df, f"{name}_{etype}")
-                    print(f"  {etype}: {len(df)} rows")
+                    self._store(pd.DataFrame(recs), f"{name}_{etype}")
+                    print(f"  {etype}: {len(recs)} rows")
             return
-        # LAST RESORT - Force at least people table from text analysis
-        print("WARNING: AI extraction failed 3 times - using text analysis...")
-        # Try to extract at least names/emails with regex
+        print("Using regex fallback...")
         people = []
         emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
-        names = re.findall(r'(?:Employee|Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', text)
-        for i, (email, name_match) in enumerate(zip(emails[:50], names[:50] if names else [f"Person {i+1}" for i in range(len(emails))]), 1):
-            people.append({'id': i, 'name': name_match if isinstance(name_match, str) else f"Person {i}", 'email': email})
+        name_patterns = [
+            r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
+            r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
+        ]
+        names = []
+        for pattern in name_patterns:
+            names.extend(re.findall(pattern, text))
+            if len(names) >= len(emails):
+                break
+        max_people = min(len(emails), 50)
+        for i in range(max_people):
+            people.append({
+                'id': i + 1,
+                'name': names[i] if i < len(names) else f"Person {i+1}",
+                'email': emails[i] if i < len(emails) else f"person{i+1}@unknown.com"
+            })
         if people:
             self._store(pd.DataFrame(people), f"{name}_people")
-            print(f"  Extracted {len(people)} people via regex")
+            print(f"  Extracted {len(people)} people")
         else:
-            # Absolute fallback
-            self._store(pd.DataFrame({'line': range(1, min(100, len(text.split('\n')))), 'text': text.split('\n')[:100]}), name)
+            lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
+            self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _extract(self, text: str, attempt: int) -> Dict:
-        """Extract with different strategies."""
+        """Extract."""
         if not self.client:
             return {}
         try:
             if attempt == 1:
-                # Detailed extraction
-                sys_msg = "You are a JSON extraction expert. Extract ALL entities with unique sequential IDs and proper foreign keys. Return ONLY valid JSON, absolutely no other text."
-                usr_msg = f"""Extract ALL structured entities from this text into a JSON object.
+                sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
+                usr_msg = f"""Extract ALL entities.
-Text (first 15000 chars):
+Text:
 {text[:15000]}
-Create separate arrays for these entity types (only if data exists):
-- people: id (int), name (str), email (str), phone (str), address (str), city (str), state (str), zip (str)
-- skills: id (int), person_id (int), skill_name (str), proficiency (str), years (int)
-- technologies: id (int), person_id (int), technology (str), category (str), proficiency (str)
-- projects: id (int), person_id (int), project_name (str), description (str), start_date (str), end_date (str)
-- certifications: id (int), person_id (int), cert_name (str), issuer (str), date_obtained (str)
-- education: id (int), person_id (int), degree (str), institution (str), graduation_year (str)
-- work_experience: id (int), person_id (int), company (str), title (str), start_date (str), end_date (str)
+JSON with: people, skills, technologies, projects, certifications, education, work_experience
-CRITICAL RULES:
-1. IDs must be unique sequential integers: 1, 2, 3, 4...
-2. person_id in related tables MUST reference valid people.id values
-3. Extract EVERY person, skill, technology, project you find
-4. Return ONLY the JSON object, no markdown, no explanations
+Example:
+{{"people":[{{"id":1,"name":"Sarah","email":"s@co.com","city":"NYC","state":"NY"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python"}}]}}
-Example output format:
-{{
-  "people": [
-    {{"id": 1, "name": "Sarah Johnson", "email": "sarah@company.com", "phone": "(212) 555-0147", "city": "New York", "state": "NY"}},
-    {{"id": 2, "name": "Michael Chen", "email": "michael@company.com", "phone": "(415) 555-0283", "city": "San Francisco", "state": "CA"}}
-  ],
-  "skills": [
-    {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
-    {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
-    {{"id": 3, "person_id": 2, "skill_name": "Product Management", "proficiency": "Expert", "years": 7}}
-  ]
-}}
+Unique IDs (1,2,3...), person_id links to people.id
-Now extract from the text above. Return ONLY valid JSON:"""
+JSON:"""
             elif attempt == 2:
-                # Simplified extraction
-                sys_msg = "Extract entities as JSON. Return only JSON."
+                sys_msg = "Return JSON."
                 usr_msg = f"""Text: {text[:10000]}
-Extract people, skills, technologies as JSON:
-{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}],"skills":[{{"id":1,"person_id":1,"skill_name":"..."}}]}}
-Rules: Unique IDs (1,2,3...), person_id links to people.id
+Extract people:
+{{"people":[{{"id":1,"name":"...","email":"..."}}]}}
-JSON only:"""
+JSON:"""
             else:
-                # Basic extraction
-                sys_msg = "Return JSON only."
-                usr_msg = f"""Text: {text[:8000]}
+                sys_msg = "JSON."
+                usr_msg = f"""Names/emails from: {text[:8000]}
-Find people with names, emails, cities. Return as JSON:
-{{"people":[{{"id":1,"name":"John","email":"john@co.com","city":"NYC"}}]}}
-JSON:"""
+{{"people":[{{"id":1,"name":"John","email":"j@co.com"}}]}}"""
-            resp = self.client.chat.completions.create(
+            r = self.client.chat.completions.create(
                 model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": sys_msg},
-                    {"role": "user", "content": usr_msg}
-                ],
+                messages=[{"role": "system", "content": sys_msg}, {"role": "user", "content": usr_msg}],
                 temperature=0,
                 max_tokens=12000
             )
-            raw = resp.choices[0].message.content.strip()
-            # AGGRESSIVE JSON extraction
-            raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").replace("json", "").strip()
+            raw = r.choices[0].message.content.strip()
+            raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").strip()
-            # Find JSON object
             start = raw.find('{')
             end = raw.rfind('}') + 1
             if start < 0 or end <= start:
                 return {}
-            json_str = raw[start:end]
+            result = json.loads(raw[start:end])
-            # Parse
-            result = json.loads(json_str)
-            # Validate
             if isinstance(result, dict) and len(result) > 0:
-                # Check if at least one entity type has data
                 has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
                 if has_data:
                     return result
             return {}
         except Exception as e:
-            print(f"  Attempt {attempt} failed: {e}")
+            print(f"  Attempt {attempt} failed: {str(e)[:100]}")
             return {}
     def _docx(self, path: Path, name: str):
@@ -292,13 +242,15 @@ JSON:"""
                     self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
         else:
             text = "\n".join([p.text for p in doc.paragraphs])
-            self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
+            lines = [l.strip() for l in text.split('\n') if l.strip()]
+            self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _txt(self, path: Path, name: str):
         """TXT."""
         with open(path, 'r', encoding='utf-8') as f:
             text = f.read()
-        self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
+        lines = [l.strip() for l in text.split('\n') if l.strip()]
+        self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _store(self, df: pd.DataFrame, name: str):
         """Store."""
@@ -313,29 +265,32 @@ JSON:"""
         print(f"  {name}: {len(df)} rows")
     def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
-        """Query."""
+        """
+        Query - FIXED: Considers ALL tables, picks best one or joins multiple.
+        """
         if not self.client:
             return QueryResult(False, "", pd.DataFrame(), None, "No API")
-        t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
-        if not t:
-            return QueryResult(False, "", pd.DataFrame(), None, "No table")
-        if self.use_embeddings and self.embedding_model:
-            cached = self._check_cache(q, t)
-            if cached:
-                return cached
+        print(f"\nQuestion: {q}")
+        # FIXED: If no table specified, let AI pick the right one(s)
+        if not table:
+            # Get ALL table schemas
+            all_schemas = {}
+            for tbl in self._get_tables():
+                all_schemas[tbl] = {
+                    'columns': list(self.schema_info.get(tbl, {}).keys()),
+                    'row_count': pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
+                }
+            # Let AI decide which table(s) to use
+            sql = self._gen_sql_smart(q, all_schemas)
+        else:
+            # Use specified table
+            sql = self._gen_sql(q, table)
         if self.fuzzy_match:
-            q = self._fuzzy(q, t)
-        key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
-        if self.cache_queries and self.cache and key in self.cache:
-            sql = self.cache[key]
-        else:
-            sql = self._gen_sql(q, t)
-            if self.cache_queries and self.cache:
-                self.cache[key] = sql
+            q = self._fuzzy(q, table or self._get_tables()[0])
         print(f"SQL: {sql}")
@@ -346,14 +301,60 @@ JSON:"""
             r = QueryResult(True, sql, df, fig)
             if self.use_embeddings and self.embedding_model:
-                self._store_cache(q, t, r)
+                self._store_cache(q, table or "all", r)
             return r
         except Exception as e:
+            print(f"Error: {e}")
             return QueryResult(False, sql, pd.DataFrame(), None, str(e))
+    def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
+        """
+        FIXED: Generate SQL considering ALL tables and their relationships.
+        """
+        # Build context with ALL tables
+        schema_context = "Database has these tables:\n"
+        for tbl, info in all_schemas.items():
+            schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
+            schema_context += f"  Columns: {', '.join(info['columns'])}\n"
+        # Add sample data from key tables
+        samples = ""
+        for tbl in list(all_schemas.keys())[:3]:  # First 3 tables
+            try:
+                sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
+                samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
+            except:
+                pass
+        prompt = f"""You are an SQL expert. Generate a query for this question.
+{schema_context}
+{samples}
+Question: {q}
+Rules:
+1. Use JOIN if question needs data from multiple tables
+2. If asking about "employee" or "person" info, always include employee_data_people table
+3. Use proper foreign key relationships (person_id references people.id)
+4. Return employee names/info when asked "which employee" or "who"
+Return ONLY the SQL query, no explanations:"""
+        r = self.client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
     def _fuzzy(self, q: str, t: str) -> str:
-        """Fuzzy match."""
+        """Fuzzy."""
         try:
             cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
             if not cols:
@@ -372,7 +373,7 @@ JSON:"""
             return q
     def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
-        """Check cache."""
+        """Cache."""
         if not self.query_embeddings:
             return None
         emb = self.embedding_model.encode([q])[0]
@@ -386,7 +387,7 @@ JSON:"""
         return self.query_embeddings[best]['result'] if best else None
     def _store_cache(self, q: str, t: str, r: 'QueryResult'):
-        """Store cache."""
+        """Store."""
         emb = self.embedding_model.encode([q])[0]
         self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
@@ -413,7 +414,7 @@ JSON:"""
             return None
     def tables(self) -> Dict:
-        """List tables."""
+        """Tables."""
         t = self._get_tables()
         print("\n" + "="*70)
         print("TABLES")
@@ -469,7 +470,7 @@ JSON:"""
             return QueryResult(False, query, pd.DataFrame(), None, str(e))
     def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
-        """MySQL export."""
+        """MySQL."""
         try:
             from sqlalchemy import create_engine
             import mysql.connector
@@ -506,14 +507,14 @@ JSON:"""
     @classmethod
     def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
-        """Load database."""
+        """Load."""
         if not Path(db_path).exists():
             raise FileNotFoundError(f"Not found: {db_path}")
         return cls(api_key=api_key, db=db_path, **kwargs)
     @classmethod
     def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
-        """Connect MySQL."""
+        """MySQL."""
         try:
             from sqlalchemy import create_engine
             import mysql.connector
@@ -540,7 +541,7 @@ JSON:"""
         return instance
     def _gen_sql(self, q: str, t: str) -> str:
-        """Generate SQL."""
+        """SQL for single table."""
         schema = self.schema_info.get(t, {})
         sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
         cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])

{querysutra-0.5.1.dist-info → querysutra-0.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{querysutra-0.5.1.dist-info → querysutra-0.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{querysutra-0.5.1.dist-info → querysutra-0.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

QuerySUTRA 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

QuerySUTRA 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl