PyPI - QuerySUTRA - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

QuerySUTRA 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{querysutra-0.5.1.dist-info → querysutra-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: QuerySUTRA
-Version: 0.5.1
+Version: 0.5.2
 Summary: SUTRA
 Author: Aditya Batta
 License: MIT

{querysutra-0.5.1.dist-info → querysutra-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-querysutra-0.5.1.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
-sutra/__init__.py,sha256=fCBD8dtNCkIaglLrLPBC4UGJxYPUJ7GyCfBh7zj8bLg,118
+querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
+sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
 sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
 sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
 sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
 sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
 sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
 sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
-sutra/sutra.py,sha256=A2qX0tm2eaxVTU4yNKFk8v07suYaD86P1degwBhAyGk,22919
+sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
 sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
 sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
 sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
 utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-querysutra-0.5.1.dist-info/METADATA,sha256=uiNLBUFwgNkwo1NfMYkg7uZLzfgzoEnTncNwweRnenY,7258
-querysutra-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-querysutra-0.5.1.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
-querysutra-0.5.1.dist-info/RECORD,,
+querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
+querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
+querysutra-0.5.2.dist-info/RECORD,,

sutra/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""QuerySUTRA v0.5.1"""
-__version__="0.5.1"
+"""QuerySUTRA v0.5.2"""
+__version__="0.5.2"
 from sutra.sutra import SUTRA,QueryResult
 __all__=["SUTRA","QueryResult"]

sutra/sutra.py CHANGED Viewed

@@ -1,10 +1,5 @@
-"""
-QuerySUTRA v0.5.0 - BULLETPROOF
-GUARANTEED to create multiple tables with proper keys
-NEVER falls back to single table
-"""
-__version__ = "0.5.0"
+"""QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
+__version__ = "0.5.1"
 __author__ = "Aditya Batta"
 __all__ = ["SUTRA", "QueryResult"]
@@ -46,7 +41,7 @@ except:
 class SUTRA:
-    """SUTRA - BULLETPROOF AI EXTRACTION"""
+    """SUTRA - BULLETPROOF"""
     def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
                  use_embeddings: bool = False, fuzzy_match: bool = True,
@@ -77,10 +72,10 @@ class SUTRA:
                 pass
         self._refresh_schema()
-        print(f"QuerySUTRA v0.5.0 Ready")
+        print(f"QuerySUTRA v0.5.1 Ready")
     def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
-        """Upload data."""
+        """Upload."""
         if isinstance(data, pd.DataFrame):
             self._store(data, name or "data")
             return self
@@ -110,7 +105,7 @@ class SUTRA:
         return self
     def _pdf(self, path: Path, name: str):
-        """BULLETPROOF PDF extraction - GUARANTEED to create multiple tables."""
+        """BULLETPROOF PDF - ALWAYS creates multiple tables."""
         if not HAS_PYPDF2:
             raise ImportError("pip install PyPDF2")
@@ -120,127 +115,104 @@ class SUTRA:
             text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
         if not self.client:
-            print("No API key - using simple extraction")
-            self._store(pd.DataFrame({'line': range(1, len(text.split('\n'))), 'text': text.split('\n')}), name)
+            print("ERROR: No API key! Set api_key parameter")
             return
-        print("AI: Extracting entities (BULLETPROOF mode)...")
+        print("AI: Extracting...")
-        # TRY 3 TIMES with progressively simpler prompts
+        # TRY 3 TIMES
         entities = None
-        # ATTEMPT 1: Full extraction
-        entities = self._extract(text, attempt=1)
-        # ATTEMPT 2: Simpler prompt
-        if not entities or len(entities) == 0:
-            print("  Retry with simpler prompt...")
-            entities = self._extract(text, attempt=2)
-        # ATTEMPT 3: Basic extraction
-        if not entities or len(entities) == 0:
-            print("  Final retry with basic prompt...")
-            entities = self._extract(text, attempt=3)
-        # SUCCESS - Create tables
+        for attempt in [1, 2, 3]:
+            entities = self._extract(text, attempt)
+            if entities and len(entities) > 0:
+                break
+            if attempt < 3:
+                print(f"  Retry {attempt+1}/3...")
+        # Create tables from entities
         if entities and len(entities) > 0:
-            print(f"SUCCESS! Extracted {len(entities)} entity types:")
+            print(f"Extracted {len(entities)} entity types:")
             for etype, recs in entities.items():
                 if recs and len(recs) > 0:
-                    # Renumber IDs
                     for idx, rec in enumerate(recs, 1):
                         rec['id'] = idx
-                    df = pd.DataFrame(recs)
-                    self._store(df, f"{name}_{etype}")
-                    print(f"  {etype}: {len(df)} rows")
+                    self._store(pd.DataFrame(recs), f"{name}_{etype}")
+                    print(f"  {etype}: {len(recs)} rows")
             return
-        # LAST RESORT - Force at least people table from text analysis
-        print("WARNING: AI extraction failed 3 times - using text analysis...")
-        # Try to extract at least names/emails with regex
+        # REGEX FALLBACK - FIXED
+        print("Using regex fallback...")
         people = []
         emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
-        names = re.findall(r'(?:Employee|Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', text)
-        for i, (email, name_match) in enumerate(zip(emails[:50], names[:50] if names else [f"Person {i+1}" for i in range(len(emails))]), 1):
-            people.append({'id': i, 'name': name_match if isinstance(name_match, str) else f"Person {i}", 'email': email})
+        # Extract names from common patterns
+        name_patterns = [
+            r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
+            r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
+            r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
+        ]
+        names = []
+        for pattern in name_patterns:
+            names.extend(re.findall(pattern, text))
+            if len(names) >= len(emails):
+                break
+        # Match emails to names
+        max_people = min(len(emails), 50)
+        for i in range(max_people):
+            people.append({
+                'id': i + 1,
+                'name': names[i] if i < len(names) else f"Person {i+1}",
+                'email': emails[i] if i < len(emails) else f"person{i+1}@unknown.com"
+            })
         if people:
             self._store(pd.DataFrame(people), f"{name}_people")
             print(f"  Extracted {len(people)} people via regex")
         else:
-            # Absolute fallback
-            self._store(pd.DataFrame({'line': range(1, min(100, len(text.split('\n')))), 'text': text.split('\n')[:100]}), name)
+            # Absolute last resort
+            lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
+            self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _extract(self, text: str, attempt: int) -> Dict:
-        """Extract with different strategies."""
+        """Extract with 3 different strategies."""
         if not self.client:
             return {}
         try:
             if attempt == 1:
-                # Detailed extraction
-                sys_msg = "You are a JSON extraction expert. Extract ALL entities with unique sequential IDs and proper foreign keys. Return ONLY valid JSON, absolutely no other text."
-                usr_msg = f"""Extract ALL structured entities from this text into a JSON object.
+                sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
+                usr_msg = f"""Extract ALL entities from text.
-Text (first 15000 chars):
+Text:
 {text[:15000]}
-Create separate arrays for these entity types (only if data exists):
-- people: id (int), name (str), email (str), phone (str), address (str), city (str), state (str), zip (str)
-- skills: id (int), person_id (int), skill_name (str), proficiency (str), years (int)
-- technologies: id (int), person_id (int), technology (str), category (str), proficiency (str)
-- projects: id (int), person_id (int), project_name (str), description (str), start_date (str), end_date (str)
-- certifications: id (int), person_id (int), cert_name (str), issuer (str), date_obtained (str)
-- education: id (int), person_id (int), degree (str), institution (str), graduation_year (str)
-- work_experience: id (int), person_id (int), company (str), title (str), start_date (str), end_date (str)
+Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
-CRITICAL RULES:
-1. IDs must be unique sequential integers: 1, 2, 3, 4...
-2. person_id in related tables MUST reference valid people.id values
-3. Extract EVERY person, skill, technology, project you find
-4. Return ONLY the JSON object, no markdown, no explanations
+Example:
+{{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
-Example output format:
-{{
-  "people": [
-    {{"id": 1, "name": "Sarah Johnson", "email": "sarah@company.com", "phone": "(212) 555-0147", "city": "New York", "state": "NY"}},
-    {{"id": 2, "name": "Michael Chen", "email": "michael@company.com", "phone": "(415) 555-0283", "city": "San Francisco", "state": "CA"}}
-  ],
-  "skills": [
-    {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
-    {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
-    {{"id": 3, "person_id": 2, "skill_name": "Product Management", "proficiency": "Expert", "years": 7}}
-  ]
-}}
+Rules: Unique IDs (1,2,3...), person_id references people.id
-Now extract from the text above. Return ONLY valid JSON:"""
+JSON:"""
             elif attempt == 2:
-                # Simplified extraction
-                sys_msg = "Extract entities as JSON. Return only JSON."
+                sys_msg = "Return JSON."
                 usr_msg = f"""Text: {text[:10000]}
-Extract people, skills, technologies as JSON:
-{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}],"skills":[{{"id":1,"person_id":1,"skill_name":"..."}}]}}
-Rules: Unique IDs (1,2,3...), person_id links to people.id
+Extract people as JSON:
+{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
-JSON only:"""
+JSON:"""
             else:
-                # Basic extraction
-                sys_msg = "Return JSON only."
-                usr_msg = f"""Text: {text[:8000]}
-Find people with names, emails, cities. Return as JSON:
-{{"people":[{{"id":1,"name":"John","email":"john@co.com","city":"NYC"}}]}}
+                sys_msg = "JSON only."
+                usr_msg = f"""Find names and emails in: {text[:8000]}
-JSON:"""
+{{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
-            resp = self.client.chat.completions.create(
+            r = self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
                     {"role": "system", "content": sys_msg},
@@ -250,26 +222,18 @@ JSON:"""
                 max_tokens=12000
             )
-            raw = resp.choices[0].message.content.strip()
-            # AGGRESSIVE JSON extraction
-            raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").replace("json", "").strip()
+            raw = r.choices[0].message.content.strip()
+            raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").strip()
-            # Find JSON object
             start = raw.find('{')
             end = raw.rfind('}') + 1
             if start < 0 or end <= start:
                 return {}
-            json_str = raw[start:end]
+            result = json.loads(raw[start:end])
-            # Parse
-            result = json.loads(json_str)
-            # Validate
             if isinstance(result, dict) and len(result) > 0:
-                # Check if at least one entity type has data
                 has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
                 if has_data:
                     return result
@@ -277,7 +241,7 @@ JSON:"""
             return {}
         except Exception as e:
-            print(f"  Attempt {attempt} failed: {e}")
+            print(f"  Attempt {attempt} failed: {str(e)[:100]}")
             return {}
     def _docx(self, path: Path, name: str):
@@ -292,13 +256,15 @@ JSON:"""
                     self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
         else:
             text = "\n".join([p.text for p in doc.paragraphs])
-            self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
+            lines = [l.strip() for l in text.split('\n') if l.strip()]
+            self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _txt(self, path: Path, name: str):
         """TXT."""
         with open(path, 'r', encoding='utf-8') as f:
             text = f.read()
-        self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
+        lines = [l.strip() for l in text.split('\n') if l.strip()]
+        self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
     def _store(self, df: pd.DataFrame, name: str):
         """Store."""
@@ -353,7 +319,7 @@ JSON:"""
             return QueryResult(False, sql, pd.DataFrame(), None, str(e))
     def _fuzzy(self, q: str, t: str) -> str:
-        """Fuzzy match."""
+        """Fuzzy."""
         try:
             cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
             if not cols:
@@ -372,7 +338,7 @@ JSON:"""
             return q
     def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
-        """Check cache."""
+        """Cache."""
         if not self.query_embeddings:
             return None
         emb = self.embedding_model.encode([q])[0]
@@ -386,7 +352,7 @@ JSON:"""
         return self.query_embeddings[best]['result'] if best else None
     def _store_cache(self, q: str, t: str, r: 'QueryResult'):
-        """Store cache."""
+        """Store."""
         emb = self.embedding_model.encode([q])[0]
         self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
@@ -413,7 +379,7 @@ JSON:"""
             return None
     def tables(self) -> Dict:
-        """List tables."""
+        """Tables."""
         t = self._get_tables()
         print("\n" + "="*70)
         print("TABLES")
@@ -469,7 +435,7 @@ JSON:"""
             return QueryResult(False, query, pd.DataFrame(), None, str(e))
     def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
-        """MySQL export."""
+        """MySQL."""
         try:
             from sqlalchemy import create_engine
             import mysql.connector
@@ -506,14 +472,14 @@ JSON:"""
     @classmethod
     def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
-        """Load database."""
+        """Load."""
         if not Path(db_path).exists():
             raise FileNotFoundError(f"Not found: {db_path}")
         return cls(api_key=api_key, db=db_path, **kwargs)
     @classmethod
     def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
-        """Connect MySQL."""
+        """MySQL."""
         try:
             from sqlalchemy import create_engine
             import mysql.connector
@@ -540,7 +506,7 @@ JSON:"""
         return instance
     def _gen_sql(self, q: str, t: str) -> str:
-        """Generate SQL."""
+        """SQL."""
         schema = self.schema_info.get(t, {})
         sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
         cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])

{querysutra-0.5.1.dist-info → querysutra-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{querysutra-0.5.1.dist-info → querysutra-0.5.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{querysutra-0.5.1.dist-info → querysutra-0.5.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

QuerySUTRA 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

QuerySUTRA 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl