QuerySUTRA 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: QuerySUTRA
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: SUTRA
5
5
  Author: Aditya Batta
6
6
  License: MIT
@@ -1,4 +1,4 @@
1
- querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
1
+ querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
2
  sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
3
3
  sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
4
  sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
11
11
  sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
12
  sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
13
  sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
14
+ sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
15
15
  sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
16
  sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
17
  sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
22
  utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
26
- querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.5.2.dist-info/RECORD,,
25
+ querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
26
+ querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
+ querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
+ querysutra-0.5.3.dist-info/RECORD,,
sutra/sutra.py CHANGED
@@ -1,5 +1,5 @@
1
- """QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
2
- __version__ = "0.5.1"
1
+ """QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
2
+ __version__ = "0.5.2"
3
3
  __author__ = "Aditya Batta"
4
4
  __all__ = ["SUTRA", "QueryResult"]
5
5
 
@@ -41,7 +41,7 @@ except:
41
41
 
42
42
 
43
43
  class SUTRA:
44
- """SUTRA - BULLETPROOF"""
44
+ """SUTRA - FIXED: Considers ALL tables"""
45
45
 
46
46
  def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
47
47
  use_embeddings: bool = False, fuzzy_match: bool = True,
@@ -72,7 +72,7 @@ class SUTRA:
72
72
  pass
73
73
 
74
74
  self._refresh_schema()
75
- print(f"QuerySUTRA v0.5.1 Ready")
75
+ print(f"QuerySUTRA v0.5.2 Ready")
76
76
 
77
77
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
78
78
  """Upload."""
@@ -105,7 +105,7 @@ class SUTRA:
105
105
  return self
106
106
 
107
107
  def _pdf(self, path: Path, name: str):
108
- """BULLETPROOF PDF - ALWAYS creates multiple tables."""
108
+ """PDF extraction."""
109
109
  if not HAS_PYPDF2:
110
110
  raise ImportError("pip install PyPDF2")
111
111
 
@@ -115,12 +115,11 @@ class SUTRA:
115
115
  text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
116
116
 
117
117
  if not self.client:
118
- print("ERROR: No API key! Set api_key parameter")
118
+ print("ERROR: No API key!")
119
119
  return
120
120
 
121
121
  print("AI: Extracting...")
122
122
 
123
- # TRY 3 TIMES
124
123
  entities = None
125
124
  for attempt in [1, 2, 3]:
126
125
  entities = self._extract(text, attempt)
@@ -129,7 +128,6 @@ class SUTRA:
129
128
  if attempt < 3:
130
129
  print(f" Retry {attempt+1}/3...")
131
130
 
132
- # Create tables from entities
133
131
  if entities and len(entities) > 0:
134
132
  print(f"Extracted {len(entities)} entity types:")
135
133
  for etype, recs in entities.items():
@@ -140,16 +138,12 @@ class SUTRA:
140
138
  print(f" {etype}: {len(recs)} rows")
141
139
  return
142
140
 
143
- # REGEX FALLBACK - FIXED
144
141
  print("Using regex fallback...")
145
142
  people = []
146
143
  emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
147
-
148
- # Extract names from common patterns
149
144
  name_patterns = [
150
145
  r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
151
146
  r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
152
- r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
153
147
  ]
154
148
 
155
149
  names = []
@@ -158,7 +152,6 @@ class SUTRA:
158
152
  if len(names) >= len(emails):
159
153
  break
160
154
 
161
- # Match emails to names
162
155
  max_people = min(len(emails), 50)
163
156
  for i in range(max_people):
164
157
  people.append({
@@ -169,55 +162,49 @@ class SUTRA:
169
162
 
170
163
  if people:
171
164
  self._store(pd.DataFrame(people), f"{name}_people")
172
- print(f" Extracted {len(people)} people via regex")
165
+ print(f" Extracted {len(people)} people")
173
166
  else:
174
- # Absolute last resort
175
167
  lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
176
168
  self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
177
169
 
178
170
  def _extract(self, text: str, attempt: int) -> Dict:
179
- """Extract with 3 different strategies."""
171
+ """Extract."""
180
172
  if not self.client:
181
173
  return {}
182
174
 
183
175
  try:
184
176
  if attempt == 1:
185
177
  sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
186
- usr_msg = f"""Extract ALL entities from text.
178
+ usr_msg = f"""Extract ALL entities.
187
179
 
188
180
  Text:
189
181
  {text[:15000]}
190
182
 
191
- Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
183
+ JSON with: people, skills, technologies, projects, certifications, education, work_experience
192
184
 
193
185
  Example:
194
- {{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
186
+ {{"people":[{{"id":1,"name":"Sarah","email":"s@co.com","city":"NYC","state":"NY"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python"}}]}}
195
187
 
196
- Rules: Unique IDs (1,2,3...), person_id references people.id
188
+ Unique IDs (1,2,3...), person_id links to people.id
197
189
 
198
190
  JSON:"""
199
-
200
191
  elif attempt == 2:
201
192
  sys_msg = "Return JSON."
202
193
  usr_msg = f"""Text: {text[:10000]}
203
194
 
204
- Extract people as JSON:
205
- {{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
195
+ Extract people:
196
+ {{"people":[{{"id":1,"name":"...","email":"..."}}]}}
206
197
 
207
198
  JSON:"""
208
-
209
199
  else:
210
- sys_msg = "JSON only."
211
- usr_msg = f"""Find names and emails in: {text[:8000]}
200
+ sys_msg = "JSON."
201
+ usr_msg = f"""Names/emails from: {text[:8000]}
212
202
 
213
- {{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
203
+ {{"people":[{{"id":1,"name":"John","email":"j@co.com"}}]}}"""
214
204
 
215
205
  r = self.client.chat.completions.create(
216
206
  model="gpt-4o-mini",
217
- messages=[
218
- {"role": "system", "content": sys_msg},
219
- {"role": "user", "content": usr_msg}
220
- ],
207
+ messages=[{"role": "system", "content": sys_msg}, {"role": "user", "content": usr_msg}],
221
208
  temperature=0,
222
209
  max_tokens=12000
223
210
  )
@@ -237,7 +224,6 @@ JSON:"""
237
224
  has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
238
225
  if has_data:
239
226
  return result
240
-
241
227
  return {}
242
228
 
243
229
  except Exception as e:
@@ -279,29 +265,32 @@ JSON:"""
279
265
  print(f" {name}: {len(df)} rows")
280
266
 
281
267
  def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
282
- """Query."""
268
+ """
269
+ Query - FIXED: Considers ALL tables, picks best one or joins multiple.
270
+ """
283
271
  if not self.client:
284
272
  return QueryResult(False, "", pd.DataFrame(), None, "No API")
285
273
 
286
- t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
287
- if not t:
288
- return QueryResult(False, "", pd.DataFrame(), None, "No table")
289
-
290
- if self.use_embeddings and self.embedding_model:
291
- cached = self._check_cache(q, t)
292
- if cached:
293
- return cached
274
+ print(f"\nQuestion: {q}")
275
+
276
+ # FIXED: If no table specified, let AI pick the right one(s)
277
+ if not table:
278
+ # Get ALL table schemas
279
+ all_schemas = {}
280
+ for tbl in self._get_tables():
281
+ all_schemas[tbl] = {
282
+ 'columns': list(self.schema_info.get(tbl, {}).keys()),
283
+ 'row_count': pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
284
+ }
285
+
286
+ # Let AI decide which table(s) to use
287
+ sql = self._gen_sql_smart(q, all_schemas)
288
+ else:
289
+ # Use specified table
290
+ sql = self._gen_sql(q, table)
294
291
 
295
292
  if self.fuzzy_match:
296
- q = self._fuzzy(q, t)
297
-
298
- key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
299
- if self.cache_queries and self.cache and key in self.cache:
300
- sql = self.cache[key]
301
- else:
302
- sql = self._gen_sql(q, t)
303
- if self.cache_queries and self.cache:
304
- self.cache[key] = sql
293
+ q = self._fuzzy(q, table or self._get_tables()[0])
305
294
 
306
295
  print(f"SQL: {sql}")
307
296
 
@@ -312,12 +301,58 @@ JSON:"""
312
301
  r = QueryResult(True, sql, df, fig)
313
302
 
314
303
  if self.use_embeddings and self.embedding_model:
315
- self._store_cache(q, t, r)
304
+ self._store_cache(q, table or "all", r)
316
305
 
317
306
  return r
318
307
  except Exception as e:
308
+ print(f"Error: {e}")
319
309
  return QueryResult(False, sql, pd.DataFrame(), None, str(e))
320
310
 
311
+ def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
312
+ """
313
+ FIXED: Generate SQL considering ALL tables and their relationships.
314
+ """
315
+ # Build context with ALL tables
316
+ schema_context = "Database has these tables:\n"
317
+ for tbl, info in all_schemas.items():
318
+ schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
319
+ schema_context += f" Columns: {', '.join(info['columns'])}\n"
320
+
321
+ # Add sample data from key tables
322
+ samples = ""
323
+ for tbl in list(all_schemas.keys())[:3]: # First 3 tables
324
+ try:
325
+ sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
326
+ samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
327
+ except:
328
+ pass
329
+
330
+ prompt = f"""You are an SQL expert. Generate a query for this question.
331
+
332
+ {schema_context}
333
+
334
+ {samples}
335
+
336
+ Question: {q}
337
+
338
+ Rules:
339
+ 1. Use JOIN if question needs data from multiple tables
340
+ 2. If asking about "employee" or "person" info, always include employee_data_people table
341
+ 3. Use proper foreign key relationships (person_id references people.id)
342
+ 4. Return employee names/info when asked "which employee" or "who"
343
+
344
+ Return ONLY the SQL query, no explanations:"""
345
+
346
+ r = self.client.chat.completions.create(
347
+ model="gpt-4o-mini",
348
+ messages=[
349
+ {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
350
+ {"role": "user", "content": prompt}
351
+ ],
352
+ temperature=0
353
+ )
354
+ return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
355
+
321
356
  def _fuzzy(self, q: str, t: str) -> str:
322
357
  """Fuzzy."""
323
358
  try:
@@ -506,7 +541,7 @@ JSON:"""
506
541
  return instance
507
542
 
508
543
  def _gen_sql(self, q: str, t: str) -> str:
509
- """SQL."""
544
+ """SQL for single table."""
510
545
  schema = self.schema_info.get(t, {})
511
546
  sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
512
547
  cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])