QuerySUTRA 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/METADATA +1 -1
- {querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/RECORD +6 -6
- sutra/sutra.py +87 -52
- {querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/WHEEL +0 -0
- {querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {querysutra-0.5.2.dist-info → querysutra-0.5.3.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
querysutra-0.5.
|
|
1
|
+
querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
2
|
sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
|
|
3
3
|
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
4
|
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
|
|
|
11
11
|
sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
|
|
12
12
|
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
13
|
sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
|
|
14
|
-
sutra/sutra.py,sha256=
|
|
14
|
+
sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
|
|
15
15
|
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
16
|
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
17
|
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
|
|
|
22
22
|
utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
querysutra-0.5.
|
|
26
|
-
querysutra-0.5.
|
|
27
|
-
querysutra-0.5.
|
|
28
|
-
querysutra-0.5.
|
|
25
|
+
querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
|
|
26
|
+
querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
+
querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
|
|
28
|
+
querysutra-0.5.3.dist-info/RECORD,,
|
sutra/sutra.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""QuerySUTRA v0.5.
|
|
2
|
-
__version__ = "0.5.
|
|
1
|
+
"""QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
|
|
2
|
+
__version__ = "0.5.2"
|
|
3
3
|
__author__ = "Aditya Batta"
|
|
4
4
|
__all__ = ["SUTRA", "QueryResult"]
|
|
5
5
|
|
|
@@ -41,7 +41,7 @@ except:
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class SUTRA:
|
|
44
|
-
"""SUTRA -
|
|
44
|
+
"""SUTRA - FIXED: Considers ALL tables"""
|
|
45
45
|
|
|
46
46
|
def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
|
|
47
47
|
use_embeddings: bool = False, fuzzy_match: bool = True,
|
|
@@ -72,7 +72,7 @@ class SUTRA:
|
|
|
72
72
|
pass
|
|
73
73
|
|
|
74
74
|
self._refresh_schema()
|
|
75
|
-
print(f"QuerySUTRA v0.5.
|
|
75
|
+
print(f"QuerySUTRA v0.5.2 Ready")
|
|
76
76
|
|
|
77
77
|
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
|
|
78
78
|
"""Upload."""
|
|
@@ -105,7 +105,7 @@ class SUTRA:
|
|
|
105
105
|
return self
|
|
106
106
|
|
|
107
107
|
def _pdf(self, path: Path, name: str):
|
|
108
|
-
"""
|
|
108
|
+
"""PDF extraction."""
|
|
109
109
|
if not HAS_PYPDF2:
|
|
110
110
|
raise ImportError("pip install PyPDF2")
|
|
111
111
|
|
|
@@ -115,12 +115,11 @@ class SUTRA:
|
|
|
115
115
|
text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
|
|
116
116
|
|
|
117
117
|
if not self.client:
|
|
118
|
-
print("ERROR: No API key!
|
|
118
|
+
print("ERROR: No API key!")
|
|
119
119
|
return
|
|
120
120
|
|
|
121
121
|
print("AI: Extracting...")
|
|
122
122
|
|
|
123
|
-
# TRY 3 TIMES
|
|
124
123
|
entities = None
|
|
125
124
|
for attempt in [1, 2, 3]:
|
|
126
125
|
entities = self._extract(text, attempt)
|
|
@@ -129,7 +128,6 @@ class SUTRA:
|
|
|
129
128
|
if attempt < 3:
|
|
130
129
|
print(f" Retry {attempt+1}/3...")
|
|
131
130
|
|
|
132
|
-
# Create tables from entities
|
|
133
131
|
if entities and len(entities) > 0:
|
|
134
132
|
print(f"Extracted {len(entities)} entity types:")
|
|
135
133
|
for etype, recs in entities.items():
|
|
@@ -140,16 +138,12 @@ class SUTRA:
|
|
|
140
138
|
print(f" {etype}: {len(recs)} rows")
|
|
141
139
|
return
|
|
142
140
|
|
|
143
|
-
# REGEX FALLBACK - FIXED
|
|
144
141
|
print("Using regex fallback...")
|
|
145
142
|
people = []
|
|
146
143
|
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
147
|
-
|
|
148
|
-
# Extract names from common patterns
|
|
149
144
|
name_patterns = [
|
|
150
145
|
r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
|
|
151
146
|
r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
|
|
152
|
-
r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
|
|
153
147
|
]
|
|
154
148
|
|
|
155
149
|
names = []
|
|
@@ -158,7 +152,6 @@ class SUTRA:
|
|
|
158
152
|
if len(names) >= len(emails):
|
|
159
153
|
break
|
|
160
154
|
|
|
161
|
-
# Match emails to names
|
|
162
155
|
max_people = min(len(emails), 50)
|
|
163
156
|
for i in range(max_people):
|
|
164
157
|
people.append({
|
|
@@ -169,55 +162,49 @@ class SUTRA:
|
|
|
169
162
|
|
|
170
163
|
if people:
|
|
171
164
|
self._store(pd.DataFrame(people), f"{name}_people")
|
|
172
|
-
print(f" Extracted {len(people)} people
|
|
165
|
+
print(f" Extracted {len(people)} people")
|
|
173
166
|
else:
|
|
174
|
-
# Absolute last resort
|
|
175
167
|
lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
|
|
176
168
|
self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
|
|
177
169
|
|
|
178
170
|
def _extract(self, text: str, attempt: int) -> Dict:
|
|
179
|
-
"""Extract
|
|
171
|
+
"""Extract."""
|
|
180
172
|
if not self.client:
|
|
181
173
|
return {}
|
|
182
174
|
|
|
183
175
|
try:
|
|
184
176
|
if attempt == 1:
|
|
185
177
|
sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
|
|
186
|
-
usr_msg = f"""Extract ALL entities
|
|
178
|
+
usr_msg = f"""Extract ALL entities.
|
|
187
179
|
|
|
188
180
|
Text:
|
|
189
181
|
{text[:15000]}
|
|
190
182
|
|
|
191
|
-
|
|
183
|
+
JSON with: people, skills, technologies, projects, certifications, education, work_experience
|
|
192
184
|
|
|
193
185
|
Example:
|
|
194
|
-
{{"people":[{{"id":1,"name":"Sarah
|
|
186
|
+
{{"people":[{{"id":1,"name":"Sarah","email":"s@co.com","city":"NYC","state":"NY"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python"}}]}}
|
|
195
187
|
|
|
196
|
-
|
|
188
|
+
Unique IDs (1,2,3...), person_id links to people.id
|
|
197
189
|
|
|
198
190
|
JSON:"""
|
|
199
|
-
|
|
200
191
|
elif attempt == 2:
|
|
201
192
|
sys_msg = "Return JSON."
|
|
202
193
|
usr_msg = f"""Text: {text[:10000]}
|
|
203
194
|
|
|
204
|
-
Extract people
|
|
205
|
-
{{"people":[{{"id":1,"name":"...","email":"..."
|
|
195
|
+
Extract people:
|
|
196
|
+
{{"people":[{{"id":1,"name":"...","email":"..."}}]}}
|
|
206
197
|
|
|
207
198
|
JSON:"""
|
|
208
|
-
|
|
209
199
|
else:
|
|
210
|
-
sys_msg = "JSON
|
|
211
|
-
usr_msg = f"""
|
|
200
|
+
sys_msg = "JSON."
|
|
201
|
+
usr_msg = f"""Names/emails from: {text[:8000]}
|
|
212
202
|
|
|
213
|
-
{{"people":[{{"id":1,"name":"John","email":"
|
|
203
|
+
{{"people":[{{"id":1,"name":"John","email":"j@co.com"}}]}}"""
|
|
214
204
|
|
|
215
205
|
r = self.client.chat.completions.create(
|
|
216
206
|
model="gpt-4o-mini",
|
|
217
|
-
messages=[
|
|
218
|
-
{"role": "system", "content": sys_msg},
|
|
219
|
-
{"role": "user", "content": usr_msg}
|
|
220
|
-
],
|
|
207
|
+
messages=[{"role": "system", "content": sys_msg}, {"role": "user", "content": usr_msg}],
|
|
221
208
|
temperature=0,
|
|
222
209
|
max_tokens=12000
|
|
223
210
|
)
|
|
@@ -237,7 +224,6 @@ JSON:"""
|
|
|
237
224
|
has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
|
|
238
225
|
if has_data:
|
|
239
226
|
return result
|
|
240
|
-
|
|
241
227
|
return {}
|
|
242
228
|
|
|
243
229
|
except Exception as e:
|
|
@@ -279,29 +265,32 @@ JSON:"""
|
|
|
279
265
|
print(f" {name}: {len(df)} rows")
|
|
280
266
|
|
|
281
267
|
def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
282
|
-
"""
|
|
268
|
+
"""
|
|
269
|
+
Query - FIXED: Considers ALL tables, picks best one or joins multiple.
|
|
270
|
+
"""
|
|
283
271
|
if not self.client:
|
|
284
272
|
return QueryResult(False, "", pd.DataFrame(), None, "No API")
|
|
285
273
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
274
|
+
print(f"\nQuestion: {q}")
|
|
275
|
+
|
|
276
|
+
# FIXED: If no table specified, let AI pick the right one(s)
|
|
277
|
+
if not table:
|
|
278
|
+
# Get ALL table schemas
|
|
279
|
+
all_schemas = {}
|
|
280
|
+
for tbl in self._get_tables():
|
|
281
|
+
all_schemas[tbl] = {
|
|
282
|
+
'columns': list(self.schema_info.get(tbl, {}).keys()),
|
|
283
|
+
'row_count': pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
# Let AI decide which table(s) to use
|
|
287
|
+
sql = self._gen_sql_smart(q, all_schemas)
|
|
288
|
+
else:
|
|
289
|
+
# Use specified table
|
|
290
|
+
sql = self._gen_sql(q, table)
|
|
294
291
|
|
|
295
292
|
if self.fuzzy_match:
|
|
296
|
-
q = self._fuzzy(q,
|
|
297
|
-
|
|
298
|
-
key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
|
|
299
|
-
if self.cache_queries and self.cache and key in self.cache:
|
|
300
|
-
sql = self.cache[key]
|
|
301
|
-
else:
|
|
302
|
-
sql = self._gen_sql(q, t)
|
|
303
|
-
if self.cache_queries and self.cache:
|
|
304
|
-
self.cache[key] = sql
|
|
293
|
+
q = self._fuzzy(q, table or self._get_tables()[0])
|
|
305
294
|
|
|
306
295
|
print(f"SQL: {sql}")
|
|
307
296
|
|
|
@@ -312,12 +301,58 @@ JSON:"""
|
|
|
312
301
|
r = QueryResult(True, sql, df, fig)
|
|
313
302
|
|
|
314
303
|
if self.use_embeddings and self.embedding_model:
|
|
315
|
-
self._store_cache(q,
|
|
304
|
+
self._store_cache(q, table or "all", r)
|
|
316
305
|
|
|
317
306
|
return r
|
|
318
307
|
except Exception as e:
|
|
308
|
+
print(f"Error: {e}")
|
|
319
309
|
return QueryResult(False, sql, pd.DataFrame(), None, str(e))
|
|
320
310
|
|
|
311
|
+
def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
|
|
312
|
+
"""
|
|
313
|
+
FIXED: Generate SQL considering ALL tables and their relationships.
|
|
314
|
+
"""
|
|
315
|
+
# Build context with ALL tables
|
|
316
|
+
schema_context = "Database has these tables:\n"
|
|
317
|
+
for tbl, info in all_schemas.items():
|
|
318
|
+
schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
|
|
319
|
+
schema_context += f" Columns: {', '.join(info['columns'])}\n"
|
|
320
|
+
|
|
321
|
+
# Add sample data from key tables
|
|
322
|
+
samples = ""
|
|
323
|
+
for tbl in list(all_schemas.keys())[:3]: # First 3 tables
|
|
324
|
+
try:
|
|
325
|
+
sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
|
|
326
|
+
samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
|
|
327
|
+
except:
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
prompt = f"""You are an SQL expert. Generate a query for this question.
|
|
331
|
+
|
|
332
|
+
{schema_context}
|
|
333
|
+
|
|
334
|
+
{samples}
|
|
335
|
+
|
|
336
|
+
Question: {q}
|
|
337
|
+
|
|
338
|
+
Rules:
|
|
339
|
+
1. Use JOIN if question needs data from multiple tables
|
|
340
|
+
2. If asking about "employee" or "person" info, always include employee_data_people table
|
|
341
|
+
3. Use proper foreign key relationships (person_id references people.id)
|
|
342
|
+
4. Return employee names/info when asked "which employee" or "who"
|
|
343
|
+
|
|
344
|
+
Return ONLY the SQL query, no explanations:"""
|
|
345
|
+
|
|
346
|
+
r = self.client.chat.completions.create(
|
|
347
|
+
model="gpt-4o-mini",
|
|
348
|
+
messages=[
|
|
349
|
+
{"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
|
|
350
|
+
{"role": "user", "content": prompt}
|
|
351
|
+
],
|
|
352
|
+
temperature=0
|
|
353
|
+
)
|
|
354
|
+
return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
355
|
+
|
|
321
356
|
def _fuzzy(self, q: str, t: str) -> str:
|
|
322
357
|
"""Fuzzy."""
|
|
323
358
|
try:
|
|
@@ -506,7 +541,7 @@ JSON:"""
|
|
|
506
541
|
return instance
|
|
507
542
|
|
|
508
543
|
def _gen_sql(self, q: str, t: str) -> str:
|
|
509
|
-
"""SQL."""
|
|
544
|
+
"""SQL for single table."""
|
|
510
545
|
schema = self.schema_info.get(t, {})
|
|
511
546
|
sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
|
|
512
547
|
cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|