QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/METADATA +18 -2
- querysutra-0.6.1.dist-info/RECORD +22 -0
- {querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/WHEEL +1 -1
- querysutra-0.6.1.dist-info/top_level.txt +1 -0
- sutra/__init__.py +6 -4
- sutra/database_manager.py +235 -195
- sutra/nlp_processor.py +175 -143
- sutra/schema_generator.py +56 -52
- sutra/sutra.py +133 -16
- querysutra-0.5.3.dist-info/RECORD +0 -28
- querysutra-0.5.3.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/test_modules.py +0 -0
- tests/test_sutra.py +0 -76
- utils/__init__.py +0 -0
- utils/file_utils.py +0 -0
- utils/text_utils.py +0 -0
- {querysutra-0.5.3.dist-info → querysutra-0.6.1.dist-info}/licenses/LICENSE +0 -0
sutra/sutra.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""QuerySUTRA v0.
|
|
2
|
-
__version__ = "0.
|
|
1
|
+
"""QuerySUTRA v0.6.1 - AI-powered data analysis for structured and unstructured data"""
|
|
2
|
+
__version__ = "0.6.1"
|
|
3
3
|
__author__ = "Aditya Batta"
|
|
4
4
|
__all__ = ["SUTRA", "QueryResult"]
|
|
5
5
|
|
|
@@ -72,7 +72,7 @@ class SUTRA:
|
|
|
72
72
|
pass
|
|
73
73
|
|
|
74
74
|
self._refresh_schema()
|
|
75
|
-
print(f"QuerySUTRA
|
|
75
|
+
print(f"QuerySUTRA v{__version__} Ready")
|
|
76
76
|
|
|
77
77
|
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
|
|
78
78
|
"""Upload."""
|
|
@@ -136,6 +136,8 @@ class SUTRA:
|
|
|
136
136
|
rec['id'] = idx
|
|
137
137
|
self._store(pd.DataFrame(recs), f"{name}_{etype}")
|
|
138
138
|
print(f" {etype}: {len(recs)} rows")
|
|
139
|
+
# After all tables are created, detect and store foreign key relationships
|
|
140
|
+
self._create_foreign_keys()
|
|
139
141
|
return
|
|
140
142
|
|
|
141
143
|
print("Using regex fallback...")
|
|
@@ -253,8 +255,17 @@ JSON:"""
|
|
|
253
255
|
self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
|
|
254
256
|
|
|
255
257
|
def _store(self, df: pd.DataFrame, name: str):
|
|
256
|
-
"""Store."""
|
|
258
|
+
"""Store. Flattens any list/dict values to strings for SQLite compatibility."""
|
|
257
259
|
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
260
|
+
|
|
261
|
+
# Flatten list/dict values to strings — SQLite can't store Python objects
|
|
262
|
+
for col in df.columns:
|
|
263
|
+
df[col] = df[col].apply(
|
|
264
|
+
lambda x: ', '.join(str(i) for i in x) if isinstance(x, list)
|
|
265
|
+
else json.dumps(x) if isinstance(x, dict)
|
|
266
|
+
else x
|
|
267
|
+
)
|
|
268
|
+
|
|
258
269
|
try:
|
|
259
270
|
df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
|
|
260
271
|
except:
|
|
@@ -264,6 +275,55 @@ JSON:"""
|
|
|
264
275
|
self._refresh_schema()
|
|
265
276
|
print(f" {name}: {len(df)} rows")
|
|
266
277
|
|
|
278
|
+
def _create_foreign_keys(self, silent=False):
|
|
279
|
+
"""Detect foreign key relationships between tables by matching column naming patterns.
|
|
280
|
+
e.g., 'person_id' in work_experience -> 'id' in people table."""
|
|
281
|
+
tables = self._get_tables()
|
|
282
|
+
|
|
283
|
+
# Build a map of potential parent tables by looking for 'id' columns
|
|
284
|
+
# e.g., employee_data_people has 'id' -> can be referenced as person_id, people_id
|
|
285
|
+
parent_map = {} # Maps potential FK column names -> (parent_table, parent_pk)
|
|
286
|
+
for t in tables:
|
|
287
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
288
|
+
cols = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
289
|
+
if 'id' in cols:
|
|
290
|
+
# Generate possible FK names from table name
|
|
291
|
+
# e.g., 'employee_data_people' -> 'person_id', 'people_id'
|
|
292
|
+
parts = t.split('_')
|
|
293
|
+
for part in parts:
|
|
294
|
+
# singular form guesses
|
|
295
|
+
fk_name = f"{part}_id"
|
|
296
|
+
parent_map[fk_name] = (t, 'id')
|
|
297
|
+
# Handle plural -> singular (people -> person)
|
|
298
|
+
if part.endswith('ies'):
|
|
299
|
+
parent_map[f"{part[:-3]}y_id"] = (t, 'id')
|
|
300
|
+
elif part.endswith('es'):
|
|
301
|
+
parent_map[f"{part[:-2]}_id"] = (t, 'id')
|
|
302
|
+
elif part.endswith('s'):
|
|
303
|
+
parent_map[f"{part[:-1]}_id"] = (t, 'id')
|
|
304
|
+
# Also try full table name as FK
|
|
305
|
+
parent_map[f"{t}_id"] = (t, 'id')
|
|
306
|
+
|
|
307
|
+
# Now scan all tables for columns matching FK patterns
|
|
308
|
+
self.foreign_keys = {} # table -> [(fk_col, parent_table, parent_col)]
|
|
309
|
+
for t in tables:
|
|
310
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
311
|
+
cols = [r[1] for r in self.cursor.fetchall()]
|
|
312
|
+
fks = []
|
|
313
|
+
for col in cols:
|
|
314
|
+
if col in parent_map:
|
|
315
|
+
parent_table, parent_col = parent_map[col]
|
|
316
|
+
if parent_table != t: # Don't self-reference
|
|
317
|
+
fks.append((col, parent_table, parent_col))
|
|
318
|
+
if fks:
|
|
319
|
+
self.foreign_keys[t] = fks
|
|
320
|
+
|
|
321
|
+
if self.foreign_keys and not silent:
|
|
322
|
+
print(f"\n🔗 Detected relationships:")
|
|
323
|
+
for t, fks in self.foreign_keys.items():
|
|
324
|
+
for fk_col, parent_table, parent_col in fks:
|
|
325
|
+
print(f" {t}.{fk_col} → {parent_table}.{parent_col}")
|
|
326
|
+
|
|
267
327
|
def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
268
328
|
"""
|
|
269
329
|
Query - FIXED: Considers ALL tables, picks best one or joins multiple.
|
|
@@ -273,6 +333,10 @@ JSON:"""
|
|
|
273
333
|
|
|
274
334
|
print(f"\nQuestion: {q}")
|
|
275
335
|
|
|
336
|
+
# Ensure foreign key relationships are detected
|
|
337
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
338
|
+
self._create_foreign_keys(silent=True)
|
|
339
|
+
|
|
276
340
|
# FIXED: If no table specified, let AI pick the right one(s)
|
|
277
341
|
if not table:
|
|
278
342
|
# Get ALL table schemas
|
|
@@ -308,6 +372,24 @@ JSON:"""
|
|
|
308
372
|
print(f"Error: {e}")
|
|
309
373
|
return QueryResult(False, sql, pd.DataFrame(), None, str(e))
|
|
310
374
|
|
|
375
|
+
def _get_relationship_context(self) -> str:
|
|
376
|
+
"""Build a clear relationship context string for the AI prompt."""
|
|
377
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
378
|
+
# Try to detect relationships if not already done
|
|
379
|
+
self._create_foreign_keys(silent=True)
|
|
380
|
+
|
|
381
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
382
|
+
return ""
|
|
383
|
+
|
|
384
|
+
lines = ["\n=== TABLE RELATIONSHIPS (FOREIGN KEYS) ==="]
|
|
385
|
+
lines.append("Use these to JOIN tables when a question needs data from multiple tables:")
|
|
386
|
+
for t, fks in self.foreign_keys.items():
|
|
387
|
+
for fk_col, parent_table, parent_col in fks:
|
|
388
|
+
lines.append(f" {t}.{fk_col} → {parent_table}.{parent_col}")
|
|
389
|
+
lines.append(f" JOIN syntax: JOIN {parent_table} ON {t}.{fk_col} = {parent_table}.{parent_col}")
|
|
390
|
+
lines.append("=" * 50)
|
|
391
|
+
return "\n".join(lines)
|
|
392
|
+
|
|
311
393
|
def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
|
|
312
394
|
"""
|
|
313
395
|
FIXED: Generate SQL considering ALL tables and their relationships.
|
|
@@ -318,35 +400,70 @@ JSON:"""
|
|
|
318
400
|
schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
|
|
319
401
|
schema_context += f" Columns: {', '.join(info['columns'])}\n"
|
|
320
402
|
|
|
321
|
-
# Add
|
|
403
|
+
# Add relationship context
|
|
404
|
+
relationship_context = self._get_relationship_context()
|
|
405
|
+
|
|
406
|
+
# Add sample data from ALL tables (not just first 3)
|
|
322
407
|
samples = ""
|
|
323
|
-
for tbl in list(all_schemas.keys())[:
|
|
408
|
+
for tbl in list(all_schemas.keys())[:6]: # Show more tables
|
|
324
409
|
try:
|
|
325
410
|
sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
|
|
326
411
|
samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
|
|
327
412
|
except:
|
|
328
413
|
pass
|
|
329
414
|
|
|
330
|
-
prompt = f"""You are an
|
|
415
|
+
prompt = f"""You are an expert SQL query generator.
|
|
331
416
|
|
|
332
417
|
{schema_context}
|
|
333
|
-
|
|
418
|
+
{relationship_context}
|
|
334
419
|
{samples}
|
|
335
420
|
|
|
336
421
|
Question: {q}
|
|
337
422
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
423
|
+
CRITICAL INSTRUCTIONS - FOLLOW THESE STEPS:
|
|
424
|
+
|
|
425
|
+
STEP 1: READ THE TABLE RELATIONSHIPS SECTION ABOVE.
|
|
426
|
+
Those show you exactly how tables connect via foreign keys.
|
|
427
|
+
|
|
428
|
+
STEP 2: IDENTIFY WHICH TABLES HAVE THE DATA NEEDED.
|
|
429
|
+
- Person info (name, email, city, state) → look in *_people table
|
|
430
|
+
- Work info (company, position, start_date) → look in *_work_experience table
|
|
431
|
+
- Skills, education, etc. → look in their respective tables
|
|
432
|
+
|
|
433
|
+
STEP 3: IF THE QUESTION NEEDS DATA FROM MULTIPLE TABLES, YOU MUST USE JOIN.
|
|
434
|
+
Use the foreign key relationships shown above.
|
|
435
|
+
Example: If work_experience has person_id and people has id:
|
|
436
|
+
JOIN people ON work_experience.person_id = people.id
|
|
437
|
+
|
|
438
|
+
STEP 4: WRITE THE QUERY.
|
|
439
|
+
- Use table aliases for readability
|
|
440
|
+
- Qualify ALL column names with table alias to avoid ambiguity
|
|
441
|
+
- For "who" / "which person" questions, ALWAYS join to the people table to get names
|
|
442
|
+
- For "from <state>" or "in <city>" questions, the location is in the people table, JOIN to it
|
|
443
|
+
- For "count by state" or "group by state", the state column is in the people table, JOIN to it
|
|
444
|
+
|
|
445
|
+
EXAMPLES:
|
|
446
|
+
❌ WRONG: SELECT COUNT(*) FROM work_experience GROUP BY company
|
|
447
|
+
(when asked "count by state" - state is NOT in work_experience!)
|
|
448
|
+
|
|
449
|
+
✅ CORRECT: SELECT p.state, COUNT(*) as employee_count
|
|
450
|
+
FROM work_experience w
|
|
451
|
+
JOIN people p ON w.person_id = p.id
|
|
452
|
+
GROUP BY p.state
|
|
453
|
+
|
|
454
|
+
❌ WRONG: SELECT * FROM work_experience WHERE company LIKE '%FL%'
|
|
455
|
+
(when asked "how many from FL" - FL is a state, not a company!)
|
|
456
|
+
|
|
457
|
+
✅ CORRECT: SELECT COUNT(*) as count
|
|
458
|
+
FROM people p
|
|
459
|
+
WHERE p.state = 'FL'
|
|
343
460
|
|
|
344
|
-
Return ONLY the SQL query, no
|
|
461
|
+
Return ONLY the executable SQL query. No explanations, no markdown, no code blocks:"""
|
|
345
462
|
|
|
346
463
|
r = self.client.chat.completions.create(
|
|
347
464
|
model="gpt-4o-mini",
|
|
348
465
|
messages=[
|
|
349
|
-
{"role": "system", "content": "SQL
|
|
466
|
+
{"role": "system", "content": "You are an expert SQL query generator. ALWAYS use JOIN when data is spread across multiple tables. ALWAYS check which table a column belongs to before using it. State, city, name are typically in people tables. Position, company are in work_experience tables. Return ONLY executable SQL."},
|
|
350
467
|
{"role": "user", "content": prompt}
|
|
351
468
|
],
|
|
352
469
|
temperature=0
|
|
@@ -562,7 +679,7 @@ Return ONLY the SQL query, no explanations:"""
|
|
|
562
679
|
return [r[0] for r in self.cursor.fetchall()]
|
|
563
680
|
|
|
564
681
|
def _refresh_schema(self):
|
|
565
|
-
"""Refresh."""
|
|
682
|
+
"""Refresh schema info."""
|
|
566
683
|
self.schema_info = {}
|
|
567
684
|
for t in self._get_tables():
|
|
568
685
|
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
|
-
sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
|
|
3
|
-
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
|
-
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
5
|
-
sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
|
|
6
|
-
sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
|
|
7
|
-
sutra/database_manager.py,sha256=L-QC_WwR3Pnl1BRh0rnEv5MNSTr4C7ZP-hIPfCHRK88,7672
|
|
8
|
-
sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
|
|
9
|
-
sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
|
|
10
|
-
sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
|
|
11
|
-
sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
|
|
12
|
-
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
|
-
sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
|
|
14
|
-
sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
|
|
15
|
-
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
|
-
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
|
-
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
18
|
-
sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
|
|
19
|
-
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
tests/test_modules.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
|
|
22
|
-
utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
-
utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
|
|
26
|
-
querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
-
querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
|
|
28
|
-
querysutra-0.5.3.dist-info/RECORD,,
|
tests/__init__.py
DELETED
|
File without changes
|
tests/test_modules.py
DELETED
|
File without changes
|
tests/test_sutra.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Test suite for SUTRA library
|
|
3
|
-
Run with: pytest test_sutra.py
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import os
|
|
9
|
-
from sutra import SutraClient
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class TestSutraClient:
|
|
13
|
-
"""Test cases for SutraClient"""
|
|
14
|
-
|
|
15
|
-
@pytest.fixture
|
|
16
|
-
def client(self):
|
|
17
|
-
"""Create a test client"""
|
|
18
|
-
# Use a test database
|
|
19
|
-
api_key = os.getenv('OPENAI_API_KEY', 'test-key')
|
|
20
|
-
client = SutraClient(api_key=api_key, db_path="test_db.db")
|
|
21
|
-
yield client
|
|
22
|
-
# Cleanup
|
|
23
|
-
client.close()
|
|
24
|
-
if os.path.exists("test_db.db"):
|
|
25
|
-
os.remove("test_db.db")
|
|
26
|
-
|
|
27
|
-
@pytest.fixture
|
|
28
|
-
def sample_data(self):
|
|
29
|
-
"""Create sample DataFrame"""
|
|
30
|
-
return pd.DataFrame({
|
|
31
|
-
'name': ['Alice', 'Bob', 'Charlie'],
|
|
32
|
-
'age': [25, 30, 35],
|
|
33
|
-
'city': ['New York', 'London', 'Paris']
|
|
34
|
-
})
|
|
35
|
-
|
|
36
|
-
def test_client_initialization(self, client):
|
|
37
|
-
"""Test client can be initialized"""
|
|
38
|
-
assert client is not None
|
|
39
|
-
assert client.db_path == "test_db.db"
|
|
40
|
-
|
|
41
|
-
def test_upload_dataframe(self, client, sample_data):
|
|
42
|
-
"""Test uploading a DataFrame"""
|
|
43
|
-
result = client.upload_dataframe(sample_data, "test_table")
|
|
44
|
-
assert result['status'] == 'success'
|
|
45
|
-
assert result['table_name'] == 'test_table'
|
|
46
|
-
assert result['rows_inserted'] == 3
|
|
47
|
-
|
|
48
|
-
def test_list_tables(self, client, sample_data):
|
|
49
|
-
"""Test listing tables"""
|
|
50
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
51
|
-
tables = client.list_tables()
|
|
52
|
-
assert 'test_table' in tables
|
|
53
|
-
|
|
54
|
-
def test_execute_sql(self, client, sample_data):
|
|
55
|
-
"""Test direct SQL execution"""
|
|
56
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
57
|
-
result = client.execute_sql("SELECT * FROM test_table")
|
|
58
|
-
assert result['status'] == 'success'
|
|
59
|
-
assert len(result['results']) == 3
|
|
60
|
-
|
|
61
|
-
def test_get_table_info(self, client, sample_data):
|
|
62
|
-
"""Test getting table information"""
|
|
63
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
64
|
-
info = client.get_table_info("test_table")
|
|
65
|
-
assert info['table_name'] == 'test_table'
|
|
66
|
-
assert len(info['columns']) > 0
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def test_import():
|
|
70
|
-
"""Test that the library can be imported"""
|
|
71
|
-
from sutra import SutraClient
|
|
72
|
-
assert SutraClient is not None
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if __name__ == "__main__":
|
|
76
|
-
pytest.main([__file__, "-v"])
|
utils/__init__.py
DELETED
|
File without changes
|
utils/file_utils.py
DELETED
|
File without changes
|
utils/text_utils.py
DELETED
|
File without changes
|
|
File without changes
|