QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sutra/sutra.py CHANGED
@@ -1,5 +1,5 @@
1
- """QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
2
- __version__ = "0.5.2"
1
+ """QuerySUTRA v0.6.1 - AI-powered data analysis for structured and unstructured data"""
2
+ __version__ = "0.6.1"
3
3
  __author__ = "Aditya Batta"
4
4
  __all__ = ["SUTRA", "QueryResult"]
5
5
 
@@ -72,7 +72,7 @@ class SUTRA:
72
72
  pass
73
73
 
74
74
  self._refresh_schema()
75
- print(f"QuerySUTRA v0.5.2 Ready")
75
+ print(f"QuerySUTRA v{__version__} Ready")
76
76
 
77
77
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
78
78
  """Upload."""
@@ -136,6 +136,8 @@ class SUTRA:
136
136
  rec['id'] = idx
137
137
  self._store(pd.DataFrame(recs), f"{name}_{etype}")
138
138
  print(f" {etype}: {len(recs)} rows")
139
+ # After all tables are created, detect and store foreign key relationships
140
+ self._create_foreign_keys()
139
141
  return
140
142
 
141
143
  print("Using regex fallback...")
@@ -253,8 +255,17 @@ JSON:"""
253
255
  self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
254
256
 
255
257
  def _store(self, df: pd.DataFrame, name: str):
256
- """Store."""
258
+ """Store. Flattens any list/dict values to strings for SQLite compatibility."""
257
259
  df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
260
+
261
+ # Flatten list/dict values to strings — SQLite can't store Python objects
262
+ for col in df.columns:
263
+ df[col] = df[col].apply(
264
+ lambda x: ', '.join(str(i) for i in x) if isinstance(x, list)
265
+ else json.dumps(x) if isinstance(x, dict)
266
+ else x
267
+ )
268
+
258
269
  try:
259
270
  df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
260
271
  except:
@@ -264,6 +275,55 @@ JSON:"""
264
275
  self._refresh_schema()
265
276
  print(f" {name}: {len(df)} rows")
266
277
 
278
+ def _create_foreign_keys(self, silent=False):
279
+ """Detect foreign key relationships between tables by matching column naming patterns.
280
+ e.g., 'person_id' in work_experience -> 'id' in people table."""
281
+ tables = self._get_tables()
282
+
283
+ # Build a map of potential parent tables by looking for 'id' columns
284
+ # e.g., employee_data_people has 'id' -> can be referenced as person_id, people_id
285
+ parent_map = {} # Maps potential FK column names -> (parent_table, parent_pk)
286
+ for t in tables:
287
+ self.cursor.execute(f"PRAGMA table_info({t})")
288
+ cols = {r[1]: r[2] for r in self.cursor.fetchall()}
289
+ if 'id' in cols:
290
+ # Generate possible FK names from table name
291
+ # e.g., 'employee_data_people' -> 'person_id', 'people_id'
292
+ parts = t.split('_')
293
+ for part in parts:
294
+ # singular form guesses
295
+ fk_name = f"{part}_id"
296
+ parent_map[fk_name] = (t, 'id')
297
+ # Handle plural -> singular (people -> person)
298
+ if part.endswith('ies'):
299
+ parent_map[f"{part[:-3]}y_id"] = (t, 'id')
300
+ elif part.endswith('es'):
301
+ parent_map[f"{part[:-2]}_id"] = (t, 'id')
302
+ elif part.endswith('s'):
303
+ parent_map[f"{part[:-1]}_id"] = (t, 'id')
304
+ # Also try full table name as FK
305
+ parent_map[f"{t}_id"] = (t, 'id')
306
+
307
+ # Now scan all tables for columns matching FK patterns
308
+ self.foreign_keys = {} # table -> [(fk_col, parent_table, parent_col)]
309
+ for t in tables:
310
+ self.cursor.execute(f"PRAGMA table_info({t})")
311
+ cols = [r[1] for r in self.cursor.fetchall()]
312
+ fks = []
313
+ for col in cols:
314
+ if col in parent_map:
315
+ parent_table, parent_col = parent_map[col]
316
+ if parent_table != t: # Don't self-reference
317
+ fks.append((col, parent_table, parent_col))
318
+ if fks:
319
+ self.foreign_keys[t] = fks
320
+
321
+ if self.foreign_keys and not silent:
322
+ print(f"\n🔗 Detected relationships:")
323
+ for t, fks in self.foreign_keys.items():
324
+ for fk_col, parent_table, parent_col in fks:
325
+ print(f" {t}.{fk_col} → {parent_table}.{parent_col}")
326
+
267
327
  def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
268
328
  """
269
329
  Query - FIXED: Considers ALL tables, picks best one or joins multiple.
@@ -273,6 +333,10 @@ JSON:"""
273
333
 
274
334
  print(f"\nQuestion: {q}")
275
335
 
336
+ # Ensure foreign key relationships are detected
337
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
338
+ self._create_foreign_keys(silent=True)
339
+
276
340
  # FIXED: If no table specified, let AI pick the right one(s)
277
341
  if not table:
278
342
  # Get ALL table schemas
@@ -308,6 +372,24 @@ JSON:"""
308
372
  print(f"Error: {e}")
309
373
  return QueryResult(False, sql, pd.DataFrame(), None, str(e))
310
374
 
375
+ def _get_relationship_context(self) -> str:
376
+ """Build a clear relationship context string for the AI prompt."""
377
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
378
+ # Try to detect relationships if not already done
379
+ self._create_foreign_keys(silent=True)
380
+
381
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
382
+ return ""
383
+
384
+ lines = ["\n=== TABLE RELATIONSHIPS (FOREIGN KEYS) ==="]
385
+ lines.append("Use these to JOIN tables when a question needs data from multiple tables:")
386
+ for t, fks in self.foreign_keys.items():
387
+ for fk_col, parent_table, parent_col in fks:
388
+ lines.append(f" {t}.{fk_col} → {parent_table}.{parent_col}")
389
+ lines.append(f" JOIN syntax: JOIN {parent_table} ON {t}.{fk_col} = {parent_table}.{parent_col}")
390
+ lines.append("=" * 50)
391
+ return "\n".join(lines)
392
+
311
393
  def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
312
394
  """
313
395
  FIXED: Generate SQL considering ALL tables and their relationships.
@@ -318,35 +400,70 @@ JSON:"""
318
400
  schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
319
401
  schema_context += f" Columns: {', '.join(info['columns'])}\n"
320
402
 
321
- # Add sample data from key tables
403
+ # Add relationship context
404
+ relationship_context = self._get_relationship_context()
405
+
406
+ # Add sample data from ALL tables (not just first 3)
322
407
  samples = ""
323
- for tbl in list(all_schemas.keys())[:3]: # First 3 tables
408
+ for tbl in list(all_schemas.keys())[:6]: # Show more tables
324
409
  try:
325
410
  sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
326
411
  samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
327
412
  except:
328
413
  pass
329
414
 
330
- prompt = f"""You are an SQL expert. Generate a query for this question.
415
+ prompt = f"""You are an expert SQL query generator.
331
416
 
332
417
  {schema_context}
333
-
418
+ {relationship_context}
334
419
  {samples}
335
420
 
336
421
  Question: {q}
337
422
 
338
- Rules:
339
- 1. Use JOIN if question needs data from multiple tables
340
- 2. If asking about "employee" or "person" info, always include employee_data_people table
341
- 3. Use proper foreign key relationships (person_id references people.id)
342
- 4. Return employee names/info when asked "which employee" or "who"
423
+ CRITICAL INSTRUCTIONS - FOLLOW THESE STEPS:
424
+
425
+ STEP 1: READ THE TABLE RELATIONSHIPS SECTION ABOVE.
426
+ Those show you exactly how tables connect via foreign keys.
427
+
428
+ STEP 2: IDENTIFY WHICH TABLES HAVE THE DATA NEEDED.
429
+ - Person info (name, email, city, state) → look in *_people table
430
+ - Work info (company, position, start_date) → look in *_work_experience table
431
+ - Skills, education, etc. → look in their respective tables
432
+
433
+ STEP 3: IF THE QUESTION NEEDS DATA FROM MULTIPLE TABLES, YOU MUST USE JOIN.
434
+ Use the foreign key relationships shown above.
435
+ Example: If work_experience has person_id and people has id:
436
+ JOIN people ON work_experience.person_id = people.id
437
+
438
+ STEP 4: WRITE THE QUERY.
439
+ - Use table aliases for readability
440
+ - Qualify ALL column names with table alias to avoid ambiguity
441
+ - For "who" / "which person" questions, ALWAYS join to the people table to get names
442
+ - For "from <state>" or "in <city>" questions, the location is in the people table, JOIN to it
443
+ - For "count by state" or "group by state", the state column is in the people table, JOIN to it
444
+
445
+ EXAMPLES:
446
+ ❌ WRONG: SELECT COUNT(*) FROM work_experience GROUP BY company
447
+ (when asked "count by state" - state is NOT in work_experience!)
448
+
449
+ ✅ CORRECT: SELECT p.state, COUNT(*) as employee_count
450
+ FROM work_experience w
451
+ JOIN people p ON w.person_id = p.id
452
+ GROUP BY p.state
453
+
454
+ ❌ WRONG: SELECT * FROM work_experience WHERE company LIKE '%FL%'
455
+ (when asked "how many from FL" - FL is a state, not a company!)
456
+
457
+ ✅ CORRECT: SELECT COUNT(*) as count
458
+ FROM people p
459
+ WHERE p.state = 'FL'
343
460
 
344
- Return ONLY the SQL query, no explanations:"""
461
+ Return ONLY the executable SQL query. No explanations, no markdown, no code blocks:"""
345
462
 
346
463
  r = self.client.chat.completions.create(
347
464
  model="gpt-4o-mini",
348
465
  messages=[
349
- {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
466
+ {"role": "system", "content": "You are an expert SQL query generator. ALWAYS use JOIN when data is spread across multiple tables. ALWAYS check which table a column belongs to before using it. State, city, name are typically in people tables. Position, company are in work_experience tables. Return ONLY executable SQL."},
350
467
  {"role": "user", "content": prompt}
351
468
  ],
352
469
  temperature=0
@@ -562,7 +679,7 @@ Return ONLY the SQL query, no explanations:"""
562
679
  return [r[0] for r in self.cursor.fetchall()]
563
680
 
564
681
  def _refresh_schema(self):
565
- """Refresh."""
682
+ """Refresh schema info."""
566
683
  self.schema_info = {}
567
684
  for t in self._get_tables():
568
685
  self.cursor.execute(f"PRAGMA table_info({t})")
@@ -1,28 +0,0 @@
1
- querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
- sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
3
- sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
- sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
- sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
6
- sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
7
- sutra/database_manager.py,sha256=L-QC_WwR3Pnl1BRh0rnEv5MNSTr4C7ZP-hIPfCHRK88,7672
8
- sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
9
- sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
10
- sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
11
- sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
- sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
- sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
15
- sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
- sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
- sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
18
- sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
19
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- tests/test_modules.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
- utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
26
- querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.5.3.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- sutra
2
- tests
3
- utils
tests/__init__.py DELETED
File without changes
tests/test_modules.py DELETED
File without changes
tests/test_sutra.py DELETED
@@ -1,76 +0,0 @@
1
- """
2
- Test suite for SUTRA library
3
- Run with: pytest test_sutra.py
4
- """
5
-
6
- import pytest
7
- import pandas as pd
8
- import os
9
- from sutra import SutraClient
10
-
11
-
12
- class TestSutraClient:
13
- """Test cases for SutraClient"""
14
-
15
- @pytest.fixture
16
- def client(self):
17
- """Create a test client"""
18
- # Use a test database
19
- api_key = os.getenv('OPENAI_API_KEY', 'test-key')
20
- client = SutraClient(api_key=api_key, db_path="test_db.db")
21
- yield client
22
- # Cleanup
23
- client.close()
24
- if os.path.exists("test_db.db"):
25
- os.remove("test_db.db")
26
-
27
- @pytest.fixture
28
- def sample_data(self):
29
- """Create sample DataFrame"""
30
- return pd.DataFrame({
31
- 'name': ['Alice', 'Bob', 'Charlie'],
32
- 'age': [25, 30, 35],
33
- 'city': ['New York', 'London', 'Paris']
34
- })
35
-
36
- def test_client_initialization(self, client):
37
- """Test client can be initialized"""
38
- assert client is not None
39
- assert client.db_path == "test_db.db"
40
-
41
- def test_upload_dataframe(self, client, sample_data):
42
- """Test uploading a DataFrame"""
43
- result = client.upload_dataframe(sample_data, "test_table")
44
- assert result['status'] == 'success'
45
- assert result['table_name'] == 'test_table'
46
- assert result['rows_inserted'] == 3
47
-
48
- def test_list_tables(self, client, sample_data):
49
- """Test listing tables"""
50
- client.upload_dataframe(sample_data, "test_table")
51
- tables = client.list_tables()
52
- assert 'test_table' in tables
53
-
54
- def test_execute_sql(self, client, sample_data):
55
- """Test direct SQL execution"""
56
- client.upload_dataframe(sample_data, "test_table")
57
- result = client.execute_sql("SELECT * FROM test_table")
58
- assert result['status'] == 'success'
59
- assert len(result['results']) == 3
60
-
61
- def test_get_table_info(self, client, sample_data):
62
- """Test getting table information"""
63
- client.upload_dataframe(sample_data, "test_table")
64
- info = client.get_table_info("test_table")
65
- assert info['table_name'] == 'test_table'
66
- assert len(info['columns']) > 0
67
-
68
-
69
- def test_import():
70
- """Test that the library can be imported"""
71
- from sutra import SutraClient
72
- assert SutraClient is not None
73
-
74
-
75
- if __name__ == "__main__":
76
- pytest.main([__file__, "-v"])
utils/__init__.py DELETED
File without changes
utils/file_utils.py DELETED
File without changes
utils/text_utils.py DELETED
File without changes