QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,25 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: QuerySUTRA
3
- Version: 0.5.3
4
- Summary: SUTRA
3
+ Version: 0.6.0
4
+ Summary: AI-powered data analysis for structured and unstructured data. Query PDF, Word, CSV, Excel with natural language.
5
5
  Author: Aditya Batta
6
6
  License: MIT
7
+ Project-URL: Homepage, https://github.com/adityabatta/QuerySUTRA
8
+ Project-URL: Repository, https://github.com/adityabatta/QuerySUTRA
9
+ Project-URL: Issues, https://github.com/adityabatta/QuerySUTRA/issues
10
+ Keywords: ai,data-analysis,nlp,sql,pdf,openai,natural-language,query,database
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Database
7
23
  Requires-Python: >=3.8
8
24
  Description-Content-Type: text/markdown
9
25
  License-File: LICENSE
@@ -0,0 +1,22 @@
1
+ querysutra-0.6.0.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
+ sutra/__init__.py,sha256=rRSfC1jjMvi8-LpP0P6dpMVx0xC_HnxsgisPr4WPkGM,200
3
+ sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
+ sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
+ sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
6
+ sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
7
+ sutra/database_manager.py,sha256=usnQTOnfjyFwpcaczG3eF-Pg0snIUeqzHl4rwsd_9rA,9150
8
+ sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
9
+ sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
10
+ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
11
+ sutra/nlp_processor.py,sha256=cvMDvmtf3b2tTbFPItJgF_t541MQqP4SdEXECR1pa0Q,6719
12
+ sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
+ sutra/schema_generator.py,sha256=EYEOo7-ljSukTx9Mm2hXhgY-DFCgsaa7RpzDWqVx4K8,2348
14
+ sutra/sutra.py,sha256=73A4HPZVf6jSl5T4ob1vuIbr7CXVnWHP70NfUYbCz-Y,27594
15
+ sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
+ sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
+ sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
18
+ sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
19
+ querysutra-0.6.0.dist-info/METADATA,sha256=e1PS_Cr8aByv3OYnE2kLxuLTUdsfjSAMNioP5DjYpBk,8252
20
+ querysutra-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
21
+ querysutra-0.6.0.dist-info/top_level.txt,sha256=tqRK7nxuOJvFTkUn-YahGogCSCkk1ZE90Wf3MgT9BDI,6
22
+ querysutra-0.6.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+ sutra
sutra/__init__.py CHANGED
@@ -1,4 +1,6 @@
1
- """QuerySUTRA v0.5.2"""
2
- __version__="0.5.2"
3
- from sutra.sutra import SUTRA,QueryResult
4
- __all__=["SUTRA","QueryResult"]
1
+ """QuerySUTRA v0.6.0 - AI-powered data analysis for structured and unstructured data"""
2
+ __version__ = "0.6.0"
3
+
4
+ from .sutra import SUTRA, QueryResult
5
+
6
+ __all__ = ["SUTRA", "QueryResult", "__version__"]
sutra/database_manager.py CHANGED
@@ -1,196 +1,236 @@
1
- """Database management for both SQLite and MySQL"""
2
-
3
- import sqlite3
4
- import pandas as pd
5
- from pathlib import Path
6
- from typing import Optional, Tuple, List
7
- from tabulate import tabulate
8
- import config
9
-
10
- # Add MySQL support
11
- try:
12
- import mysql.connector
13
- MYSQL_AVAILABLE = True
14
- except ImportError:
15
- MYSQL_AVAILABLE = False
16
- print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
17
-
18
- class DatabaseManager:
19
- """Manage database operations (SQLite or MySQL)"""
20
-
21
- """Database management for both SQLite and MySQL"""
22
-
23
- import sqlite3
24
- import pandas as pd
25
- from pathlib import Path
26
- from typing import Optional, Tuple, List
27
- from tabulate import tabulate
28
- import config
29
-
30
- # Add MySQL support
31
- try:
32
- import mysql.connector
33
- MYSQL_AVAILABLE = True
34
- except ImportError:
35
- MYSQL_AVAILABLE = False
36
- print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
37
-
38
- class DatabaseManager:
39
- """Manage database operations (SQLite or MySQL)"""
40
-
41
- def __init__(self, db_path: str = ':memory:', db_type: str = 'sqlite'): # FIX: Added indentation
42
- self.db_type = db_type.lower()
43
-
44
- if self.db_type == 'mysql':
45
- if not MYSQL_AVAILABLE:
46
- print("❌ MySQL not available, falling back to SQLite")
47
- self.db_type = 'sqlite'
48
- else:
49
- # First connect without database to create it if needed
50
- try:
51
- conn_temp = mysql.connector.connect(
52
- host=config.MYSQL_HOST,
53
- user=config.MYSQL_USER,
54
- password=config.MYSQL_PASSWORD
55
- )
56
- cursor_temp = conn_temp.cursor()
57
- cursor_temp.execute(f"CREATE DATABASE IF NOT EXISTS {config.MYSQL_DATABASE}")
58
- conn_temp.close()
59
- print(f"✅ Database {config.MYSQL_DATABASE} ready")
60
- except Exception as e:
61
- print(f"❌ Could not create database: {e}")
62
-
63
- # Now connect to the database
64
- self.conn = mysql.connector.connect(
65
- host=config.MYSQL_HOST,
66
- user=config.MYSQL_USER,
67
- password=config.MYSQL_PASSWORD,
68
- database=config.MYSQL_DATABASE
69
- )
70
- self.cursor = self.conn.cursor()
71
- print(f"📂 Connected to MySQL: {config.MYSQL_DATABASE}")
72
-
73
- if self.db_type == 'sqlite': # FIX: Added this block for SQLite
74
- self.conn = sqlite3.connect(db_path)
75
- self.cursor = self.conn.cursor()
76
- print(f"📂 SQLite {'created in memory' if db_path == ':memory:' else f'connected: {db_path}'}")
77
-
78
- # Rest of the methods stay the same...
79
-
80
- def execute_schema(self, schema_sql: str) -> bool:
81
- """Execute SQL schema with MySQL compatibility"""
82
- try:
83
- if self.db_type == 'mysql':
84
- # MySQL adjustments
85
- schema_sql = schema_sql.replace('INTEGER PRIMARY KEY AUTOINCREMENT',
86
- 'INT PRIMARY KEY AUTO_INCREMENT')
87
- schema_sql = schema_sql.replace('TEXT', 'VARCHAR(255)')
88
- schema_sql = schema_sql.replace('REAL', 'DECIMAL(10,2)')
89
-
90
- # Execute statements one by one for MySQL
91
- for statement in schema_sql.split(';'):
92
- if statement.strip():
93
- self.cursor.execute(statement)
94
- self.conn.commit()
95
- else:
96
- # SQLite can handle multiple statements
97
- self.cursor.executescript(schema_sql)
98
- self.conn.commit()
99
-
100
- print("✅ Schema executed successfully!")
101
- return True
102
- except Exception as e:
103
- print(f"❌ Error executing schema: {e}")
104
- return False
105
-
106
- def execute_query(self, query: str) -> Optional[pd.DataFrame]:
107
- """Execute query on either database"""
108
- try:
109
- df = pd.read_sql_query(query, self.conn)
110
- return df
111
- except Exception as e:
112
- print(f"❌ Query error: {e}")
113
- return None
114
-
115
- def get_tables(self):
116
- """Get list of all tables in database"""
117
- if self.db_type == 'mysql':
118
- cursor = self.conn.cursor()
119
- cursor.execute("SHOW TABLES")
120
- tables = [table[0] for table in cursor.fetchall()]
121
- cursor.close()
122
- return tables
123
- else: # sqlite
124
- cursor = self.conn.cursor()
125
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
126
- tables = [table[0] for table in cursor.fetchall()]
127
- cursor.close()
128
- return tables
129
-
130
- def get_columns(self, table_name):
131
- """Get list of columns for a specific table"""
132
- if self.db_type == 'mysql':
133
- cursor = self.conn.cursor()
134
- cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
135
- columns = [col[0] for col in cursor.fetchall()]
136
- cursor.close()
137
- return columns
138
- else: # sqlite
139
- cursor = self.conn.cursor()
140
- cursor.execute(f"PRAGMA table_info({table_name})")
141
- columns = [col[1] for col in cursor.fetchall()]
142
- cursor.close()
143
- return columns
144
-
145
- def get_schema_context(self) -> str:
146
- """Get database schema"""
147
- if self.db_type == 'mysql':
148
- tables = self.get_tables()
149
- schema = []
150
- for table in tables:
151
- self.cursor.execute(f"SHOW CREATE TABLE {table}")
152
- schema.append(self.cursor.fetchone()[1])
153
- return '\n'.join(schema)
154
- else:
155
- self.cursor.execute(
156
- "SELECT sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
157
- )
158
- return '\n'.join([row[0] for row in self.cursor.fetchall()])
159
-
160
- def display_tables(self): # FIX: Proper indentation - part of class
161
- """Display all tables with their structure and data"""
162
- tables = self.get_tables()
163
- print(f"\n📋 Created {len(tables)} tables:")
164
-
165
- for table in tables:
166
- print(f"\n Table: {table}")
167
-
168
- # Show columns
169
- columns = self.get_table_info(table)
170
- for col in columns:
171
- print(f" - {col[1]} ({col[2]})")
172
-
173
- # Show row count
174
- count = self.get_row_count(table)
175
- print(f" Records: {count}")
176
-
177
- def get_table_info(self, table_name: str) -> List[Tuple]: # FIX: Proper indentation
178
- """Get column information for a table"""
179
- if self.db_type == 'mysql':
180
- self.cursor.execute(f"DESCRIBE {table_name}")
181
- return [(i, row[0], row[1]) for i, row in enumerate(self.cursor.fetchall())]
182
- else:
183
- self.cursor.execute(f"PRAGMA table_info({table_name})")
184
- return self.cursor.fetchall()
185
-
186
- def get_row_count(self, table_name: str) -> int: # FIX: Proper indentation
187
- """Get number of rows in a table"""
188
- self.cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
189
- return self.cursor.fetchone()[0]
190
-
191
- def close(self):
192
- """Close database connection"""
193
- self.conn.close()
194
- print("📂 Database connection closed")
195
-
1
+ """Database management for both SQLite and MySQL"""
2
+
3
+ import sqlite3
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ from typing import Optional, Tuple, List
7
+ from tabulate import tabulate
8
+ import config
9
+
10
+ # Add MySQL support
11
+ try:
12
+ import mysql.connector
13
+ MYSQL_AVAILABLE = True
14
+ except ImportError:
15
+ MYSQL_AVAILABLE = False
16
+ print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
17
+
18
+ class DatabaseManager:
19
+ """Manage database operations (SQLite or MySQL)"""
20
+
21
+ """Database management for both SQLite and MySQL"""
22
+
23
+ import sqlite3
24
+ import pandas as pd
25
+ from pathlib import Path
26
+ from typing import Optional, Tuple, List
27
+ from tabulate import tabulate
28
+ import config
29
+
30
+ # Add MySQL support
31
+ try:
32
+ import mysql.connector
33
+ MYSQL_AVAILABLE = True
34
+ except ImportError:
35
+ MYSQL_AVAILABLE = False
36
+ print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
37
+
38
+ class DatabaseManager:
39
+ """Manage database operations (SQLite or MySQL)"""
40
+
41
+ def __init__(self, db_path: str = ':memory:', db_type: str = 'sqlite'): # FIX: Added indentation
42
+ self.db_type = db_type.lower()
43
+
44
+ if self.db_type == 'mysql':
45
+ if not MYSQL_AVAILABLE:
46
+ print("❌ MySQL not available, falling back to SQLite")
47
+ self.db_type = 'sqlite'
48
+ else:
49
+ # First connect without database to create it if needed
50
+ try:
51
+ conn_temp = mysql.connector.connect(
52
+ host=config.MYSQL_HOST,
53
+ user=config.MYSQL_USER,
54
+ password=config.MYSQL_PASSWORD
55
+ )
56
+ cursor_temp = conn_temp.cursor()
57
+ cursor_temp.execute(f"CREATE DATABASE IF NOT EXISTS {config.MYSQL_DATABASE}")
58
+ conn_temp.close()
59
+ print(f"✅ Database {config.MYSQL_DATABASE} ready")
60
+ except Exception as e:
61
+ print(f"❌ Could not create database: {e}")
62
+
63
+ # Now connect to the database
64
+ self.conn = mysql.connector.connect(
65
+ host=config.MYSQL_HOST,
66
+ user=config.MYSQL_USER,
67
+ password=config.MYSQL_PASSWORD,
68
+ database=config.MYSQL_DATABASE
69
+ )
70
+ self.cursor = self.conn.cursor()
71
+ print(f"📂 Connected to MySQL: {config.MYSQL_DATABASE}")
72
+
73
+ if self.db_type == 'sqlite': # FIX: Added this block for SQLite
74
+ self.conn = sqlite3.connect(db_path)
75
+ self.cursor = self.conn.cursor()
76
+ print(f"📂 SQLite {'created in memory' if db_path == ':memory:' else f'connected: {db_path}'}")
77
+
78
+ # Rest of the methods stay the same...
79
+
80
+ def execute_schema(self, schema_sql: str) -> bool:
81
+ """Execute SQL schema with MySQL compatibility"""
82
+ try:
83
+ if self.db_type == 'mysql':
84
+ # MySQL adjustments
85
+ schema_sql = schema_sql.replace('INTEGER PRIMARY KEY AUTOINCREMENT',
86
+ 'INT PRIMARY KEY AUTO_INCREMENT')
87
+ schema_sql = schema_sql.replace('TEXT', 'VARCHAR(255)')
88
+ schema_sql = schema_sql.replace('REAL', 'DECIMAL(10,2)')
89
+
90
+ # Execute statements one by one for MySQL
91
+ for statement in schema_sql.split(';'):
92
+ if statement.strip():
93
+ self.cursor.execute(statement)
94
+ self.conn.commit()
95
+ else:
96
+ # SQLite can handle multiple statements
97
+ self.cursor.executescript(schema_sql)
98
+ self.conn.commit()
99
+
100
+ print("✅ Schema executed successfully!")
101
+ return True
102
+ except Exception as e:
103
+ print(f"❌ Error executing schema: {e}")
104
+ return False
105
+
106
+ def execute_query(self, query: str) -> Optional[pd.DataFrame]:
107
+ """Execute query on either database"""
108
+ try:
109
+ df = pd.read_sql_query(query, self.conn)
110
+ return df
111
+ except Exception as e:
112
+ print(f"❌ Query error: {e}")
113
+ return None
114
+
115
+ def get_tables(self):
116
+ """Get list of all tables in database"""
117
+ if self.db_type == 'mysql':
118
+ cursor = self.conn.cursor()
119
+ cursor.execute("SHOW TABLES")
120
+ tables = [table[0] for table in cursor.fetchall()]
121
+ cursor.close()
122
+ return tables
123
+ else: # sqlite
124
+ cursor = self.conn.cursor()
125
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
126
+ tables = [table[0] for table in cursor.fetchall()]
127
+ cursor.close()
128
+ return tables
129
+
130
+ def get_columns(self, table_name):
131
+ """Get list of columns for a specific table"""
132
+ if self.db_type == 'mysql':
133
+ cursor = self.conn.cursor()
134
+ cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
135
+ columns = [col[0] for col in cursor.fetchall()]
136
+ cursor.close()
137
+ return columns
138
+ else: # sqlite
139
+ cursor = self.conn.cursor()
140
+ cursor.execute(f"PRAGMA table_info({table_name})")
141
+ columns = [col[1] for col in cursor.fetchall()]
142
+ cursor.close()
143
+ return columns
144
+
145
+ def get_schema_context(self) -> str:
146
+ """Get database schema with relationship information"""
147
+ if self.db_type == 'mysql':
148
+ tables = self.get_tables()
149
+ schema = []
150
+ for table in tables:
151
+ self.cursor.execute(f"SHOW CREATE TABLE {table}")
152
+ schema.append(self.cursor.fetchone()[1])
153
+ schema_text = '\n'.join(schema)
154
+ else:
155
+ self.cursor.execute(
156
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
157
+ )
158
+ schema_text = '\n'.join([row[0] for row in self.cursor.fetchall()])
159
+
160
+ # Add relationship summary at the beginning
161
+ relationships = self._extract_relationships(schema_text)
162
+ if relationships:
163
+ relationship_summary = "\n=== TABLE RELATIONSHIPS ===\n" + "\n".join(relationships) + "\n\n=== FULL SCHEMA ===\n"
164
+ return relationship_summary + schema_text
165
+ return schema_text
166
+
167
+ def _extract_relationships(self, schema_text: str) -> List[str]:
168
+ """Extract and format foreign key relationships from schema"""
169
+ import re
170
+ relationships = []
171
+
172
+ # Pattern to match FOREIGN KEY statements
173
+ fk_pattern = r'FOREIGN KEY\s*\(([^)]+)\)\s*REFERENCES\s+(\w+)\s*\(([^)]+)\)'
174
+
175
+ # Split schema into individual table definitions
176
+ tables = schema_text.split('CREATE TABLE')
177
+
178
+ for table_def in tables:
179
+ if not table_def.strip():
180
+ continue
181
+
182
+ # Extract table name
183
+ table_match = re.search(r'[`"]?(\w+)[`"]?', table_def)
184
+ if not table_match:
185
+ continue
186
+ table_name = table_match.group(1)
187
+
188
+ # Find all foreign keys in this table
189
+ for match in re.finditer(fk_pattern, table_def, re.IGNORECASE):
190
+ fk_column = match.group(1).strip('`" ')
191
+ ref_table = match.group(2).strip('`" ')
192
+ ref_column = match.group(3).strip('`" ')
193
+
194
+ relationships.append(
195
+ f" {table_name}.{fk_column} → {ref_table}.{ref_column}"
196
+ )
197
+
198
+ return relationships
199
+
200
+ def display_tables(self): # FIX: Proper indentation - part of class
201
+ """Display all tables with their structure and data"""
202
+ tables = self.get_tables()
203
+ print(f"\n📋 Created {len(tables)} tables:")
204
+
205
+ for table in tables:
206
+ print(f"\n Table: {table}")
207
+
208
+ # Show columns
209
+ columns = self.get_table_info(table)
210
+ for col in columns:
211
+ print(f" - {col[1]} ({col[2]})")
212
+
213
+ # Show row count
214
+ count = self.get_row_count(table)
215
+ print(f" Records: {count}")
216
+
217
+ def get_table_info(self, table_name: str) -> List[Tuple]: # FIX: Proper indentation
218
+ """Get column information for a table"""
219
+ if self.db_type == 'mysql':
220
+ self.cursor.execute(f"DESCRIBE {table_name}")
221
+ return [(i, row[0], row[1]) for i, row in enumerate(self.cursor.fetchall())]
222
+ else:
223
+ self.cursor.execute(f"PRAGMA table_info({table_name})")
224
+ return self.cursor.fetchall()
225
+
226
+ def get_row_count(self, table_name: str) -> int: # FIX: Proper indentation
227
+ """Get number of rows in a table"""
228
+ self.cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
229
+ return self.cursor.fetchone()[0]
230
+
231
+ def close(self):
232
+ """Close database connection"""
233
+ self.conn.close()
234
+ print("📂 Database connection closed")
235
+
196
236
 
sutra/nlp_processor.py CHANGED
@@ -1,144 +1,176 @@
1
- """NLP to SQL query processor with relevancy checking"""
2
-
3
- import pandas as pd
4
- from typing import Optional, Tuple
5
- from tabulate import tabulate
6
- from sutra.cache_manager import CacheManager
7
- import openai
8
- import config
9
- from sutra.feedback import SimpleFeedback
10
- from sutra.schema_embeddings import SchemaEmbeddings
11
- from sutra.feedback_matcher import FeedbackMatcher
12
-
13
- class NLPProcessor:
14
- """Process natural language questions to SQL queries"""
15
-
16
- def __init__(self, db_manager, openai_client=None):
17
- self.db = db_manager
18
- self.cache = CacheManager() if config.CACHE_ENABLED else None
19
- self.model_name = config.MODEL_NAME
20
-
21
- # Set the API key directly for openai 0.28.1
22
- openai.api_key = config.OPENAI_API_KEY
23
-
24
- # Added for feedback handling and tracking
25
- self.feedback = SimpleFeedback()
26
- self.last_question = None
27
- self.last_sql = None
28
-
29
- # ✅ NEW: Auto-load schema embeddings
30
- self.relevancy_checker = SchemaEmbeddings(db_manager)
31
-
32
- # ✅ NEW: Smart feedback matcher
33
- self.feedback_matcher = FeedbackMatcher()
34
-
35
- def nlp_to_sql(self, question: str) -> str:
36
- """Convert natural language question to SQL"""
37
-
38
- # ✅ NEW: Check feedback for similar queries first
39
- similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
40
- if similar_sql:
41
- print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
42
- return similar_sql
43
-
44
- # Check cache next
45
- if self.cache:
46
- cached_sql = self.cache.get_cached_query(question)
47
- if cached_sql:
48
- print("⚡ Using cached query")
49
- return cached_sql
50
-
51
- # Only call API if no feedback match and no cache
52
- print("🤖 Calling OpenAI API...")
53
-
54
- # Get schema context
55
- schema = self.db.get_schema_context()
56
-
57
- prompt = f"""
58
- Convert this question to a SQLite query:
59
-
60
- Question: {question}
61
-
62
- Database schema:
63
- {schema}
64
-
65
- Return ONLY the SELECT statement. No explanations, no markdown.
66
- """
67
-
68
- # Use openai.ChatCompletion directly for version 0.28.1
69
- response = openai.ChatCompletion.create(
70
- model=self.model_name,
71
- messages=[{"role": "user", "content": prompt}],
72
- temperature=0
73
- )
74
-
75
- sql_query = response['choices'][0]['message']['content'].strip()
76
- sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
77
-
78
- # Cache the result
79
- if self.cache:
80
- self.cache.add_to_cache(question, sql_query)
81
-
82
- return sql_query
83
-
84
- def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
85
- """Process a natural language question and return results"""
86
-
87
- # ✅ NEW: Check relevancy FIRST - BEFORE any API calls
88
- is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
89
-
90
- if not is_relevant:
91
- print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
92
- for item in info:
93
- print(f" {item}")
94
- return None, ""
95
-
96
- print(f"✅ Relevant question (similarity: {similarity:.2f})")
97
-
98
- try:
99
- # Convert to SQL (only if relevant)
100
- sql_query = self.nlp_to_sql(question)
101
- print(f"\n🔍 Generated SQL Query:")
102
- print(f" {sql_query}")
103
-
104
- # Track for feedback
105
- self.last_question = question
106
- self.last_sql = sql_query
107
-
108
- # Execute query
109
- result_df = self.db.execute_query(sql_query)
110
-
111
- return result_df, sql_query
112
-
113
- except Exception as e:
114
- print(f"❌ Error processing question: {e}")
115
- return None, ""
116
-
117
- def display_results(self, df: pd.DataFrame, max_rows: int = 15):
118
- """Display query results in a formatted table"""
119
- if df is None or df.empty:
120
- print(" No results found")
121
- return # Exit early if no results
122
-
123
- # Show the table
124
- display_df = df.head(max_rows) if len(df) > max_rows else df
125
- print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
126
-
127
- if len(df) > max_rows:
128
- print(f" ... showing first {max_rows} of {len(df)} rows")
129
-
130
- # ✅ UPDATED: Only ask for feedback for relevant questions with results
131
- # (Irrelevant questions never reach here due to early return)
132
- feedback = input("\n👍 or 👎? (y/n): ").lower()
133
- if feedback == 'y':
134
- self.feedback.save(self.last_question, self.last_sql, True)
135
- print("✅ Saved as good")
136
- # Reload feedback matcher with new data
137
- self.feedback_matcher.reload_feedback()
138
- elif feedback == 'n':
139
- correct = input("Correct SQL: ").strip()
140
- self.feedback.save(self.last_question, self.last_sql, False, correct)
141
- if correct:
142
- print("✅ Learned correction")
143
- # Reload feedback matcher with new data
1
+ """NLP to SQL query processor with relevancy checking"""
2
+
3
+ import pandas as pd
4
+ from typing import Optional, Tuple
5
+ from tabulate import tabulate
6
+ from sutra.cache_manager import CacheManager
7
+ import openai
8
+ import config
9
+ from sutra.feedback import SimpleFeedback
10
+ from sutra.schema_embeddings import SchemaEmbeddings
11
+ from sutra.feedback_matcher import FeedbackMatcher
12
+
13
+ class NLPProcessor:
14
+ """Process natural language questions to SQL queries"""
15
+
16
+ def __init__(self, db_manager, openai_client=None):
17
+ self.db = db_manager
18
+ self.cache = CacheManager() if config.CACHE_ENABLED else None
19
+ self.model_name = config.MODEL_NAME
20
+
21
+ # Set the API key directly for openai 0.28.1
22
+ openai.api_key = config.OPENAI_API_KEY
23
+
24
+ # Added for feedback handling and tracking
25
+ self.feedback = SimpleFeedback()
26
+ self.last_question = None
27
+ self.last_sql = None
28
+
29
+ # ✅ NEW: Auto-load schema embeddings
30
+ self.relevancy_checker = SchemaEmbeddings(db_manager)
31
+
32
+ # ✅ NEW: Smart feedback matcher
33
+ self.feedback_matcher = FeedbackMatcher()
34
+
35
+ def nlp_to_sql(self, question: str) -> str:
36
+ """Convert natural language question to SQL"""
37
+
38
+ # ✅ NEW: Check feedback for similar queries first
39
+ similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
40
+ if similar_sql:
41
+ print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
42
+ return similar_sql
43
+
44
+ # Check cache next
45
+ if self.cache:
46
+ cached_sql = self.cache.get_cached_query(question)
47
+ if cached_sql:
48
+ print("⚡ Using cached query")
49
+ return cached_sql
50
+
51
+ # Only call API if no feedback match and no cache
52
+ print("🤖 Calling OpenAI API...")
53
+
54
+ # Get schema context
55
+ schema = self.db.get_schema_context()
56
+
57
+ prompt = f"""
58
+ Convert this question to a SQLite query.
59
+
60
+ Question: {question}
61
+
62
+ Database schema:
63
+ {schema}
64
+
65
+ CRITICAL INSTRUCTIONS FOR MULTI-TABLE QUERIES:
66
+
67
+ **STEP 1: CHECK TABLE RELATIONSHIPS FIRST**
68
+ Look at the "=== TABLE RELATIONSHIPS ===" section at the top of the schema.
69
+ These show you exactly how tables are connected via foreign keys.
70
+ Format: table1.column → table2.column means table1.column references table2.column
71
+
72
+ **STEP 2: IDENTIFY REQUIRED TABLES**
73
+ Analyze which tables contain the data needed to answer the question.
74
+ If information is spread across multiple tables, you MUST join them.
75
+
76
+ **STEP 3: USE THE RELATIONSHIPS TO JOIN**
77
+ When you need data from multiple tables:
78
+ - Use the foreign key relationships shown in the TABLE RELATIONSHIPS section
79
+ - Join table1 to table2 using: JOIN table2 ON table1.fk_column = table2.pk_column
80
+ - Use INNER JOIN when both tables must have matching data
81
+ - Use LEFT JOIN when you need all rows from the first table regardless of matches
82
+
83
+ **STEP 4: WRITE THE QUERY**
84
+ - Use table aliases (t1, t2, etc.) for readability
85
+ - Qualify all column names with table aliases to avoid ambiguity
86
+ - Include all necessary columns from all joined tables in SELECT
87
+
88
+ EXAMPLES:
89
+ ❌ WRONG: SELECT name FROM customers WHERE city = 'NYC'
90
+ (if you need order information too)
91
+
92
+ CORRECT: SELECT c.name, o.order_date, o.total
93
+ FROM customers c
94
+ JOIN orders o ON c.customer_id = o.customer_id
95
+ WHERE c.city = 'NYC'
96
+
97
+ Return ONLY the executable SELECT statement. No explanations, no markdown, no code blocks.
98
+ """
99
+
100
+ # Use openai.ChatCompletion directly for version 0.28.1
101
+ response = openai.ChatCompletion.create(
102
+ model=self.model_name,
103
+ messages=[{"role": "user", "content": prompt}],
104
+ temperature=0
105
+ )
106
+
107
+ sql_query = response['choices'][0]['message']['content'].strip()
108
+ sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
109
+
110
+ # Cache the result
111
+ if self.cache:
112
+ self.cache.add_to_cache(question, sql_query)
113
+
114
+ return sql_query
115
+
116
+ def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
117
+ """Process a natural language question and return results"""
118
+
119
+ # NEW: Check relevancy FIRST - BEFORE any API calls
120
+ is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
121
+
122
+ if not is_relevant:
123
+ print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
124
+ for item in info:
125
+ print(f" {item}")
126
+ return None, ""
127
+
128
+ print(f" Relevant question (similarity: {similarity:.2f})")
129
+
130
+ try:
131
+ # Convert to SQL (only if relevant)
132
+ sql_query = self.nlp_to_sql(question)
133
+ print(f"\n🔍 Generated SQL Query:")
134
+ print(f" {sql_query}")
135
+
136
+ # Track for feedback
137
+ self.last_question = question
138
+ self.last_sql = sql_query
139
+
140
+ # Execute query
141
+ result_df = self.db.execute_query(sql_query)
142
+
143
+ return result_df, sql_query
144
+
145
+ except Exception as e:
146
+ print(f"❌ Error processing question: {e}")
147
+ return None, ""
148
+
149
+ def display_results(self, df: pd.DataFrame, max_rows: int = 15):
150
+ """Display query results in a formatted table"""
151
+ if df is None or df.empty:
152
+ print(" No results found")
153
+ return # Exit early if no results
154
+
155
+ # Show the table
156
+ display_df = df.head(max_rows) if len(df) > max_rows else df
157
+ print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
158
+
159
+ if len(df) > max_rows:
160
+ print(f" ... showing first {max_rows} of {len(df)} rows")
161
+
162
+ # ✅ UPDATED: Only ask for feedback for relevant questions with results
163
+ # (Irrelevant questions never reach here due to early return)
164
+ feedback = input("\n👍 or 👎? (y/n): ").lower()
165
+ if feedback == 'y':
166
+ self.feedback.save(self.last_question, self.last_sql, True)
167
+ print("✅ Saved as good")
168
+ # Reload feedback matcher with new data
169
+ self.feedback_matcher.reload_feedback()
170
+ elif feedback == 'n':
171
+ correct = input("Correct SQL: ").strip()
172
+ self.feedback.save(self.last_question, self.last_sql, False, correct)
173
+ if correct:
174
+ print("✅ Learned correction")
175
+ # Reload feedback matcher with new data
144
176
  self.feedback_matcher.reload_feedback()
sutra/schema_generator.py CHANGED
@@ -1,53 +1,57 @@
1
- """SQL schema generation from unstructured text using AI"""
2
-
3
- import openai
4
- import config
5
-
6
- class SchemaGenerator:
7
- """Generate SQL schema from unstructured data using OpenAI"""
8
-
9
- def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
10
- openai.api_key = api_key
11
- self.model_name = model_name
12
- self.temperature = config.TEMPERATURE
13
-
14
- def generate_schema(self, unstructured_data: str) -> str:
15
- """Generate SQL schema from unstructured text"""
16
-
17
- # Truncate if too long
18
- if len(unstructured_data) > config.MAX_TEXT_LENGTH:
19
- unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
20
- print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
21
-
22
- prompt = f"""
23
- Convert this unstructured text into a SQLite database:
24
-
25
- {unstructured_data}
26
-
27
- Requirements:
28
- 1. Create tables based on what entities you find in the text
29
- 2. Add foreign keys to connect related tables
30
- 3. Extract ALL data from the text - don't add anything not in the text
31
- 4. Use INTEGER PRIMARY KEY AUTOINCREMENT for IDs
32
-
33
- Return ONLY executable SQLite statements:
34
- - DROP TABLE IF EXISTS statements
35
- - CREATE TABLE statements with PRIMARY KEY and FOREIGN KEY
36
- - INSERT statements with the actual data from the text
37
-
38
- No markdown, no code blocks, just SQL.
39
- """
40
-
41
- print("🔄 Generating schema via OpenAI API...")
42
-
43
- response = openai.ChatCompletion.create(
44
- model=self.model_name,
45
- messages=[{"role": "user", "content": prompt}],
46
- temperature=self.temperature
47
- )
48
-
49
- generated_schema = response['choices'][0]['message']['content'].strip()
50
- generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
51
-
52
- print("✅ Schema generated!")
1
+ """SQL schema generation from unstructured text using AI"""
2
+
3
+ import openai
4
+ import config
5
+
6
+ class SchemaGenerator:
7
+ """Generate SQL schema from unstructured data using OpenAI"""
8
+
9
+ def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
10
+ openai.api_key = api_key
11
+ self.model_name = model_name
12
+ self.temperature = config.TEMPERATURE
13
+
14
+ def generate_schema(self, unstructured_data: str) -> str:
15
+ """Generate SQL schema from unstructured text"""
16
+
17
+ # Truncate if too long
18
+ if len(unstructured_data) > config.MAX_TEXT_LENGTH:
19
+ unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
20
+ print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
21
+
22
+ prompt = f"""
23
+ Convert this unstructured text into a SQLite database:
24
+
25
+ {unstructured_data}
26
+
27
+ CRITICAL Requirements:
28
+ 1. Identify all entities in the text and create a table for each
29
+ 2. **MANDATORY:** Add FOREIGN KEY constraints to connect related tables
30
+ - If table A references table B, add: FOREIGN KEY (column_name) REFERENCES table_b(id)
31
+ - Example: If employees belong to departments, employees table must have:
32
+ department_id INTEGER, FOREIGN KEY (department_id) REFERENCES departments(id)
33
+ 3. Extract ALL data from the text - don't add anything not in the text
34
+ 4. Use INTEGER PRIMARY KEY AUTOINCREMENT for all ID columns
35
+ 5. Ensure parent tables (referenced tables) are created BEFORE child tables
36
+
37
+ Return ONLY executable SQLite statements in this order:
38
+ 1. DROP TABLE IF EXISTS statements (child tables first, parent tables last)
39
+ 2. CREATE TABLE statements (parent tables first, child tables last)
40
+ 3. INSERT statements (parent tables first, child tables last)
41
+
42
+ No markdown, no code blocks, no explanations - just SQL statements.
43
+ """
44
+
45
+ print("🔄 Generating schema via OpenAI API...")
46
+
47
+ response = openai.ChatCompletion.create(
48
+ model=self.model_name,
49
+ messages=[{"role": "user", "content": prompt}],
50
+ temperature=self.temperature
51
+ )
52
+
53
+ generated_schema = response['choices'][0]['message']['content'].strip()
54
+ generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
55
+
56
+ print("✅ Schema generated!")
53
57
  return generated_schema
sutra/sutra.py CHANGED
@@ -1,5 +1,5 @@
1
- """QuerySUTRA v0.5.2 - FIXED: Smart table selection"""
2
- __version__ = "0.5.2"
1
+ """QuerySUTRA v0.6.0 - AI-powered data analysis for structured and unstructured data"""
2
+ __version__ = "0.6.0"
3
3
  __author__ = "Aditya Batta"
4
4
  __all__ = ["SUTRA", "QueryResult"]
5
5
 
@@ -72,7 +72,7 @@ class SUTRA:
72
72
  pass
73
73
 
74
74
  self._refresh_schema()
75
- print(f"QuerySUTRA v0.5.2 Ready")
75
+ print(f"QuerySUTRA v{__version__} Ready")
76
76
 
77
77
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
78
78
  """Upload."""
@@ -136,6 +136,8 @@ class SUTRA:
136
136
  rec['id'] = idx
137
137
  self._store(pd.DataFrame(recs), f"{name}_{etype}")
138
138
  print(f" {etype}: {len(recs)} rows")
139
+ # After all tables are created, detect and store foreign key relationships
140
+ self._create_foreign_keys()
139
141
  return
140
142
 
141
143
  print("Using regex fallback...")
@@ -264,6 +266,55 @@ JSON:"""
264
266
  self._refresh_schema()
265
267
  print(f" {name}: {len(df)} rows")
266
268
 
269
+ def _create_foreign_keys(self, silent=False):
270
+ """Detect foreign key relationships between tables by matching column naming patterns.
271
+ e.g., 'person_id' in work_experience -> 'id' in people table."""
272
+ tables = self._get_tables()
273
+
274
+ # Build a map of potential parent tables by looking for 'id' columns
275
+ # e.g., employee_data_people has 'id' -> can be referenced as person_id, people_id
276
+ parent_map = {} # Maps potential FK column names -> (parent_table, parent_pk)
277
+ for t in tables:
278
+ self.cursor.execute(f"PRAGMA table_info({t})")
279
+ cols = {r[1]: r[2] for r in self.cursor.fetchall()}
280
+ if 'id' in cols:
281
+ # Generate possible FK names from table name
282
+ # e.g., 'employee_data_people' -> 'person_id', 'people_id'
283
+ parts = t.split('_')
284
+ for part in parts:
285
+ # singular form guesses
286
+ fk_name = f"{part}_id"
287
+ parent_map[fk_name] = (t, 'id')
288
+ # Handle plural -> singular (people -> person)
289
+ if part.endswith('ies'):
290
+ parent_map[f"{part[:-3]}y_id"] = (t, 'id')
291
+ elif part.endswith('es'):
292
+ parent_map[f"{part[:-2]}_id"] = (t, 'id')
293
+ elif part.endswith('s'):
294
+ parent_map[f"{part[:-1]}_id"] = (t, 'id')
295
+ # Also try full table name as FK
296
+ parent_map[f"{t}_id"] = (t, 'id')
297
+
298
+ # Now scan all tables for columns matching FK patterns
299
+ self.foreign_keys = {} # table -> [(fk_col, parent_table, parent_col)]
300
+ for t in tables:
301
+ self.cursor.execute(f"PRAGMA table_info({t})")
302
+ cols = [r[1] for r in self.cursor.fetchall()]
303
+ fks = []
304
+ for col in cols:
305
+ if col in parent_map:
306
+ parent_table, parent_col = parent_map[col]
307
+ if parent_table != t: # Don't self-reference
308
+ fks.append((col, parent_table, parent_col))
309
+ if fks:
310
+ self.foreign_keys[t] = fks
311
+
312
+ if self.foreign_keys and not silent:
313
+ print(f"\n🔗 Detected relationships:")
314
+ for t, fks in self.foreign_keys.items():
315
+ for fk_col, parent_table, parent_col in fks:
316
+ print(f" {t}.{fk_col} → {parent_table}.{parent_col}")
317
+
267
318
  def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
268
319
  """
269
320
  Query - FIXED: Considers ALL tables, picks best one or joins multiple.
@@ -273,6 +324,10 @@ JSON:"""
273
324
 
274
325
  print(f"\nQuestion: {q}")
275
326
 
327
+ # Ensure foreign key relationships are detected
328
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
329
+ self._create_foreign_keys(silent=True)
330
+
276
331
  # FIXED: If no table specified, let AI pick the right one(s)
277
332
  if not table:
278
333
  # Get ALL table schemas
@@ -308,6 +363,24 @@ JSON:"""
308
363
  print(f"Error: {e}")
309
364
  return QueryResult(False, sql, pd.DataFrame(), None, str(e))
310
365
 
366
+ def _get_relationship_context(self) -> str:
367
+ """Build a clear relationship context string for the AI prompt."""
368
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
369
+ # Try to detect relationships if not already done
370
+ self._create_foreign_keys(silent=True)
371
+
372
+ if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
373
+ return ""
374
+
375
+ lines = ["\n=== TABLE RELATIONSHIPS (FOREIGN KEYS) ==="]
376
+ lines.append("Use these to JOIN tables when a question needs data from multiple tables:")
377
+ for t, fks in self.foreign_keys.items():
378
+ for fk_col, parent_table, parent_col in fks:
379
+ lines.append(f" {t}.{fk_col} → {parent_table}.{parent_col}")
380
+ lines.append(f" JOIN syntax: JOIN {parent_table} ON {t}.{fk_col} = {parent_table}.{parent_col}")
381
+ lines.append("=" * 50)
382
+ return "\n".join(lines)
383
+
311
384
  def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
312
385
  """
313
386
  FIXED: Generate SQL considering ALL tables and their relationships.
@@ -318,35 +391,70 @@ JSON:"""
318
391
  schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
319
392
  schema_context += f" Columns: {', '.join(info['columns'])}\n"
320
393
 
321
- # Add sample data from key tables
394
+ # Add relationship context
395
+ relationship_context = self._get_relationship_context()
396
+
397
+ # Add sample data from ALL tables (not just first 3)
322
398
  samples = ""
323
- for tbl in list(all_schemas.keys())[:3]: # First 3 tables
399
+ for tbl in list(all_schemas.keys())[:6]: # Show more tables
324
400
  try:
325
401
  sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
326
402
  samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
327
403
  except:
328
404
  pass
329
405
 
330
- prompt = f"""You are an SQL expert. Generate a query for this question.
406
+ prompt = f"""You are an expert SQL query generator.
331
407
 
332
408
  {schema_context}
333
-
409
+ {relationship_context}
334
410
  {samples}
335
411
 
336
412
  Question: {q}
337
413
 
338
- Rules:
339
- 1. Use JOIN if question needs data from multiple tables
340
- 2. If asking about "employee" or "person" info, always include employee_data_people table
341
- 3. Use proper foreign key relationships (person_id references people.id)
342
- 4. Return employee names/info when asked "which employee" or "who"
414
+ CRITICAL INSTRUCTIONS - FOLLOW THESE STEPS:
415
+
416
+ STEP 1: READ THE TABLE RELATIONSHIPS SECTION ABOVE.
417
+ Those show you exactly how tables connect via foreign keys.
418
+
419
+ STEP 2: IDENTIFY WHICH TABLES HAVE THE DATA NEEDED.
420
+ - Person info (name, email, city, state) → look in *_people table
421
+ - Work info (company, position, start_date) → look in *_work_experience table
422
+ - Skills, education, etc. → look in their respective tables
423
+
424
+ STEP 3: IF THE QUESTION NEEDS DATA FROM MULTIPLE TABLES, YOU MUST USE JOIN.
425
+ Use the foreign key relationships shown above.
426
+ Example: If work_experience has person_id and people has id:
427
+ JOIN people ON work_experience.person_id = people.id
428
+
429
+ STEP 4: WRITE THE QUERY.
430
+ - Use table aliases for readability
431
+ - Qualify ALL column names with table alias to avoid ambiguity
432
+ - For "who" / "which person" questions, ALWAYS join to the people table to get names
433
+ - For "from <state>" or "in <city>" questions, the location is in the people table, JOIN to it
434
+ - For "count by state" or "group by state", the state column is in the people table, JOIN to it
435
+
436
+ EXAMPLES:
437
+ ❌ WRONG: SELECT COUNT(*) FROM work_experience GROUP BY company
438
+ (when asked "count by state" - state is NOT in work_experience!)
439
+
440
+ ✅ CORRECT: SELECT p.state, COUNT(*) as employee_count
441
+ FROM work_experience w
442
+ JOIN people p ON w.person_id = p.id
443
+ GROUP BY p.state
444
+
445
+ ❌ WRONG: SELECT * FROM work_experience WHERE company LIKE '%FL%'
446
+ (when asked "how many from FL" - FL is a state, not a company!)
447
+
448
+ ✅ CORRECT: SELECT COUNT(*) as count
449
+ FROM people p
450
+ WHERE p.state = 'FL'
343
451
 
344
- Return ONLY the SQL query, no explanations:"""
452
+ Return ONLY the executable SQL query. No explanations, no markdown, no code blocks:"""
345
453
 
346
454
  r = self.client.chat.completions.create(
347
455
  model="gpt-4o-mini",
348
456
  messages=[
349
- {"role": "system", "content": "SQL expert. Generate queries using proper JOINs. Return only SQL."},
457
+ {"role": "system", "content": "You are an expert SQL query generator. ALWAYS use JOIN when data is spread across multiple tables. ALWAYS check which table a column belongs to before using it. State, city, name are typically in people tables. Position, company are in work_experience tables. Return ONLY executable SQL."},
350
458
  {"role": "user", "content": prompt}
351
459
  ],
352
460
  temperature=0
@@ -562,7 +670,7 @@ Return ONLY the SQL query, no explanations:"""
562
670
  return [r[0] for r in self.cursor.fetchall()]
563
671
 
564
672
  def _refresh_schema(self):
565
- """Refresh."""
673
+ """Refresh schema info."""
566
674
  self.schema_info = {}
567
675
  for t in self._get_tables():
568
676
  self.cursor.execute(f"PRAGMA table_info({t})")
@@ -1,28 +0,0 @@
1
- querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
- sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
3
- sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
- sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
- sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
6
- sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
7
- sutra/database_manager.py,sha256=L-QC_WwR3Pnl1BRh0rnEv5MNSTr4C7ZP-hIPfCHRK88,7672
8
- sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
9
- sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
10
- sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
11
- sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
- sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
- sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
15
- sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
- sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
- sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
18
- sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
19
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- tests/test_modules.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
- utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
26
- querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.5.3.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- sutra
2
- tests
3
- utils
tests/__init__.py DELETED
File without changes
tests/test_modules.py DELETED
File without changes
tests/test_sutra.py DELETED
@@ -1,76 +0,0 @@
1
- """
2
- Test suite for SUTRA library
3
- Run with: pytest test_sutra.py
4
- """
5
-
6
- import pytest
7
- import pandas as pd
8
- import os
9
- from sutra import SutraClient
10
-
11
-
12
- class TestSutraClient:
13
- """Test cases for SutraClient"""
14
-
15
- @pytest.fixture
16
- def client(self):
17
- """Create a test client"""
18
- # Use a test database
19
- api_key = os.getenv('OPENAI_API_KEY', 'test-key')
20
- client = SutraClient(api_key=api_key, db_path="test_db.db")
21
- yield client
22
- # Cleanup
23
- client.close()
24
- if os.path.exists("test_db.db"):
25
- os.remove("test_db.db")
26
-
27
- @pytest.fixture
28
- def sample_data(self):
29
- """Create sample DataFrame"""
30
- return pd.DataFrame({
31
- 'name': ['Alice', 'Bob', 'Charlie'],
32
- 'age': [25, 30, 35],
33
- 'city': ['New York', 'London', 'Paris']
34
- })
35
-
36
- def test_client_initialization(self, client):
37
- """Test client can be initialized"""
38
- assert client is not None
39
- assert client.db_path == "test_db.db"
40
-
41
- def test_upload_dataframe(self, client, sample_data):
42
- """Test uploading a DataFrame"""
43
- result = client.upload_dataframe(sample_data, "test_table")
44
- assert result['status'] == 'success'
45
- assert result['table_name'] == 'test_table'
46
- assert result['rows_inserted'] == 3
47
-
48
- def test_list_tables(self, client, sample_data):
49
- """Test listing tables"""
50
- client.upload_dataframe(sample_data, "test_table")
51
- tables = client.list_tables()
52
- assert 'test_table' in tables
53
-
54
- def test_execute_sql(self, client, sample_data):
55
- """Test direct SQL execution"""
56
- client.upload_dataframe(sample_data, "test_table")
57
- result = client.execute_sql("SELECT * FROM test_table")
58
- assert result['status'] == 'success'
59
- assert len(result['results']) == 3
60
-
61
- def test_get_table_info(self, client, sample_data):
62
- """Test getting table information"""
63
- client.upload_dataframe(sample_data, "test_table")
64
- info = client.get_table_info("test_table")
65
- assert info['table_name'] == 'test_table'
66
- assert len(info['columns']) > 0
67
-
68
-
69
- def test_import():
70
- """Test that the library can be imported"""
71
- from sutra import SutraClient
72
- assert SutraClient is not None
73
-
74
-
75
- if __name__ == "__main__":
76
- pytest.main([__file__, "-v"])
utils/__init__.py DELETED
File without changes
utils/file_utils.py DELETED
File without changes
utils/text_utils.py DELETED
File without changes