QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.5.3.dist-info → querysutra-0.6.0.dist-info}/METADATA +18 -2
- querysutra-0.6.0.dist-info/RECORD +22 -0
- {querysutra-0.5.3.dist-info → querysutra-0.6.0.dist-info}/WHEEL +1 -1
- querysutra-0.6.0.dist-info/top_level.txt +1 -0
- sutra/__init__.py +6 -4
- sutra/database_manager.py +235 -195
- sutra/nlp_processor.py +175 -143
- sutra/schema_generator.py +56 -52
- sutra/sutra.py +123 -15
- querysutra-0.5.3.dist-info/RECORD +0 -28
- querysutra-0.5.3.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/test_modules.py +0 -0
- tests/test_sutra.py +0 -76
- utils/__init__.py +0 -0
- utils/file_utils.py +0 -0
- utils/text_utils.py +0 -0
- {querysutra-0.5.3.dist-info → querysutra-0.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,25 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: QuerySUTRA
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: AI-powered data analysis for structured and unstructured data. Query PDF, Word, CSV, Excel with natural language.
|
|
5
5
|
Author: Aditya Batta
|
|
6
6
|
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/adityabatta/QuerySUTRA
|
|
8
|
+
Project-URL: Repository, https://github.com/adityabatta/QuerySUTRA
|
|
9
|
+
Project-URL: Issues, https://github.com/adityabatta/QuerySUTRA/issues
|
|
10
|
+
Keywords: ai,data-analysis,nlp,sql,pdf,openai,natural-language,query,database
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Database
|
|
7
23
|
Requires-Python: >=3.8
|
|
8
24
|
Description-Content-Type: text/markdown
|
|
9
25
|
License-File: LICENSE
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
querysutra-0.6.0.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
|
+
sutra/__init__.py,sha256=rRSfC1jjMvi8-LpP0P6dpMVx0xC_HnxsgisPr4WPkGM,200
|
|
3
|
+
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
|
+
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
5
|
+
sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
|
|
6
|
+
sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
|
|
7
|
+
sutra/database_manager.py,sha256=usnQTOnfjyFwpcaczG3eF-Pg0snIUeqzHl4rwsd_9rA,9150
|
|
8
|
+
sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
|
|
9
|
+
sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
|
|
10
|
+
sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
|
|
11
|
+
sutra/nlp_processor.py,sha256=cvMDvmtf3b2tTbFPItJgF_t541MQqP4SdEXECR1pa0Q,6719
|
|
12
|
+
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
|
+
sutra/schema_generator.py,sha256=EYEOo7-ljSukTx9Mm2hXhgY-DFCgsaa7RpzDWqVx4K8,2348
|
|
14
|
+
sutra/sutra.py,sha256=73A4HPZVf6jSl5T4ob1vuIbr7CXVnWHP70NfUYbCz-Y,27594
|
|
15
|
+
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
|
+
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
|
+
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
18
|
+
sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
|
|
19
|
+
querysutra-0.6.0.dist-info/METADATA,sha256=e1PS_Cr8aByv3OYnE2kLxuLTUdsfjSAMNioP5DjYpBk,8252
|
|
20
|
+
querysutra-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
21
|
+
querysutra-0.6.0.dist-info/top_level.txt,sha256=tqRK7nxuOJvFTkUn-YahGogCSCkk1ZE90Wf3MgT9BDI,6
|
|
22
|
+
querysutra-0.6.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sutra
|
sutra/__init__.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""QuerySUTRA v0.
|
|
2
|
-
__version__="0.
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
"""QuerySUTRA v0.6.0 - AI-powered data analysis for structured and unstructured data"""
|
|
2
|
+
__version__ = "0.6.0"
|
|
3
|
+
|
|
4
|
+
from .sutra import SUTRA, QueryResult
|
|
5
|
+
|
|
6
|
+
__all__ = ["SUTRA", "QueryResult", "__version__"]
|
sutra/database_manager.py
CHANGED
|
@@ -1,196 +1,236 @@
|
|
|
1
|
-
"""Database management for both SQLite and MySQL"""
|
|
2
|
-
|
|
3
|
-
import sqlite3
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional, Tuple, List
|
|
7
|
-
from tabulate import tabulate
|
|
8
|
-
import config
|
|
9
|
-
|
|
10
|
-
# Add MySQL support
|
|
11
|
-
try:
|
|
12
|
-
import mysql.connector
|
|
13
|
-
MYSQL_AVAILABLE = True
|
|
14
|
-
except ImportError:
|
|
15
|
-
MYSQL_AVAILABLE = False
|
|
16
|
-
print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
|
|
17
|
-
|
|
18
|
-
class DatabaseManager:
|
|
19
|
-
"""Manage database operations (SQLite or MySQL)"""
|
|
20
|
-
|
|
21
|
-
"""Database management for both SQLite and MySQL"""
|
|
22
|
-
|
|
23
|
-
import sqlite3
|
|
24
|
-
import pandas as pd
|
|
25
|
-
from pathlib import Path
|
|
26
|
-
from typing import Optional, Tuple, List
|
|
27
|
-
from tabulate import tabulate
|
|
28
|
-
import config
|
|
29
|
-
|
|
30
|
-
# Add MySQL support
|
|
31
|
-
try:
|
|
32
|
-
import mysql.connector
|
|
33
|
-
MYSQL_AVAILABLE = True
|
|
34
|
-
except ImportError:
|
|
35
|
-
MYSQL_AVAILABLE = False
|
|
36
|
-
print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
|
|
37
|
-
|
|
38
|
-
class DatabaseManager:
|
|
39
|
-
"""Manage database operations (SQLite or MySQL)"""
|
|
40
|
-
|
|
41
|
-
def __init__(self, db_path: str = ':memory:', db_type: str = 'sqlite'): # FIX: Added indentation
|
|
42
|
-
self.db_type = db_type.lower()
|
|
43
|
-
|
|
44
|
-
if self.db_type == 'mysql':
|
|
45
|
-
if not MYSQL_AVAILABLE:
|
|
46
|
-
print("❌ MySQL not available, falling back to SQLite")
|
|
47
|
-
self.db_type = 'sqlite'
|
|
48
|
-
else:
|
|
49
|
-
# First connect without database to create it if needed
|
|
50
|
-
try:
|
|
51
|
-
conn_temp = mysql.connector.connect(
|
|
52
|
-
host=config.MYSQL_HOST,
|
|
53
|
-
user=config.MYSQL_USER,
|
|
54
|
-
password=config.MYSQL_PASSWORD
|
|
55
|
-
)
|
|
56
|
-
cursor_temp = conn_temp.cursor()
|
|
57
|
-
cursor_temp.execute(f"CREATE DATABASE IF NOT EXISTS {config.MYSQL_DATABASE}")
|
|
58
|
-
conn_temp.close()
|
|
59
|
-
print(f"✅ Database {config.MYSQL_DATABASE} ready")
|
|
60
|
-
except Exception as e:
|
|
61
|
-
print(f"❌ Could not create database: {e}")
|
|
62
|
-
|
|
63
|
-
# Now connect to the database
|
|
64
|
-
self.conn = mysql.connector.connect(
|
|
65
|
-
host=config.MYSQL_HOST,
|
|
66
|
-
user=config.MYSQL_USER,
|
|
67
|
-
password=config.MYSQL_PASSWORD,
|
|
68
|
-
database=config.MYSQL_DATABASE
|
|
69
|
-
)
|
|
70
|
-
self.cursor = self.conn.cursor()
|
|
71
|
-
print(f"📂 Connected to MySQL: {config.MYSQL_DATABASE}")
|
|
72
|
-
|
|
73
|
-
if self.db_type == 'sqlite': # FIX: Added this block for SQLite
|
|
74
|
-
self.conn = sqlite3.connect(db_path)
|
|
75
|
-
self.cursor = self.conn.cursor()
|
|
76
|
-
print(f"📂 SQLite {'created in memory' if db_path == ':memory:' else f'connected: {db_path}'}")
|
|
77
|
-
|
|
78
|
-
# Rest of the methods stay the same...
|
|
79
|
-
|
|
80
|
-
def execute_schema(self, schema_sql: str) -> bool:
|
|
81
|
-
"""Execute SQL schema with MySQL compatibility"""
|
|
82
|
-
try:
|
|
83
|
-
if self.db_type == 'mysql':
|
|
84
|
-
# MySQL adjustments
|
|
85
|
-
schema_sql = schema_sql.replace('INTEGER PRIMARY KEY AUTOINCREMENT',
|
|
86
|
-
'INT PRIMARY KEY AUTO_INCREMENT')
|
|
87
|
-
schema_sql = schema_sql.replace('TEXT', 'VARCHAR(255)')
|
|
88
|
-
schema_sql = schema_sql.replace('REAL', 'DECIMAL(10,2)')
|
|
89
|
-
|
|
90
|
-
# Execute statements one by one for MySQL
|
|
91
|
-
for statement in schema_sql.split(';'):
|
|
92
|
-
if statement.strip():
|
|
93
|
-
self.cursor.execute(statement)
|
|
94
|
-
self.conn.commit()
|
|
95
|
-
else:
|
|
96
|
-
# SQLite can handle multiple statements
|
|
97
|
-
self.cursor.executescript(schema_sql)
|
|
98
|
-
self.conn.commit()
|
|
99
|
-
|
|
100
|
-
print("✅ Schema executed successfully!")
|
|
101
|
-
return True
|
|
102
|
-
except Exception as e:
|
|
103
|
-
print(f"❌ Error executing schema: {e}")
|
|
104
|
-
return False
|
|
105
|
-
|
|
106
|
-
def execute_query(self, query: str) -> Optional[pd.DataFrame]:
|
|
107
|
-
"""Execute query on either database"""
|
|
108
|
-
try:
|
|
109
|
-
df = pd.read_sql_query(query, self.conn)
|
|
110
|
-
return df
|
|
111
|
-
except Exception as e:
|
|
112
|
-
print(f"❌ Query error: {e}")
|
|
113
|
-
return None
|
|
114
|
-
|
|
115
|
-
def get_tables(self):
|
|
116
|
-
"""Get list of all tables in database"""
|
|
117
|
-
if self.db_type == 'mysql':
|
|
118
|
-
cursor = self.conn.cursor()
|
|
119
|
-
cursor.execute("SHOW TABLES")
|
|
120
|
-
tables = [table[0] for table in cursor.fetchall()]
|
|
121
|
-
cursor.close()
|
|
122
|
-
return tables
|
|
123
|
-
else: # sqlite
|
|
124
|
-
cursor = self.conn.cursor()
|
|
125
|
-
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
126
|
-
tables = [table[0] for table in cursor.fetchall()]
|
|
127
|
-
cursor.close()
|
|
128
|
-
return tables
|
|
129
|
-
|
|
130
|
-
def get_columns(self, table_name):
|
|
131
|
-
"""Get list of columns for a specific table"""
|
|
132
|
-
if self.db_type == 'mysql':
|
|
133
|
-
cursor = self.conn.cursor()
|
|
134
|
-
cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
|
|
135
|
-
columns = [col[0] for col in cursor.fetchall()]
|
|
136
|
-
cursor.close()
|
|
137
|
-
return columns
|
|
138
|
-
else: # sqlite
|
|
139
|
-
cursor = self.conn.cursor()
|
|
140
|
-
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
141
|
-
columns = [col[1] for col in cursor.fetchall()]
|
|
142
|
-
cursor.close()
|
|
143
|
-
return columns
|
|
144
|
-
|
|
145
|
-
def get_schema_context(self) -> str:
|
|
146
|
-
"""Get database schema"""
|
|
147
|
-
if self.db_type == 'mysql':
|
|
148
|
-
tables = self.get_tables()
|
|
149
|
-
schema = []
|
|
150
|
-
for table in tables:
|
|
151
|
-
self.cursor.execute(f"SHOW CREATE TABLE {table}")
|
|
152
|
-
schema.append(self.cursor.fetchone()[1])
|
|
153
|
-
|
|
154
|
-
else:
|
|
155
|
-
self.cursor.execute(
|
|
156
|
-
"SELECT sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
1
|
+
"""Database management for both SQLite and MySQL"""
|
|
2
|
+
|
|
3
|
+
import sqlite3
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, Tuple, List
|
|
7
|
+
from tabulate import tabulate
|
|
8
|
+
import config
|
|
9
|
+
|
|
10
|
+
# Add MySQL support
|
|
11
|
+
try:
|
|
12
|
+
import mysql.connector
|
|
13
|
+
MYSQL_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
MYSQL_AVAILABLE = False
|
|
16
|
+
print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
|
|
17
|
+
|
|
18
|
+
class DatabaseManager:
|
|
19
|
+
"""Manage database operations (SQLite or MySQL)"""
|
|
20
|
+
|
|
21
|
+
"""Database management for both SQLite and MySQL"""
|
|
22
|
+
|
|
23
|
+
import sqlite3
|
|
24
|
+
import pandas as pd
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Optional, Tuple, List
|
|
27
|
+
from tabulate import tabulate
|
|
28
|
+
import config
|
|
29
|
+
|
|
30
|
+
# Add MySQL support
|
|
31
|
+
try:
|
|
32
|
+
import mysql.connector
|
|
33
|
+
MYSQL_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
MYSQL_AVAILABLE = False
|
|
36
|
+
print("⚠️ MySQL not installed. Run: pip install mysql-connector-python")
|
|
37
|
+
|
|
38
|
+
class DatabaseManager:
|
|
39
|
+
"""Manage database operations (SQLite or MySQL)"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, db_path: str = ':memory:', db_type: str = 'sqlite'): # FIX: Added indentation
|
|
42
|
+
self.db_type = db_type.lower()
|
|
43
|
+
|
|
44
|
+
if self.db_type == 'mysql':
|
|
45
|
+
if not MYSQL_AVAILABLE:
|
|
46
|
+
print("❌ MySQL not available, falling back to SQLite")
|
|
47
|
+
self.db_type = 'sqlite'
|
|
48
|
+
else:
|
|
49
|
+
# First connect without database to create it if needed
|
|
50
|
+
try:
|
|
51
|
+
conn_temp = mysql.connector.connect(
|
|
52
|
+
host=config.MYSQL_HOST,
|
|
53
|
+
user=config.MYSQL_USER,
|
|
54
|
+
password=config.MYSQL_PASSWORD
|
|
55
|
+
)
|
|
56
|
+
cursor_temp = conn_temp.cursor()
|
|
57
|
+
cursor_temp.execute(f"CREATE DATABASE IF NOT EXISTS {config.MYSQL_DATABASE}")
|
|
58
|
+
conn_temp.close()
|
|
59
|
+
print(f"✅ Database {config.MYSQL_DATABASE} ready")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
print(f"❌ Could not create database: {e}")
|
|
62
|
+
|
|
63
|
+
# Now connect to the database
|
|
64
|
+
self.conn = mysql.connector.connect(
|
|
65
|
+
host=config.MYSQL_HOST,
|
|
66
|
+
user=config.MYSQL_USER,
|
|
67
|
+
password=config.MYSQL_PASSWORD,
|
|
68
|
+
database=config.MYSQL_DATABASE
|
|
69
|
+
)
|
|
70
|
+
self.cursor = self.conn.cursor()
|
|
71
|
+
print(f"📂 Connected to MySQL: {config.MYSQL_DATABASE}")
|
|
72
|
+
|
|
73
|
+
if self.db_type == 'sqlite': # FIX: Added this block for SQLite
|
|
74
|
+
self.conn = sqlite3.connect(db_path)
|
|
75
|
+
self.cursor = self.conn.cursor()
|
|
76
|
+
print(f"📂 SQLite {'created in memory' if db_path == ':memory:' else f'connected: {db_path}'}")
|
|
77
|
+
|
|
78
|
+
# Rest of the methods stay the same...
|
|
79
|
+
|
|
80
|
+
def execute_schema(self, schema_sql: str) -> bool:
|
|
81
|
+
"""Execute SQL schema with MySQL compatibility"""
|
|
82
|
+
try:
|
|
83
|
+
if self.db_type == 'mysql':
|
|
84
|
+
# MySQL adjustments
|
|
85
|
+
schema_sql = schema_sql.replace('INTEGER PRIMARY KEY AUTOINCREMENT',
|
|
86
|
+
'INT PRIMARY KEY AUTO_INCREMENT')
|
|
87
|
+
schema_sql = schema_sql.replace('TEXT', 'VARCHAR(255)')
|
|
88
|
+
schema_sql = schema_sql.replace('REAL', 'DECIMAL(10,2)')
|
|
89
|
+
|
|
90
|
+
# Execute statements one by one for MySQL
|
|
91
|
+
for statement in schema_sql.split(';'):
|
|
92
|
+
if statement.strip():
|
|
93
|
+
self.cursor.execute(statement)
|
|
94
|
+
self.conn.commit()
|
|
95
|
+
else:
|
|
96
|
+
# SQLite can handle multiple statements
|
|
97
|
+
self.cursor.executescript(schema_sql)
|
|
98
|
+
self.conn.commit()
|
|
99
|
+
|
|
100
|
+
print("✅ Schema executed successfully!")
|
|
101
|
+
return True
|
|
102
|
+
except Exception as e:
|
|
103
|
+
print(f"❌ Error executing schema: {e}")
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
def execute_query(self, query: str) -> Optional[pd.DataFrame]:
|
|
107
|
+
"""Execute query on either database"""
|
|
108
|
+
try:
|
|
109
|
+
df = pd.read_sql_query(query, self.conn)
|
|
110
|
+
return df
|
|
111
|
+
except Exception as e:
|
|
112
|
+
print(f"❌ Query error: {e}")
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
def get_tables(self):
|
|
116
|
+
"""Get list of all tables in database"""
|
|
117
|
+
if self.db_type == 'mysql':
|
|
118
|
+
cursor = self.conn.cursor()
|
|
119
|
+
cursor.execute("SHOW TABLES")
|
|
120
|
+
tables = [table[0] for table in cursor.fetchall()]
|
|
121
|
+
cursor.close()
|
|
122
|
+
return tables
|
|
123
|
+
else: # sqlite
|
|
124
|
+
cursor = self.conn.cursor()
|
|
125
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
126
|
+
tables = [table[0] for table in cursor.fetchall()]
|
|
127
|
+
cursor.close()
|
|
128
|
+
return tables
|
|
129
|
+
|
|
130
|
+
def get_columns(self, table_name):
|
|
131
|
+
"""Get list of columns for a specific table"""
|
|
132
|
+
if self.db_type == 'mysql':
|
|
133
|
+
cursor = self.conn.cursor()
|
|
134
|
+
cursor.execute(f"SHOW COLUMNS FROM `{table_name}`")
|
|
135
|
+
columns = [col[0] for col in cursor.fetchall()]
|
|
136
|
+
cursor.close()
|
|
137
|
+
return columns
|
|
138
|
+
else: # sqlite
|
|
139
|
+
cursor = self.conn.cursor()
|
|
140
|
+
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
141
|
+
columns = [col[1] for col in cursor.fetchall()]
|
|
142
|
+
cursor.close()
|
|
143
|
+
return columns
|
|
144
|
+
|
|
145
|
+
def get_schema_context(self) -> str:
|
|
146
|
+
"""Get database schema with relationship information"""
|
|
147
|
+
if self.db_type == 'mysql':
|
|
148
|
+
tables = self.get_tables()
|
|
149
|
+
schema = []
|
|
150
|
+
for table in tables:
|
|
151
|
+
self.cursor.execute(f"SHOW CREATE TABLE {table}")
|
|
152
|
+
schema.append(self.cursor.fetchone()[1])
|
|
153
|
+
schema_text = '\n'.join(schema)
|
|
154
|
+
else:
|
|
155
|
+
self.cursor.execute(
|
|
156
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
|
|
157
|
+
)
|
|
158
|
+
schema_text = '\n'.join([row[0] for row in self.cursor.fetchall()])
|
|
159
|
+
|
|
160
|
+
# Add relationship summary at the beginning
|
|
161
|
+
relationships = self._extract_relationships(schema_text)
|
|
162
|
+
if relationships:
|
|
163
|
+
relationship_summary = "\n=== TABLE RELATIONSHIPS ===\n" + "\n".join(relationships) + "\n\n=== FULL SCHEMA ===\n"
|
|
164
|
+
return relationship_summary + schema_text
|
|
165
|
+
return schema_text
|
|
166
|
+
|
|
167
|
+
def _extract_relationships(self, schema_text: str) -> List[str]:
|
|
168
|
+
"""Extract and format foreign key relationships from schema"""
|
|
169
|
+
import re
|
|
170
|
+
relationships = []
|
|
171
|
+
|
|
172
|
+
# Pattern to match FOREIGN KEY statements
|
|
173
|
+
fk_pattern = r'FOREIGN KEY\s*\(([^)]+)\)\s*REFERENCES\s+(\w+)\s*\(([^)]+)\)'
|
|
174
|
+
|
|
175
|
+
# Split schema into individual table definitions
|
|
176
|
+
tables = schema_text.split('CREATE TABLE')
|
|
177
|
+
|
|
178
|
+
for table_def in tables:
|
|
179
|
+
if not table_def.strip():
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
# Extract table name
|
|
183
|
+
table_match = re.search(r'[`"]?(\w+)[`"]?', table_def)
|
|
184
|
+
if not table_match:
|
|
185
|
+
continue
|
|
186
|
+
table_name = table_match.group(1)
|
|
187
|
+
|
|
188
|
+
# Find all foreign keys in this table
|
|
189
|
+
for match in re.finditer(fk_pattern, table_def, re.IGNORECASE):
|
|
190
|
+
fk_column = match.group(1).strip('`" ')
|
|
191
|
+
ref_table = match.group(2).strip('`" ')
|
|
192
|
+
ref_column = match.group(3).strip('`" ')
|
|
193
|
+
|
|
194
|
+
relationships.append(
|
|
195
|
+
f" {table_name}.{fk_column} → {ref_table}.{ref_column}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return relationships
|
|
199
|
+
|
|
200
|
+
def display_tables(self): # FIX: Proper indentation - part of class
|
|
201
|
+
"""Display all tables with their structure and data"""
|
|
202
|
+
tables = self.get_tables()
|
|
203
|
+
print(f"\n📋 Created {len(tables)} tables:")
|
|
204
|
+
|
|
205
|
+
for table in tables:
|
|
206
|
+
print(f"\n Table: {table}")
|
|
207
|
+
|
|
208
|
+
# Show columns
|
|
209
|
+
columns = self.get_table_info(table)
|
|
210
|
+
for col in columns:
|
|
211
|
+
print(f" - {col[1]} ({col[2]})")
|
|
212
|
+
|
|
213
|
+
# Show row count
|
|
214
|
+
count = self.get_row_count(table)
|
|
215
|
+
print(f" Records: {count}")
|
|
216
|
+
|
|
217
|
+
def get_table_info(self, table_name: str) -> List[Tuple]: # FIX: Proper indentation
|
|
218
|
+
"""Get column information for a table"""
|
|
219
|
+
if self.db_type == 'mysql':
|
|
220
|
+
self.cursor.execute(f"DESCRIBE {table_name}")
|
|
221
|
+
return [(i, row[0], row[1]) for i, row in enumerate(self.cursor.fetchall())]
|
|
222
|
+
else:
|
|
223
|
+
self.cursor.execute(f"PRAGMA table_info({table_name})")
|
|
224
|
+
return self.cursor.fetchall()
|
|
225
|
+
|
|
226
|
+
def get_row_count(self, table_name: str) -> int: # FIX: Proper indentation
|
|
227
|
+
"""Get number of rows in a table"""
|
|
228
|
+
self.cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
229
|
+
return self.cursor.fetchone()[0]
|
|
230
|
+
|
|
231
|
+
def close(self):
|
|
232
|
+
"""Close database connection"""
|
|
233
|
+
self.conn.close()
|
|
234
|
+
print("📂 Database connection closed")
|
|
235
|
+
|
|
196
236
|
|
sutra/nlp_processor.py
CHANGED
|
@@ -1,144 +1,176 @@
|
|
|
1
|
-
"""NLP to SQL query processor with relevancy checking"""
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import Optional, Tuple
|
|
5
|
-
from tabulate import tabulate
|
|
6
|
-
from sutra.cache_manager import CacheManager
|
|
7
|
-
import openai
|
|
8
|
-
import config
|
|
9
|
-
from sutra.feedback import SimpleFeedback
|
|
10
|
-
from sutra.schema_embeddings import SchemaEmbeddings
|
|
11
|
-
from sutra.feedback_matcher import FeedbackMatcher
|
|
12
|
-
|
|
13
|
-
class NLPProcessor:
|
|
14
|
-
"""Process natural language questions to SQL queries"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, db_manager, openai_client=None):
|
|
17
|
-
self.db = db_manager
|
|
18
|
-
self.cache = CacheManager() if config.CACHE_ENABLED else None
|
|
19
|
-
self.model_name = config.MODEL_NAME
|
|
20
|
-
|
|
21
|
-
# Set the API key directly for openai 0.28.1
|
|
22
|
-
openai.api_key = config.OPENAI_API_KEY
|
|
23
|
-
|
|
24
|
-
# Added for feedback handling and tracking
|
|
25
|
-
self.feedback = SimpleFeedback()
|
|
26
|
-
self.last_question = None
|
|
27
|
-
self.last_sql = None
|
|
28
|
-
|
|
29
|
-
# ✅ NEW: Auto-load schema embeddings
|
|
30
|
-
self.relevancy_checker = SchemaEmbeddings(db_manager)
|
|
31
|
-
|
|
32
|
-
# ✅ NEW: Smart feedback matcher
|
|
33
|
-
self.feedback_matcher = FeedbackMatcher()
|
|
34
|
-
|
|
35
|
-
def nlp_to_sql(self, question: str) -> str:
|
|
36
|
-
"""Convert natural language question to SQL"""
|
|
37
|
-
|
|
38
|
-
# ✅ NEW: Check feedback for similar queries first
|
|
39
|
-
similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
|
|
40
|
-
if similar_sql:
|
|
41
|
-
print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
|
|
42
|
-
return similar_sql
|
|
43
|
-
|
|
44
|
-
# Check cache next
|
|
45
|
-
if self.cache:
|
|
46
|
-
cached_sql = self.cache.get_cached_query(question)
|
|
47
|
-
if cached_sql:
|
|
48
|
-
print("⚡ Using cached query")
|
|
49
|
-
return cached_sql
|
|
50
|
-
|
|
51
|
-
# Only call API if no feedback match and no cache
|
|
52
|
-
print("🤖 Calling OpenAI API...")
|
|
53
|
-
|
|
54
|
-
# Get schema context
|
|
55
|
-
schema = self.db.get_schema_context()
|
|
56
|
-
|
|
57
|
-
prompt = f"""
|
|
58
|
-
Convert this question to a SQLite query
|
|
59
|
-
|
|
60
|
-
Question: {question}
|
|
61
|
-
|
|
62
|
-
Database schema:
|
|
63
|
-
{schema}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
#
|
|
137
|
-
self.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
1
|
+
"""NLP to SQL query processor with relevancy checking"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Optional, Tuple
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
from sutra.cache_manager import CacheManager
|
|
7
|
+
import openai
|
|
8
|
+
import config
|
|
9
|
+
from sutra.feedback import SimpleFeedback
|
|
10
|
+
from sutra.schema_embeddings import SchemaEmbeddings
|
|
11
|
+
from sutra.feedback_matcher import FeedbackMatcher
|
|
12
|
+
|
|
13
|
+
class NLPProcessor:
|
|
14
|
+
"""Process natural language questions to SQL queries"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, db_manager, openai_client=None):
|
|
17
|
+
self.db = db_manager
|
|
18
|
+
self.cache = CacheManager() if config.CACHE_ENABLED else None
|
|
19
|
+
self.model_name = config.MODEL_NAME
|
|
20
|
+
|
|
21
|
+
# Set the API key directly for openai 0.28.1
|
|
22
|
+
openai.api_key = config.OPENAI_API_KEY
|
|
23
|
+
|
|
24
|
+
# Added for feedback handling and tracking
|
|
25
|
+
self.feedback = SimpleFeedback()
|
|
26
|
+
self.last_question = None
|
|
27
|
+
self.last_sql = None
|
|
28
|
+
|
|
29
|
+
# ✅ NEW: Auto-load schema embeddings
|
|
30
|
+
self.relevancy_checker = SchemaEmbeddings(db_manager)
|
|
31
|
+
|
|
32
|
+
# ✅ NEW: Smart feedback matcher
|
|
33
|
+
self.feedback_matcher = FeedbackMatcher()
|
|
34
|
+
|
|
35
|
+
def nlp_to_sql(self, question: str) -> str:
|
|
36
|
+
"""Convert natural language question to SQL"""
|
|
37
|
+
|
|
38
|
+
# ✅ NEW: Check feedback for similar queries first
|
|
39
|
+
similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
|
|
40
|
+
if similar_sql:
|
|
41
|
+
print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
|
|
42
|
+
return similar_sql
|
|
43
|
+
|
|
44
|
+
# Check cache next
|
|
45
|
+
if self.cache:
|
|
46
|
+
cached_sql = self.cache.get_cached_query(question)
|
|
47
|
+
if cached_sql:
|
|
48
|
+
print("⚡ Using cached query")
|
|
49
|
+
return cached_sql
|
|
50
|
+
|
|
51
|
+
# Only call API if no feedback match and no cache
|
|
52
|
+
print("🤖 Calling OpenAI API...")
|
|
53
|
+
|
|
54
|
+
# Get schema context
|
|
55
|
+
schema = self.db.get_schema_context()
|
|
56
|
+
|
|
57
|
+
prompt = f"""
|
|
58
|
+
Convert this question to a SQLite query.
|
|
59
|
+
|
|
60
|
+
Question: {question}
|
|
61
|
+
|
|
62
|
+
Database schema:
|
|
63
|
+
{schema}
|
|
64
|
+
|
|
65
|
+
CRITICAL INSTRUCTIONS FOR MULTI-TABLE QUERIES:
|
|
66
|
+
|
|
67
|
+
**STEP 1: CHECK TABLE RELATIONSHIPS FIRST**
|
|
68
|
+
Look at the "=== TABLE RELATIONSHIPS ===" section at the top of the schema.
|
|
69
|
+
These show you exactly how tables are connected via foreign keys.
|
|
70
|
+
Format: table1.column → table2.column means table1.column references table2.column
|
|
71
|
+
|
|
72
|
+
**STEP 2: IDENTIFY REQUIRED TABLES**
|
|
73
|
+
Analyze which tables contain the data needed to answer the question.
|
|
74
|
+
If information is spread across multiple tables, you MUST join them.
|
|
75
|
+
|
|
76
|
+
**STEP 3: USE THE RELATIONSHIPS TO JOIN**
|
|
77
|
+
When you need data from multiple tables:
|
|
78
|
+
- Use the foreign key relationships shown in the TABLE RELATIONSHIPS section
|
|
79
|
+
- Join table1 to table2 using: JOIN table2 ON table1.fk_column = table2.pk_column
|
|
80
|
+
- Use INNER JOIN when both tables must have matching data
|
|
81
|
+
- Use LEFT JOIN when you need all rows from the first table regardless of matches
|
|
82
|
+
|
|
83
|
+
**STEP 4: WRITE THE QUERY**
|
|
84
|
+
- Use table aliases (t1, t2, etc.) for readability
|
|
85
|
+
- Qualify all column names with table aliases to avoid ambiguity
|
|
86
|
+
- Include all necessary columns from all joined tables in SELECT
|
|
87
|
+
|
|
88
|
+
EXAMPLES:
|
|
89
|
+
❌ WRONG: SELECT name FROM customers WHERE city = 'NYC'
|
|
90
|
+
(if you need order information too)
|
|
91
|
+
|
|
92
|
+
✅ CORRECT: SELECT c.name, o.order_date, o.total
|
|
93
|
+
FROM customers c
|
|
94
|
+
JOIN orders o ON c.customer_id = o.customer_id
|
|
95
|
+
WHERE c.city = 'NYC'
|
|
96
|
+
|
|
97
|
+
Return ONLY the executable SELECT statement. No explanations, no markdown, no code blocks.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# Use openai.ChatCompletion directly for version 0.28.1
|
|
101
|
+
response = openai.ChatCompletion.create(
|
|
102
|
+
model=self.model_name,
|
|
103
|
+
messages=[{"role": "user", "content": prompt}],
|
|
104
|
+
temperature=0
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
sql_query = response['choices'][0]['message']['content'].strip()
|
|
108
|
+
sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
|
|
109
|
+
|
|
110
|
+
# Cache the result
|
|
111
|
+
if self.cache:
|
|
112
|
+
self.cache.add_to_cache(question, sql_query)
|
|
113
|
+
|
|
114
|
+
return sql_query
|
|
115
|
+
|
|
116
|
+
def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
|
|
117
|
+
"""Process a natural language question and return results"""
|
|
118
|
+
|
|
119
|
+
# ✅ NEW: Check relevancy FIRST - BEFORE any API calls
|
|
120
|
+
is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
|
|
121
|
+
|
|
122
|
+
if not is_relevant:
|
|
123
|
+
print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
|
|
124
|
+
for item in info:
|
|
125
|
+
print(f" {item}")
|
|
126
|
+
return None, ""
|
|
127
|
+
|
|
128
|
+
print(f"✅ Relevant question (similarity: {similarity:.2f})")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# Convert to SQL (only if relevant)
|
|
132
|
+
sql_query = self.nlp_to_sql(question)
|
|
133
|
+
print(f"\n🔍 Generated SQL Query:")
|
|
134
|
+
print(f" {sql_query}")
|
|
135
|
+
|
|
136
|
+
# Track for feedback
|
|
137
|
+
self.last_question = question
|
|
138
|
+
self.last_sql = sql_query
|
|
139
|
+
|
|
140
|
+
# Execute query
|
|
141
|
+
result_df = self.db.execute_query(sql_query)
|
|
142
|
+
|
|
143
|
+
return result_df, sql_query
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print(f"❌ Error processing question: {e}")
|
|
147
|
+
return None, ""
|
|
148
|
+
|
|
149
|
+
def display_results(self, df: pd.DataFrame, max_rows: int = 15):
|
|
150
|
+
"""Display query results in a formatted table"""
|
|
151
|
+
if df is None or df.empty:
|
|
152
|
+
print(" No results found")
|
|
153
|
+
return # Exit early if no results
|
|
154
|
+
|
|
155
|
+
# Show the table
|
|
156
|
+
display_df = df.head(max_rows) if len(df) > max_rows else df
|
|
157
|
+
print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
|
|
158
|
+
|
|
159
|
+
if len(df) > max_rows:
|
|
160
|
+
print(f" ... showing first {max_rows} of {len(df)} rows")
|
|
161
|
+
|
|
162
|
+
# ✅ UPDATED: Only ask for feedback for relevant questions with results
|
|
163
|
+
# (Irrelevant questions never reach here due to early return)
|
|
164
|
+
feedback = input("\n👍 or 👎? (y/n): ").lower()
|
|
165
|
+
if feedback == 'y':
|
|
166
|
+
self.feedback.save(self.last_question, self.last_sql, True)
|
|
167
|
+
print("✅ Saved as good")
|
|
168
|
+
# Reload feedback matcher with new data
|
|
169
|
+
self.feedback_matcher.reload_feedback()
|
|
170
|
+
elif feedback == 'n':
|
|
171
|
+
correct = input("Correct SQL: ").strip()
|
|
172
|
+
self.feedback.save(self.last_question, self.last_sql, False, correct)
|
|
173
|
+
if correct:
|
|
174
|
+
print("✅ Learned correction")
|
|
175
|
+
# Reload feedback matcher with new data
|
|
144
176
|
self.feedback_matcher.reload_feedback()
|
sutra/schema_generator.py
CHANGED
|
@@ -1,53 +1,57 @@
|
|
|
1
|
-
"""SQL schema generation from unstructured text using AI"""
|
|
2
|
-
|
|
3
|
-
import openai
|
|
4
|
-
import config
|
|
5
|
-
|
|
6
|
-
class SchemaGenerator:
|
|
7
|
-
"""Generate SQL schema from unstructured data using OpenAI"""
|
|
8
|
-
|
|
9
|
-
def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
|
|
10
|
-
openai.api_key = api_key
|
|
11
|
-
self.model_name = model_name
|
|
12
|
-
self.temperature = config.TEMPERATURE
|
|
13
|
-
|
|
14
|
-
def generate_schema(self, unstructured_data: str) -> str:
|
|
15
|
-
"""Generate SQL schema from unstructured text"""
|
|
16
|
-
|
|
17
|
-
# Truncate if too long
|
|
18
|
-
if len(unstructured_data) > config.MAX_TEXT_LENGTH:
|
|
19
|
-
unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
|
|
20
|
-
print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
|
|
21
|
-
|
|
22
|
-
prompt = f"""
|
|
23
|
-
Convert this unstructured text into a SQLite database:
|
|
24
|
-
|
|
25
|
-
{unstructured_data}
|
|
26
|
-
|
|
27
|
-
Requirements:
|
|
28
|
-
1.
|
|
29
|
-
2. Add
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
1
|
+
"""SQL schema generation from unstructured text using AI"""
|
|
2
|
+
|
|
3
|
+
import openai
|
|
4
|
+
import config
|
|
5
|
+
|
|
6
|
+
class SchemaGenerator:
|
|
7
|
+
"""Generate SQL schema from unstructured data using OpenAI"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
|
|
10
|
+
openai.api_key = api_key
|
|
11
|
+
self.model_name = model_name
|
|
12
|
+
self.temperature = config.TEMPERATURE
|
|
13
|
+
|
|
14
|
+
def generate_schema(self, unstructured_data: str) -> str:
|
|
15
|
+
"""Generate SQL schema from unstructured text"""
|
|
16
|
+
|
|
17
|
+
# Truncate if too long
|
|
18
|
+
if len(unstructured_data) > config.MAX_TEXT_LENGTH:
|
|
19
|
+
unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
|
|
20
|
+
print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
|
|
21
|
+
|
|
22
|
+
prompt = f"""
|
|
23
|
+
Convert this unstructured text into a SQLite database:
|
|
24
|
+
|
|
25
|
+
{unstructured_data}
|
|
26
|
+
|
|
27
|
+
CRITICAL Requirements:
|
|
28
|
+
1. Identify all entities in the text and create a table for each
|
|
29
|
+
2. **MANDATORY:** Add FOREIGN KEY constraints to connect related tables
|
|
30
|
+
- If table A references table B, add: FOREIGN KEY (column_name) REFERENCES table_b(id)
|
|
31
|
+
- Example: If employees belong to departments, employees table must have:
|
|
32
|
+
department_id INTEGER, FOREIGN KEY (department_id) REFERENCES departments(id)
|
|
33
|
+
3. Extract ALL data from the text - don't add anything not in the text
|
|
34
|
+
4. Use INTEGER PRIMARY KEY AUTOINCREMENT for all ID columns
|
|
35
|
+
5. Ensure parent tables (referenced tables) are created BEFORE child tables
|
|
36
|
+
|
|
37
|
+
Return ONLY executable SQLite statements in this order:
|
|
38
|
+
1. DROP TABLE IF EXISTS statements (child tables first, parent tables last)
|
|
39
|
+
2. CREATE TABLE statements (parent tables first, child tables last)
|
|
40
|
+
3. INSERT statements (parent tables first, child tables last)
|
|
41
|
+
|
|
42
|
+
No markdown, no code blocks, no explanations - just SQL statements.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
print("🔄 Generating schema via OpenAI API...")
|
|
46
|
+
|
|
47
|
+
response = openai.ChatCompletion.create(
|
|
48
|
+
model=self.model_name,
|
|
49
|
+
messages=[{"role": "user", "content": prompt}],
|
|
50
|
+
temperature=self.temperature
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
generated_schema = response['choices'][0]['message']['content'].strip()
|
|
54
|
+
generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
|
|
55
|
+
|
|
56
|
+
print("✅ Schema generated!")
|
|
53
57
|
return generated_schema
|
sutra/sutra.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""QuerySUTRA v0.
|
|
2
|
-
__version__ = "0.
|
|
1
|
+
"""QuerySUTRA v0.6.0 - AI-powered data analysis for structured and unstructured data"""
|
|
2
|
+
__version__ = "0.6.0"
|
|
3
3
|
__author__ = "Aditya Batta"
|
|
4
4
|
__all__ = ["SUTRA", "QueryResult"]
|
|
5
5
|
|
|
@@ -72,7 +72,7 @@ class SUTRA:
|
|
|
72
72
|
pass
|
|
73
73
|
|
|
74
74
|
self._refresh_schema()
|
|
75
|
-
print(f"QuerySUTRA
|
|
75
|
+
print(f"QuerySUTRA v{__version__} Ready")
|
|
76
76
|
|
|
77
77
|
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
|
|
78
78
|
"""Upload."""
|
|
@@ -136,6 +136,8 @@ class SUTRA:
|
|
|
136
136
|
rec['id'] = idx
|
|
137
137
|
self._store(pd.DataFrame(recs), f"{name}_{etype}")
|
|
138
138
|
print(f" {etype}: {len(recs)} rows")
|
|
139
|
+
# After all tables are created, detect and store foreign key relationships
|
|
140
|
+
self._create_foreign_keys()
|
|
139
141
|
return
|
|
140
142
|
|
|
141
143
|
print("Using regex fallback...")
|
|
@@ -264,6 +266,55 @@ JSON:"""
|
|
|
264
266
|
self._refresh_schema()
|
|
265
267
|
print(f" {name}: {len(df)} rows")
|
|
266
268
|
|
|
269
|
+
def _create_foreign_keys(self, silent=False):
|
|
270
|
+
"""Detect foreign key relationships between tables by matching column naming patterns.
|
|
271
|
+
e.g., 'person_id' in work_experience -> 'id' in people table."""
|
|
272
|
+
tables = self._get_tables()
|
|
273
|
+
|
|
274
|
+
# Build a map of potential parent tables by looking for 'id' columns
|
|
275
|
+
# e.g., employee_data_people has 'id' -> can be referenced as person_id, people_id
|
|
276
|
+
parent_map = {} # Maps potential FK column names -> (parent_table, parent_pk)
|
|
277
|
+
for t in tables:
|
|
278
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
279
|
+
cols = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
280
|
+
if 'id' in cols:
|
|
281
|
+
# Generate possible FK names from table name
|
|
282
|
+
# e.g., 'employee_data_people' -> 'person_id', 'people_id'
|
|
283
|
+
parts = t.split('_')
|
|
284
|
+
for part in parts:
|
|
285
|
+
# singular form guesses
|
|
286
|
+
fk_name = f"{part}_id"
|
|
287
|
+
parent_map[fk_name] = (t, 'id')
|
|
288
|
+
# Handle plural -> singular (people -> person)
|
|
289
|
+
if part.endswith('ies'):
|
|
290
|
+
parent_map[f"{part[:-3]}y_id"] = (t, 'id')
|
|
291
|
+
elif part.endswith('es'):
|
|
292
|
+
parent_map[f"{part[:-2]}_id"] = (t, 'id')
|
|
293
|
+
elif part.endswith('s'):
|
|
294
|
+
parent_map[f"{part[:-1]}_id"] = (t, 'id')
|
|
295
|
+
# Also try full table name as FK
|
|
296
|
+
parent_map[f"{t}_id"] = (t, 'id')
|
|
297
|
+
|
|
298
|
+
# Now scan all tables for columns matching FK patterns
|
|
299
|
+
self.foreign_keys = {} # table -> [(fk_col, parent_table, parent_col)]
|
|
300
|
+
for t in tables:
|
|
301
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
302
|
+
cols = [r[1] for r in self.cursor.fetchall()]
|
|
303
|
+
fks = []
|
|
304
|
+
for col in cols:
|
|
305
|
+
if col in parent_map:
|
|
306
|
+
parent_table, parent_col = parent_map[col]
|
|
307
|
+
if parent_table != t: # Don't self-reference
|
|
308
|
+
fks.append((col, parent_table, parent_col))
|
|
309
|
+
if fks:
|
|
310
|
+
self.foreign_keys[t] = fks
|
|
311
|
+
|
|
312
|
+
if self.foreign_keys and not silent:
|
|
313
|
+
print(f"\n🔗 Detected relationships:")
|
|
314
|
+
for t, fks in self.foreign_keys.items():
|
|
315
|
+
for fk_col, parent_table, parent_col in fks:
|
|
316
|
+
print(f" {t}.{fk_col} → {parent_table}.{parent_col}")
|
|
317
|
+
|
|
267
318
|
def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
268
319
|
"""
|
|
269
320
|
Query - FIXED: Considers ALL tables, picks best one or joins multiple.
|
|
@@ -273,6 +324,10 @@ JSON:"""
|
|
|
273
324
|
|
|
274
325
|
print(f"\nQuestion: {q}")
|
|
275
326
|
|
|
327
|
+
# Ensure foreign key relationships are detected
|
|
328
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
329
|
+
self._create_foreign_keys(silent=True)
|
|
330
|
+
|
|
276
331
|
# FIXED: If no table specified, let AI pick the right one(s)
|
|
277
332
|
if not table:
|
|
278
333
|
# Get ALL table schemas
|
|
@@ -308,6 +363,24 @@ JSON:"""
|
|
|
308
363
|
print(f"Error: {e}")
|
|
309
364
|
return QueryResult(False, sql, pd.DataFrame(), None, str(e))
|
|
310
365
|
|
|
366
|
+
def _get_relationship_context(self) -> str:
|
|
367
|
+
"""Build a clear relationship context string for the AI prompt."""
|
|
368
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
369
|
+
# Try to detect relationships if not already done
|
|
370
|
+
self._create_foreign_keys(silent=True)
|
|
371
|
+
|
|
372
|
+
if not hasattr(self, 'foreign_keys') or not self.foreign_keys:
|
|
373
|
+
return ""
|
|
374
|
+
|
|
375
|
+
lines = ["\n=== TABLE RELATIONSHIPS (FOREIGN KEYS) ==="]
|
|
376
|
+
lines.append("Use these to JOIN tables when a question needs data from multiple tables:")
|
|
377
|
+
for t, fks in self.foreign_keys.items():
|
|
378
|
+
for fk_col, parent_table, parent_col in fks:
|
|
379
|
+
lines.append(f" {t}.{fk_col} → {parent_table}.{parent_col}")
|
|
380
|
+
lines.append(f" JOIN syntax: JOIN {parent_table} ON {t}.{fk_col} = {parent_table}.{parent_col}")
|
|
381
|
+
lines.append("=" * 50)
|
|
382
|
+
return "\n".join(lines)
|
|
383
|
+
|
|
311
384
|
def _gen_sql_smart(self, q: str, all_schemas: Dict) -> str:
|
|
312
385
|
"""
|
|
313
386
|
FIXED: Generate SQL considering ALL tables and their relationships.
|
|
@@ -318,35 +391,70 @@ JSON:"""
|
|
|
318
391
|
schema_context += f"\n{tbl} ({info['row_count']} rows):\n"
|
|
319
392
|
schema_context += f" Columns: {', '.join(info['columns'])}\n"
|
|
320
393
|
|
|
321
|
-
# Add
|
|
394
|
+
# Add relationship context
|
|
395
|
+
relationship_context = self._get_relationship_context()
|
|
396
|
+
|
|
397
|
+
# Add sample data from ALL tables (not just first 3)
|
|
322
398
|
samples = ""
|
|
323
|
-
for tbl in list(all_schemas.keys())[:
|
|
399
|
+
for tbl in list(all_schemas.keys())[:6]: # Show more tables
|
|
324
400
|
try:
|
|
325
401
|
sample_df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT 2", self.conn)
|
|
326
402
|
samples += f"\nSample from {tbl}:\n{sample_df.to_string(index=False)}\n"
|
|
327
403
|
except:
|
|
328
404
|
pass
|
|
329
405
|
|
|
330
|
-
prompt = f"""You are an
|
|
406
|
+
prompt = f"""You are an expert SQL query generator.
|
|
331
407
|
|
|
332
408
|
{schema_context}
|
|
333
|
-
|
|
409
|
+
{relationship_context}
|
|
334
410
|
{samples}
|
|
335
411
|
|
|
336
412
|
Question: {q}
|
|
337
413
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
414
|
+
CRITICAL INSTRUCTIONS - FOLLOW THESE STEPS:
|
|
415
|
+
|
|
416
|
+
STEP 1: READ THE TABLE RELATIONSHIPS SECTION ABOVE.
|
|
417
|
+
Those show you exactly how tables connect via foreign keys.
|
|
418
|
+
|
|
419
|
+
STEP 2: IDENTIFY WHICH TABLES HAVE THE DATA NEEDED.
|
|
420
|
+
- Person info (name, email, city, state) → look in *_people table
|
|
421
|
+
- Work info (company, position, start_date) → look in *_work_experience table
|
|
422
|
+
- Skills, education, etc. → look in their respective tables
|
|
423
|
+
|
|
424
|
+
STEP 3: IF THE QUESTION NEEDS DATA FROM MULTIPLE TABLES, YOU MUST USE JOIN.
|
|
425
|
+
Use the foreign key relationships shown above.
|
|
426
|
+
Example: If work_experience has person_id and people has id:
|
|
427
|
+
JOIN people ON work_experience.person_id = people.id
|
|
428
|
+
|
|
429
|
+
STEP 4: WRITE THE QUERY.
|
|
430
|
+
- Use table aliases for readability
|
|
431
|
+
- Qualify ALL column names with table alias to avoid ambiguity
|
|
432
|
+
- For "who" / "which person" questions, ALWAYS join to the people table to get names
|
|
433
|
+
- For "from <state>" or "in <city>" questions, the location is in the people table, JOIN to it
|
|
434
|
+
- For "count by state" or "group by state", the state column is in the people table, JOIN to it
|
|
435
|
+
|
|
436
|
+
EXAMPLES:
|
|
437
|
+
❌ WRONG: SELECT COUNT(*) FROM work_experience GROUP BY company
|
|
438
|
+
(when asked "count by state" - state is NOT in work_experience!)
|
|
439
|
+
|
|
440
|
+
✅ CORRECT: SELECT p.state, COUNT(*) as employee_count
|
|
441
|
+
FROM work_experience w
|
|
442
|
+
JOIN people p ON w.person_id = p.id
|
|
443
|
+
GROUP BY p.state
|
|
444
|
+
|
|
445
|
+
❌ WRONG: SELECT * FROM work_experience WHERE company LIKE '%FL%'
|
|
446
|
+
(when asked "how many from FL" - FL is a state, not a company!)
|
|
447
|
+
|
|
448
|
+
✅ CORRECT: SELECT COUNT(*) as count
|
|
449
|
+
FROM people p
|
|
450
|
+
WHERE p.state = 'FL'
|
|
343
451
|
|
|
344
|
-
Return ONLY the SQL query, no
|
|
452
|
+
Return ONLY the executable SQL query. No explanations, no markdown, no code blocks:"""
|
|
345
453
|
|
|
346
454
|
r = self.client.chat.completions.create(
|
|
347
455
|
model="gpt-4o-mini",
|
|
348
456
|
messages=[
|
|
349
|
-
{"role": "system", "content": "SQL
|
|
457
|
+
{"role": "system", "content": "You are an expert SQL query generator. ALWAYS use JOIN when data is spread across multiple tables. ALWAYS check which table a column belongs to before using it. State, city, name are typically in people tables. Position, company are in work_experience tables. Return ONLY executable SQL."},
|
|
350
458
|
{"role": "user", "content": prompt}
|
|
351
459
|
],
|
|
352
460
|
temperature=0
|
|
@@ -562,7 +670,7 @@ Return ONLY the SQL query, no explanations:"""
|
|
|
562
670
|
return [r[0] for r in self.cursor.fetchall()]
|
|
563
671
|
|
|
564
672
|
def _refresh_schema(self):
|
|
565
|
-
"""Refresh."""
|
|
673
|
+
"""Refresh schema info."""
|
|
566
674
|
self.schema_info = {}
|
|
567
675
|
for t in self._get_tables():
|
|
568
676
|
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
querysutra-0.5.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
|
-
sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
|
|
3
|
-
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
|
-
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
5
|
-
sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
|
|
6
|
-
sutra/data_loader.py,sha256=_yPj-DS2qYtlCgaMACQtfXZfSuAdVVd4igNP7yzXolc,5781
|
|
7
|
-
sutra/database_manager.py,sha256=L-QC_WwR3Pnl1BRh0rnEv5MNSTr4C7ZP-hIPfCHRK88,7672
|
|
8
|
-
sutra/direct_query.py,sha256=X69I646zHIZlZjMmgn8O2xLS_7ww7miAkABTnJEPAAc,2724
|
|
9
|
-
sutra/feedback.py,sha256=PHSffU_rfORjLkTW3-j2VSjQdw4ufROsTeBWaX6DZ00,1642
|
|
10
|
-
sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,2865
|
|
11
|
-
sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
|
|
12
|
-
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
|
-
sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
|
|
14
|
-
sutra/sutra.py,sha256=61juV3zlMau4UZJ-5IxjaN-Bc1XBP8w2vkYfum-aXlY,21979
|
|
15
|
-
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
|
-
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
|
-
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
18
|
-
sutra/visualizer.py,sha256=YOKTmjQcY72smmx9KsZrQTdbAiE5GQDKofMFjpLIUfI,6996
|
|
19
|
-
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
tests/test_modules.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
|
|
22
|
-
utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
-
utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
querysutra-0.5.3.dist-info/METADATA,sha256=yFffBSYGfbLrYnXA7OFGHk1mO37fpUV-0iglmHXbAVQ,7258
|
|
26
|
-
querysutra-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
-
querysutra-0.5.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
|
|
28
|
-
querysutra-0.5.3.dist-info/RECORD,,
|
tests/__init__.py
DELETED
|
File without changes
|
tests/test_modules.py
DELETED
|
File without changes
|
tests/test_sutra.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Test suite for SUTRA library
|
|
3
|
-
Run with: pytest test_sutra.py
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import os
|
|
9
|
-
from sutra import SutraClient
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class TestSutraClient:
|
|
13
|
-
"""Test cases for SutraClient"""
|
|
14
|
-
|
|
15
|
-
@pytest.fixture
|
|
16
|
-
def client(self):
|
|
17
|
-
"""Create a test client"""
|
|
18
|
-
# Use a test database
|
|
19
|
-
api_key = os.getenv('OPENAI_API_KEY', 'test-key')
|
|
20
|
-
client = SutraClient(api_key=api_key, db_path="test_db.db")
|
|
21
|
-
yield client
|
|
22
|
-
# Cleanup
|
|
23
|
-
client.close()
|
|
24
|
-
if os.path.exists("test_db.db"):
|
|
25
|
-
os.remove("test_db.db")
|
|
26
|
-
|
|
27
|
-
@pytest.fixture
|
|
28
|
-
def sample_data(self):
|
|
29
|
-
"""Create sample DataFrame"""
|
|
30
|
-
return pd.DataFrame({
|
|
31
|
-
'name': ['Alice', 'Bob', 'Charlie'],
|
|
32
|
-
'age': [25, 30, 35],
|
|
33
|
-
'city': ['New York', 'London', 'Paris']
|
|
34
|
-
})
|
|
35
|
-
|
|
36
|
-
def test_client_initialization(self, client):
|
|
37
|
-
"""Test client can be initialized"""
|
|
38
|
-
assert client is not None
|
|
39
|
-
assert client.db_path == "test_db.db"
|
|
40
|
-
|
|
41
|
-
def test_upload_dataframe(self, client, sample_data):
|
|
42
|
-
"""Test uploading a DataFrame"""
|
|
43
|
-
result = client.upload_dataframe(sample_data, "test_table")
|
|
44
|
-
assert result['status'] == 'success'
|
|
45
|
-
assert result['table_name'] == 'test_table'
|
|
46
|
-
assert result['rows_inserted'] == 3
|
|
47
|
-
|
|
48
|
-
def test_list_tables(self, client, sample_data):
|
|
49
|
-
"""Test listing tables"""
|
|
50
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
51
|
-
tables = client.list_tables()
|
|
52
|
-
assert 'test_table' in tables
|
|
53
|
-
|
|
54
|
-
def test_execute_sql(self, client, sample_data):
|
|
55
|
-
"""Test direct SQL execution"""
|
|
56
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
57
|
-
result = client.execute_sql("SELECT * FROM test_table")
|
|
58
|
-
assert result['status'] == 'success'
|
|
59
|
-
assert len(result['results']) == 3
|
|
60
|
-
|
|
61
|
-
def test_get_table_info(self, client, sample_data):
|
|
62
|
-
"""Test getting table information"""
|
|
63
|
-
client.upload_dataframe(sample_data, "test_table")
|
|
64
|
-
info = client.get_table_info("test_table")
|
|
65
|
-
assert info['table_name'] == 'test_table'
|
|
66
|
-
assert len(info['columns']) > 0
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def test_import():
|
|
70
|
-
"""Test that the library can be imported"""
|
|
71
|
-
from sutra import SutraClient
|
|
72
|
-
assert SutraClient is not None
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if __name__ == "__main__":
|
|
76
|
-
pytest.main([__file__, "-v"])
|
utils/__init__.py
DELETED
|
File without changes
|
utils/file_utils.py
DELETED
|
File without changes
|
utils/text_utils.py
DELETED
|
File without changes
|
|
File without changes
|