QuerySUTRA 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- querysutra-0.3.3.dist-info/METADATA +285 -0
- {querysutra-0.3.1.dist-info → querysutra-0.3.3.dist-info}/RECORD +7 -7
- sutra/__init__.py +2 -2
- sutra/sutra.py +281 -463
- querysutra-0.3.1.dist-info/METADATA +0 -429
- {querysutra-0.3.1.dist-info → querysutra-0.3.3.dist-info}/WHEEL +0 -0
- {querysutra-0.3.1.dist-info → querysutra-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {querysutra-0.3.1.dist-info → querysutra-0.3.3.dist-info}/top_level.txt +0 -0
sutra/sutra.py
CHANGED
|
@@ -1,21 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
|
-
QuerySUTRA v0.3.
|
|
2
|
+
QuerySUTRA v0.3.3 - PROPER RELATIONAL DATABASE EXTRACTION
|
|
3
3
|
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
✅ Embeddings for caching - OPTIONAL (user decides)
|
|
11
|
-
✅ All features are OPTIONAL - zero hard coding
|
|
5
|
+
FIXED: Proper primary keys, foreign keys, and relational integrity
|
|
6
|
+
- Unique IDs for each entity
|
|
7
|
+
- Proper foreign key relationships
|
|
8
|
+
- No duplicate keys
|
|
9
|
+
- Comprehensive entity extraction (skills, technologies, projects, etc.)
|
|
12
10
|
|
|
13
11
|
Author: Aditya Batta
|
|
14
12
|
License: MIT
|
|
15
|
-
Version: 0.3.
|
|
13
|
+
Version: 0.3.3
|
|
16
14
|
"""
|
|
17
15
|
|
|
18
|
-
__version__ = "0.3.
|
|
16
|
+
__version__ = "0.3.3"
|
|
19
17
|
__author__ = "Aditya Batta"
|
|
20
18
|
__title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
|
|
21
19
|
__all__ = ["SUTRA", "QueryResult", "quick_start"]
|
|
@@ -24,7 +22,7 @@ import os
|
|
|
24
22
|
import sqlite3
|
|
25
23
|
import pandas as pd
|
|
26
24
|
import numpy as np
|
|
27
|
-
from typing import Optional, Union, Dict, Any, List
|
|
25
|
+
from typing import Optional, Union, Dict, Any, List
|
|
28
26
|
from pathlib import Path
|
|
29
27
|
import json
|
|
30
28
|
import hashlib
|
|
@@ -78,12 +76,7 @@ class SUTRA:
|
|
|
78
76
|
"""
|
|
79
77
|
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
80
78
|
|
|
81
|
-
|
|
82
|
-
- Custom visualizations (pie, bar, line, scatter, etc.)
|
|
83
|
-
- Load existing databases (SQLite, MySQL, PostgreSQL)
|
|
84
|
-
- Smart NLP with fuzzy matching (OPTIONAL)
|
|
85
|
-
- Query relevance detection (OPTIONAL)
|
|
86
|
-
- Embeddings for caching (OPTIONAL)
|
|
79
|
+
Professional data analysis with proper relational database structure
|
|
87
80
|
"""
|
|
88
81
|
|
|
89
82
|
def __init__(self,
|
|
@@ -93,19 +86,9 @@ class SUTRA:
|
|
|
93
86
|
check_relevance: bool = False,
|
|
94
87
|
fuzzy_match: bool = True,
|
|
95
88
|
cache_queries: bool = True):
|
|
96
|
-
"""
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
api_key: OpenAI API key (optional)
|
|
101
|
-
db: Database path (SQLite file)
|
|
102
|
-
use_embeddings: Use embeddings for smart query caching (saves API calls)
|
|
103
|
-
check_relevance: Check if query is relevant to database before processing
|
|
104
|
-
fuzzy_match: Enable fuzzy matching for city names, etc. (e.g., "New York City" → "New York")
|
|
105
|
-
cache_queries: Cache SQL queries to avoid repeated API calls
|
|
106
|
-
"""
|
|
107
|
-
print("🚀 Initializing QuerySUTRA v0.3.0 - ENHANCED MODE")
|
|
108
|
-
print(" SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
|
|
89
|
+
"""Initialize SUTRA with optional features."""
|
|
90
|
+
print("Initializing QuerySUTRA v0.3.3")
|
|
91
|
+
print("SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
|
|
109
92
|
|
|
110
93
|
if api_key:
|
|
111
94
|
os.environ["OPENAI_API_KEY"] = api_key
|
|
@@ -120,7 +103,6 @@ class SUTRA:
|
|
|
120
103
|
self.current_table = None
|
|
121
104
|
self.schema_info = {}
|
|
122
105
|
|
|
123
|
-
# OPTIONAL FEATURES (user decides)
|
|
124
106
|
self.cache_queries = cache_queries
|
|
125
107
|
self.cache = {} if cache_queries else None
|
|
126
108
|
|
|
@@ -131,126 +113,77 @@ class SUTRA:
|
|
|
131
113
|
self.check_relevance = check_relevance
|
|
132
114
|
self.fuzzy_match = fuzzy_match
|
|
133
115
|
|
|
134
|
-
# Initialize embeddings if requested
|
|
135
116
|
if use_embeddings and HAS_EMBEDDINGS:
|
|
136
117
|
try:
|
|
137
|
-
print("
|
|
118
|
+
print("Loading embeddings model...")
|
|
138
119
|
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
139
|
-
print("
|
|
120
|
+
print("Embeddings ready")
|
|
140
121
|
except:
|
|
141
|
-
print("
|
|
122
|
+
print("Embeddings unavailable")
|
|
142
123
|
self.use_embeddings = False
|
|
143
124
|
|
|
144
|
-
# Refresh schema
|
|
145
125
|
self._refresh_schema()
|
|
146
126
|
|
|
147
|
-
print(f"
|
|
148
|
-
print(f" Cache: {'ON' if cache_queries else 'OFF'}")
|
|
149
|
-
print(f" Embeddings: {'ON' if use_embeddings else 'OFF'}")
|
|
150
|
-
print(f" Relevance Check: {'ON' if check_relevance else 'OFF'}")
|
|
151
|
-
print(f" Fuzzy Match: {'ON' if fuzzy_match else 'OFF'}")
|
|
152
|
-
|
|
127
|
+
print(f"Ready! Database: {db}")
|
|
153
128
|
if not self.api_key:
|
|
154
|
-
print("
|
|
155
|
-
|
|
156
|
-
# ========================================================================
|
|
157
|
-
# NEW: LOAD EXISTING DATABASE
|
|
158
|
-
# ========================================================================
|
|
129
|
+
print("No API key - use .sql() for direct queries")
|
|
159
130
|
|
|
160
131
|
@classmethod
|
|
161
132
|
def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
|
|
162
|
-
"""
|
|
163
|
-
Load existing database WITHOUT re-uploading data.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
db_path: Path to existing SQLite database
|
|
167
|
-
api_key: OpenAI API key
|
|
168
|
-
**kwargs: Other options (use_embeddings, check_relevance, etc.)
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
SUTRA instance connected to existing database
|
|
172
|
-
|
|
173
|
-
Example:
|
|
174
|
-
sutra = SUTRA.load_from_db("sutra.db", api_key="sk-...")
|
|
175
|
-
sutra.tables() # See existing tables
|
|
176
|
-
result = sutra.ask("Show me data") # Query immediately!
|
|
177
|
-
"""
|
|
133
|
+
"""Load existing SQLite database."""
|
|
178
134
|
if not Path(db_path).exists():
|
|
179
135
|
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
180
136
|
|
|
181
|
-
print(f"
|
|
137
|
+
print(f"Loading database: {db_path}")
|
|
182
138
|
instance = cls(api_key=api_key, db=db_path, **kwargs)
|
|
183
139
|
|
|
184
140
|
tables = instance.tables()
|
|
185
|
-
print(f"
|
|
141
|
+
print(f"Loaded {len(tables)} tables")
|
|
186
142
|
|
|
187
143
|
return instance
|
|
188
144
|
|
|
189
145
|
@classmethod
|
|
190
146
|
def connect_mysql(cls, host: str, user: str, password: str, database: str,
|
|
191
147
|
port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
192
|
-
"""
|
|
193
|
-
Connect to existing MySQL database WITHOUT importing data.
|
|
194
|
-
Query directly from MySQL!
|
|
195
|
-
|
|
196
|
-
Args:
|
|
197
|
-
host: MySQL host
|
|
198
|
-
user: MySQL user
|
|
199
|
-
password: MySQL password
|
|
200
|
-
database: Database name
|
|
201
|
-
port: MySQL port
|
|
202
|
-
api_key: OpenAI API key
|
|
203
|
-
|
|
204
|
-
Example:
|
|
205
|
-
sutra = SUTRA.connect_mysql("localhost", "root", "pass", "mydb", api_key="sk-...")
|
|
206
|
-
result = sutra.ask("Show me users")
|
|
207
|
-
"""
|
|
148
|
+
"""Connect to MySQL database."""
|
|
208
149
|
try:
|
|
209
150
|
from sqlalchemy import create_engine
|
|
210
151
|
except ImportError:
|
|
211
|
-
raise ImportError("Run: pip install
|
|
152
|
+
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
212
153
|
|
|
213
|
-
print(f"
|
|
154
|
+
print(f"Connecting to MySQL: {host}:{port}/{database}")
|
|
214
155
|
|
|
215
156
|
connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
|
|
216
157
|
|
|
217
|
-
# Create temporary SQLite and sync tables
|
|
218
158
|
temp_db = f"sutra_mysql_{database}.db"
|
|
219
159
|
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
220
160
|
|
|
221
161
|
engine = create_engine(connection_string)
|
|
222
162
|
|
|
223
|
-
# Get all tables from MySQL
|
|
224
163
|
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
225
164
|
|
|
226
|
-
print(f"
|
|
227
|
-
print(f" Syncing to local cache...")
|
|
165
|
+
print(f"Found {len(tables)} tables, syncing...")
|
|
228
166
|
|
|
229
167
|
for table in tables:
|
|
230
168
|
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
231
169
|
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
232
|
-
print(f"
|
|
170
|
+
print(f" {table}: {len(df)} rows")
|
|
233
171
|
|
|
234
172
|
instance._refresh_schema()
|
|
235
|
-
print(f"
|
|
173
|
+
print(f"Connected! {len(tables)} tables available")
|
|
236
174
|
|
|
237
175
|
return instance
|
|
238
176
|
|
|
239
177
|
@classmethod
|
|
240
178
|
def connect_postgres(cls, host: str, user: str, password: str, database: str,
|
|
241
179
|
port: int = 5432, api_key: Optional[str] = None, **kwargs):
|
|
242
|
-
"""
|
|
243
|
-
Connect to existing PostgreSQL database WITHOUT importing data.
|
|
244
|
-
|
|
245
|
-
Example:
|
|
246
|
-
sutra = SUTRA.connect_postgres("localhost", "postgres", "pass", "mydb", api_key="sk-...")
|
|
247
|
-
"""
|
|
180
|
+
"""Connect to PostgreSQL database."""
|
|
248
181
|
try:
|
|
249
182
|
from sqlalchemy import create_engine
|
|
250
183
|
except ImportError:
|
|
251
|
-
raise ImportError("Run: pip install
|
|
184
|
+
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
252
185
|
|
|
253
|
-
print(f"
|
|
186
|
+
print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
|
|
254
187
|
|
|
255
188
|
connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
|
256
189
|
|
|
@@ -259,32 +192,34 @@ class SUTRA:
|
|
|
259
192
|
|
|
260
193
|
engine = create_engine(connection_string)
|
|
261
194
|
|
|
262
|
-
# Get all tables
|
|
263
195
|
tables = pd.read_sql_query(
|
|
264
196
|
"SELECT tablename FROM pg_tables WHERE schemaname='public'",
|
|
265
197
|
engine
|
|
266
198
|
)['tablename'].tolist()
|
|
267
199
|
|
|
268
|
-
print(f"
|
|
269
|
-
print(f" Syncing to local cache...")
|
|
200
|
+
print(f"Found {len(tables)} tables, syncing...")
|
|
270
201
|
|
|
271
202
|
for table in tables:
|
|
272
203
|
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
273
204
|
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
274
|
-
print(f"
|
|
205
|
+
print(f" {table}: {len(df)} rows")
|
|
275
206
|
|
|
276
207
|
instance._refresh_schema()
|
|
277
|
-
print(f"
|
|
208
|
+
print(f"Connected! {len(tables)} tables available")
|
|
278
209
|
|
|
279
210
|
return instance
|
|
280
211
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
212
|
+
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
|
|
213
|
+
extract_entities: Optional[List[str]] = None) -> 'SUTRA':
|
|
214
|
+
"""
|
|
215
|
+
Upload data with optional custom entity extraction.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
data: File path or DataFrame
|
|
219
|
+
name: Table name
|
|
220
|
+
extract_entities: Custom entities to extract (e.g., ['skills', 'technologies'])
|
|
221
|
+
"""
|
|
222
|
+
print(f"\nUploading data...")
|
|
288
223
|
|
|
289
224
|
if isinstance(data, pd.DataFrame):
|
|
290
225
|
name = name or "data"
|
|
@@ -298,9 +233,8 @@ class SUTRA:
|
|
|
298
233
|
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
299
234
|
ext = path.suffix.lower()
|
|
300
235
|
|
|
301
|
-
print(f"
|
|
236
|
+
print(f"File: {path.name}")
|
|
302
237
|
|
|
303
|
-
# Load based on format
|
|
304
238
|
if ext == ".csv":
|
|
305
239
|
df = pd.read_csv(path)
|
|
306
240
|
self._store_dataframe(df, name)
|
|
@@ -318,69 +252,63 @@ class SUTRA:
|
|
|
318
252
|
self.cursor.executescript(f.read())
|
|
319
253
|
self.conn.commit()
|
|
320
254
|
self._refresh_schema()
|
|
321
|
-
print(
|
|
255
|
+
print("SQL executed")
|
|
322
256
|
|
|
323
257
|
elif ext == ".pdf":
|
|
324
|
-
self._smart_upload_pdf(path, name)
|
|
258
|
+
self._smart_upload_pdf(path, name, extract_entities)
|
|
325
259
|
|
|
326
260
|
elif ext == ".docx":
|
|
327
|
-
self._smart_upload_docx(path, name)
|
|
261
|
+
self._smart_upload_docx(path, name, extract_entities)
|
|
328
262
|
|
|
329
263
|
elif ext == ".txt":
|
|
330
|
-
self._smart_upload_txt(path, name)
|
|
264
|
+
self._smart_upload_txt(path, name, extract_entities)
|
|
331
265
|
|
|
332
266
|
else:
|
|
333
267
|
raise ValueError(f"Unsupported format: {ext}")
|
|
334
268
|
|
|
335
269
|
return self
|
|
336
270
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
# ========================================================================
|
|
340
|
-
|
|
341
|
-
def _smart_upload_pdf(self, path: Path, base_name: str):
|
|
342
|
-
"""Parse PDF and create multiple tables."""
|
|
271
|
+
def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
272
|
+
"""Parse PDF with proper relational structure."""
|
|
343
273
|
if not HAS_PYPDF2:
|
|
344
|
-
raise ImportError("
|
|
274
|
+
raise ImportError("Run: pip install PyPDF2")
|
|
345
275
|
|
|
346
|
-
print("
|
|
276
|
+
print("Extracting text from PDF...")
|
|
347
277
|
|
|
348
278
|
with open(path, 'rb') as file:
|
|
349
279
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
350
280
|
text = ""
|
|
351
281
|
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
352
282
|
text += page.extract_text() + "\n"
|
|
353
|
-
print(f"
|
|
283
|
+
print(f" Page {page_num}/{len(pdf_reader.pages)}")
|
|
354
284
|
|
|
355
285
|
if self.client:
|
|
356
|
-
print("
|
|
357
|
-
tables = self._create_tables_with_ai(text, base_name)
|
|
286
|
+
print("AI: Comprehensive entity extraction with proper relationships...")
|
|
287
|
+
tables = self._create_tables_with_ai(text, base_name, extract_entities)
|
|
358
288
|
|
|
359
289
|
if tables and len(tables) > 0:
|
|
360
|
-
print(f"\
|
|
290
|
+
print(f"\nCreated {len(tables)} relational tables:")
|
|
361
291
|
for tbl_name in tables:
|
|
362
292
|
count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
|
|
363
293
|
cols = len(self.schema_info.get(tbl_name, {}))
|
|
364
|
-
print(f"
|
|
294
|
+
print(f" {tbl_name}: {count} rows, {cols} columns")
|
|
365
295
|
return
|
|
366
296
|
|
|
367
|
-
|
|
368
|
-
print(" ⚠️ AI not available, creating simple text table")
|
|
297
|
+
print("AI unavailable, creating simple table")
|
|
369
298
|
df = self._parse_text_simple(text)
|
|
370
299
|
self._store_dataframe(df, base_name)
|
|
371
300
|
|
|
372
|
-
def _smart_upload_docx(self, path: Path, base_name: str):
|
|
373
|
-
"""Parse DOCX
|
|
301
|
+
def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
302
|
+
"""Parse DOCX with proper structure."""
|
|
374
303
|
if not HAS_DOCX:
|
|
375
|
-
raise ImportError("
|
|
304
|
+
raise ImportError("Run: pip install python-docx")
|
|
376
305
|
|
|
377
|
-
print("
|
|
306
|
+
print("Extracting from DOCX...")
|
|
378
307
|
|
|
379
308
|
doc = docx.Document(path)
|
|
380
309
|
|
|
381
|
-
# Check for tables first
|
|
382
310
|
if doc.tables:
|
|
383
|
-
print(f"
|
|
311
|
+
print(f"Found {len(doc.tables)} table(s)")
|
|
384
312
|
for i, table in enumerate(doc.tables):
|
|
385
313
|
data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
|
|
386
314
|
if data and len(data) > 1:
|
|
@@ -389,93 +317,144 @@ class SUTRA:
|
|
|
389
317
|
self._store_dataframe(df, table_name)
|
|
390
318
|
return
|
|
391
319
|
|
|
392
|
-
# Extract text
|
|
393
320
|
text = "\n".join([para.text for para in doc.paragraphs])
|
|
394
321
|
|
|
395
322
|
if self.client:
|
|
396
|
-
print("
|
|
397
|
-
tables = self._create_tables_with_ai(text, base_name)
|
|
323
|
+
print("AI: Analyzing...")
|
|
324
|
+
tables = self._create_tables_with_ai(text, base_name, extract_entities)
|
|
398
325
|
|
|
399
326
|
if tables and len(tables) > 0:
|
|
400
|
-
print(f"\
|
|
327
|
+
print(f"\nCreated {len(tables)} tables:")
|
|
401
328
|
for tbl_name in tables:
|
|
402
329
|
count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
|
|
403
330
|
cols = len(self.schema_info.get(tbl_name, {}))
|
|
404
|
-
print(f"
|
|
331
|
+
print(f" {tbl_name}: {count} rows, {cols} columns")
|
|
405
332
|
return
|
|
406
333
|
|
|
407
334
|
df = self._parse_text_simple(text)
|
|
408
335
|
self._store_dataframe(df, base_name)
|
|
409
336
|
|
|
410
|
-
def _smart_upload_txt(self, path: Path, base_name: str):
|
|
411
|
-
"""Parse TXT
|
|
412
|
-
print("
|
|
337
|
+
def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
338
|
+
"""Parse TXT with proper structure."""
|
|
339
|
+
print("Reading TXT...")
|
|
413
340
|
|
|
414
341
|
with open(path, 'r', encoding='utf-8') as file:
|
|
415
342
|
text = file.read()
|
|
416
343
|
|
|
417
344
|
if self.client:
|
|
418
|
-
print("
|
|
419
|
-
tables = self._create_tables_with_ai(text, base_name)
|
|
345
|
+
print("AI: Analyzing...")
|
|
346
|
+
tables = self._create_tables_with_ai(text, base_name, extract_entities)
|
|
420
347
|
|
|
421
348
|
if tables and len(tables) > 0:
|
|
422
|
-
print(f"\
|
|
349
|
+
print(f"\nCreated {len(tables)} tables:")
|
|
423
350
|
for tbl_name in tables:
|
|
424
351
|
count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
|
|
425
352
|
cols = len(self.schema_info.get(tbl_name, {}))
|
|
426
|
-
print(f"
|
|
353
|
+
print(f" {tbl_name}: {count} rows, {cols} columns")
|
|
427
354
|
return
|
|
428
355
|
|
|
429
356
|
df = self._parse_text_simple(text)
|
|
430
357
|
self._store_dataframe(df, base_name)
|
|
431
358
|
|
|
432
|
-
def _create_tables_with_ai(self, text: str, base_name: str) -> List[str]:
|
|
433
|
-
"""
|
|
359
|
+
def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
|
|
360
|
+
"""
|
|
361
|
+
AI extracts ALL entities with PROPER primary and foreign keys.
|
|
362
|
+
|
|
363
|
+
CRITICAL: Each entity gets UNIQUE IDs, foreign keys properly link tables.
|
|
364
|
+
"""
|
|
434
365
|
if not self.client:
|
|
435
366
|
return []
|
|
436
367
|
|
|
437
368
|
try:
|
|
438
|
-
|
|
369
|
+
if custom_entities:
|
|
370
|
+
entity_instruction = f"""Extract these specific entities: {', '.join(custom_entities)}
|
|
371
|
+
For each entity type, create a proper table with unique IDs."""
|
|
372
|
+
else:
|
|
373
|
+
entity_instruction = """Automatically identify and extract ALL structured entities.
|
|
374
|
+
|
|
375
|
+
Common entities (extract ALL you find):
|
|
376
|
+
- people: Personal information (id, name, email, phone, address, city, state, zip)
|
|
377
|
+
- skills: Individual skills (id, person_id, skill_name, proficiency_level, years_experience)
|
|
378
|
+
- technologies: Technologies/tools (id, person_id, technology_name, category, proficiency)
|
|
379
|
+
- projects: Projects (id, person_id, project_name, description, start_date, end_date)
|
|
380
|
+
- certifications: Certifications (id, person_id, cert_name, issuer, date_obtained)
|
|
381
|
+
- education: Education records (id, person_id, degree, institution, graduation_year)
|
|
382
|
+
- work_experience: Work history (id, person_id, company, title, start_date, end_date)
|
|
383
|
+
- events: Events/meetings (id, host_id, description, location, date, attendee_ids)
|
|
384
|
+
- organizations: Companies/departments (id, name, address, city, industry)
|
|
385
|
+
- products: Products/services (id, name, description, price, category)
|
|
386
|
+
- ANY other structured entities you identify
|
|
387
|
+
|
|
388
|
+
Extract EVERYTHING you find in the text."""
|
|
389
|
+
|
|
390
|
+
extraction_prompt = f"""Analyze this text and extract ALL structured data into proper relational database tables.
|
|
439
391
|
|
|
440
392
|
Text:
|
|
441
|
-
{text[:
|
|
393
|
+
{text[:6000]}
|
|
394
|
+
|
|
395
|
+
{entity_instruction}
|
|
396
|
+
|
|
397
|
+
CRITICAL REQUIREMENTS FOR PROPER DATABASE DESIGN:
|
|
398
|
+
|
|
399
|
+
1. PRIMARY KEYS:
|
|
400
|
+
- Each table MUST have unique sequential IDs starting from 1
|
|
401
|
+
- Person 1 gets id=1, Person 2 gets id=2, etc.
|
|
402
|
+
- NO DUPLICATE IDs within same table
|
|
403
|
+
- IDs must be integers
|
|
404
|
+
|
|
405
|
+
2. FOREIGN KEYS:
|
|
406
|
+
- Use foreign keys to link related tables
|
|
407
|
+
- Example: skills table has person_id that references people.id
|
|
408
|
+
- Example: projects table has person_id that references people.id
|
|
409
|
+
- Foreign keys MUST match existing primary keys
|
|
442
410
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
4. organizations - (id, name, address, city)
|
|
448
|
-
5. Any other entities you find
|
|
411
|
+
3. TABLE STRUCTURE:
|
|
412
|
+
- Each entity type gets its own table
|
|
413
|
+
- Use clear table names (people, skills, technologies, not table1, table2)
|
|
414
|
+
- Include ALL relevant attributes for each entity
|
|
449
415
|
|
|
450
|
-
Return
|
|
416
|
+
Return JSON with this EXACT structure:
|
|
451
417
|
{{
|
|
452
418
|
"people": [
|
|
453
|
-
{{"id": 1, "name": "John Doe", "
|
|
419
|
+
{{"id": 1, "name": "John Doe", "email": "john@email.com", "phone": "+1-555-0100", "city": "Dallas", "state": "TX"}},
|
|
420
|
+
{{"id": 2, "name": "Jane Smith", "email": "jane@email.com", "phone": "+1-555-0101", "city": "New York", "state": "NY"}},
|
|
454
421
|
...
|
|
455
422
|
],
|
|
456
|
-
"
|
|
457
|
-
{{"id": 1, "person_id": 1, "
|
|
423
|
+
"skills": [
|
|
424
|
+
{{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
|
|
425
|
+
{{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
|
|
426
|
+
{{"id": 3, "person_id": 2, "skill_name": "Java", "proficiency": "Expert", "years": 7}},
|
|
458
427
|
...
|
|
459
428
|
],
|
|
460
|
-
"
|
|
461
|
-
{{"id": 1, "
|
|
429
|
+
"technologies": [
|
|
430
|
+
{{"id": 1, "person_id": 1, "technology": "React", "category": "Frontend"}},
|
|
431
|
+
{{"id": 2, "person_id": 1, "technology": "PostgreSQL", "category": "Database"}},
|
|
432
|
+
{{"id": 3, "person_id": 2, "technology": "Spring Boot", "category": "Backend"}},
|
|
433
|
+
...
|
|
434
|
+
],
|
|
435
|
+
"projects": [
|
|
436
|
+
{{"id": 1, "person_id": 1, "project_name": "E-commerce Platform", "role": "Lead Developer"}},
|
|
437
|
+
{{"id": 2, "person_id": 2, "project_name": "Analytics Dashboard", "role": "Backend Engineer"}},
|
|
462
438
|
...
|
|
463
439
|
]
|
|
464
440
|
}}
|
|
465
441
|
|
|
466
442
|
IMPORTANT:
|
|
467
|
-
- Extract
|
|
468
|
-
-
|
|
443
|
+
- Extract EVERY structured piece of data you find
|
|
444
|
+
- Assign UNIQUE sequential IDs (1, 2, 3, ...) for each table
|
|
445
|
+
- Foreign keys MUST reference valid primary keys
|
|
446
|
+
- Create as many tables as needed (don't limit yourself)
|
|
469
447
|
- Return ONLY valid JSON, no explanations
|
|
470
|
-
-
|
|
448
|
+
- Be COMPREHENSIVE - extract skills, technologies, projects, certifications, education, work history, etc."""
|
|
471
449
|
|
|
472
450
|
response = self.client.chat.completions.create(
|
|
473
451
|
model="gpt-4o-mini",
|
|
474
452
|
messages=[
|
|
475
|
-
{"role": "system", "content": "You are a
|
|
453
|
+
{"role": "system", "content": "You are a database design expert. Extract ALL entities with proper primary keys (unique sequential IDs) and foreign keys (referencing valid IDs). Be comprehensive and extract EVERYTHING. Return only valid JSON."},
|
|
476
454
|
{"role": "user", "content": extraction_prompt}
|
|
477
455
|
],
|
|
478
|
-
temperature=0
|
|
456
|
+
temperature=0,
|
|
457
|
+
max_tokens=4096
|
|
479
458
|
)
|
|
480
459
|
|
|
481
460
|
json_text = response.choices[0].message.content.strip()
|
|
@@ -485,7 +464,6 @@ IMPORTANT:
|
|
|
485
464
|
|
|
486
465
|
created_tables = []
|
|
487
466
|
|
|
488
|
-
# Create tables from extracted data
|
|
489
467
|
for entity_type, records in extracted_data.items():
|
|
490
468
|
if records and isinstance(records, list) and len(records) > 0:
|
|
491
469
|
table_name = f"{base_name}_{entity_type}"
|
|
@@ -495,24 +473,23 @@ IMPORTANT:
|
|
|
495
473
|
if not df.empty:
|
|
496
474
|
self._store_dataframe(df, table_name, silent=True)
|
|
497
475
|
created_tables.append(table_name)
|
|
498
|
-
print(f"
|
|
476
|
+
print(f" {entity_type}: {len(df)} records")
|
|
499
477
|
except Exception as e:
|
|
500
|
-
print(f"
|
|
478
|
+
print(f" Failed {entity_type}: {e}")
|
|
501
479
|
|
|
502
480
|
return created_tables
|
|
503
481
|
|
|
504
482
|
except Exception as e:
|
|
505
|
-
print(f"
|
|
483
|
+
print(f"AI extraction error: {e}")
|
|
506
484
|
return []
|
|
507
485
|
|
|
508
486
|
def _parse_text_simple(self, text: str) -> pd.DataFrame:
|
|
509
|
-
"""
|
|
487
|
+
"""Fallback text parsing."""
|
|
510
488
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
511
489
|
|
|
512
490
|
if not lines:
|
|
513
491
|
return pd.DataFrame({'content': ['No content']})
|
|
514
492
|
|
|
515
|
-
# Try to detect if it's tabular
|
|
516
493
|
sample = lines[:min(10, len(lines))]
|
|
517
494
|
for delimiter in ['\t', ',', '|', ';']:
|
|
518
495
|
if all(delimiter in line for line in sample):
|
|
@@ -529,98 +506,63 @@ IMPORTANT:
|
|
|
529
506
|
})
|
|
530
507
|
|
|
531
508
|
def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
|
|
532
|
-
"""Store DataFrame
|
|
509
|
+
"""Store DataFrame."""
|
|
533
510
|
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
534
511
|
df.to_sql(name, self.conn, if_exists='replace', index=False)
|
|
535
512
|
self.current_table = name
|
|
536
513
|
self._refresh_schema()
|
|
537
514
|
|
|
538
515
|
if not silent:
|
|
539
|
-
print(f"
|
|
540
|
-
print(f"
|
|
541
|
-
print(f" 🔤 Columns: {', '.join(df.columns[:10].tolist())}{' ...' if len(df.columns) > 10 else ''}")
|
|
516
|
+
print(f"Uploaded: {name}")
|
|
517
|
+
print(f" {len(df)} rows, {len(df.columns)} columns")
|
|
542
518
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
# ========================================================================
|
|
546
|
-
|
|
547
|
-
def ask(self, question: str,
|
|
548
|
-
viz: Union[bool, str] = False,
|
|
549
|
-
table: Optional[str] = None) -> 'QueryResult':
|
|
550
|
-
"""
|
|
551
|
-
Ask question with FLEXIBLE visualization options.
|
|
552
|
-
|
|
553
|
-
Args:
|
|
554
|
-
question: Natural language question
|
|
555
|
-
viz: Visualization type:
|
|
556
|
-
- False: No visualization
|
|
557
|
-
- True: Auto-detect best chart
|
|
558
|
-
- "pie": Pie chart
|
|
559
|
-
- "bar": Bar chart
|
|
560
|
-
- "line": Line chart
|
|
561
|
-
- "scatter": Scatter plot
|
|
562
|
-
- "table": Table view
|
|
563
|
-
- "heatmap": Heatmap
|
|
564
|
-
table: Specific table to query (optional)
|
|
565
|
-
|
|
566
|
-
Examples:
|
|
567
|
-
result = sutra.ask("How many people in each city?")
|
|
568
|
-
result = sutra.ask("Show sales by month", viz="line")
|
|
569
|
-
result = sutra.ask("Revenue breakdown", viz="pie")
|
|
570
|
-
result = sutra.ask("Compare metrics", viz="bar")
|
|
571
|
-
"""
|
|
519
|
+
def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
520
|
+
"""Query with natural language."""
|
|
572
521
|
if not self.client:
|
|
573
|
-
print("
|
|
522
|
+
print("No API key")
|
|
574
523
|
return QueryResult(False, "", pd.DataFrame(), None, "No API key")
|
|
575
524
|
|
|
576
|
-
print(f"\
|
|
525
|
+
print(f"\nQuestion: {question}")
|
|
577
526
|
|
|
578
|
-
# NEW: Check relevance if enabled
|
|
579
527
|
if self.check_relevance:
|
|
580
528
|
if not self._is_relevant_query(question):
|
|
581
|
-
print("
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
if choice not in ['yes', 'y', 'yeah', 'yep', 'sure']:
|
|
586
|
-
return QueryResult(False, "", pd.DataFrame(), None, "Query not relevant to database")
|
|
529
|
+
print("Warning: Query may be irrelevant")
|
|
530
|
+
choice = input("Continue? (yes/no): ").strip().lower()
|
|
531
|
+
if choice not in ['yes', 'y']:
|
|
532
|
+
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
587
533
|
|
|
588
|
-
# Select table
|
|
589
534
|
tbl = table or self.current_table
|
|
590
535
|
if not tbl:
|
|
591
536
|
all_tables = self._get_table_names()
|
|
592
537
|
if all_tables:
|
|
593
538
|
tbl = all_tables[0]
|
|
594
539
|
else:
|
|
595
|
-
print("
|
|
540
|
+
print("No tables found")
|
|
596
541
|
return QueryResult(False, "", pd.DataFrame(), None, "No table")
|
|
597
542
|
|
|
598
|
-
# NEW: Check embeddings cache if enabled
|
|
599
543
|
if self.use_embeddings and self.embedding_model:
|
|
600
544
|
cached_result = self._check_embedding_cache(question, tbl)
|
|
601
545
|
if cached_result:
|
|
602
|
-
print("
|
|
546
|
+
print(" Using cached result")
|
|
603
547
|
return cached_result
|
|
604
548
|
|
|
605
|
-
# NEW: Apply fuzzy matching to question if enabled
|
|
606
549
|
if self.fuzzy_match:
|
|
607
550
|
question = self._apply_fuzzy_matching(question, tbl)
|
|
608
551
|
|
|
609
|
-
# Check simple cache
|
|
610
552
|
cache_key = hashlib.md5(f"{question}:{tbl}".encode()).hexdigest()
|
|
611
|
-
if self.cache_queries and cache_key in self.cache:
|
|
553
|
+
if self.cache_queries and self.cache and cache_key in self.cache:
|
|
612
554
|
sql_query = self.cache[cache_key]
|
|
613
|
-
print("
|
|
555
|
+
print(" From cache")
|
|
614
556
|
else:
|
|
615
557
|
sql_query = self._generate_sql(question, tbl)
|
|
616
|
-
if self.cache_queries:
|
|
558
|
+
if self.cache_queries and self.cache is not None:
|
|
617
559
|
self.cache[cache_key] = sql_query
|
|
618
560
|
|
|
619
|
-
print(f"
|
|
561
|
+
print(f"SQL: {sql_query}")
|
|
620
562
|
|
|
621
563
|
try:
|
|
622
564
|
df = pd.read_sql_query(sql_query, self.conn)
|
|
623
|
-
print(f"
|
|
565
|
+
print(f"Success! {len(df)} rows")
|
|
624
566
|
|
|
625
567
|
fig = None
|
|
626
568
|
if viz:
|
|
@@ -629,30 +571,24 @@ IMPORTANT:
|
|
|
629
571
|
|
|
630
572
|
result = QueryResult(True, sql_query, df, fig)
|
|
631
573
|
|
|
632
|
-
# Store in embedding cache if enabled
|
|
633
574
|
if self.use_embeddings and self.embedding_model:
|
|
634
575
|
self._store_in_embedding_cache(question, tbl, result)
|
|
635
576
|
|
|
636
577
|
return result
|
|
637
578
|
except Exception as e:
|
|
638
|
-
print(f"
|
|
579
|
+
print(f"Error: {e}")
|
|
639
580
|
return QueryResult(False, sql_query, pd.DataFrame(), None, str(e))
|
|
640
581
|
|
|
641
|
-
# ========================================================================
|
|
642
|
-
# NEW: RELEVANCE CHECK
|
|
643
|
-
# ========================================================================
|
|
644
|
-
|
|
645
582
|
def _is_relevant_query(self, question: str) -> bool:
|
|
646
|
-
"""Check
|
|
583
|
+
"""Check relevance."""
|
|
647
584
|
if not self.client:
|
|
648
585
|
return True
|
|
649
586
|
|
|
650
|
-
# Get database context
|
|
651
587
|
tables = self._get_table_names()
|
|
652
588
|
columns = []
|
|
653
|
-
for tbl in tables[:3]:
|
|
589
|
+
for tbl in tables[:3]:
|
|
654
590
|
cols = list(self.schema_info.get(tbl, {}).keys())
|
|
655
|
-
columns.extend(cols[:5])
|
|
591
|
+
columns.extend(cols[:5])
|
|
656
592
|
|
|
657
593
|
db_context = f"Tables: {', '.join(tables[:5])}. Columns: {', '.join(columns[:15])}"
|
|
658
594
|
|
|
@@ -660,31 +596,22 @@ IMPORTANT:
|
|
|
660
596
|
response = self.client.chat.completions.create(
|
|
661
597
|
model="gpt-4o-mini",
|
|
662
598
|
messages=[
|
|
663
|
-
{"role": "system", "content": "
|
|
664
|
-
{"role": "user", "content": f"Is this
|
|
599
|
+
{"role": "system", "content": "Relevance checker. Return only 'yes' or 'no'."},
|
|
600
|
+
{"role": "user", "content": f"Is this relevant to database with {db_context}?\n\nQuestion: {question}\n\nyes or no:"}
|
|
665
601
|
],
|
|
666
602
|
temperature=0,
|
|
667
603
|
max_tokens=5
|
|
668
604
|
)
|
|
669
605
|
|
|
670
|
-
|
|
671
|
-
return 'yes' in answer
|
|
606
|
+
return 'yes' in response.choices[0].message.content.strip().lower()
|
|
672
607
|
except:
|
|
673
|
-
return True
|
|
674
|
-
|
|
675
|
-
# ========================================================================
|
|
676
|
-
# NEW: FUZZY MATCHING FOR BETTER NLP
|
|
677
|
-
# ========================================================================
|
|
608
|
+
return True
|
|
678
609
|
|
|
679
610
|
def _apply_fuzzy_matching(self, question: str, table: str) -> str:
|
|
680
|
-
"""
|
|
681
|
-
Apply fuzzy matching to improve NLP understanding.
|
|
682
|
-
Example: "New York City" → finds "New York" in database
|
|
683
|
-
"""
|
|
611
|
+
"""Fuzzy match query terms."""
|
|
684
612
|
if not self.schema_info.get(table):
|
|
685
613
|
return question
|
|
686
614
|
|
|
687
|
-
# Get all unique values from string columns
|
|
688
615
|
try:
|
|
689
616
|
string_cols = [col for col, dtype in self.schema_info[table].items()
|
|
690
617
|
if 'TEXT' in dtype or 'VARCHAR' in dtype]
|
|
@@ -692,41 +619,32 @@ IMPORTANT:
|
|
|
692
619
|
if not string_cols:
|
|
693
620
|
return question
|
|
694
621
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
# Replace with closest match
|
|
708
|
-
words_in_question[i] = matches[0]
|
|
709
|
-
print(f" 🔍 Fuzzy match: '{word}' → '{matches[0]}'")
|
|
622
|
+
for col in string_cols[:2]:
|
|
623
|
+
df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
|
|
624
|
+
unique_values = [str(v) for v in df[col].dropna().tolist()]
|
|
625
|
+
|
|
626
|
+
words = question.split()
|
|
627
|
+
for i, word in enumerate(words):
|
|
628
|
+
matches = get_close_matches(word, unique_values, n=1, cutoff=0.6)
|
|
629
|
+
if matches and word != matches[0]:
|
|
630
|
+
words[i] = matches[0]
|
|
631
|
+
print(f" Fuzzy: '{word}' -> '{matches[0]}'")
|
|
632
|
+
|
|
633
|
+
question = " ".join(words)
|
|
710
634
|
|
|
711
|
-
return
|
|
635
|
+
return question
|
|
712
636
|
except:
|
|
713
637
|
return question
|
|
714
638
|
|
|
715
|
-
# ========================================================================
|
|
716
|
-
# NEW: EMBEDDING-BASED CACHE
|
|
717
|
-
# ========================================================================
|
|
718
|
-
|
|
719
639
|
def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
|
|
720
|
-
"""Check
|
|
640
|
+
"""Check embedding cache."""
|
|
721
641
|
if not self.query_embeddings:
|
|
722
642
|
return None
|
|
723
643
|
|
|
724
|
-
# Get embedding for current question
|
|
725
644
|
q_embedding = self.embedding_model.encode([question])[0]
|
|
726
645
|
|
|
727
|
-
# Find most similar cached query
|
|
728
646
|
best_match = None
|
|
729
|
-
best_similarity = 0.85
|
|
647
|
+
best_similarity = 0.85
|
|
730
648
|
|
|
731
649
|
for cached_q, cached_data in self.query_embeddings.items():
|
|
732
650
|
if cached_data['table'] != table:
|
|
@@ -741,39 +659,27 @@ IMPORTANT:
|
|
|
741
659
|
best_match = cached_q
|
|
742
660
|
|
|
743
661
|
if best_match:
|
|
744
|
-
print(f"
|
|
662
|
+
print(f" Similar query ({best_similarity:.0%}): '{best_match}'")
|
|
745
663
|
return self.query_embeddings[best_match]['result']
|
|
746
664
|
|
|
747
665
|
return None
|
|
748
666
|
|
|
749
667
|
def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
|
|
750
|
-
"""Store
|
|
668
|
+
"""Store in cache."""
|
|
751
669
|
q_embedding = self.embedding_model.encode([question])[0]
|
|
752
|
-
|
|
753
670
|
self.query_embeddings[question] = {
|
|
754
671
|
'table': table,
|
|
755
672
|
'embedding': q_embedding,
|
|
756
673
|
'result': result
|
|
757
674
|
}
|
|
758
675
|
|
|
759
|
-
# ========================================================================
|
|
760
|
-
# NEW: FLEXIBLE VISUALIZATION
|
|
761
|
-
# ========================================================================
|
|
762
|
-
|
|
763
676
|
def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
|
|
764
|
-
"""
|
|
765
|
-
Create flexible visualization based on user choice.
|
|
766
|
-
|
|
767
|
-
Args:
|
|
768
|
-
df: Data to visualize
|
|
769
|
-
title: Chart title
|
|
770
|
-
viz_type: Type of visualization (auto, pie, bar, line, scatter, table, heatmap)
|
|
771
|
-
"""
|
|
677
|
+
"""Create visualization."""
|
|
772
678
|
if not HAS_PLOTLY and not HAS_MATPLOTLIB:
|
|
773
|
-
print("
|
|
679
|
+
print("Install plotly or matplotlib")
|
|
774
680
|
return None
|
|
775
681
|
|
|
776
|
-
print(f"
|
|
682
|
+
print(f"Creating {viz_type} chart...")
|
|
777
683
|
|
|
778
684
|
if HAS_PLOTLY:
|
|
779
685
|
return self._plotly_viz(df, title, viz_type)
|
|
@@ -781,7 +687,7 @@ IMPORTANT:
|
|
|
781
687
|
return self._matplotlib_viz(df, title, viz_type)
|
|
782
688
|
|
|
783
689
|
def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
784
|
-
"""
|
|
690
|
+
"""Plotly visualization."""
|
|
785
691
|
try:
|
|
786
692
|
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
787
693
|
categorical = df.select_dtypes(include=['object']).columns.tolist()
|
|
@@ -791,57 +697,39 @@ IMPORTANT:
|
|
|
791
697
|
header=dict(values=list(df.columns)),
|
|
792
698
|
cells=dict(values=[df[c] for c in df.columns])
|
|
793
699
|
)])
|
|
794
|
-
|
|
795
700
|
elif viz_type == "pie" and categorical and numeric:
|
|
796
701
|
fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
|
|
797
|
-
|
|
798
702
|
elif viz_type == "bar" and categorical and numeric:
|
|
799
703
|
fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
|
|
800
|
-
|
|
801
704
|
elif viz_type == "line" and numeric:
|
|
802
705
|
fig = px.line(df, y=numeric[0], title=title)
|
|
803
|
-
|
|
804
706
|
elif viz_type == "scatter" and len(numeric) >= 2:
|
|
805
707
|
fig = px.scatter(df, x=numeric[0], y=numeric[1], title=title)
|
|
806
|
-
|
|
807
708
|
elif viz_type == "heatmap" and len(numeric) >= 2:
|
|
808
|
-
# Create correlation heatmap
|
|
809
709
|
corr = df[numeric].corr()
|
|
810
710
|
fig = go.Figure(data=go.Heatmap(
|
|
811
|
-
z=corr.values,
|
|
812
|
-
x=corr.columns,
|
|
813
|
-
y=corr.columns,
|
|
814
|
-
colorscale='Viridis'
|
|
711
|
+
z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'
|
|
815
712
|
))
|
|
816
713
|
fig.update_layout(title=title)
|
|
817
|
-
|
|
818
714
|
elif viz_type == "auto":
|
|
819
|
-
# Auto-detect best chart
|
|
820
715
|
if categorical and numeric:
|
|
821
|
-
if len(df) <= 10
|
|
822
|
-
fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
|
|
823
|
-
else:
|
|
824
|
-
fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
|
|
716
|
+
fig = px.pie(df, names=categorical[0], values=numeric[0], title=title) if len(df) <= 10 else px.bar(df, x=categorical[0], y=numeric[0], title=title)
|
|
825
717
|
elif len(numeric) >= 2:
|
|
826
718
|
fig = px.line(df, y=numeric[0], title=title)
|
|
827
719
|
else:
|
|
828
720
|
fig = px.bar(df, y=df.columns[0], title=title)
|
|
829
721
|
else:
|
|
830
|
-
|
|
831
|
-
if categorical and numeric:
|
|
832
|
-
fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
|
|
833
|
-
else:
|
|
834
|
-
fig = px.bar(df, y=df.columns[0], title=title)
|
|
722
|
+
fig = px.bar(df, x=categorical[0] if categorical else df.index, y=numeric[0] if numeric else df.columns[0], title=title)
|
|
835
723
|
|
|
836
724
|
fig.show()
|
|
837
|
-
print("
|
|
725
|
+
print("Chart displayed")
|
|
838
726
|
return fig
|
|
839
727
|
except Exception as e:
|
|
840
|
-
print(f"
|
|
728
|
+
print(f"Viz error: {e}")
|
|
841
729
|
return None
|
|
842
730
|
|
|
843
731
|
def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
844
|
-
"""
|
|
732
|
+
"""Matplotlib visualization."""
|
|
845
733
|
try:
|
|
846
734
|
plt.figure(figsize=(10, 6))
|
|
847
735
|
numeric = df.select_dtypes(include=[np.number]).columns
|
|
@@ -859,26 +747,22 @@ IMPORTANT:
|
|
|
859
747
|
plt.title(title)
|
|
860
748
|
plt.tight_layout()
|
|
861
749
|
plt.show()
|
|
862
|
-
print("
|
|
750
|
+
print("Chart displayed")
|
|
863
751
|
return plt.gcf()
|
|
864
752
|
except Exception as e:
|
|
865
|
-
print(f"
|
|
753
|
+
print(f"Viz error: {e}")
|
|
866
754
|
return None
|
|
867
755
|
|
|
868
|
-
# ========================================================================
|
|
869
|
-
# VIEW DATABASE
|
|
870
|
-
# ========================================================================
|
|
871
|
-
|
|
872
756
|
def tables(self) -> Dict[str, dict]:
|
|
873
|
-
"""
|
|
757
|
+
"""List all tables."""
|
|
874
758
|
print("\n" + "="*70)
|
|
875
|
-
print("
|
|
759
|
+
print("TABLES IN DATABASE")
|
|
876
760
|
print("="*70)
|
|
877
761
|
|
|
878
762
|
all_tables = self._get_table_names()
|
|
879
763
|
|
|
880
764
|
if not all_tables:
|
|
881
|
-
print("
|
|
765
|
+
print("No tables found")
|
|
882
766
|
return {}
|
|
883
767
|
|
|
884
768
|
result = {}
|
|
@@ -887,26 +771,23 @@ IMPORTANT:
|
|
|
887
771
|
cols = self.schema_info.get(tbl, {})
|
|
888
772
|
col_list = list(cols.keys())
|
|
889
773
|
|
|
890
|
-
marker = "
|
|
891
|
-
print(f"{marker} {i}.
|
|
892
|
-
print(f"
|
|
893
|
-
print(f"
|
|
774
|
+
marker = ">" if tbl == self.current_table else " "
|
|
775
|
+
print(f"{marker} {i}. {tbl}")
|
|
776
|
+
print(f" {count} rows, {len(col_list)} columns")
|
|
777
|
+
print(f" Columns: {', '.join(col_list[:8])}")
|
|
894
778
|
|
|
895
|
-
result[tbl] = {
|
|
896
|
-
'rows': count,
|
|
897
|
-
'columns': col_list
|
|
898
|
-
}
|
|
779
|
+
result[tbl] = {'rows': count, 'columns': col_list}
|
|
899
780
|
|
|
900
781
|
print("="*70)
|
|
901
782
|
return result
|
|
902
783
|
|
|
903
784
|
def schema(self, table: Optional[str] = None) -> dict:
|
|
904
|
-
"""Show
|
|
785
|
+
"""Show schema."""
|
|
905
786
|
if not self.schema_info:
|
|
906
787
|
self._refresh_schema()
|
|
907
788
|
|
|
908
789
|
print("\n" + "="*70)
|
|
909
|
-
print("
|
|
790
|
+
print("DATABASE SCHEMA")
|
|
910
791
|
print("="*70)
|
|
911
792
|
|
|
912
793
|
tables_to_show = [table] if table else self.schema_info.keys()
|
|
@@ -915,12 +796,12 @@ IMPORTANT:
|
|
|
915
796
|
for tbl in tables_to_show:
|
|
916
797
|
if tbl in self.schema_info:
|
|
917
798
|
count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
|
|
918
|
-
print(f"\
|
|
919
|
-
print(f"
|
|
920
|
-
print(
|
|
799
|
+
print(f"\nTable: {tbl}")
|
|
800
|
+
print(f"Records: {count}")
|
|
801
|
+
print("Columns:")
|
|
921
802
|
|
|
922
803
|
for col, dtype in self.schema_info[tbl].items():
|
|
923
|
-
print(f"
|
|
804
|
+
print(f" - {col:<30} ({dtype})")
|
|
924
805
|
|
|
925
806
|
result[tbl] = {
|
|
926
807
|
'records': count,
|
|
@@ -931,151 +812,126 @@ IMPORTANT:
|
|
|
931
812
|
return result
|
|
932
813
|
|
|
933
814
|
def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
|
|
934
|
-
"""
|
|
815
|
+
"""Preview data."""
|
|
935
816
|
tbl = table or self.current_table
|
|
936
817
|
if not tbl:
|
|
937
|
-
print("
|
|
818
|
+
print("No table specified")
|
|
938
819
|
return pd.DataFrame()
|
|
939
820
|
|
|
940
821
|
df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
|
|
941
|
-
print(f"\
|
|
822
|
+
print(f"\nSample from '{tbl}' ({n} rows):")
|
|
942
823
|
print(df.to_string(index=False))
|
|
943
824
|
return df
|
|
944
825
|
|
|
945
826
|
def info(self):
|
|
946
|
-
"""
|
|
827
|
+
"""Database overview."""
|
|
947
828
|
return self.tables()
|
|
948
829
|
|
|
949
|
-
# ========================================================================
|
|
950
|
-
# QUERY METHODS
|
|
951
|
-
# ========================================================================
|
|
952
|
-
|
|
953
830
|
def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
|
|
954
|
-
"""Execute SQL
|
|
955
|
-
print(
|
|
831
|
+
"""Execute SQL."""
|
|
832
|
+
print("\nExecuting SQL...")
|
|
956
833
|
|
|
957
834
|
try:
|
|
958
835
|
df = pd.read_sql_query(query, self.conn)
|
|
959
|
-
print(f"
|
|
836
|
+
print(f"Success! {len(df)} rows")
|
|
960
837
|
|
|
961
838
|
fig = None
|
|
962
839
|
if viz:
|
|
963
840
|
viz_type = viz if isinstance(viz, str) else "auto"
|
|
964
|
-
fig = self._visualize(df, "SQL
|
|
841
|
+
fig = self._visualize(df, "SQL Result", viz_type=viz_type)
|
|
965
842
|
|
|
966
843
|
return QueryResult(True, query, df, fig)
|
|
967
844
|
except Exception as e:
|
|
968
|
-
print(f"
|
|
845
|
+
print(f"Error: {e}")
|
|
969
846
|
return QueryResult(False, query, pd.DataFrame(), None, str(e))
|
|
970
847
|
|
|
971
848
|
def interactive(self, question: str) -> 'QueryResult':
|
|
972
|
-
"""
|
|
973
|
-
print(f"\
|
|
974
|
-
choice = input("
|
|
975
|
-
|
|
976
|
-
if choice in ['
|
|
977
|
-
viz = True
|
|
978
|
-
elif choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap']:
|
|
979
|
-
viz = choice
|
|
980
|
-
else:
|
|
981
|
-
viz = False
|
|
849
|
+
"""Interactive query."""
|
|
850
|
+
print(f"\nQuestion: {question}")
|
|
851
|
+
choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
|
|
852
|
+
|
|
853
|
+
viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
|
|
982
854
|
|
|
983
855
|
return self.ask(question, viz=viz)
|
|
984
856
|
|
|
985
|
-
# ========================================================================
|
|
986
|
-
# DATABASE EXPORT
|
|
987
|
-
# ========================================================================
|
|
988
|
-
|
|
989
857
|
def export_db(self, path: str, format: str = "sqlite"):
|
|
990
|
-
"""Export
|
|
991
|
-
print(f"\
|
|
992
|
-
|
|
993
|
-
format = format.lower()
|
|
858
|
+
"""Export database."""
|
|
859
|
+
print(f"\nExporting to {format}...")
|
|
994
860
|
|
|
995
861
|
if format == "sqlite":
|
|
996
862
|
shutil.copy2(self.db_path, path)
|
|
997
|
-
print(f"✅ Saved to {path}")
|
|
998
|
-
|
|
999
863
|
elif format == "sql":
|
|
1000
864
|
with open(path, 'w', encoding='utf-8') as f:
|
|
1001
865
|
for line in self.conn.iterdump():
|
|
1002
866
|
f.write(f'{line}\n')
|
|
1003
|
-
print(f"✅ Saved to {path}")
|
|
1004
|
-
|
|
1005
867
|
elif format == "json":
|
|
1006
868
|
data = {}
|
|
1007
869
|
for table in self._get_table_names():
|
|
1008
870
|
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
1009
871
|
data[table] = df.to_dict(orient='records')
|
|
1010
|
-
|
|
1011
872
|
with open(path, 'w', encoding='utf-8') as f:
|
|
1012
873
|
json.dump(data, f, indent=2, default=str)
|
|
1013
|
-
print(f"✅ Saved to {path}")
|
|
1014
|
-
|
|
1015
874
|
elif format == "excel":
|
|
1016
875
|
with pd.ExcelWriter(path, engine='openpyxl') as writer:
|
|
1017
876
|
for table in self._get_table_names():
|
|
1018
877
|
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
1019
878
|
df.to_excel(writer, sheet_name=table[:31], index=False)
|
|
1020
|
-
print(f"✅ Saved to {path}")
|
|
1021
|
-
|
|
1022
879
|
else:
|
|
1023
|
-
raise ValueError(f"Unsupported
|
|
880
|
+
raise ValueError(f"Unsupported: {format}")
|
|
1024
881
|
|
|
882
|
+
print(f"Saved to {path}")
|
|
1025
883
|
return self
|
|
1026
884
|
|
|
1027
885
|
def save_to_mysql(self, host: str, user: str, password: str, database: str,
|
|
1028
886
|
port: int = 3306, tables: Optional[List[str]] = None):
|
|
1029
|
-
"""
|
|
887
|
+
"""Export to MySQL."""
|
|
1030
888
|
try:
|
|
1031
889
|
from sqlalchemy import create_engine
|
|
1032
890
|
except ImportError:
|
|
1033
|
-
raise ImportError("Run: pip install
|
|
891
|
+
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
1034
892
|
|
|
1035
|
-
print(f"\
|
|
893
|
+
print(f"\nConnecting to MySQL: {host}:{port}...")
|
|
1036
894
|
|
|
1037
|
-
|
|
1038
|
-
engine = create_engine(connection_string)
|
|
895
|
+
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
1039
896
|
|
|
1040
897
|
tables_to_export = tables or self._get_table_names()
|
|
1041
898
|
|
|
1042
|
-
print(f"
|
|
899
|
+
print(f"Exporting {len(tables_to_export)} tables...")
|
|
1043
900
|
|
|
1044
901
|
for table in tables_to_export:
|
|
1045
902
|
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
1046
903
|
df.to_sql(table, engine, if_exists='replace', index=False)
|
|
1047
|
-
print(f"
|
|
904
|
+
print(f" {table}: {len(df)} rows")
|
|
1048
905
|
|
|
1049
|
-
print(
|
|
906
|
+
print("Complete!")
|
|
1050
907
|
return self
|
|
1051
908
|
|
|
1052
909
|
def save_to_postgres(self, host: str, user: str, password: str, database: str,
|
|
1053
910
|
port: int = 5432, tables: Optional[List[str]] = None):
|
|
1054
|
-
"""
|
|
911
|
+
"""Export to PostgreSQL."""
|
|
1055
912
|
try:
|
|
1056
913
|
from sqlalchemy import create_engine
|
|
1057
914
|
except ImportError:
|
|
1058
|
-
raise ImportError("Run: pip install
|
|
915
|
+
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
1059
916
|
|
|
1060
|
-
print(f"\
|
|
917
|
+
print(f"\nConnecting to PostgreSQL: {host}:{port}...")
|
|
1061
918
|
|
|
1062
|
-
|
|
1063
|
-
engine = create_engine(connection_string)
|
|
919
|
+
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
1064
920
|
|
|
1065
921
|
tables_to_export = tables or self._get_table_names()
|
|
1066
922
|
|
|
1067
|
-
print(f"
|
|
923
|
+
print(f"Exporting {len(tables_to_export)} tables...")
|
|
1068
924
|
|
|
1069
925
|
for table in tables_to_export:
|
|
1070
926
|
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
1071
927
|
df.to_sql(table, engine, if_exists='replace', index=False)
|
|
1072
|
-
print(f"
|
|
928
|
+
print(f" {table}: {len(df)} rows")
|
|
1073
929
|
|
|
1074
|
-
print(
|
|
930
|
+
print("Complete!")
|
|
1075
931
|
return self
|
|
1076
932
|
|
|
1077
933
|
def backup(self, backup_path: str = None):
|
|
1078
|
-
"""Create
|
|
934
|
+
"""Create backup."""
|
|
1079
935
|
if backup_path:
|
|
1080
936
|
backup_dir = Path(backup_path)
|
|
1081
937
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -1084,52 +940,47 @@ IMPORTANT:
|
|
|
1084
940
|
|
|
1085
941
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1086
942
|
|
|
1087
|
-
print(
|
|
943
|
+
print("\nCreating backup...")
|
|
1088
944
|
|
|
1089
|
-
db_backup = backup_dir / f"
|
|
945
|
+
db_backup = backup_dir / f"sutra_{timestamp}.db"
|
|
1090
946
|
self.export_db(str(db_backup), format="sqlite")
|
|
1091
947
|
|
|
1092
|
-
json_backup = backup_dir / f"
|
|
948
|
+
json_backup = backup_dir / f"sutra_{timestamp}.json"
|
|
1093
949
|
self.export_db(str(json_backup), format="json")
|
|
1094
950
|
|
|
1095
|
-
print(f"\
|
|
1096
|
-
print(f"
|
|
1097
|
-
print(f"
|
|
951
|
+
print(f"\nBackup complete!")
|
|
952
|
+
print(f" Database: {db_backup}")
|
|
953
|
+
print(f" Data: {json_backup}")
|
|
1098
954
|
|
|
1099
955
|
return self
|
|
1100
956
|
|
|
1101
|
-
# ========================================================================
|
|
1102
|
-
# UTILITIES
|
|
1103
|
-
# ========================================================================
|
|
1104
|
-
|
|
1105
957
|
def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
|
|
1106
958
|
"""Export results."""
|
|
1107
|
-
|
|
1108
|
-
if fmt == "csv":
|
|
959
|
+
if format == "csv":
|
|
1109
960
|
data.to_csv(path, index=False)
|
|
1110
|
-
elif
|
|
961
|
+
elif format in ["excel", "xlsx"]:
|
|
1111
962
|
data.to_excel(path, index=False)
|
|
1112
|
-
elif
|
|
963
|
+
elif format == "json":
|
|
1113
964
|
data.to_json(path, orient="records", indent=2)
|
|
1114
965
|
else:
|
|
1115
|
-
raise ValueError(f"Unknown
|
|
966
|
+
raise ValueError(f"Unknown: {format}")
|
|
1116
967
|
|
|
1117
|
-
print(f"
|
|
968
|
+
print(f"Exported to {path}")
|
|
1118
969
|
return self
|
|
1119
970
|
|
|
1120
971
|
def close(self):
|
|
1121
|
-
"""Close
|
|
972
|
+
"""Close connection."""
|
|
1122
973
|
if self.conn:
|
|
1123
974
|
self.conn.close()
|
|
1124
|
-
print("
|
|
975
|
+
print("Closed")
|
|
1125
976
|
|
|
1126
977
|
def _get_table_names(self) -> List[str]:
|
|
1127
|
-
"""Get
|
|
978
|
+
"""Get tables."""
|
|
1128
979
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
1129
980
|
return [r[0] for r in self.cursor.fetchall()]
|
|
1130
981
|
|
|
1131
982
|
def _refresh_schema(self):
|
|
1132
|
-
"""Refresh schema
|
|
983
|
+
"""Refresh schema."""
|
|
1133
984
|
tables = self._get_table_names()
|
|
1134
985
|
|
|
1135
986
|
self.schema_info = {}
|
|
@@ -1138,7 +989,7 @@ IMPORTANT:
|
|
|
1138
989
|
self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
1139
990
|
|
|
1140
991
|
def _generate_sql(self, question: str, table: str) -> str:
|
|
1141
|
-
"""Generate SQL
|
|
992
|
+
"""Generate SQL."""
|
|
1142
993
|
schema = self.schema_info.get(table, {})
|
|
1143
994
|
sample_df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn)
|
|
1144
995
|
sample = sample_df.to_string(index=False)
|
|
@@ -1156,7 +1007,7 @@ Sample:
|
|
|
1156
1007
|
|
|
1157
1008
|
Question: {question}
|
|
1158
1009
|
|
|
1159
|
-
Return ONLY SQL.
|
|
1010
|
+
Return ONLY SQL."""
|
|
1160
1011
|
|
|
1161
1012
|
response = self.client.chat.completions.create(
|
|
1162
1013
|
model="gpt-4o-mini",
|
|
@@ -1177,7 +1028,6 @@ Return ONLY SQL. No explanations."""
|
|
|
1177
1028
|
self.close()
|
|
1178
1029
|
|
|
1179
1030
|
def __repr__(self):
|
|
1180
|
-
tables = len(self.schema_info)
|
|
1181
1031
|
features = []
|
|
1182
1032
|
if self.cache_queries:
|
|
1183
1033
|
features.append("cache")
|
|
@@ -1188,8 +1038,8 @@ Return ONLY SQL. No explanations."""
|
|
|
1188
1038
|
if self.fuzzy_match:
|
|
1189
1039
|
features.append("fuzzy")
|
|
1190
1040
|
|
|
1191
|
-
|
|
1192
|
-
return f"SUTRA(tables={
|
|
1041
|
+
feat_str = f", {', '.join(features)}" if features else ""
|
|
1042
|
+
return f"SUTRA(tables={len(self.schema_info)}{feat_str})"
|
|
1193
1043
|
|
|
1194
1044
|
|
|
1195
1045
|
class QueryResult:
|
|
@@ -1203,15 +1053,10 @@ class QueryResult:
|
|
|
1203
1053
|
self.error = error
|
|
1204
1054
|
|
|
1205
1055
|
def __repr__(self):
|
|
1206
|
-
if self.success
|
|
1207
|
-
return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})"
|
|
1208
|
-
return f"QueryResult(error='{self.error}')"
|
|
1056
|
+
return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
|
|
1209
1057
|
|
|
1210
1058
|
def show(self):
|
|
1211
|
-
if self.success:
|
|
1212
|
-
print(self.data)
|
|
1213
|
-
else:
|
|
1214
|
-
print(f"Error: {self.error}")
|
|
1059
|
+
print(self.data) if self.success else print(f"Error: {self.error}")
|
|
1215
1060
|
return self
|
|
1216
1061
|
|
|
1217
1062
|
|
|
@@ -1224,39 +1069,12 @@ def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, st
|
|
|
1224
1069
|
|
|
1225
1070
|
if __name__ == "__main__":
|
|
1226
1071
|
print("""
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
║ Structured-Unstructured-Text-Retrieval-Architecture ║
|
|
1230
|
-
╚══════════════════════════════════════════════════════════════╝
|
|
1072
|
+
QuerySUTRA v0.3.3 - Professional Data Analysis
|
|
1073
|
+
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
1231
1074
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
✅ Custom visualizations (pie, bar, line, scatter, table)
|
|
1235
|
-
✅ Smart NLP with fuzzy matching (optional)
|
|
1236
|
-
✅ Irrelevant query detection (optional)
|
|
1237
|
-
✅ Embeddings for caching (optional)
|
|
1238
|
-
✅ All features are OPTIONAL - you control everything!
|
|
1075
|
+
Fixed: Proper primary and foreign keys with unique IDs
|
|
1076
|
+
Features: Load existing DB, custom viz, fuzzy matching, embeddings
|
|
1239
1077
|
|
|
1240
1078
|
Installation: pip install QuerySUTRA
|
|
1241
|
-
|
|
1242
|
-
Quick Start:
|
|
1243
|
-
from sutra import SUTRA
|
|
1244
|
-
|
|
1245
|
-
# NEW: Load existing database
|
|
1246
|
-
sutra = SUTRA.load_from_db("sutra.db", api_key="sk-...")
|
|
1247
|
-
|
|
1248
|
-
# Or create new with options
|
|
1249
|
-
sutra = SUTRA(api_key="sk-...",
|
|
1250
|
-
use_embeddings=True, # Smart caching
|
|
1251
|
-
check_relevance=True, # Detect irrelevant queries
|
|
1252
|
-
fuzzy_match=True) # Better NLP
|
|
1253
|
-
|
|
1254
|
-
# Upload and query
|
|
1255
|
-
sutra.upload("data.pdf")
|
|
1256
|
-
result = sutra.ask("Show sales by region", viz="pie")
|
|
1257
|
-
|
|
1258
|
-
# Connect to MySQL/PostgreSQL
|
|
1259
|
-
sutra = SUTRA.connect_mysql("localhost", "root", "pass", "db")
|
|
1260
|
-
|
|
1261
|
-
Supported: CSV, Excel, JSON, SQL, PDF, DOCX, TXT, DataFrame
|
|
1079
|
+
Usage: from sutra import SUTRA
|
|
1262
1080
|
""")
|