QuerySUTRA 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- querysutra-0.4.1.dist-info/METADATA +264 -0
- {querysutra-0.3.3.dist-info → querysutra-0.4.1.dist-info}/RECORD +7 -7
- sutra/__init__.py +2 -5
- sutra/sutra.py +352 -580
- querysutra-0.3.3.dist-info/METADATA +0 -285
- {querysutra-0.3.3.dist-info → querysutra-0.4.1.dist-info}/WHEEL +0 -0
- {querysutra-0.3.3.dist-info → querysutra-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {querysutra-0.3.3.dist-info → querysutra-0.4.1.dist-info}/top_level.txt +0 -0
sutra/sutra.py
CHANGED
|
@@ -1,35 +1,32 @@
|
|
|
1
1
|
"""
|
|
2
|
-
QuerySUTRA v0.
|
|
2
|
+
QuerySUTRA v0.4.0 - SIMPLE & AUTOMATIC
|
|
3
3
|
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
4
4
|
|
|
5
|
-
FIXED:
|
|
6
|
-
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
-
|
|
5
|
+
FIXED:
|
|
6
|
+
- Auto-creates MySQL database if not exists
|
|
7
|
+
- One-line export to MySQL
|
|
8
|
+
- Complete data extraction from large PDFs
|
|
9
|
+
- No manual file transfers needed
|
|
10
10
|
|
|
11
11
|
Author: Aditya Batta
|
|
12
|
-
|
|
13
|
-
Version: 0.3.3
|
|
12
|
+
Version: 0.4.0
|
|
14
13
|
"""
|
|
15
14
|
|
|
16
|
-
__version__ = "0.
|
|
15
|
+
__version__ = "0.4.0"
|
|
17
16
|
__author__ = "Aditya Batta"
|
|
18
|
-
__title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
|
|
19
17
|
__all__ = ["SUTRA", "QueryResult", "quick_start"]
|
|
20
18
|
|
|
21
19
|
import os
|
|
22
20
|
import sqlite3
|
|
23
21
|
import pandas as pd
|
|
24
22
|
import numpy as np
|
|
25
|
-
from typing import Optional, Union, Dict,
|
|
23
|
+
from typing import Optional, Union, Dict, List
|
|
26
24
|
from pathlib import Path
|
|
27
25
|
import json
|
|
28
26
|
import hashlib
|
|
29
27
|
import warnings
|
|
30
28
|
import shutil
|
|
31
29
|
import datetime
|
|
32
|
-
import re
|
|
33
30
|
from io import StringIO
|
|
34
31
|
from difflib import get_close_matches
|
|
35
32
|
warnings.filterwarnings('ignore')
|
|
@@ -73,22 +70,13 @@ except ImportError:
|
|
|
73
70
|
|
|
74
71
|
|
|
75
72
|
class SUTRA:
|
|
76
|
-
"""
|
|
77
|
-
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
73
|
+
"""SUTRA: Structured-Unstructured-Text-Retrieval-Architecture"""
|
|
78
74
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
db: str = "sutra.db",
|
|
85
|
-
use_embeddings: bool = False,
|
|
86
|
-
check_relevance: bool = False,
|
|
87
|
-
fuzzy_match: bool = True,
|
|
88
|
-
cache_queries: bool = True):
|
|
89
|
-
"""Initialize SUTRA with optional features."""
|
|
90
|
-
print("Initializing QuerySUTRA v0.3.3")
|
|
91
|
-
print("SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
|
|
75
|
+
def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
|
|
76
|
+
use_embeddings: bool = False, check_relevance: bool = False,
|
|
77
|
+
fuzzy_match: bool = True, cache_queries: bool = True):
|
|
78
|
+
"""Initialize."""
|
|
79
|
+
print("Initializing QuerySUTRA v0.4.0")
|
|
92
80
|
|
|
93
81
|
if api_key:
|
|
94
82
|
os.environ["OPENAI_API_KEY"] = api_key
|
|
@@ -97,454 +85,354 @@ class SUTRA:
|
|
|
97
85
|
self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
|
|
98
86
|
|
|
99
87
|
self.db_path = db
|
|
100
|
-
self.conn = sqlite3.connect(db, check_same_thread=False)
|
|
101
|
-
self.cursor = self.conn.cursor()
|
|
102
88
|
|
|
89
|
+
try:
|
|
90
|
+
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
91
|
+
self.conn.execute("PRAGMA journal_mode=WAL")
|
|
92
|
+
self.conn.execute("PRAGMA synchronous=NORMAL")
|
|
93
|
+
except:
|
|
94
|
+
self.conn = sqlite3.connect(db, check_same_thread=False)
|
|
95
|
+
|
|
96
|
+
self.cursor = self.conn.cursor()
|
|
103
97
|
self.current_table = None
|
|
104
98
|
self.schema_info = {}
|
|
105
99
|
|
|
106
100
|
self.cache_queries = cache_queries
|
|
107
101
|
self.cache = {} if cache_queries else None
|
|
108
|
-
|
|
109
102
|
self.use_embeddings = use_embeddings
|
|
110
103
|
self.embedding_model = None
|
|
111
104
|
self.query_embeddings = {}
|
|
112
|
-
|
|
113
105
|
self.check_relevance = check_relevance
|
|
114
106
|
self.fuzzy_match = fuzzy_match
|
|
115
107
|
|
|
116
108
|
if use_embeddings and HAS_EMBEDDINGS:
|
|
117
109
|
try:
|
|
118
|
-
print("Loading embeddings model...")
|
|
119
110
|
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
120
|
-
print("Embeddings ready")
|
|
121
111
|
except:
|
|
122
|
-
print("Embeddings unavailable")
|
|
123
112
|
self.use_embeddings = False
|
|
124
113
|
|
|
125
114
|
self._refresh_schema()
|
|
126
|
-
|
|
127
115
|
print(f"Ready! Database: {db}")
|
|
128
|
-
if not self.api_key:
|
|
129
|
-
print("No API key - use .sql() for direct queries")
|
|
130
116
|
|
|
131
117
|
@classmethod
|
|
132
118
|
def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
|
|
133
|
-
"""Load existing
|
|
119
|
+
"""Load existing database."""
|
|
134
120
|
if not Path(db_path).exists():
|
|
135
|
-
raise FileNotFoundError(f"
|
|
136
|
-
|
|
137
|
-
print(f"Loading database: {db_path}")
|
|
138
|
-
instance = cls(api_key=api_key, db=db_path, **kwargs)
|
|
139
|
-
|
|
140
|
-
tables = instance.tables()
|
|
141
|
-
print(f"Loaded {len(tables)} tables")
|
|
142
|
-
|
|
143
|
-
return instance
|
|
121
|
+
raise FileNotFoundError(f"Not found: {db_path}")
|
|
122
|
+
return cls(api_key=api_key, db=db_path, **kwargs)
|
|
144
123
|
|
|
145
124
|
@classmethod
|
|
146
125
|
def connect_mysql(cls, host: str, user: str, password: str, database: str,
|
|
147
126
|
port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
148
|
-
"""Connect to MySQL
|
|
127
|
+
"""Connect to MySQL."""
|
|
149
128
|
try:
|
|
150
129
|
from sqlalchemy import create_engine
|
|
130
|
+
import mysql.connector
|
|
151
131
|
except ImportError:
|
|
152
132
|
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
153
133
|
|
|
154
|
-
print(f"Connecting to MySQL
|
|
134
|
+
print(f"Connecting to MySQL...")
|
|
155
135
|
|
|
156
|
-
|
|
136
|
+
# Auto-create database if not exists
|
|
137
|
+
try:
|
|
138
|
+
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
139
|
+
temp_cursor = temp_conn.cursor()
|
|
140
|
+
temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
141
|
+
temp_cursor.close()
|
|
142
|
+
temp_conn.close()
|
|
143
|
+
except:
|
|
144
|
+
pass
|
|
157
145
|
|
|
146
|
+
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
158
147
|
temp_db = f"sutra_mysql_{database}.db"
|
|
159
148
|
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
160
149
|
|
|
161
|
-
engine = create_engine(connection_string)
|
|
162
|
-
|
|
163
150
|
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
164
151
|
|
|
165
|
-
print(f"Found {len(tables)} tables, syncing...")
|
|
166
|
-
|
|
167
152
|
for table in tables:
|
|
168
153
|
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
169
154
|
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
170
|
-
print(f" {table}: {len(df)} rows")
|
|
171
155
|
|
|
172
156
|
instance._refresh_schema()
|
|
173
|
-
print(f"Connected! {len(tables)} tables
|
|
174
|
-
|
|
157
|
+
print(f"Connected! {len(tables)} tables")
|
|
175
158
|
return instance
|
|
176
159
|
|
|
177
160
|
@classmethod
|
|
178
161
|
def connect_postgres(cls, host: str, user: str, password: str, database: str,
|
|
179
162
|
port: int = 5432, api_key: Optional[str] = None, **kwargs):
|
|
180
|
-
"""Connect to PostgreSQL
|
|
163
|
+
"""Connect to PostgreSQL."""
|
|
181
164
|
try:
|
|
182
165
|
from sqlalchemy import create_engine
|
|
183
166
|
except ImportError:
|
|
184
167
|
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
185
168
|
|
|
186
|
-
print(f"Connecting to PostgreSQL
|
|
187
|
-
|
|
188
|
-
connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
|
169
|
+
print(f"Connecting to PostgreSQL...")
|
|
189
170
|
|
|
171
|
+
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
190
172
|
temp_db = f"sutra_postgres_{database}.db"
|
|
191
173
|
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
192
174
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
tables = pd.read_sql_query(
|
|
196
|
-
"SELECT tablename FROM pg_tables WHERE schemaname='public'",
|
|
197
|
-
engine
|
|
198
|
-
)['tablename'].tolist()
|
|
199
|
-
|
|
200
|
-
print(f"Found {len(tables)} tables, syncing...")
|
|
175
|
+
tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
|
|
201
176
|
|
|
202
177
|
for table in tables:
|
|
203
178
|
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
204
179
|
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
205
|
-
print(f" {table}: {len(df)} rows")
|
|
206
180
|
|
|
207
181
|
instance._refresh_schema()
|
|
208
|
-
print(f"Connected! {len(tables)} tables
|
|
209
|
-
|
|
182
|
+
print(f"Connected! {len(tables)} tables")
|
|
210
183
|
return instance
|
|
211
184
|
|
|
212
185
|
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
|
|
213
|
-
extract_entities: Optional[List[str]] = None
|
|
186
|
+
extract_entities: Optional[List[str]] = None,
|
|
187
|
+
auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
|
|
214
188
|
"""
|
|
215
|
-
Upload data with
|
|
189
|
+
Upload data with OPTIONAL automatic MySQL export.
|
|
216
190
|
|
|
217
191
|
Args:
|
|
218
192
|
data: File path or DataFrame
|
|
219
193
|
name: Table name
|
|
220
|
-
extract_entities: Custom entities to extract
|
|
194
|
+
extract_entities: Custom entities to extract
|
|
195
|
+
auto_export_mysql: Auto-export to MySQL after upload
|
|
196
|
+
{'host': 'localhost', 'user': 'root', 'password': 'pass', 'database': 'mydb'}
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
sutra.upload("data.pdf", auto_export_mysql={
|
|
200
|
+
'host': 'localhost',
|
|
201
|
+
'user': 'root',
|
|
202
|
+
'password': '123456',
|
|
203
|
+
'database': 'my_database'
|
|
204
|
+
})
|
|
221
205
|
"""
|
|
222
|
-
print(
|
|
206
|
+
print("\nUploading...")
|
|
223
207
|
|
|
224
208
|
if isinstance(data, pd.DataFrame):
|
|
225
209
|
name = name or "data"
|
|
226
210
|
self._store_dataframe(data, name)
|
|
227
|
-
return self
|
|
228
|
-
|
|
229
|
-
path = Path(data)
|
|
230
|
-
if not path.exists():
|
|
231
|
-
raise FileNotFoundError(f"File not found: {data}")
|
|
232
|
-
|
|
233
|
-
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
234
|
-
ext = path.suffix.lower()
|
|
235
|
-
|
|
236
|
-
print(f"File: {path.name}")
|
|
237
|
-
|
|
238
|
-
if ext == ".csv":
|
|
239
|
-
df = pd.read_csv(path)
|
|
240
|
-
self._store_dataframe(df, name)
|
|
241
|
-
|
|
242
|
-
elif ext in [".xlsx", ".xls"]:
|
|
243
|
-
df = pd.read_excel(path)
|
|
244
|
-
self._store_dataframe(df, name)
|
|
245
|
-
|
|
246
|
-
elif ext == ".json":
|
|
247
|
-
df = pd.read_json(path)
|
|
248
|
-
self._store_dataframe(df, name)
|
|
249
|
-
|
|
250
|
-
elif ext == ".sql":
|
|
251
|
-
with open(path) as f:
|
|
252
|
-
self.cursor.executescript(f.read())
|
|
253
|
-
self.conn.commit()
|
|
254
|
-
self._refresh_schema()
|
|
255
|
-
print("SQL executed")
|
|
256
|
-
|
|
257
|
-
elif ext == ".pdf":
|
|
258
|
-
self._smart_upload_pdf(path, name, extract_entities)
|
|
259
|
-
|
|
260
|
-
elif ext == ".docx":
|
|
261
|
-
self._smart_upload_docx(path, name, extract_entities)
|
|
262
|
-
|
|
263
|
-
elif ext == ".txt":
|
|
264
|
-
self._smart_upload_txt(path, name, extract_entities)
|
|
265
|
-
|
|
266
211
|
else:
|
|
267
|
-
|
|
212
|
+
path = Path(data)
|
|
213
|
+
if not path.exists():
|
|
214
|
+
raise FileNotFoundError(f"Not found: {data}")
|
|
215
|
+
|
|
216
|
+
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
217
|
+
ext = path.suffix.lower()
|
|
218
|
+
|
|
219
|
+
print(f"File: {path.name}")
|
|
220
|
+
|
|
221
|
+
if ext == ".csv":
|
|
222
|
+
self._store_dataframe(pd.read_csv(path), name)
|
|
223
|
+
elif ext in [".xlsx", ".xls"]:
|
|
224
|
+
self._store_dataframe(pd.read_excel(path), name)
|
|
225
|
+
elif ext == ".json":
|
|
226
|
+
self._store_dataframe(pd.read_json(path), name)
|
|
227
|
+
elif ext == ".sql":
|
|
228
|
+
with open(path) as f:
|
|
229
|
+
self.cursor.executescript(f.read())
|
|
230
|
+
self.conn.commit()
|
|
231
|
+
self._refresh_schema()
|
|
232
|
+
elif ext == ".pdf":
|
|
233
|
+
self._smart_upload_pdf(path, name, extract_entities)
|
|
234
|
+
elif ext == ".docx":
|
|
235
|
+
self._smart_upload_docx(path, name, extract_entities)
|
|
236
|
+
elif ext == ".txt":
|
|
237
|
+
self._smart_upload_txt(path, name, extract_entities)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"Unsupported: {ext}")
|
|
240
|
+
|
|
241
|
+
# AUTO-EXPORT to MySQL if requested
|
|
242
|
+
if auto_export_mysql:
|
|
243
|
+
print("\nAuto-exporting to MySQL...")
|
|
244
|
+
self.save_to_mysql(
|
|
245
|
+
host=auto_export_mysql.get('host', 'localhost'),
|
|
246
|
+
user=auto_export_mysql.get('user', 'root'),
|
|
247
|
+
password=auto_export_mysql['password'],
|
|
248
|
+
database=auto_export_mysql['database'],
|
|
249
|
+
port=auto_export_mysql.get('port', 3306)
|
|
250
|
+
)
|
|
268
251
|
|
|
269
252
|
return self
|
|
270
253
|
|
|
271
254
|
def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
272
|
-
"""Parse PDF
|
|
255
|
+
"""Parse PDF - extracts ALL pages."""
|
|
273
256
|
if not HAS_PYPDF2:
|
|
274
257
|
raise ImportError("Run: pip install PyPDF2")
|
|
275
258
|
|
|
276
|
-
print("Extracting
|
|
259
|
+
print("Extracting PDF...")
|
|
277
260
|
|
|
278
261
|
with open(path, 'rb') as file:
|
|
279
262
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
280
|
-
|
|
263
|
+
full_text = ""
|
|
281
264
|
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
282
|
-
|
|
265
|
+
full_text += page.extract_text() + "\n"
|
|
283
266
|
print(f" Page {page_num}/{len(pdf_reader.pages)}")
|
|
284
267
|
|
|
285
268
|
if self.client:
|
|
286
|
-
print("AI:
|
|
287
|
-
tables = self._create_tables_with_ai(text, base_name, extract_entities)
|
|
269
|
+
print("AI: Extracting entities...")
|
|
288
270
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
271
|
+
# Process in chunks for large documents
|
|
272
|
+
chunk_size = 10000
|
|
273
|
+
all_entities = {}
|
|
274
|
+
|
|
275
|
+
for i in range(0, len(full_text), chunk_size):
|
|
276
|
+
chunk = full_text[i:i+chunk_size]
|
|
277
|
+
chunk_num = (i // chunk_size) + 1
|
|
278
|
+
total_chunks = (len(full_text) // chunk_size) + 1
|
|
279
|
+
|
|
280
|
+
if total_chunks > 1:
|
|
281
|
+
print(f" Chunk {chunk_num}/{total_chunks}...")
|
|
282
|
+
|
|
283
|
+
entities = self._extract_chunk(chunk, extract_entities)
|
|
284
|
+
|
|
285
|
+
for entity_type, records in entities.items():
|
|
286
|
+
if entity_type not in all_entities:
|
|
287
|
+
all_entities[entity_type] = []
|
|
288
|
+
all_entities[entity_type].extend(records)
|
|
289
|
+
|
|
290
|
+
# Renumber IDs
|
|
291
|
+
for entity_type, records in all_entities.items():
|
|
292
|
+
for idx, record in enumerate(records, 1):
|
|
293
|
+
record['id'] = idx
|
|
294
|
+
|
|
295
|
+
# Create tables
|
|
296
|
+
if all_entities:
|
|
297
|
+
print(f"\nCreated {len(all_entities)} tables:")
|
|
298
|
+
for entity_type, records in all_entities.items():
|
|
299
|
+
if records:
|
|
300
|
+
table_name = f"{base_name}_{entity_type}"
|
|
301
|
+
df = pd.DataFrame(records)
|
|
302
|
+
self._store_dataframe_safe(df, table_name)
|
|
303
|
+
print(f" {entity_type}: {len(df)} records")
|
|
295
304
|
return
|
|
296
305
|
|
|
297
|
-
print("
|
|
298
|
-
|
|
299
|
-
self._store_dataframe(df, base_name)
|
|
306
|
+
print("Creating simple table")
|
|
307
|
+
self._store_dataframe(self._parse_text_simple(full_text), base_name)
|
|
300
308
|
|
|
301
309
|
def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
302
|
-
"""Parse DOCX
|
|
310
|
+
"""Parse DOCX."""
|
|
303
311
|
if not HAS_DOCX:
|
|
304
312
|
raise ImportError("Run: pip install python-docx")
|
|
305
313
|
|
|
306
|
-
print("Extracting from DOCX...")
|
|
307
|
-
|
|
308
314
|
doc = docx.Document(path)
|
|
309
315
|
|
|
310
316
|
if doc.tables:
|
|
311
|
-
print(f"Found {len(doc.tables)} table(s)")
|
|
312
317
|
for i, table in enumerate(doc.tables):
|
|
313
318
|
data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
|
|
314
319
|
if data and len(data) > 1:
|
|
315
320
|
df = pd.DataFrame(data[1:], columns=data[0])
|
|
316
|
-
|
|
317
|
-
self._store_dataframe(df, table_name)
|
|
321
|
+
self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
|
|
318
322
|
return
|
|
319
323
|
|
|
320
324
|
text = "\n".join([para.text for para in doc.paragraphs])
|
|
321
325
|
|
|
322
|
-
if self.client:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
cols = len(self.schema_info.get(tbl_name, {}))
|
|
331
|
-
print(f" {tbl_name}: {count} rows, {cols} columns")
|
|
332
|
-
return
|
|
333
|
-
|
|
334
|
-
df = self._parse_text_simple(text)
|
|
335
|
-
self._store_dataframe(df, base_name)
|
|
326
|
+
if self.client and len(text) > 0:
|
|
327
|
+
entities = self._extract_chunk(text, extract_entities)
|
|
328
|
+
for entity_type, records in entities.items():
|
|
329
|
+
if records:
|
|
330
|
+
df = pd.DataFrame(records)
|
|
331
|
+
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
332
|
+
else:
|
|
333
|
+
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
336
334
|
|
|
337
335
|
def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
338
|
-
"""Parse TXT
|
|
339
|
-
print("Reading TXT...")
|
|
340
|
-
|
|
336
|
+
"""Parse TXT."""
|
|
341
337
|
with open(path, 'r', encoding='utf-8') as file:
|
|
342
338
|
text = file.read()
|
|
343
339
|
|
|
344
|
-
if self.client:
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
cols = len(self.schema_info.get(tbl_name, {}))
|
|
353
|
-
print(f" {tbl_name}: {count} rows, {cols} columns")
|
|
354
|
-
return
|
|
355
|
-
|
|
356
|
-
df = self._parse_text_simple(text)
|
|
357
|
-
self._store_dataframe(df, base_name)
|
|
340
|
+
if self.client and len(text) > 0:
|
|
341
|
+
entities = self._extract_chunk(text, extract_entities)
|
|
342
|
+
for entity_type, records in entities.items():
|
|
343
|
+
if records:
|
|
344
|
+
df = pd.DataFrame(records)
|
|
345
|
+
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
346
|
+
else:
|
|
347
|
+
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
358
348
|
|
|
359
|
-
def
|
|
360
|
-
"""
|
|
361
|
-
AI extracts ALL entities with PROPER primary and foreign keys.
|
|
362
|
-
|
|
363
|
-
CRITICAL: Each entity gets UNIQUE IDs, foreign keys properly link tables.
|
|
364
|
-
"""
|
|
349
|
+
def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
|
|
350
|
+
"""Extract entities from text chunk."""
|
|
365
351
|
if not self.client:
|
|
366
|
-
return
|
|
352
|
+
return {}
|
|
367
353
|
|
|
368
354
|
try:
|
|
369
|
-
|
|
370
|
-
entity_instruction = f"""Extract these specific entities: {', '.join(custom_entities)}
|
|
371
|
-
For each entity type, create a proper table with unique IDs."""
|
|
372
|
-
else:
|
|
373
|
-
entity_instruction = """Automatically identify and extract ALL structured entities.
|
|
374
|
-
|
|
375
|
-
Common entities (extract ALL you find):
|
|
376
|
-
- people: Personal information (id, name, email, phone, address, city, state, zip)
|
|
377
|
-
- skills: Individual skills (id, person_id, skill_name, proficiency_level, years_experience)
|
|
378
|
-
- technologies: Technologies/tools (id, person_id, technology_name, category, proficiency)
|
|
379
|
-
- projects: Projects (id, person_id, project_name, description, start_date, end_date)
|
|
380
|
-
- certifications: Certifications (id, person_id, cert_name, issuer, date_obtained)
|
|
381
|
-
- education: Education records (id, person_id, degree, institution, graduation_year)
|
|
382
|
-
- work_experience: Work history (id, person_id, company, title, start_date, end_date)
|
|
383
|
-
- events: Events/meetings (id, host_id, description, location, date, attendee_ids)
|
|
384
|
-
- organizations: Companies/departments (id, name, address, city, industry)
|
|
385
|
-
- products: Products/services (id, name, description, price, category)
|
|
386
|
-
- ANY other structured entities you identify
|
|
387
|
-
|
|
388
|
-
Extract EVERYTHING you find in the text."""
|
|
389
|
-
|
|
390
|
-
extraction_prompt = f"""Analyze this text and extract ALL structured data into proper relational database tables.
|
|
355
|
+
prompt = f"""Extract ALL structured entities from this text.
|
|
391
356
|
|
|
392
357
|
Text:
|
|
393
|
-
{text[:
|
|
358
|
+
{text[:8000]}
|
|
394
359
|
|
|
395
|
-
|
|
360
|
+
Extract entities like: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY other structured data.
|
|
396
361
|
|
|
397
|
-
|
|
362
|
+
Return JSON with arrays. Use sequential IDs (1,2,3...). Foreign keys reference primary keys.
|
|
398
363
|
|
|
399
|
-
|
|
400
|
-
- Each table MUST have unique sequential IDs starting from 1
|
|
401
|
-
- Person 1 gets id=1, Person 2 gets id=2, etc.
|
|
402
|
-
- NO DUPLICATE IDs within same table
|
|
403
|
-
- IDs must be integers
|
|
404
|
-
|
|
405
|
-
2. FOREIGN KEYS:
|
|
406
|
-
- Use foreign keys to link related tables
|
|
407
|
-
- Example: skills table has person_id that references people.id
|
|
408
|
-
- Example: projects table has person_id that references people.id
|
|
409
|
-
- Foreign keys MUST match existing primary keys
|
|
410
|
-
|
|
411
|
-
3. TABLE STRUCTURE:
|
|
412
|
-
- Each entity type gets its own table
|
|
413
|
-
- Use clear table names (people, skills, technologies, not table1, table2)
|
|
414
|
-
- Include ALL relevant attributes for each entity
|
|
415
|
-
|
|
416
|
-
Return JSON with this EXACT structure:
|
|
364
|
+
Example:
|
|
417
365
|
{{
|
|
418
|
-
"people": [
|
|
419
|
-
|
|
420
|
-
{{"id": 2, "name": "Jane Smith", "email": "jane@email.com", "phone": "+1-555-0101", "city": "New York", "state": "NY"}},
|
|
421
|
-
...
|
|
422
|
-
],
|
|
423
|
-
"skills": [
|
|
424
|
-
{{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
|
|
425
|
-
{{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
|
|
426
|
-
{{"id": 3, "person_id": 2, "skill_name": "Java", "proficiency": "Expert", "years": 7}},
|
|
427
|
-
...
|
|
428
|
-
],
|
|
429
|
-
"technologies": [
|
|
430
|
-
{{"id": 1, "person_id": 1, "technology": "React", "category": "Frontend"}},
|
|
431
|
-
{{"id": 2, "person_id": 1, "technology": "PostgreSQL", "category": "Database"}},
|
|
432
|
-
{{"id": 3, "person_id": 2, "technology": "Spring Boot", "category": "Backend"}},
|
|
433
|
-
...
|
|
434
|
-
],
|
|
435
|
-
"projects": [
|
|
436
|
-
{{"id": 1, "person_id": 1, "project_name": "E-commerce Platform", "role": "Lead Developer"}},
|
|
437
|
-
{{"id": 2, "person_id": 2, "project_name": "Analytics Dashboard", "role": "Backend Engineer"}},
|
|
438
|
-
...
|
|
439
|
-
]
|
|
366
|
+
"people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
|
|
367
|
+
"skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
|
|
440
368
|
}}
|
|
441
369
|
|
|
442
|
-
|
|
443
|
-
- Extract EVERY structured piece of data you find
|
|
444
|
-
- Assign UNIQUE sequential IDs (1, 2, 3, ...) for each table
|
|
445
|
-
- Foreign keys MUST reference valid primary keys
|
|
446
|
-
- Create as many tables as needed (don't limit yourself)
|
|
447
|
-
- Return ONLY valid JSON, no explanations
|
|
448
|
-
- Be COMPREHENSIVE - extract skills, technologies, projects, certifications, education, work history, etc."""
|
|
370
|
+
Return ONLY valid JSON."""
|
|
449
371
|
|
|
450
|
-
|
|
372
|
+
resp = self.client.chat.completions.create(
|
|
451
373
|
model="gpt-4o-mini",
|
|
452
374
|
messages=[
|
|
453
|
-
{"role": "system", "content": "
|
|
454
|
-
{"role": "user", "content":
|
|
375
|
+
{"role": "system", "content": "Extract ALL entities with unique IDs. Return only JSON."},
|
|
376
|
+
{"role": "user", "content": prompt}
|
|
455
377
|
],
|
|
456
378
|
temperature=0,
|
|
457
|
-
max_tokens=
|
|
379
|
+
max_tokens=8000
|
|
458
380
|
)
|
|
459
381
|
|
|
460
|
-
json_text =
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
extracted_data = json.loads(json_text)
|
|
464
|
-
|
|
465
|
-
created_tables = []
|
|
466
|
-
|
|
467
|
-
for entity_type, records in extracted_data.items():
|
|
468
|
-
if records and isinstance(records, list) and len(records) > 0:
|
|
469
|
-
table_name = f"{base_name}_{entity_type}"
|
|
470
|
-
|
|
471
|
-
try:
|
|
472
|
-
df = pd.DataFrame(records)
|
|
473
|
-
if not df.empty:
|
|
474
|
-
self._store_dataframe(df, table_name, silent=True)
|
|
475
|
-
created_tables.append(table_name)
|
|
476
|
-
print(f" {entity_type}: {len(df)} records")
|
|
477
|
-
except Exception as e:
|
|
478
|
-
print(f" Failed {entity_type}: {e}")
|
|
479
|
-
|
|
480
|
-
return created_tables
|
|
481
|
-
|
|
382
|
+
json_text = resp.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()
|
|
383
|
+
return json.loads(json_text)
|
|
482
384
|
except Exception as e:
|
|
483
|
-
|
|
484
|
-
|
|
385
|
+
return {}
|
|
386
|
+
|
|
387
|
+
def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
|
|
388
|
+
"""Store with error handling."""
|
|
389
|
+
try:
|
|
390
|
+
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
391
|
+
df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
|
|
392
|
+
self.conn.commit()
|
|
393
|
+
self.current_table = name
|
|
394
|
+
self._refresh_schema()
|
|
395
|
+
except:
|
|
396
|
+
df.to_sql(name, self.conn, if_exists='replace', index=False)
|
|
397
|
+
self.conn.commit()
|
|
398
|
+
self.current_table = name
|
|
399
|
+
self._refresh_schema()
|
|
485
400
|
|
|
486
401
|
def _parse_text_simple(self, text: str) -> pd.DataFrame:
|
|
487
|
-
"""
|
|
402
|
+
"""Simple parsing."""
|
|
488
403
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
489
|
-
|
|
490
404
|
if not lines:
|
|
491
405
|
return pd.DataFrame({'content': ['No content']})
|
|
492
406
|
|
|
493
|
-
|
|
494
|
-
for delimiter in ['\t', ',', '|', ';']:
|
|
495
|
-
if all(delimiter in line for line in sample):
|
|
496
|
-
try:
|
|
497
|
-
df = pd.read_csv(StringIO('\n'.join(lines)), sep=delimiter)
|
|
498
|
-
if len(df.columns) > 1:
|
|
499
|
-
return df
|
|
500
|
-
except:
|
|
501
|
-
continue
|
|
502
|
-
|
|
503
|
-
return pd.DataFrame({
|
|
504
|
-
'line_number': range(1, len(lines) + 1),
|
|
505
|
-
'content': lines
|
|
506
|
-
})
|
|
407
|
+
return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
|
|
507
408
|
|
|
508
|
-
def _store_dataframe(self, df: pd.DataFrame, name: str
|
|
509
|
-
"""Store
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
self.current_table = name
|
|
513
|
-
self._refresh_schema()
|
|
514
|
-
|
|
515
|
-
if not silent:
|
|
516
|
-
print(f"Uploaded: {name}")
|
|
517
|
-
print(f" {len(df)} rows, {len(df.columns)} columns")
|
|
409
|
+
def _store_dataframe(self, df: pd.DataFrame, name: str):
|
|
410
|
+
"""Store."""
|
|
411
|
+
self._store_dataframe_safe(df, name)
|
|
412
|
+
print(f"Uploaded: {name} ({len(df)} rows)")
|
|
518
413
|
|
|
519
414
|
def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
520
|
-
"""
|
|
415
|
+
"""Natural language query."""
|
|
521
416
|
if not self.client:
|
|
522
|
-
print("No API key")
|
|
523
417
|
return QueryResult(False, "", pd.DataFrame(), None, "No API key")
|
|
524
418
|
|
|
525
419
|
print(f"\nQuestion: {question}")
|
|
526
420
|
|
|
527
|
-
if self.check_relevance:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
421
|
+
if self.check_relevance and not self._is_relevant_query(question):
|
|
422
|
+
print("Warning: Irrelevant query")
|
|
423
|
+
choice = input("Continue? (yes/no): ").strip().lower()
|
|
424
|
+
if choice not in ['yes', 'y']:
|
|
425
|
+
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
533
426
|
|
|
534
|
-
tbl = table or self.current_table
|
|
427
|
+
tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
|
|
535
428
|
if not tbl:
|
|
536
|
-
|
|
537
|
-
if all_tables:
|
|
538
|
-
tbl = all_tables[0]
|
|
539
|
-
else:
|
|
540
|
-
print("No tables found")
|
|
541
|
-
return QueryResult(False, "", pd.DataFrame(), None, "No table")
|
|
429
|
+
return QueryResult(False, "", pd.DataFrame(), None, "No table")
|
|
542
430
|
|
|
543
431
|
if self.use_embeddings and self.embedding_model:
|
|
544
|
-
|
|
545
|
-
if
|
|
546
|
-
print("
|
|
547
|
-
return
|
|
432
|
+
cached = self._check_embedding_cache(question, tbl)
|
|
433
|
+
if cached:
|
|
434
|
+
print(" Cached")
|
|
435
|
+
return cached
|
|
548
436
|
|
|
549
437
|
if self.fuzzy_match:
|
|
550
438
|
question = self._apply_fuzzy_matching(question, tbl)
|
|
@@ -567,7 +455,7 @@ IMPORTANT:
|
|
|
567
455
|
fig = None
|
|
568
456
|
if viz:
|
|
569
457
|
viz_type = viz if isinstance(viz, str) else "auto"
|
|
570
|
-
fig = self._visualize(df, question, viz_type
|
|
458
|
+
fig = self._visualize(df, question, viz_type)
|
|
571
459
|
|
|
572
460
|
result = QueryResult(True, sql_query, df, fig)
|
|
573
461
|
|
|
@@ -584,199 +472,155 @@ IMPORTANT:
|
|
|
584
472
|
if not self.client:
|
|
585
473
|
return True
|
|
586
474
|
|
|
587
|
-
tables = self._get_table_names()
|
|
588
|
-
columns = []
|
|
589
|
-
for tbl in tables[:3]:
|
|
590
|
-
cols = list(self.schema_info.get(tbl, {}).keys())
|
|
591
|
-
columns.extend(cols[:5])
|
|
592
|
-
|
|
593
|
-
db_context = f"Tables: {', '.join(tables[:5])}. Columns: {', '.join(columns[:15])}"
|
|
594
|
-
|
|
595
475
|
try:
|
|
596
|
-
|
|
476
|
+
tables = self._get_table_names()[:3]
|
|
477
|
+
cols = []
|
|
478
|
+
for tbl in tables:
|
|
479
|
+
cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
|
|
480
|
+
|
|
481
|
+
resp = self.client.chat.completions.create(
|
|
597
482
|
model="gpt-4o-mini",
|
|
598
483
|
messages=[
|
|
599
|
-
{"role": "system", "content": "
|
|
600
|
-
{"role": "user", "content": f"
|
|
484
|
+
{"role": "system", "content": "Return 'yes' or 'no'."},
|
|
485
|
+
{"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
|
|
601
486
|
],
|
|
602
487
|
temperature=0,
|
|
603
488
|
max_tokens=5
|
|
604
489
|
)
|
|
605
|
-
|
|
606
|
-
return 'yes' in response.choices[0].message.content.strip().lower()
|
|
490
|
+
return 'yes' in resp.choices[0].message.content.lower()
|
|
607
491
|
except:
|
|
608
492
|
return True
|
|
609
493
|
|
|
610
494
|
def _apply_fuzzy_matching(self, question: str, table: str) -> str:
|
|
611
|
-
"""Fuzzy
|
|
495
|
+
"""Fuzzy matching."""
|
|
612
496
|
if not self.schema_info.get(table):
|
|
613
497
|
return question
|
|
614
498
|
|
|
615
499
|
try:
|
|
616
|
-
string_cols = [col for col, dtype in self.schema_info[table].items()
|
|
617
|
-
if 'TEXT' in dtype or 'VARCHAR' in dtype]
|
|
618
|
-
|
|
500
|
+
string_cols = [col for col, dtype in self.schema_info[table].items() if 'TEXT' in dtype]
|
|
619
501
|
if not string_cols:
|
|
620
502
|
return question
|
|
621
503
|
|
|
622
504
|
for col in string_cols[:2]:
|
|
623
505
|
df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
|
|
624
|
-
|
|
506
|
+
values = [str(v) for v in df[col].dropna().tolist()]
|
|
625
507
|
|
|
626
508
|
words = question.split()
|
|
627
509
|
for i, word in enumerate(words):
|
|
628
|
-
matches = get_close_matches(word,
|
|
510
|
+
matches = get_close_matches(word, values, n=1, cutoff=0.6)
|
|
629
511
|
if matches and word != matches[0]:
|
|
630
512
|
words[i] = matches[0]
|
|
631
513
|
print(f" Fuzzy: '{word}' -> '{matches[0]}'")
|
|
632
|
-
|
|
633
514
|
question = " ".join(words)
|
|
634
|
-
|
|
635
515
|
return question
|
|
636
516
|
except:
|
|
637
517
|
return question
|
|
638
518
|
|
|
639
519
|
def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
|
|
640
|
-
"""Check
|
|
520
|
+
"""Check cache."""
|
|
641
521
|
if not self.query_embeddings:
|
|
642
522
|
return None
|
|
643
523
|
|
|
644
|
-
|
|
645
|
-
|
|
524
|
+
q_emb = self.embedding_model.encode([question])[0]
|
|
646
525
|
best_match = None
|
|
647
|
-
|
|
526
|
+
best_sim = 0.85
|
|
648
527
|
|
|
649
|
-
for cached_q,
|
|
650
|
-
if
|
|
528
|
+
for cached_q, data in self.query_embeddings.items():
|
|
529
|
+
if data['table'] != table:
|
|
651
530
|
continue
|
|
652
531
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
if similarity > best_similarity:
|
|
658
|
-
best_similarity = similarity
|
|
532
|
+
sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
|
|
533
|
+
if sim > best_sim:
|
|
534
|
+
best_sim = sim
|
|
659
535
|
best_match = cached_q
|
|
660
536
|
|
|
661
537
|
if best_match:
|
|
662
|
-
print(f" Similar
|
|
538
|
+
print(f" Similar ({best_sim:.0%})")
|
|
663
539
|
return self.query_embeddings[best_match]['result']
|
|
664
540
|
|
|
665
541
|
return None
|
|
666
542
|
|
|
667
543
|
def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
|
|
668
|
-
"""Store
|
|
669
|
-
|
|
670
|
-
self.query_embeddings[question] = {
|
|
671
|
-
'table': table,
|
|
672
|
-
'embedding': q_embedding,
|
|
673
|
-
'result': result
|
|
674
|
-
}
|
|
544
|
+
"""Store cache."""
|
|
545
|
+
q_emb = self.embedding_model.encode([question])[0]
|
|
546
|
+
self.query_embeddings[question] = {'table': table, 'embedding': q_emb, 'result': result}
|
|
675
547
|
|
|
676
548
|
def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
|
|
677
|
-
"""
|
|
549
|
+
"""Visualize."""
|
|
678
550
|
if not HAS_PLOTLY and not HAS_MATPLOTLIB:
|
|
679
|
-
print("Install plotly or matplotlib")
|
|
680
551
|
return None
|
|
681
552
|
|
|
682
553
|
print(f"Creating {viz_type} chart...")
|
|
683
|
-
|
|
684
|
-
if HAS_PLOTLY:
|
|
685
|
-
return self._plotly_viz(df, title, viz_type)
|
|
686
|
-
else:
|
|
687
|
-
return self._matplotlib_viz(df, title, viz_type)
|
|
554
|
+
return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
|
|
688
555
|
|
|
689
556
|
def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
690
|
-
"""Plotly
|
|
557
|
+
"""Plotly."""
|
|
691
558
|
try:
|
|
692
|
-
|
|
693
|
-
|
|
559
|
+
num = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
560
|
+
cat = df.select_dtypes(include=['object']).columns.tolist()
|
|
694
561
|
|
|
695
|
-
if viz_type == "table"
|
|
696
|
-
fig = go.Figure(data=[go.Table(
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
fig =
|
|
708
|
-
elif viz_type == "heatmap" and len(numeric) >= 2:
|
|
709
|
-
corr = df[numeric].corr()
|
|
710
|
-
fig = go.Figure(data=go.Heatmap(
|
|
711
|
-
z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'
|
|
712
|
-
))
|
|
562
|
+
if viz_type == "table":
|
|
563
|
+
fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)), cells=dict(values=[df[c] for c in df.columns]))])
|
|
564
|
+
elif viz_type == "pie" and cat and num:
|
|
565
|
+
fig = px.pie(df, names=cat[0], values=num[0], title=title)
|
|
566
|
+
elif viz_type == "bar" and cat and num:
|
|
567
|
+
fig = px.bar(df, x=cat[0], y=num[0], title=title)
|
|
568
|
+
elif viz_type == "line" and num:
|
|
569
|
+
fig = px.line(df, y=num[0], title=title)
|
|
570
|
+
elif viz_type == "scatter" and len(num) >= 2:
|
|
571
|
+
fig = px.scatter(df, x=num[0], y=num[1], title=title)
|
|
572
|
+
elif viz_type == "heatmap" and len(num) >= 2:
|
|
573
|
+
corr = df[num].corr()
|
|
574
|
+
fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
|
|
713
575
|
fig.update_layout(title=title)
|
|
714
|
-
|
|
715
|
-
if
|
|
716
|
-
fig = px.pie(df, names=
|
|
717
|
-
elif len(numeric) >= 2:
|
|
718
|
-
fig = px.line(df, y=numeric[0], title=title)
|
|
576
|
+
else:
|
|
577
|
+
if cat and num:
|
|
578
|
+
fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
|
|
719
579
|
else:
|
|
720
580
|
fig = px.bar(df, y=df.columns[0], title=title)
|
|
721
|
-
else:
|
|
722
|
-
fig = px.bar(df, x=categorical[0] if categorical else df.index, y=numeric[0] if numeric else df.columns[0], title=title)
|
|
723
581
|
|
|
724
582
|
fig.show()
|
|
725
|
-
print("Chart displayed")
|
|
726
583
|
return fig
|
|
727
|
-
except
|
|
728
|
-
print(f"Viz error: {e}")
|
|
584
|
+
except:
|
|
729
585
|
return None
|
|
730
586
|
|
|
731
587
|
def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
732
|
-
"""Matplotlib
|
|
588
|
+
"""Matplotlib."""
|
|
733
589
|
try:
|
|
734
590
|
plt.figure(figsize=(10, 6))
|
|
735
|
-
|
|
591
|
+
num = df.select_dtypes(include=[np.number]).columns
|
|
736
592
|
|
|
737
|
-
if viz_type == "pie"
|
|
593
|
+
if viz_type == "pie":
|
|
738
594
|
df[df.columns[0]].value_counts().plot(kind='pie')
|
|
739
|
-
elif viz_type == "line" and len(
|
|
740
|
-
df[
|
|
595
|
+
elif viz_type == "line" and len(num) > 0:
|
|
596
|
+
df[num[0]].plot(kind='line')
|
|
741
597
|
else:
|
|
742
|
-
if len(
|
|
743
|
-
df[numeric[0]].plot(kind='bar')
|
|
744
|
-
else:
|
|
745
|
-
df.iloc[:, 0].value_counts().plot(kind='bar')
|
|
598
|
+
(df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
|
|
746
599
|
|
|
747
600
|
plt.title(title)
|
|
748
601
|
plt.tight_layout()
|
|
749
602
|
plt.show()
|
|
750
|
-
print("Chart displayed")
|
|
751
603
|
return plt.gcf()
|
|
752
|
-
except
|
|
753
|
-
print(f"Viz error: {e}")
|
|
604
|
+
except:
|
|
754
605
|
return None
|
|
755
606
|
|
|
756
607
|
def tables(self) -> Dict[str, dict]:
|
|
757
|
-
"""List
|
|
608
|
+
"""List tables."""
|
|
758
609
|
print("\n" + "="*70)
|
|
759
|
-
print("TABLES
|
|
610
|
+
print("TABLES")
|
|
760
611
|
print("="*70)
|
|
761
612
|
|
|
762
613
|
all_tables = self._get_table_names()
|
|
763
|
-
|
|
764
614
|
if not all_tables:
|
|
765
|
-
print("No tables
|
|
615
|
+
print("No tables")
|
|
766
616
|
return {}
|
|
767
617
|
|
|
768
618
|
result = {}
|
|
769
619
|
for i, tbl in enumerate(all_tables, 1):
|
|
770
|
-
|
|
771
|
-
cols = self.schema_info.get(tbl, {})
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
marker = ">" if tbl == self.current_table else " "
|
|
775
|
-
print(f"{marker} {i}. {tbl}")
|
|
776
|
-
print(f" {count} rows, {len(col_list)} columns")
|
|
777
|
-
print(f" Columns: {', '.join(col_list[:8])}")
|
|
778
|
-
|
|
779
|
-
result[tbl] = {'rows': count, 'columns': col_list}
|
|
620
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
|
|
621
|
+
cols = list(self.schema_info.get(tbl, {}).keys())
|
|
622
|
+
print(f" {i}. {tbl}: {cnt} rows, {len(cols)} columns")
|
|
623
|
+
result[tbl] = {'rows': cnt, 'columns': cols}
|
|
780
624
|
|
|
781
625
|
print("="*70)
|
|
782
626
|
return result
|
|
@@ -787,77 +631,55 @@ IMPORTANT:
|
|
|
787
631
|
self._refresh_schema()
|
|
788
632
|
|
|
789
633
|
print("\n" + "="*70)
|
|
790
|
-
print("
|
|
634
|
+
print("SCHEMA")
|
|
791
635
|
print("="*70)
|
|
792
636
|
|
|
793
|
-
tables_to_show = [table] if table else self.schema_info.keys()
|
|
794
|
-
|
|
795
637
|
result = {}
|
|
796
|
-
for tbl in
|
|
638
|
+
for tbl in ([table] if table else self.schema_info.keys()):
|
|
797
639
|
if tbl in self.schema_info:
|
|
798
|
-
|
|
799
|
-
print(f"\
|
|
800
|
-
print(f"Records: {count}")
|
|
801
|
-
print("Columns:")
|
|
802
|
-
|
|
640
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
|
|
641
|
+
print(f"\n{tbl}: {cnt} records")
|
|
803
642
|
for col, dtype in self.schema_info[tbl].items():
|
|
804
|
-
print(f" - {col:<30}
|
|
805
|
-
|
|
806
|
-
result[tbl] = {
|
|
807
|
-
'records': count,
|
|
808
|
-
'columns': self.schema_info[tbl]
|
|
809
|
-
}
|
|
643
|
+
print(f" - {col:<30} {dtype}")
|
|
644
|
+
result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
|
|
810
645
|
|
|
811
646
|
print("="*70)
|
|
812
647
|
return result
|
|
813
648
|
|
|
814
649
|
def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
|
|
815
|
-
"""Preview
|
|
650
|
+
"""Preview."""
|
|
816
651
|
tbl = table or self.current_table
|
|
817
652
|
if not tbl:
|
|
818
|
-
print("No table specified")
|
|
819
653
|
return pd.DataFrame()
|
|
820
654
|
|
|
821
655
|
df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
|
|
822
|
-
print(f"\nSample from '{tbl}'
|
|
656
|
+
print(f"\nSample from '{tbl}':")
|
|
823
657
|
print(df.to_string(index=False))
|
|
824
658
|
return df
|
|
825
659
|
|
|
826
660
|
def info(self):
|
|
827
|
-
"""
|
|
661
|
+
"""Overview."""
|
|
828
662
|
return self.tables()
|
|
829
663
|
|
|
830
664
|
def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
|
|
831
665
|
"""Execute SQL."""
|
|
832
|
-
print("\nExecuting SQL...")
|
|
833
|
-
|
|
834
666
|
try:
|
|
835
667
|
df = pd.read_sql_query(query, self.conn)
|
|
836
668
|
print(f"Success! {len(df)} rows")
|
|
837
|
-
|
|
838
|
-
fig = None
|
|
839
|
-
if viz:
|
|
840
|
-
viz_type = viz if isinstance(viz, str) else "auto"
|
|
841
|
-
fig = self._visualize(df, "SQL Result", viz_type=viz_type)
|
|
842
|
-
|
|
669
|
+
fig = self._visualize(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
|
|
843
670
|
return QueryResult(True, query, df, fig)
|
|
844
671
|
except Exception as e:
|
|
845
672
|
print(f"Error: {e}")
|
|
846
673
|
return QueryResult(False, query, pd.DataFrame(), None, str(e))
|
|
847
674
|
|
|
848
675
|
def interactive(self, question: str) -> 'QueryResult':
|
|
849
|
-
"""Interactive
|
|
850
|
-
print(f"\nQuestion: {question}")
|
|
676
|
+
"""Interactive."""
|
|
851
677
|
choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
|
|
852
|
-
|
|
853
678
|
viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
|
|
854
|
-
|
|
855
679
|
return self.ask(question, viz=viz)
|
|
856
680
|
|
|
857
681
|
def export_db(self, path: str, format: str = "sqlite"):
|
|
858
682
|
"""Export database."""
|
|
859
|
-
print(f"\nExporting to {format}...")
|
|
860
|
-
|
|
861
683
|
if format == "sqlite":
|
|
862
684
|
shutil.copy2(self.db_path, path)
|
|
863
685
|
elif format == "sql":
|
|
@@ -865,93 +687,90 @@ IMPORTANT:
|
|
|
865
687
|
for line in self.conn.iterdump():
|
|
866
688
|
f.write(f'{line}\n')
|
|
867
689
|
elif format == "json":
|
|
868
|
-
data = {}
|
|
869
|
-
for table in self._get_table_names():
|
|
870
|
-
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
871
|
-
data[table] = df.to_dict(orient='records')
|
|
690
|
+
data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}
|
|
872
691
|
with open(path, 'w', encoding='utf-8') as f:
|
|
873
692
|
json.dump(data, f, indent=2, default=str)
|
|
874
693
|
elif format == "excel":
|
|
875
694
|
with pd.ExcelWriter(path, engine='openpyxl') as writer:
|
|
876
|
-
for
|
|
877
|
-
|
|
878
|
-
df.to_excel(writer, sheet_name=table[:31], index=False)
|
|
695
|
+
for t in self._get_table_names():
|
|
696
|
+
pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
|
|
879
697
|
else:
|
|
880
698
|
raise ValueError(f"Unsupported: {format}")
|
|
881
699
|
|
|
882
|
-
print(f"Saved
|
|
700
|
+
print(f"Saved: {path}")
|
|
883
701
|
return self
|
|
884
702
|
|
|
885
|
-
def save_to_mysql(self, host: str, user: str, password: str, database: str,
|
|
886
|
-
port: int = 3306, tables: Optional[List[str]] = None
|
|
887
|
-
|
|
703
|
+
def save_to_mysql(self, host: str, user: str, password: str, database: str,
|
|
704
|
+
port: int = 3306, tables: Optional[List[str]] = None,
|
|
705
|
+
auto_create: bool = True):
|
|
706
|
+
"""
|
|
707
|
+
Export to MySQL - AUTO-CREATES database if not exists.
|
|
708
|
+
|
|
709
|
+
Args:
|
|
710
|
+
host: MySQL host
|
|
711
|
+
user: MySQL user
|
|
712
|
+
password: MySQL password
|
|
713
|
+
database: Database name (auto-created if not exists)
|
|
714
|
+
port: MySQL port
|
|
715
|
+
tables: Specific tables to export (None = all)
|
|
716
|
+
auto_create: Auto-create database if not exists
|
|
717
|
+
"""
|
|
888
718
|
try:
|
|
889
719
|
from sqlalchemy import create_engine
|
|
720
|
+
import mysql.connector
|
|
890
721
|
except ImportError:
|
|
891
722
|
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
892
723
|
|
|
893
|
-
print(f"
|
|
724
|
+
print(f"Exporting to MySQL: {host}/{database}")
|
|
894
725
|
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
726
|
+
# Auto-create database if requested
|
|
727
|
+
if auto_create:
|
|
728
|
+
try:
|
|
729
|
+
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
730
|
+
temp_cursor = temp_conn.cursor()
|
|
731
|
+
temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
|
|
732
|
+
temp_cursor.close()
|
|
733
|
+
temp_conn.close()
|
|
734
|
+
print(f" Database '{database}' ready")
|
|
735
|
+
except Exception as e:
|
|
736
|
+
print(f" Warning: Could not auto-create database: {e}")
|
|
898
737
|
|
|
899
|
-
|
|
738
|
+
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
900
739
|
|
|
901
|
-
for
|
|
902
|
-
df = pd.read_sql_query(f"SELECT * FROM {
|
|
903
|
-
df.to_sql(
|
|
904
|
-
print(f" {
|
|
740
|
+
for t in (tables or self._get_table_names()):
|
|
741
|
+
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
742
|
+
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
743
|
+
print(f" {t}: {len(df)} rows")
|
|
905
744
|
|
|
906
745
|
print("Complete!")
|
|
907
746
|
return self
|
|
908
747
|
|
|
909
|
-
def save_to_postgres(self, host: str, user: str, password: str, database: str,
|
|
748
|
+
def save_to_postgres(self, host: str, user: str, password: str, database: str,
|
|
910
749
|
port: int = 5432, tables: Optional[List[str]] = None):
|
|
911
750
|
"""Export to PostgreSQL."""
|
|
912
751
|
try:
|
|
913
752
|
from sqlalchemy import create_engine
|
|
753
|
+
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
754
|
+
|
|
755
|
+
print(f"Exporting to PostgreSQL...")
|
|
756
|
+
for t in (tables or self._get_table_names()):
|
|
757
|
+
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
758
|
+
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
759
|
+
print(f" {t}: {len(df)} rows")
|
|
760
|
+
print("Complete!")
|
|
761
|
+
return self
|
|
914
762
|
except ImportError:
|
|
915
763
|
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
916
|
-
|
|
917
|
-
print(f"\nConnecting to PostgreSQL: {host}:{port}...")
|
|
918
|
-
|
|
919
|
-
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
920
|
-
|
|
921
|
-
tables_to_export = tables or self._get_table_names()
|
|
922
|
-
|
|
923
|
-
print(f"Exporting {len(tables_to_export)} tables...")
|
|
924
|
-
|
|
925
|
-
for table in tables_to_export:
|
|
926
|
-
df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
|
|
927
|
-
df.to_sql(table, engine, if_exists='replace', index=False)
|
|
928
|
-
print(f" {table}: {len(df)} rows")
|
|
929
|
-
|
|
930
|
-
print("Complete!")
|
|
931
|
-
return self
|
|
932
764
|
|
|
933
|
-
def backup(self,
|
|
934
|
-
"""
|
|
935
|
-
if
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
else:
|
|
939
|
-
backup_dir = Path(".")
|
|
940
|
-
|
|
941
|
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
942
|
-
|
|
943
|
-
print("\nCreating backup...")
|
|
944
|
-
|
|
945
|
-
db_backup = backup_dir / f"sutra_{timestamp}.db"
|
|
946
|
-
self.export_db(str(db_backup), format="sqlite")
|
|
947
|
-
|
|
948
|
-
json_backup = backup_dir / f"sutra_{timestamp}.json"
|
|
949
|
-
self.export_db(str(json_backup), format="json")
|
|
950
|
-
|
|
951
|
-
print(f"\nBackup complete!")
|
|
952
|
-
print(f" Database: {db_backup}")
|
|
953
|
-
print(f" Data: {json_backup}")
|
|
765
|
+
def backup(self, path: str = None):
|
|
766
|
+
"""Backup."""
|
|
767
|
+
dir = Path(path) if path else Path(".")
|
|
768
|
+
dir.mkdir(parents=True, exist_ok=True)
|
|
769
|
+
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
954
770
|
|
|
771
|
+
self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
|
|
772
|
+
self.export_db(str(dir / f"sutra_{ts}.json"), "json")
|
|
773
|
+
print("Backup complete!")
|
|
955
774
|
return self
|
|
956
775
|
|
|
957
776
|
def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
|
|
@@ -962,17 +781,13 @@ IMPORTANT:
|
|
|
962
781
|
data.to_excel(path, index=False)
|
|
963
782
|
elif format == "json":
|
|
964
783
|
data.to_json(path, orient="records", indent=2)
|
|
965
|
-
|
|
966
|
-
raise ValueError(f"Unknown: {format}")
|
|
967
|
-
|
|
968
|
-
print(f"Exported to {path}")
|
|
784
|
+
print(f"Exported: {path}")
|
|
969
785
|
return self
|
|
970
786
|
|
|
971
787
|
def close(self):
|
|
972
|
-
"""Close
|
|
788
|
+
"""Close."""
|
|
973
789
|
if self.conn:
|
|
974
790
|
self.conn.close()
|
|
975
|
-
print("Closed")
|
|
976
791
|
|
|
977
792
|
def _get_table_names(self) -> List[str]:
|
|
978
793
|
"""Get tables."""
|
|
@@ -980,46 +795,28 @@ IMPORTANT:
|
|
|
980
795
|
return [r[0] for r in self.cursor.fetchall()]
|
|
981
796
|
|
|
982
797
|
def _refresh_schema(self):
|
|
983
|
-
"""Refresh
|
|
984
|
-
tables = self._get_table_names()
|
|
985
|
-
|
|
798
|
+
"""Refresh."""
|
|
986
799
|
self.schema_info = {}
|
|
987
|
-
for tbl in
|
|
800
|
+
for tbl in self._get_table_names():
|
|
988
801
|
self.cursor.execute(f"PRAGMA table_info({tbl})")
|
|
989
802
|
self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
990
803
|
|
|
991
804
|
def _generate_sql(self, question: str, table: str) -> str:
|
|
992
805
|
"""Generate SQL."""
|
|
993
806
|
schema = self.schema_info.get(table, {})
|
|
994
|
-
|
|
995
|
-
sample = sample_df.to_string(index=False)
|
|
996
|
-
|
|
807
|
+
sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
|
|
997
808
|
schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
|
|
998
809
|
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
Database: SQLite
|
|
1002
|
-
Table: {table}
|
|
1003
|
-
Columns: {schema_str}
|
|
1004
|
-
|
|
1005
|
-
Sample:
|
|
1006
|
-
{sample}
|
|
1007
|
-
|
|
1008
|
-
Question: {question}
|
|
1009
|
-
|
|
1010
|
-
Return ONLY SQL."""
|
|
1011
|
-
|
|
1012
|
-
response = self.client.chat.completions.create(
|
|
810
|
+
resp = self.client.chat.completions.create(
|
|
1013
811
|
model="gpt-4o-mini",
|
|
1014
812
|
messages=[
|
|
1015
|
-
{"role": "system", "content": "SQL expert. Return only SQL
|
|
1016
|
-
{"role": "user", "content":
|
|
813
|
+
{"role": "system", "content": "SQL expert. Return only SQL."},
|
|
814
|
+
{"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
|
|
1017
815
|
],
|
|
1018
816
|
temperature=0
|
|
1019
817
|
)
|
|
1020
818
|
|
|
1021
|
-
|
|
1022
|
-
return sql.replace("```sql", "").replace("```", "").strip()
|
|
819
|
+
return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
1023
820
|
|
|
1024
821
|
def __enter__(self):
|
|
1025
822
|
return self
|
|
@@ -1028,53 +825,28 @@ Return ONLY SQL."""
|
|
|
1028
825
|
self.close()
|
|
1029
826
|
|
|
1030
827
|
def __repr__(self):
|
|
1031
|
-
|
|
1032
|
-
if self.cache_queries:
|
|
1033
|
-
features.append("cache")
|
|
1034
|
-
if self.use_embeddings:
|
|
1035
|
-
features.append("embeddings")
|
|
1036
|
-
if self.check_relevance:
|
|
1037
|
-
features.append("relevance")
|
|
1038
|
-
if self.fuzzy_match:
|
|
1039
|
-
features.append("fuzzy")
|
|
1040
|
-
|
|
1041
|
-
feat_str = f", {', '.join(features)}" if features else ""
|
|
1042
|
-
return f"SUTRA(tables={len(self.schema_info)}{feat_str})"
|
|
828
|
+
return f"SUTRA(tables={len(self.schema_info)})"
|
|
1043
829
|
|
|
1044
830
|
|
|
1045
831
|
class QueryResult:
|
|
1046
|
-
"""
|
|
1047
|
-
|
|
832
|
+
"""Result."""
|
|
1048
833
|
def __init__(self, success: bool, sql: str, data: pd.DataFrame, viz, error: str = None):
|
|
1049
|
-
self.success = success
|
|
1050
|
-
self.sql = sql
|
|
1051
|
-
self.data = data
|
|
1052
|
-
self.viz = viz
|
|
1053
|
-
self.error = error
|
|
834
|
+
self.success, self.sql, self.data, self.viz, self.error = success, sql, data, viz, error
|
|
1054
835
|
|
|
1055
836
|
def __repr__(self):
|
|
1056
|
-
return f"QueryResult(rows={len(self.data)}
|
|
837
|
+
return f"QueryResult(rows={len(self.data)})" if self.success else f"QueryResult(error='{self.error}')"
|
|
1057
838
|
|
|
1058
839
|
def show(self):
|
|
1059
|
-
print(self.data
|
|
840
|
+
print(self.data if self.success else f"Error: {self.error}")
|
|
1060
841
|
return self
|
|
1061
842
|
|
|
1062
843
|
|
|
1063
844
|
def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
|
|
1064
|
-
"""
|
|
845
|
+
"""Quick start."""
|
|
1065
846
|
with SUTRA(api_key=api_key) as sutra:
|
|
1066
847
|
sutra.upload(data_path)
|
|
1067
848
|
return sutra.ask(question, viz=viz)
|
|
1068
849
|
|
|
1069
850
|
|
|
1070
851
|
if __name__ == "__main__":
|
|
1071
|
-
print(""
|
|
1072
|
-
QuerySUTRA v0.3.3 - Professional Data Analysis
|
|
1073
|
-
SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
|
|
1074
|
-
|
|
1075
|
-
Fixed: Proper primary and foreign keys with unique IDs
|
|
1076
|
-
Features: Load existing DB, custom viz, fuzzy matching, embeddings
|
|
1077
|
-
|
|
1078
|
-
Installation: pip install QuerySUTRA
|
|
1079
|
-
Usage: from sutra import SUTRA
|
|
1080
|
-
""")
|
|
852
|
+
print("QuerySUTRA v0.4.0 - Simple & Automatic")
|