QuerySUTRA 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.5.0.dist-info → querysutra-0.5.2.dist-info}/METADATA +1 -1
- {querysutra-0.5.0.dist-info → querysutra-0.5.2.dist-info}/RECORD +7 -7
- sutra/__init__.py +4 -4
- sutra/sutra.py +333 -592
- {querysutra-0.5.0.dist-info → querysutra-0.5.2.dist-info}/WHEEL +0 -0
- {querysutra-0.5.0.dist-info → querysutra-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {querysutra-0.5.0.dist-info → querysutra-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
querysutra-0.5.
|
|
2
|
-
sutra/__init__.py,sha256=
|
|
1
|
+
querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
|
+
sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
|
|
3
3
|
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
4
|
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
5
5
|
sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
|
|
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
|
|
|
11
11
|
sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
|
|
12
12
|
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
13
|
sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
|
|
14
|
-
sutra/sutra.py,sha256=
|
|
14
|
+
sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
|
|
15
15
|
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
16
|
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
17
|
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
|
|
|
22
22
|
utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
querysutra-0.5.
|
|
26
|
-
querysutra-0.5.
|
|
27
|
-
querysutra-0.5.
|
|
28
|
-
querysutra-0.5.
|
|
25
|
+
querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
|
|
26
|
+
querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
+
querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
|
|
28
|
+
querysutra-0.5.2.dist-info/RECORD,,
|
sutra/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""QuerySUTRA v0.5.
|
|
2
|
-
__version__="0.5.
|
|
3
|
-
from sutra.sutra import SUTRA,QueryResult
|
|
4
|
-
__all__=["SUTRA","QueryResult"
|
|
1
|
+
"""QuerySUTRA v0.5.2"""
|
|
2
|
+
__version__="0.5.2"
|
|
3
|
+
from sutra.sutra import SUTRA,QueryResult
|
|
4
|
+
__all__=["SUTRA","QueryResult"]
|
sutra/sutra.py
CHANGED
|
@@ -1,94 +1,62 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
Debug mode to see why extraction fails
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
__version__ = "0.4.5"
|
|
1
|
+
"""QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
|
|
2
|
+
__version__ = "0.5.1"
|
|
7
3
|
__author__ = "Aditya Batta"
|
|
8
|
-
__all__ = ["SUTRA", "QueryResult"
|
|
4
|
+
__all__ = ["SUTRA", "QueryResult"]
|
|
9
5
|
|
|
10
|
-
import os
|
|
11
|
-
import sqlite3
|
|
12
|
-
import pandas as pd
|
|
13
|
-
import numpy as np
|
|
6
|
+
import os, sqlite3, pandas as pd, numpy as np, json, hashlib, shutil, datetime, re
|
|
14
7
|
from typing import Optional, Union, Dict, List
|
|
15
8
|
from pathlib import Path
|
|
16
|
-
import json
|
|
17
|
-
import hashlib
|
|
18
|
-
import warnings
|
|
19
|
-
import shutil
|
|
20
|
-
import datetime
|
|
21
|
-
from io import StringIO
|
|
22
9
|
from difflib import get_close_matches
|
|
23
|
-
warnings.filterwarnings('ignore')
|
|
24
10
|
|
|
25
11
|
try:
|
|
26
12
|
from openai import OpenAI
|
|
27
13
|
HAS_OPENAI = True
|
|
28
|
-
except
|
|
14
|
+
except:
|
|
29
15
|
HAS_OPENAI = False
|
|
30
16
|
|
|
31
17
|
try:
|
|
32
18
|
import plotly.express as px
|
|
33
19
|
import plotly.graph_objects as go
|
|
34
20
|
HAS_PLOTLY = True
|
|
35
|
-
except
|
|
21
|
+
except:
|
|
36
22
|
HAS_PLOTLY = False
|
|
37
23
|
|
|
38
|
-
try:
|
|
39
|
-
import matplotlib.pyplot as plt
|
|
40
|
-
HAS_MATPLOTLIB = True
|
|
41
|
-
except ImportError:
|
|
42
|
-
HAS_MATPLOTLIB = False
|
|
43
|
-
|
|
44
24
|
try:
|
|
45
25
|
import PyPDF2
|
|
46
26
|
HAS_PYPDF2 = True
|
|
47
|
-
except
|
|
27
|
+
except:
|
|
48
28
|
HAS_PYPDF2 = False
|
|
49
29
|
|
|
50
30
|
try:
|
|
51
31
|
import docx
|
|
52
32
|
HAS_DOCX = True
|
|
53
|
-
except
|
|
33
|
+
except:
|
|
54
34
|
HAS_DOCX = False
|
|
55
35
|
|
|
56
36
|
try:
|
|
57
37
|
from sentence_transformers import SentenceTransformer
|
|
58
38
|
HAS_EMBEDDINGS = True
|
|
59
|
-
except
|
|
39
|
+
except:
|
|
60
40
|
HAS_EMBEDDINGS = False
|
|
61
41
|
|
|
62
42
|
|
|
63
43
|
class SUTRA:
|
|
64
|
-
"""SUTRA
|
|
44
|
+
"""SUTRA - BULLETPROOF"""
|
|
65
45
|
|
|
66
46
|
def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
|
|
67
|
-
use_embeddings: bool = False,
|
|
68
|
-
|
|
69
|
-
"""Initialize."""
|
|
70
|
-
print("Initializing QuerySUTRA v0.4.5")
|
|
47
|
+
use_embeddings: bool = False, fuzzy_match: bool = True,
|
|
48
|
+
cache_queries: bool = True, check_relevance: bool = False):
|
|
71
49
|
|
|
72
50
|
if api_key:
|
|
73
51
|
os.environ["OPENAI_API_KEY"] = api_key
|
|
74
52
|
|
|
75
53
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
76
54
|
self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
|
|
77
|
-
|
|
78
55
|
self.db_path = db
|
|
79
|
-
self.
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
83
|
-
self.conn.execute("PRAGMA journal_mode=WAL")
|
|
84
|
-
self.conn.execute("PRAGMA synchronous=NORMAL")
|
|
85
|
-
except:
|
|
86
|
-
self.conn = sqlite3.connect(db, check_same_thread=False)
|
|
87
|
-
|
|
56
|
+
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
88
57
|
self.cursor = self.conn.cursor()
|
|
89
58
|
self.current_table = None
|
|
90
59
|
self.schema_info = {}
|
|
91
|
-
|
|
92
60
|
self.cache_queries = cache_queries
|
|
93
61
|
self.cache = {} if cache_queries else None
|
|
94
62
|
self.use_embeddings = use_embeddings
|
|
@@ -101,667 +69,459 @@ class SUTRA:
|
|
|
101
69
|
try:
|
|
102
70
|
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
103
71
|
except:
|
|
104
|
-
|
|
72
|
+
pass
|
|
105
73
|
|
|
106
74
|
self._refresh_schema()
|
|
107
|
-
print(f"
|
|
75
|
+
print(f"QuerySUTRA v0.5.1 Ready")
|
|
108
76
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
"""Load existing database."""
|
|
112
|
-
if not Path(db_path).exists():
|
|
113
|
-
raise FileNotFoundError(f"Not found: {db_path}")
|
|
114
|
-
return cls(api_key=api_key, db=db_path, **kwargs)
|
|
115
|
-
|
|
116
|
-
@classmethod
|
|
117
|
-
def connect_mysql(cls, host: str, user: str, password: str, database: str,
|
|
118
|
-
port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
119
|
-
"""Connect to MySQL."""
|
|
120
|
-
try:
|
|
121
|
-
from sqlalchemy import create_engine
|
|
122
|
-
import mysql.connector
|
|
123
|
-
except ImportError:
|
|
124
|
-
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
125
|
-
|
|
126
|
-
print(f"Connecting to MySQL...")
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
130
|
-
temp_cursor = temp_conn.cursor()
|
|
131
|
-
temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
132
|
-
temp_cursor.close()
|
|
133
|
-
temp_conn.close()
|
|
134
|
-
except:
|
|
135
|
-
pass
|
|
136
|
-
|
|
137
|
-
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
138
|
-
temp_db = f"sutra_mysql_{database}.db"
|
|
139
|
-
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
140
|
-
|
|
141
|
-
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
142
|
-
|
|
143
|
-
for table in tables:
|
|
144
|
-
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
145
|
-
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
146
|
-
|
|
147
|
-
instance._refresh_schema()
|
|
148
|
-
print(f"Connected! {len(tables)} tables")
|
|
149
|
-
return instance
|
|
150
|
-
|
|
151
|
-
@classmethod
|
|
152
|
-
def connect_postgres(cls, host: str, user: str, password: str, database: str,
|
|
153
|
-
port: int = 5432, api_key: Optional[str] = None, **kwargs):
|
|
154
|
-
"""Connect to PostgreSQL."""
|
|
155
|
-
try:
|
|
156
|
-
from sqlalchemy import create_engine
|
|
157
|
-
except ImportError:
|
|
158
|
-
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
159
|
-
|
|
160
|
-
print(f"Connecting to PostgreSQL...")
|
|
161
|
-
|
|
162
|
-
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
163
|
-
temp_db = f"sutra_postgres_{database}.db"
|
|
164
|
-
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
165
|
-
|
|
166
|
-
tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
|
|
167
|
-
|
|
168
|
-
for table in tables:
|
|
169
|
-
df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
|
|
170
|
-
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
171
|
-
|
|
172
|
-
instance._refresh_schema()
|
|
173
|
-
print(f"Connected! {len(tables)} tables")
|
|
174
|
-
return instance
|
|
175
|
-
|
|
176
|
-
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
|
|
177
|
-
extract_entities: Optional[List[str]] = None,
|
|
178
|
-
auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
|
|
179
|
-
"""Upload data."""
|
|
180
|
-
print("\nUploading...")
|
|
181
|
-
|
|
77
|
+
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
|
|
78
|
+
"""Upload."""
|
|
182
79
|
if isinstance(data, pd.DataFrame):
|
|
183
|
-
|
|
184
|
-
self
|
|
80
|
+
self._store(data, name or "data")
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
path = Path(data)
|
|
84
|
+
if not path.exists():
|
|
85
|
+
raise FileNotFoundError(f"Not found: {data}")
|
|
86
|
+
|
|
87
|
+
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
88
|
+
ext = path.suffix.lower()
|
|
89
|
+
|
|
90
|
+
if ext == ".csv":
|
|
91
|
+
self._store(pd.read_csv(path), name)
|
|
92
|
+
elif ext in [".xlsx", ".xls"]:
|
|
93
|
+
self._store(pd.read_excel(path), name)
|
|
94
|
+
elif ext == ".json":
|
|
95
|
+
self._store(pd.read_json(path), name)
|
|
96
|
+
elif ext == ".pdf":
|
|
97
|
+
self._pdf(path, name)
|
|
98
|
+
elif ext == ".docx":
|
|
99
|
+
self._docx(path, name)
|
|
100
|
+
elif ext == ".txt":
|
|
101
|
+
self._txt(path, name)
|
|
185
102
|
else:
|
|
186
|
-
|
|
187
|
-
if not path.exists():
|
|
188
|
-
raise FileNotFoundError(f"Not found: {data}")
|
|
189
|
-
|
|
190
|
-
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
191
|
-
ext = path.suffix.lower()
|
|
192
|
-
|
|
193
|
-
print(f"File: {path.name}")
|
|
194
|
-
|
|
195
|
-
if ext == ".csv":
|
|
196
|
-
self._store_dataframe(pd.read_csv(path), name)
|
|
197
|
-
elif ext in [".xlsx", ".xls"]:
|
|
198
|
-
self._store_dataframe(pd.read_excel(path), name)
|
|
199
|
-
elif ext == ".json":
|
|
200
|
-
self._store_dataframe(pd.read_json(path), name)
|
|
201
|
-
elif ext == ".sql":
|
|
202
|
-
with open(path) as f:
|
|
203
|
-
self.cursor.executescript(f.read())
|
|
204
|
-
self.conn.commit()
|
|
205
|
-
self._refresh_schema()
|
|
206
|
-
elif ext == ".pdf":
|
|
207
|
-
self._smart_upload_pdf(path, name, extract_entities)
|
|
208
|
-
elif ext == ".docx":
|
|
209
|
-
self._smart_upload_docx(path, name, extract_entities)
|
|
210
|
-
elif ext == ".txt":
|
|
211
|
-
self._smart_upload_txt(path, name, extract_entities)
|
|
212
|
-
else:
|
|
213
|
-
raise ValueError(f"Unsupported: {ext}")
|
|
214
|
-
|
|
215
|
-
if auto_export_mysql:
|
|
216
|
-
print("\nAuto-exporting to MySQL...")
|
|
217
|
-
self.save_to_mysql(
|
|
218
|
-
host=auto_export_mysql.get('host', 'localhost'),
|
|
219
|
-
user=auto_export_mysql.get('user', 'root'),
|
|
220
|
-
password=auto_export_mysql['password'],
|
|
221
|
-
database=auto_export_mysql['database'],
|
|
222
|
-
port=auto_export_mysql.get('port', 3306)
|
|
223
|
-
)
|
|
103
|
+
raise ValueError(f"Unsupported: {ext}")
|
|
224
104
|
|
|
225
105
|
return self
|
|
226
106
|
|
|
227
|
-
def
|
|
228
|
-
"""
|
|
107
|
+
def _pdf(self, path: Path, name: str):
|
|
108
|
+
"""BULLETPROOF PDF - ALWAYS creates multiple tables."""
|
|
229
109
|
if not HAS_PYPDF2:
|
|
230
|
-
raise ImportError("
|
|
110
|
+
raise ImportError("pip install PyPDF2")
|
|
231
111
|
|
|
232
|
-
print("Extracting PDF
|
|
112
|
+
print(f"Extracting PDF: {path.name}")
|
|
233
113
|
|
|
234
|
-
with open(path, 'rb') as
|
|
235
|
-
|
|
236
|
-
full_text = ""
|
|
237
|
-
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
238
|
-
full_text += page.extract_text() + "\n"
|
|
239
|
-
print(f" Page {page_num}/{len(pdf_reader.pages)}")
|
|
114
|
+
with open(path, 'rb') as f:
|
|
115
|
+
text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
|
|
240
116
|
|
|
241
|
-
if self.client:
|
|
242
|
-
print("
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
117
|
+
if not self.client:
|
|
118
|
+
print("ERROR: No API key! Set api_key parameter")
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
print("AI: Extracting...")
|
|
122
|
+
|
|
123
|
+
# TRY 3 TIMES
|
|
124
|
+
entities = None
|
|
125
|
+
for attempt in [1, 2, 3]:
|
|
126
|
+
entities = self._extract(text, attempt)
|
|
127
|
+
if entities and len(entities) > 0:
|
|
128
|
+
break
|
|
129
|
+
if attempt < 3:
|
|
130
|
+
print(f" Retry {attempt+1}/3...")
|
|
131
|
+
|
|
132
|
+
# Create tables from entities
|
|
133
|
+
if entities and len(entities) > 0:
|
|
134
|
+
print(f"Extracted {len(entities)} entity types:")
|
|
135
|
+
for etype, recs in entities.items():
|
|
136
|
+
if recs and len(recs) > 0:
|
|
137
|
+
for idx, rec in enumerate(recs, 1):
|
|
138
|
+
rec['id'] = idx
|
|
139
|
+
self._store(pd.DataFrame(recs), f"{name}_{etype}")
|
|
140
|
+
print(f" {etype}: {len(recs)} rows")
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# REGEX FALLBACK - FIXED
|
|
144
|
+
print("Using regex fallback...")
|
|
145
|
+
people = []
|
|
146
|
+
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
147
|
+
|
|
148
|
+
# Extract names from common patterns
|
|
149
|
+
name_patterns = [
|
|
150
|
+
r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
|
|
151
|
+
r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
|
|
152
|
+
r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
names = []
|
|
156
|
+
for pattern in name_patterns:
|
|
157
|
+
names.extend(re.findall(pattern, text))
|
|
158
|
+
if len(names) >= len(emails):
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
# Match emails to names
|
|
162
|
+
max_people = min(len(emails), 50)
|
|
163
|
+
for i in range(max_people):
|
|
164
|
+
people.append({
|
|
165
|
+
'id': i + 1,
|
|
166
|
+
'name': names[i] if i < len(names) else f"Person {i+1}",
|
|
167
|
+
'email': emails[i] if i < len(emails) else f"person{i+1}@unknown.com"
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
if people:
|
|
171
|
+
self._store(pd.DataFrame(people), f"{name}_people")
|
|
172
|
+
print(f" Extracted {len(people)} people via regex")
|
|
173
|
+
else:
|
|
174
|
+
# Absolute last resort
|
|
175
|
+
lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
|
|
176
|
+
self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
|
|
288
177
|
|
|
289
|
-
def
|
|
290
|
-
"""Extract
|
|
178
|
+
def _extract(self, text: str, attempt: int) -> Dict:
|
|
179
|
+
"""Extract with 3 different strategies."""
|
|
291
180
|
if not self.client:
|
|
292
181
|
return {}
|
|
293
182
|
|
|
294
183
|
try:
|
|
295
|
-
|
|
184
|
+
if attempt == 1:
|
|
185
|
+
sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
|
|
186
|
+
usr_msg = f"""Extract ALL entities from text.
|
|
296
187
|
|
|
297
188
|
Text:
|
|
298
|
-
{text[:
|
|
189
|
+
{text[:15000]}
|
|
299
190
|
|
|
300
|
-
|
|
191
|
+
Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
|
|
301
192
|
|
|
302
|
-
|
|
193
|
+
Example:
|
|
194
|
+
{{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
|
|
303
195
|
|
|
304
|
-
|
|
305
|
-
"people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
|
|
306
|
-
"skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
|
|
307
|
-
}}
|
|
196
|
+
Rules: Unique IDs (1,2,3...), person_id references people.id
|
|
308
197
|
|
|
309
|
-
|
|
198
|
+
JSON:"""
|
|
199
|
+
|
|
200
|
+
elif attempt == 2:
|
|
201
|
+
sys_msg = "Return JSON."
|
|
202
|
+
usr_msg = f"""Text: {text[:10000]}
|
|
203
|
+
|
|
204
|
+
Extract people as JSON:
|
|
205
|
+
{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
|
|
206
|
+
|
|
207
|
+
JSON:"""
|
|
208
|
+
|
|
209
|
+
else:
|
|
210
|
+
sys_msg = "JSON only."
|
|
211
|
+
usr_msg = f"""Find names and emails in: {text[:8000]}
|
|
310
212
|
|
|
311
|
-
|
|
213
|
+
{{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
|
|
214
|
+
|
|
215
|
+
r = self.client.chat.completions.create(
|
|
312
216
|
model="gpt-4o-mini",
|
|
313
217
|
messages=[
|
|
314
|
-
{"role": "system", "content":
|
|
315
|
-
{"role": "user", "content":
|
|
218
|
+
{"role": "system", "content": sys_msg},
|
|
219
|
+
{"role": "user", "content": usr_msg}
|
|
316
220
|
],
|
|
317
221
|
temperature=0,
|
|
318
|
-
max_tokens=
|
|
222
|
+
max_tokens=12000
|
|
319
223
|
)
|
|
320
224
|
|
|
321
|
-
|
|
225
|
+
raw = r.choices[0].message.content.strip()
|
|
226
|
+
raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").strip()
|
|
322
227
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
print(f" DEBUG: First 200 chars: {json_text[:200]}")
|
|
228
|
+
start = raw.find('{')
|
|
229
|
+
end = raw.rfind('}') + 1
|
|
326
230
|
|
|
327
|
-
|
|
231
|
+
if start < 0 or end <= start:
|
|
232
|
+
return {}
|
|
328
233
|
|
|
329
|
-
result = json.loads(
|
|
234
|
+
result = json.loads(raw[start:end])
|
|
330
235
|
|
|
331
|
-
if
|
|
332
|
-
|
|
236
|
+
if isinstance(result, dict) and len(result) > 0:
|
|
237
|
+
has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
|
|
238
|
+
if has_data:
|
|
239
|
+
return result
|
|
333
240
|
|
|
334
|
-
return result
|
|
335
|
-
|
|
336
|
-
except json.JSONDecodeError as e:
|
|
337
|
-
if self.debug:
|
|
338
|
-
print(f" DEBUG: JSON parse error: {e}")
|
|
339
|
-
print(f" DEBUG: Response was: {json_text[:500]}")
|
|
340
241
|
return {}
|
|
242
|
+
|
|
341
243
|
except Exception as e:
|
|
342
|
-
|
|
343
|
-
print(f" DEBUG: Extraction error: {e}")
|
|
244
|
+
print(f" Attempt {attempt} failed: {str(e)[:100]}")
|
|
344
245
|
return {}
|
|
345
246
|
|
|
346
|
-
def
|
|
347
|
-
"""
|
|
247
|
+
def _docx(self, path: Path, name: str):
|
|
248
|
+
"""DOCX."""
|
|
348
249
|
if not HAS_DOCX:
|
|
349
|
-
raise ImportError("
|
|
350
|
-
|
|
250
|
+
raise ImportError("pip install python-docx")
|
|
351
251
|
doc = docx.Document(path)
|
|
352
|
-
|
|
353
252
|
if doc.tables:
|
|
354
|
-
for i,
|
|
355
|
-
data = [[cell.text.strip() for cell in row.cells] for row in
|
|
253
|
+
for i, t in enumerate(doc.tables):
|
|
254
|
+
data = [[cell.text.strip() for cell in row.cells] for row in t.rows]
|
|
356
255
|
if data and len(data) > 1:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
373
|
-
|
|
374
|
-
def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
375
|
-
"""Parse TXT."""
|
|
376
|
-
with open(path, 'r', encoding='utf-8') as file:
|
|
377
|
-
text = file.read()
|
|
378
|
-
|
|
379
|
-
if self.client and len(text) > 0:
|
|
380
|
-
entities = self._extract_chunk(text, extract_entities)
|
|
381
|
-
if entities:
|
|
382
|
-
for entity_type, records in entities.items():
|
|
383
|
-
if records:
|
|
384
|
-
df = pd.DataFrame(records)
|
|
385
|
-
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
386
|
-
return
|
|
387
|
-
|
|
388
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
389
|
-
|
|
390
|
-
def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
|
|
256
|
+
self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
|
|
257
|
+
else:
|
|
258
|
+
text = "\n".join([p.text for p in doc.paragraphs])
|
|
259
|
+
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|
260
|
+
self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
|
|
261
|
+
|
|
262
|
+
def _txt(self, path: Path, name: str):
|
|
263
|
+
"""TXT."""
|
|
264
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
265
|
+
text = f.read()
|
|
266
|
+
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|
267
|
+
self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
|
|
268
|
+
|
|
269
|
+
def _store(self, df: pd.DataFrame, name: str):
|
|
391
270
|
"""Store."""
|
|
271
|
+
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
392
272
|
try:
|
|
393
|
-
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
394
273
|
df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
|
|
395
|
-
self.conn.commit()
|
|
396
|
-
self.current_table = name
|
|
397
|
-
self._refresh_schema()
|
|
398
274
|
except:
|
|
399
275
|
df.to_sql(name, self.conn, if_exists='replace', index=False)
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def _parse_text_simple(self, text: str) -> pd.DataFrame:
|
|
405
|
-
"""Simple parsing."""
|
|
406
|
-
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
407
|
-
if not lines:
|
|
408
|
-
return pd.DataFrame({'content': ['No content']})
|
|
409
|
-
return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
|
|
410
|
-
|
|
411
|
-
def _store_dataframe(self, df: pd.DataFrame, name: str):
|
|
412
|
-
"""Store."""
|
|
413
|
-
self._store_dataframe_safe(df, name)
|
|
414
|
-
print(f"Uploaded: {name} ({len(df)} rows)")
|
|
276
|
+
self.conn.commit()
|
|
277
|
+
self.current_table = name
|
|
278
|
+
self._refresh_schema()
|
|
279
|
+
print(f" {name}: {len(df)} rows")
|
|
415
280
|
|
|
416
|
-
def ask(self,
|
|
281
|
+
def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
417
282
|
"""Query."""
|
|
418
283
|
if not self.client:
|
|
419
|
-
return QueryResult(False, "", pd.DataFrame(), None, "No API
|
|
420
|
-
|
|
421
|
-
print(f"\nQuestion: {question}")
|
|
284
|
+
return QueryResult(False, "", pd.DataFrame(), None, "No API")
|
|
422
285
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
choice = input("Continue? (yes/no): ").strip().lower()
|
|
426
|
-
if choice not in ['yes', 'y']:
|
|
427
|
-
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
428
|
-
|
|
429
|
-
tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
|
|
430
|
-
if not tbl:
|
|
286
|
+
t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
|
|
287
|
+
if not t:
|
|
431
288
|
return QueryResult(False, "", pd.DataFrame(), None, "No table")
|
|
432
289
|
|
|
433
290
|
if self.use_embeddings and self.embedding_model:
|
|
434
|
-
cached = self.
|
|
291
|
+
cached = self._check_cache(q, t)
|
|
435
292
|
if cached:
|
|
436
|
-
print(" Cached")
|
|
437
293
|
return cached
|
|
438
294
|
|
|
439
295
|
if self.fuzzy_match:
|
|
440
|
-
|
|
296
|
+
q = self._fuzzy(q, t)
|
|
441
297
|
|
|
442
|
-
|
|
443
|
-
if self.cache_queries and self.cache and
|
|
444
|
-
|
|
445
|
-
print(" From cache")
|
|
298
|
+
key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
|
|
299
|
+
if self.cache_queries and self.cache and key in self.cache:
|
|
300
|
+
sql = self.cache[key]
|
|
446
301
|
else:
|
|
447
|
-
|
|
448
|
-
if self.cache_queries and self.cache
|
|
449
|
-
self.cache[
|
|
302
|
+
sql = self._gen_sql(q, t)
|
|
303
|
+
if self.cache_queries and self.cache:
|
|
304
|
+
self.cache[key] = sql
|
|
450
305
|
|
|
451
|
-
print(f"SQL: {
|
|
306
|
+
print(f"SQL: {sql}")
|
|
452
307
|
|
|
453
308
|
try:
|
|
454
|
-
df = pd.read_sql_query(
|
|
309
|
+
df = pd.read_sql_query(sql, self.conn)
|
|
455
310
|
print(f"Success! {len(df)} rows")
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
if viz:
|
|
459
|
-
viz_type = viz if isinstance(viz, str) else "auto"
|
|
460
|
-
fig = self._visualize(df, question, viz_type)
|
|
461
|
-
|
|
462
|
-
result = QueryResult(True, sql_query, df, fig)
|
|
311
|
+
fig = self._viz(df, q, viz if isinstance(viz, str) else "auto") if viz else None
|
|
312
|
+
r = QueryResult(True, sql, df, fig)
|
|
463
313
|
|
|
464
314
|
if self.use_embeddings and self.embedding_model:
|
|
465
|
-
self.
|
|
315
|
+
self._store_cache(q, t, r)
|
|
466
316
|
|
|
467
|
-
return
|
|
317
|
+
return r
|
|
468
318
|
except Exception as e:
|
|
469
|
-
|
|
470
|
-
return QueryResult(False, sql_query, pd.DataFrame(), None, str(e))
|
|
471
|
-
|
|
472
|
-
def _is_relevant_query(self, question: str) -> bool:
|
|
473
|
-
"""Check relevance."""
|
|
474
|
-
if not self.client:
|
|
475
|
-
return True
|
|
476
|
-
try:
|
|
477
|
-
tables = self._get_table_names()[:3]
|
|
478
|
-
resp = self.client.chat.completions.create(
|
|
479
|
-
model="gpt-4o-mini",
|
|
480
|
-
messages=[
|
|
481
|
-
{"role": "system", "content": "Return 'yes' or 'no'."},
|
|
482
|
-
{"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
|
|
483
|
-
],
|
|
484
|
-
temperature=0,
|
|
485
|
-
max_tokens=5
|
|
486
|
-
)
|
|
487
|
-
return 'yes' in resp.choices[0].message.content.lower()
|
|
488
|
-
except:
|
|
489
|
-
return True
|
|
319
|
+
return QueryResult(False, sql, pd.DataFrame(), None, str(e))
|
|
490
320
|
|
|
491
|
-
def
|
|
321
|
+
def _fuzzy(self, q: str, t: str) -> str:
|
|
492
322
|
"""Fuzzy."""
|
|
493
|
-
if not self.schema_info.get(table):
|
|
494
|
-
return question
|
|
495
|
-
|
|
496
323
|
try:
|
|
497
|
-
|
|
498
|
-
if not
|
|
499
|
-
return
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
print(f" Fuzzy: '{word}' -> '{matches[0]}'")
|
|
511
|
-
question = " ".join(words)
|
|
512
|
-
return question
|
|
324
|
+
cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
|
|
325
|
+
if not cols:
|
|
326
|
+
return q
|
|
327
|
+
for col in cols[:2]:
|
|
328
|
+
df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {t} LIMIT 100", self.conn)
|
|
329
|
+
vals = [str(v) for v in df[col].dropna()]
|
|
330
|
+
words = q.split()
|
|
331
|
+
for i, w in enumerate(words):
|
|
332
|
+
m = get_close_matches(w, vals, n=1, cutoff=0.6)
|
|
333
|
+
if m and w != m[0]:
|
|
334
|
+
words[i] = m[0]
|
|
335
|
+
q = " ".join(words)
|
|
336
|
+
return q
|
|
513
337
|
except:
|
|
514
|
-
return
|
|
338
|
+
return q
|
|
515
339
|
|
|
516
|
-
def
|
|
517
|
-
"""
|
|
340
|
+
def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
|
|
341
|
+
"""Cache."""
|
|
518
342
|
if not self.query_embeddings:
|
|
519
343
|
return None
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
for cached_q, data in self.query_embeddings.items():
|
|
525
|
-
if data['table'] != table:
|
|
344
|
+
emb = self.embedding_model.encode([q])[0]
|
|
345
|
+
best, sim = None, 0.85
|
|
346
|
+
for cq, d in self.query_embeddings.items():
|
|
347
|
+
if d['table'] != t:
|
|
526
348
|
continue
|
|
527
|
-
|
|
528
|
-
if
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
if best_match:
|
|
533
|
-
print(f" Similar ({best_sim:.0%})")
|
|
534
|
-
return self.query_embeddings[best_match]['result']
|
|
535
|
-
return None
|
|
349
|
+
s = np.dot(emb, d['embedding']) / (np.linalg.norm(emb) * np.linalg.norm(d['embedding']))
|
|
350
|
+
if s > sim:
|
|
351
|
+
sim, best = s, cq
|
|
352
|
+
return self.query_embeddings[best]['result'] if best else None
|
|
536
353
|
|
|
537
|
-
def
|
|
354
|
+
def _store_cache(self, q: str, t: str, r: 'QueryResult'):
|
|
538
355
|
"""Store."""
|
|
539
|
-
|
|
540
|
-
self.query_embeddings[
|
|
356
|
+
emb = self.embedding_model.encode([q])[0]
|
|
357
|
+
self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
|
|
541
358
|
|
|
542
|
-
def
|
|
359
|
+
def _viz(self, df: pd.DataFrame, title: str, vt: str):
|
|
543
360
|
"""Viz."""
|
|
544
|
-
if not HAS_PLOTLY
|
|
361
|
+
if not HAS_PLOTLY:
|
|
545
362
|
return None
|
|
546
|
-
print(f"Creating {viz_type} chart...")
|
|
547
|
-
return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
|
|
548
|
-
|
|
549
|
-
def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
550
|
-
"""Plotly."""
|
|
551
363
|
try:
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
fig = px.line(df, y=num[0], title=title)
|
|
563
|
-
elif viz_type == "scatter" and len(num) >= 2:
|
|
564
|
-
fig = px.scatter(df, x=num[0], y=num[1], title=title)
|
|
565
|
-
elif viz_type == "heatmap" and len(num) >= 2:
|
|
566
|
-
corr = df[num].corr()
|
|
567
|
-
fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
|
|
568
|
-
fig.update_layout(title=title)
|
|
364
|
+
n = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
365
|
+
c = df.select_dtypes(include=['object']).columns.tolist()
|
|
366
|
+
if vt == "pie" and c and n:
|
|
367
|
+
fig = px.pie(df, names=c[0], values=n[0], title=title)
|
|
368
|
+
elif vt == "bar" and c and n:
|
|
369
|
+
fig = px.bar(df, x=c[0], y=n[0], title=title)
|
|
370
|
+
elif vt == "line" and n:
|
|
371
|
+
fig = px.line(df, y=n[0], title=title)
|
|
372
|
+
elif vt == "scatter" and len(n) >= 2:
|
|
373
|
+
fig = px.scatter(df, x=n[0], y=n[1], title=title)
|
|
569
374
|
else:
|
|
570
|
-
|
|
571
|
-
fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
|
|
572
|
-
else:
|
|
573
|
-
fig = px.bar(df, y=df.columns[0], title=title)
|
|
375
|
+
fig = px.bar(df, y=df.columns[0], title=title)
|
|
574
376
|
fig.show()
|
|
575
377
|
return fig
|
|
576
378
|
except:
|
|
577
379
|
return None
|
|
578
380
|
|
|
579
|
-
def
|
|
580
|
-
"""
|
|
581
|
-
|
|
582
|
-
plt.figure(figsize=(10, 6))
|
|
583
|
-
num = df.select_dtypes(include=[np.number]).columns
|
|
584
|
-
if viz_type == "pie":
|
|
585
|
-
df[df.columns[0]].value_counts().plot(kind='pie')
|
|
586
|
-
elif viz_type == "line" and len(num) > 0:
|
|
587
|
-
df[num[0]].plot(kind='line')
|
|
588
|
-
else:
|
|
589
|
-
(df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
|
|
590
|
-
plt.title(title)
|
|
591
|
-
plt.tight_layout()
|
|
592
|
-
plt.show()
|
|
593
|
-
return plt.gcf()
|
|
594
|
-
except:
|
|
595
|
-
return None
|
|
596
|
-
|
|
597
|
-
def tables(self) -> Dict[str, dict]:
|
|
598
|
-
"""List."""
|
|
381
|
+
def tables(self) -> Dict:
|
|
382
|
+
"""Tables."""
|
|
383
|
+
t = self._get_tables()
|
|
599
384
|
print("\n" + "="*70)
|
|
600
385
|
print("TABLES")
|
|
601
386
|
print("="*70)
|
|
602
|
-
|
|
603
|
-
all_tables = self._get_table_names()
|
|
604
|
-
if not all_tables:
|
|
387
|
+
if not t:
|
|
605
388
|
print("No tables")
|
|
606
389
|
return {}
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
result[tbl] = {'rows': cnt, 'columns': cols}
|
|
614
|
-
|
|
390
|
+
r = {}
|
|
391
|
+
for i, tb in enumerate(t, 1):
|
|
392
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tb}", self.conn).iloc[0, 0]
|
|
393
|
+
cols = list(self.schema_info.get(tb, {}).keys())
|
|
394
|
+
print(f" {i}. {tb}: {cnt} rows, {len(cols)} cols")
|
|
395
|
+
r[tb] = {'rows': cnt, 'columns': cols}
|
|
615
396
|
print("="*70)
|
|
616
|
-
return
|
|
397
|
+
return r
|
|
617
398
|
|
|
618
|
-
def schema(self, table: Optional[str] = None) ->
|
|
399
|
+
def schema(self, table: Optional[str] = None) -> Dict:
|
|
619
400
|
"""Schema."""
|
|
620
401
|
if not self.schema_info:
|
|
621
402
|
self._refresh_schema()
|
|
622
|
-
|
|
623
403
|
print("\n" + "="*70)
|
|
624
404
|
print("SCHEMA")
|
|
625
405
|
print("="*70)
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
|
|
635
|
-
|
|
406
|
+
r = {}
|
|
407
|
+
for t in ([table] if table else self.schema_info.keys()):
|
|
408
|
+
if t in self.schema_info:
|
|
409
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {t}", self.conn).iloc[0, 0]
|
|
410
|
+
print(f"\n{t}: {cnt} records")
|
|
411
|
+
for c, d in self.schema_info[t].items():
|
|
412
|
+
print(f" - {c:<30} {d}")
|
|
413
|
+
r[t] = {'records': cnt, 'columns': self.schema_info[t]}
|
|
636
414
|
print("="*70)
|
|
637
|
-
return
|
|
415
|
+
return r
|
|
638
416
|
|
|
639
417
|
def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
|
|
640
418
|
"""Preview."""
|
|
641
|
-
|
|
642
|
-
if not
|
|
419
|
+
t = table or self.current_table
|
|
420
|
+
if not t:
|
|
643
421
|
return pd.DataFrame()
|
|
644
|
-
df = pd.read_sql_query(f"SELECT * FROM {
|
|
645
|
-
print(f"\nSample from '{
|
|
422
|
+
df = pd.read_sql_query(f"SELECT * FROM {t} LIMIT {n}", self.conn)
|
|
423
|
+
print(f"\nSample from '{t}':")
|
|
646
424
|
print(df.to_string(index=False))
|
|
647
425
|
return df
|
|
648
426
|
|
|
649
|
-
def info(self):
|
|
650
|
-
"""Info."""
|
|
651
|
-
return self.tables()
|
|
652
|
-
|
|
653
427
|
def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
|
|
654
428
|
"""SQL."""
|
|
655
429
|
try:
|
|
656
430
|
df = pd.read_sql_query(query, self.conn)
|
|
657
431
|
print(f"Success! {len(df)} rows")
|
|
658
|
-
fig = self.
|
|
432
|
+
fig = self._viz(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
|
|
659
433
|
return QueryResult(True, query, df, fig)
|
|
660
434
|
except Exception as e:
|
|
661
|
-
print(f"Error: {e}")
|
|
662
435
|
return QueryResult(False, query, pd.DataFrame(), None, str(e))
|
|
663
436
|
|
|
664
|
-
def
|
|
665
|
-
"""
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
437
|
+
def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
|
|
438
|
+
"""MySQL."""
|
|
439
|
+
try:
|
|
440
|
+
from sqlalchemy import create_engine
|
|
441
|
+
import mysql.connector
|
|
442
|
+
except:
|
|
443
|
+
raise ImportError("pip install QuerySUTRA[mysql]")
|
|
444
|
+
|
|
445
|
+
print(f"Exporting to MySQL: {database}")
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
449
|
+
tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
|
|
450
|
+
tc.close()
|
|
451
|
+
except:
|
|
452
|
+
pass
|
|
453
|
+
|
|
454
|
+
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
455
|
+
for t in self._get_tables():
|
|
456
|
+
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
457
|
+
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
458
|
+
print(f" {t}: {len(df)} rows")
|
|
459
|
+
print("Done!")
|
|
460
|
+
return self
|
|
669
461
|
|
|
670
462
|
def export_db(self, path: str, format: str = "sqlite"):
|
|
671
463
|
"""Export."""
|
|
672
464
|
if format == "sqlite":
|
|
673
465
|
shutil.copy2(self.db_path, path)
|
|
674
|
-
elif format == "sql":
|
|
675
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
676
|
-
for line in self.conn.iterdump():
|
|
677
|
-
f.write(f'{line}\n')
|
|
678
466
|
elif format == "json":
|
|
679
|
-
data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self.
|
|
680
|
-
with open(path, 'w'
|
|
467
|
+
data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_tables()}
|
|
468
|
+
with open(path, 'w') as f:
|
|
681
469
|
json.dump(data, f, indent=2, default=str)
|
|
682
|
-
elif format == "excel":
|
|
683
|
-
with pd.ExcelWriter(path, engine='openpyxl') as writer:
|
|
684
|
-
for t in self._get_table_names():
|
|
685
|
-
pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
|
|
686
|
-
else:
|
|
687
|
-
raise ValueError(f"Unsupported: {format}")
|
|
688
470
|
print(f"Saved: {path}")
|
|
689
471
|
return self
|
|
690
472
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
"""
|
|
473
|
+
@classmethod
|
|
474
|
+
def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
|
|
475
|
+
"""Load."""
|
|
476
|
+
if not Path(db_path).exists():
|
|
477
|
+
raise FileNotFoundError(f"Not found: {db_path}")
|
|
478
|
+
return cls(api_key=api_key, db=db_path, **kwargs)
|
|
479
|
+
|
|
480
|
+
@classmethod
|
|
481
|
+
def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
482
|
+
"""MySQL."""
|
|
694
483
|
try:
|
|
695
484
|
from sqlalchemy import create_engine
|
|
696
485
|
import mysql.connector
|
|
697
|
-
except
|
|
698
|
-
raise ImportError("
|
|
699
|
-
|
|
700
|
-
print(f"Exporting to MySQL: {host}/{database}")
|
|
486
|
+
except:
|
|
487
|
+
raise ImportError("pip install QuerySUTRA[mysql]")
|
|
701
488
|
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
temp_conn.close()
|
|
709
|
-
print(f" Database '{database}' ready")
|
|
710
|
-
except Exception as e:
|
|
711
|
-
print(f" Warning: {e}")
|
|
489
|
+
try:
|
|
490
|
+
tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
491
|
+
tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
492
|
+
tc.close()
|
|
493
|
+
except:
|
|
494
|
+
pass
|
|
712
495
|
|
|
713
496
|
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
497
|
+
temp_db = f"mysql_{database}.db"
|
|
498
|
+
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
714
499
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
print(f" {t}: {len(df)} rows")
|
|
500
|
+
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
501
|
+
for t in tables:
|
|
502
|
+
pd.read_sql_query(f"SELECT * FROM {t}", engine).to_sql(t, instance.conn, if_exists='replace', index=False)
|
|
719
503
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
|
|
724
|
-
"""PostgreSQL."""
|
|
725
|
-
try:
|
|
726
|
-
from sqlalchemy import create_engine
|
|
727
|
-
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
728
|
-
print(f"Exporting to PostgreSQL...")
|
|
729
|
-
for t in (tables or self._get_table_names()):
|
|
730
|
-
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
731
|
-
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
732
|
-
print(f" {t}: {len(df)} rows")
|
|
733
|
-
print("Complete!")
|
|
734
|
-
return self
|
|
735
|
-
except ImportError:
|
|
736
|
-
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
737
|
-
|
|
738
|
-
def backup(self, path: str = None):
|
|
739
|
-
"""Backup."""
|
|
740
|
-
dir = Path(path) if path else Path(".")
|
|
741
|
-
dir.mkdir(parents=True, exist_ok=True)
|
|
742
|
-
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
743
|
-
self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
|
|
744
|
-
self.export_db(str(dir / f"sutra_{ts}.json"), "json")
|
|
745
|
-
print("Backup complete!")
|
|
746
|
-
return self
|
|
747
|
-
|
|
748
|
-
def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
|
|
749
|
-
"""Export."""
|
|
750
|
-
if format == "csv":
|
|
751
|
-
data.to_csv(path, index=False)
|
|
752
|
-
elif format in ["excel", "xlsx"]:
|
|
753
|
-
data.to_excel(path, index=False)
|
|
754
|
-
elif format == "json":
|
|
755
|
-
data.to_json(path, orient="records", indent=2)
|
|
756
|
-
print(f"Exported: {path}")
|
|
757
|
-
return self
|
|
504
|
+
instance._refresh_schema()
|
|
505
|
+
print(f"Connected! {len(tables)} tables")
|
|
506
|
+
return instance
|
|
758
507
|
|
|
759
|
-
def
|
|
760
|
-
"""
|
|
761
|
-
|
|
762
|
-
|
|
508
|
+
def _gen_sql(self, q: str, t: str) -> str:
|
|
509
|
+
"""SQL."""
|
|
510
|
+
schema = self.schema_info.get(t, {})
|
|
511
|
+
sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
|
|
512
|
+
cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])
|
|
513
|
+
|
|
514
|
+
r = self.client.chat.completions.create(
|
|
515
|
+
model="gpt-4o-mini",
|
|
516
|
+
messages=[
|
|
517
|
+
{"role": "system", "content": "SQL expert. Return only SQL."},
|
|
518
|
+
{"role": "user", "content": f"Table: {t}\nColumns: {cols}\nSample:\n{sample}\n\nQ: {q}\n\nSQL:"}
|
|
519
|
+
],
|
|
520
|
+
temperature=0
|
|
521
|
+
)
|
|
522
|
+
return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
763
523
|
|
|
764
|
-
def
|
|
524
|
+
def _get_tables(self) -> List[str]:
|
|
765
525
|
"""Tables."""
|
|
766
526
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
767
527
|
return [r[0] for r in self.cursor.fetchall()]
|
|
@@ -769,25 +529,13 @@ ONLY valid JSON. No explanations."""
|
|
|
769
529
|
def _refresh_schema(self):
|
|
770
530
|
"""Refresh."""
|
|
771
531
|
self.schema_info = {}
|
|
772
|
-
for
|
|
773
|
-
self.cursor.execute(f"PRAGMA table_info({
|
|
774
|
-
self.schema_info[
|
|
532
|
+
for t in self._get_tables():
|
|
533
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
534
|
+
self.schema_info[t] = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
775
535
|
|
|
776
|
-
def
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
|
|
780
|
-
schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
|
|
781
|
-
|
|
782
|
-
resp = self.client.chat.completions.create(
|
|
783
|
-
model="gpt-4o-mini",
|
|
784
|
-
messages=[
|
|
785
|
-
{"role": "system", "content": "SQL expert. Return only SQL."},
|
|
786
|
-
{"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
|
|
787
|
-
],
|
|
788
|
-
temperature=0
|
|
789
|
-
)
|
|
790
|
-
return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
536
|
+
def close(self):
|
|
537
|
+
if self.conn:
|
|
538
|
+
self.conn.close()
|
|
791
539
|
|
|
792
540
|
def __enter__(self):
|
|
793
541
|
return self
|
|
@@ -810,10 +558,3 @@ class QueryResult:
|
|
|
810
558
|
def show(self):
|
|
811
559
|
print(self.data if self.success else f"Error: {self.error}")
|
|
812
560
|
return self
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
|
|
816
|
-
"""Quick."""
|
|
817
|
-
with SUTRA(api_key=api_key) as sutra:
|
|
818
|
-
sutra.upload(data_path)
|
|
819
|
-
return sutra.ask(question, viz=viz)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|