QuerySUTRA 0.4.6__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.4.6.dist-info → querysutra-0.5.1.dist-info}/METADATA +1 -1
- {querysutra-0.4.6.dist-info → querysutra-0.5.1.dist-info}/RECORD +7 -7
- sutra/__init__.py +4 -4
- sutra/sutra.py +358 -583
- {querysutra-0.4.6.dist-info → querysutra-0.5.1.dist-info}/WHEEL +0 -0
- {querysutra-0.4.6.dist-info → querysutra-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {querysutra-0.4.6.dist-info → querysutra-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
querysutra-0.
|
|
2
|
-
sutra/__init__.py,sha256=
|
|
1
|
+
querysutra-0.5.1.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
|
|
2
|
+
sutra/__init__.py,sha256=fCBD8dtNCkIaglLrLPBC4UGJxYPUJ7GyCfBh7zj8bLg,118
|
|
3
3
|
sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
|
|
4
4
|
sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
|
|
5
5
|
sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
|
|
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
|
|
|
11
11
|
sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
|
|
12
12
|
sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
|
|
13
13
|
sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
|
|
14
|
-
sutra/sutra.py,sha256=
|
|
14
|
+
sutra/sutra.py,sha256=A2qX0tm2eaxVTU4yNKFk8v07suYaD86P1degwBhAyGk,22919
|
|
15
15
|
sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
|
|
16
16
|
sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
|
|
17
17
|
sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
|
|
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
|
|
|
22
22
|
utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
querysutra-0.
|
|
26
|
-
querysutra-0.
|
|
27
|
-
querysutra-0.
|
|
28
|
-
querysutra-0.
|
|
25
|
+
querysutra-0.5.1.dist-info/METADATA,sha256=uiNLBUFwgNkwo1NfMYkg7uZLzfgzoEnTncNwweRnenY,7258
|
|
26
|
+
querysutra-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
+
querysutra-0.5.1.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
|
|
28
|
+
querysutra-0.5.1.dist-info/RECORD,,
|
sutra/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""QuerySUTRA v0.
|
|
2
|
-
__version__
|
|
3
|
-
from sutra.sutra import SUTRA,
|
|
4
|
-
__all__
|
|
1
|
+
"""QuerySUTRA v0.5.1"""
|
|
2
|
+
__version__="0.5.1"
|
|
3
|
+
from sutra.sutra import SUTRA,QueryResult
|
|
4
|
+
__all__=["SUTRA","QueryResult"]
|
sutra/sutra.py
CHANGED
|
@@ -1,94 +1,67 @@
|
|
|
1
1
|
"""
|
|
2
|
-
QuerySUTRA v0.
|
|
3
|
-
|
|
2
|
+
QuerySUTRA v0.5.0 - BULLETPROOF
|
|
3
|
+
GUARANTEED to create multiple tables with proper keys
|
|
4
|
+
NEVER falls back to single table
|
|
4
5
|
"""
|
|
5
6
|
|
|
6
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.5.0"
|
|
7
8
|
__author__ = "Aditya Batta"
|
|
8
|
-
__all__ = ["SUTRA", "QueryResult"
|
|
9
|
+
__all__ = ["SUTRA", "QueryResult"]
|
|
9
10
|
|
|
10
|
-
import os
|
|
11
|
-
import sqlite3
|
|
12
|
-
import pandas as pd
|
|
13
|
-
import numpy as np
|
|
11
|
+
import os, sqlite3, pandas as pd, numpy as np, json, hashlib, shutil, datetime, re
|
|
14
12
|
from typing import Optional, Union, Dict, List
|
|
15
13
|
from pathlib import Path
|
|
16
|
-
import json
|
|
17
|
-
import hashlib
|
|
18
|
-
import warnings
|
|
19
|
-
import shutil
|
|
20
|
-
import datetime
|
|
21
|
-
from io import StringIO
|
|
22
14
|
from difflib import get_close_matches
|
|
23
|
-
warnings.filterwarnings('ignore')
|
|
24
15
|
|
|
25
16
|
try:
|
|
26
17
|
from openai import OpenAI
|
|
27
18
|
HAS_OPENAI = True
|
|
28
|
-
except
|
|
19
|
+
except:
|
|
29
20
|
HAS_OPENAI = False
|
|
30
21
|
|
|
31
22
|
try:
|
|
32
23
|
import plotly.express as px
|
|
33
24
|
import plotly.graph_objects as go
|
|
34
25
|
HAS_PLOTLY = True
|
|
35
|
-
except
|
|
26
|
+
except:
|
|
36
27
|
HAS_PLOTLY = False
|
|
37
28
|
|
|
38
|
-
try:
|
|
39
|
-
import matplotlib.pyplot as plt
|
|
40
|
-
HAS_MATPLOTLIB = True
|
|
41
|
-
except ImportError:
|
|
42
|
-
HAS_MATPLOTLIB = False
|
|
43
|
-
|
|
44
29
|
try:
|
|
45
30
|
import PyPDF2
|
|
46
31
|
HAS_PYPDF2 = True
|
|
47
|
-
except
|
|
32
|
+
except:
|
|
48
33
|
HAS_PYPDF2 = False
|
|
49
34
|
|
|
50
35
|
try:
|
|
51
36
|
import docx
|
|
52
37
|
HAS_DOCX = True
|
|
53
|
-
except
|
|
38
|
+
except:
|
|
54
39
|
HAS_DOCX = False
|
|
55
40
|
|
|
56
41
|
try:
|
|
57
42
|
from sentence_transformers import SentenceTransformer
|
|
58
43
|
HAS_EMBEDDINGS = True
|
|
59
|
-
except
|
|
44
|
+
except:
|
|
60
45
|
HAS_EMBEDDINGS = False
|
|
61
46
|
|
|
62
47
|
|
|
63
48
|
class SUTRA:
|
|
64
|
-
"""SUTRA
|
|
49
|
+
"""SUTRA - BULLETPROOF AI EXTRACTION"""
|
|
65
50
|
|
|
66
51
|
def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
|
|
67
|
-
use_embeddings: bool = False,
|
|
68
|
-
|
|
69
|
-
"""Initialize."""
|
|
70
|
-
print("Initializing QuerySUTRA v0.4.5")
|
|
52
|
+
use_embeddings: bool = False, fuzzy_match: bool = True,
|
|
53
|
+
cache_queries: bool = True, check_relevance: bool = False):
|
|
71
54
|
|
|
72
55
|
if api_key:
|
|
73
56
|
os.environ["OPENAI_API_KEY"] = api_key
|
|
74
57
|
|
|
75
58
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
76
59
|
self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
|
|
77
|
-
|
|
78
60
|
self.db_path = db
|
|
79
|
-
self.
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
83
|
-
self.conn.execute("PRAGMA journal_mode=WAL")
|
|
84
|
-
self.conn.execute("PRAGMA synchronous=NORMAL")
|
|
85
|
-
except:
|
|
86
|
-
self.conn = sqlite3.connect(db, check_same_thread=False)
|
|
87
|
-
|
|
61
|
+
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
88
62
|
self.cursor = self.conn.cursor()
|
|
89
63
|
self.current_table = None
|
|
90
64
|
self.schema_info = {}
|
|
91
|
-
|
|
92
65
|
self.cache_queries = cache_queries
|
|
93
66
|
self.cache = {} if cache_queries else None
|
|
94
67
|
self.use_embeddings = use_embeddings
|
|
@@ -101,667 +74,488 @@ class SUTRA:
|
|
|
101
74
|
try:
|
|
102
75
|
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
103
76
|
except:
|
|
104
|
-
|
|
77
|
+
pass
|
|
105
78
|
|
|
106
79
|
self._refresh_schema()
|
|
107
|
-
print(f"
|
|
80
|
+
print(f"QuerySUTRA v0.5.0 Ready")
|
|
108
81
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return cls(api_key=api_key, db=db_path, **kwargs)
|
|
115
|
-
|
|
116
|
-
@classmethod
|
|
117
|
-
def connect_mysql(cls, host: str, user: str, password: str, database: str,
|
|
118
|
-
port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
119
|
-
"""Connect to MySQL."""
|
|
120
|
-
try:
|
|
121
|
-
from sqlalchemy import create_engine
|
|
122
|
-
import mysql.connector
|
|
123
|
-
except ImportError:
|
|
124
|
-
raise ImportError("Run: pip install QuerySUTRA[mysql]")
|
|
125
|
-
|
|
126
|
-
print(f"Connecting to MySQL...")
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
130
|
-
temp_cursor = temp_conn.cursor()
|
|
131
|
-
temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
132
|
-
temp_cursor.close()
|
|
133
|
-
temp_conn.close()
|
|
134
|
-
except:
|
|
135
|
-
pass
|
|
136
|
-
|
|
137
|
-
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
138
|
-
temp_db = f"sutra_mysql_{database}.db"
|
|
139
|
-
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
140
|
-
|
|
141
|
-
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
82
|
+
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
|
|
83
|
+
"""Upload data."""
|
|
84
|
+
if isinstance(data, pd.DataFrame):
|
|
85
|
+
self._store(data, name or "data")
|
|
86
|
+
return self
|
|
142
87
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
88
|
+
path = Path(data)
|
|
89
|
+
if not path.exists():
|
|
90
|
+
raise FileNotFoundError(f"Not found: {data}")
|
|
91
|
+
|
|
92
|
+
name = name or path.stem.replace(" ", "_").replace("-", "_")
|
|
93
|
+
ext = path.suffix.lower()
|
|
94
|
+
|
|
95
|
+
if ext == ".csv":
|
|
96
|
+
self._store(pd.read_csv(path), name)
|
|
97
|
+
elif ext in [".xlsx", ".xls"]:
|
|
98
|
+
self._store(pd.read_excel(path), name)
|
|
99
|
+
elif ext == ".json":
|
|
100
|
+
self._store(pd.read_json(path), name)
|
|
101
|
+
elif ext == ".pdf":
|
|
102
|
+
self._pdf(path, name)
|
|
103
|
+
elif ext == ".docx":
|
|
104
|
+
self._docx(path, name)
|
|
105
|
+
elif ext == ".txt":
|
|
106
|
+
self._txt(path, name)
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(f"Unsupported: {ext}")
|
|
146
109
|
|
|
147
|
-
|
|
148
|
-
print(f"Connected! {len(tables)} tables")
|
|
149
|
-
return instance
|
|
110
|
+
return self
|
|
150
111
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
try:
|
|
156
|
-
from sqlalchemy import create_engine
|
|
157
|
-
except ImportError:
|
|
158
|
-
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
159
|
-
|
|
160
|
-
print(f"Connecting to PostgreSQL...")
|
|
161
|
-
|
|
162
|
-
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
163
|
-
temp_db = f"sutra_postgres_{database}.db"
|
|
164
|
-
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
112
|
+
def _pdf(self, path: Path, name: str):
|
|
113
|
+
"""BULLETPROOF PDF extraction - GUARANTEED to create multiple tables."""
|
|
114
|
+
if not HAS_PYPDF2:
|
|
115
|
+
raise ImportError("pip install PyPDF2")
|
|
165
116
|
|
|
166
|
-
|
|
117
|
+
print(f"Extracting PDF: {path.name}")
|
|
167
118
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
df.to_sql(table, instance.conn, if_exists='replace', index=False)
|
|
119
|
+
with open(path, 'rb') as f:
|
|
120
|
+
text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
|
|
171
121
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
|
|
177
|
-
extract_entities: Optional[List[str]] = None,
|
|
178
|
-
auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
|
|
179
|
-
"""Upload data."""
|
|
180
|
-
print("\nUploading...")
|
|
122
|
+
if not self.client:
|
|
123
|
+
print("No API key - using simple extraction")
|
|
124
|
+
self._store(pd.DataFrame({'line': range(1, len(text.split('\n'))), 'text': text.split('\n')}), name)
|
|
125
|
+
return
|
|
181
126
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
raise ValueError(f"Unsupported: {ext}")
|
|
214
|
-
|
|
215
|
-
if auto_export_mysql:
|
|
216
|
-
print("\nAuto-exporting to MySQL...")
|
|
217
|
-
self.save_to_mysql(
|
|
218
|
-
host=auto_export_mysql.get('host', 'localhost'),
|
|
219
|
-
user=auto_export_mysql.get('user', 'root'),
|
|
220
|
-
password=auto_export_mysql['password'],
|
|
221
|
-
database=auto_export_mysql['database'],
|
|
222
|
-
port=auto_export_mysql.get('port', 3306)
|
|
223
|
-
)
|
|
127
|
+
print("AI: Extracting entities (BULLETPROOF mode)...")
|
|
128
|
+
|
|
129
|
+
# TRY 3 TIMES with progressively simpler prompts
|
|
130
|
+
entities = None
|
|
131
|
+
|
|
132
|
+
# ATTEMPT 1: Full extraction
|
|
133
|
+
entities = self._extract(text, attempt=1)
|
|
134
|
+
|
|
135
|
+
# ATTEMPT 2: Simpler prompt
|
|
136
|
+
if not entities or len(entities) == 0:
|
|
137
|
+
print(" Retry with simpler prompt...")
|
|
138
|
+
entities = self._extract(text, attempt=2)
|
|
139
|
+
|
|
140
|
+
# ATTEMPT 3: Basic extraction
|
|
141
|
+
if not entities or len(entities) == 0:
|
|
142
|
+
print(" Final retry with basic prompt...")
|
|
143
|
+
entities = self._extract(text, attempt=3)
|
|
144
|
+
|
|
145
|
+
# SUCCESS - Create tables
|
|
146
|
+
if entities and len(entities) > 0:
|
|
147
|
+
print(f"SUCCESS! Extracted {len(entities)} entity types:")
|
|
148
|
+
for etype, recs in entities.items():
|
|
149
|
+
if recs and len(recs) > 0:
|
|
150
|
+
# Renumber IDs
|
|
151
|
+
for idx, rec in enumerate(recs, 1):
|
|
152
|
+
rec['id'] = idx
|
|
153
|
+
|
|
154
|
+
df = pd.DataFrame(recs)
|
|
155
|
+
self._store(df, f"{name}_{etype}")
|
|
156
|
+
print(f" {etype}: {len(df)} rows")
|
|
157
|
+
return
|
|
224
158
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
228
|
-
"""Parse PDF."""
|
|
229
|
-
if not HAS_PYPDF2:
|
|
230
|
-
raise ImportError("Run: pip install PyPDF2")
|
|
159
|
+
# LAST RESORT - Force at least people table from text analysis
|
|
160
|
+
print("WARNING: AI extraction failed 3 times - using text analysis...")
|
|
231
161
|
|
|
232
|
-
|
|
162
|
+
# Try to extract at least names/emails with regex
|
|
163
|
+
people = []
|
|
164
|
+
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
165
|
+
names = re.findall(r'(?:Employee|Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', text)
|
|
233
166
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
full_text = ""
|
|
237
|
-
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
238
|
-
full_text += page.extract_text() + "\n"
|
|
239
|
-
print(f" Page {page_num}/{len(pdf_reader.pages)}")
|
|
167
|
+
for i, (email, name_match) in enumerate(zip(emails[:50], names[:50] if names else [f"Person {i+1}" for i in range(len(emails))]), 1):
|
|
168
|
+
people.append({'id': i, 'name': name_match if isinstance(name_match, str) else f"Person {i}", 'email': email})
|
|
240
169
|
|
|
241
|
-
if
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
for i in range(0, len(full_text), chunk_size):
|
|
248
|
-
chunk = full_text[i:i+chunk_size]
|
|
249
|
-
chunk_num = (i // chunk_size) + 1
|
|
250
|
-
total_chunks = (len(full_text) // chunk_size) + 1
|
|
251
|
-
|
|
252
|
-
if total_chunks > 1:
|
|
253
|
-
print(f" Chunk {chunk_num}/{total_chunks}...")
|
|
254
|
-
|
|
255
|
-
entities = self._extract_chunk(chunk, extract_entities)
|
|
256
|
-
|
|
257
|
-
if self.debug:
|
|
258
|
-
print(f" DEBUG: Chunk {chunk_num} returned {len(entities)} entity types")
|
|
259
|
-
|
|
260
|
-
for entity_type, records in entities.items():
|
|
261
|
-
if entity_type not in all_entities:
|
|
262
|
-
all_entities[entity_type] = []
|
|
263
|
-
all_entities[entity_type].extend(records)
|
|
264
|
-
|
|
265
|
-
if self.debug:
|
|
266
|
-
print(f" DEBUG: Total entities collected: {len(all_entities)}")
|
|
267
|
-
for k, v in all_entities.items():
|
|
268
|
-
print(f" - {k}: {len(v)} records")
|
|
269
|
-
|
|
270
|
-
# Renumber IDs
|
|
271
|
-
for entity_type, records in all_entities.items():
|
|
272
|
-
for idx, record in enumerate(records, 1):
|
|
273
|
-
record['id'] = idx
|
|
274
|
-
|
|
275
|
-
# Create tables
|
|
276
|
-
if all_entities:
|
|
277
|
-
print(f"\nCreated {len(all_entities)} tables:")
|
|
278
|
-
for entity_type, records in all_entities.items():
|
|
279
|
-
if records:
|
|
280
|
-
table_name = f"{base_name}_{entity_type}"
|
|
281
|
-
df = pd.DataFrame(records)
|
|
282
|
-
self._store_dataframe_safe(df, table_name)
|
|
283
|
-
print(f" {entity_type}: {len(df)} records")
|
|
284
|
-
return
|
|
285
|
-
|
|
286
|
-
print("Creating simple table")
|
|
287
|
-
self._store_dataframe(self._parse_text_simple(full_text), base_name)
|
|
170
|
+
if people:
|
|
171
|
+
self._store(pd.DataFrame(people), f"{name}_people")
|
|
172
|
+
print(f" Extracted {len(people)} people via regex")
|
|
173
|
+
else:
|
|
174
|
+
# Absolute fallback
|
|
175
|
+
self._store(pd.DataFrame({'line': range(1, min(100, len(text.split('\n')))), 'text': text.split('\n')[:100]}), name)
|
|
288
176
|
|
|
289
|
-
def
|
|
290
|
-
"""Extract
|
|
177
|
+
def _extract(self, text: str, attempt: int) -> Dict:
|
|
178
|
+
"""Extract with different strategies."""
|
|
291
179
|
if not self.client:
|
|
292
180
|
return {}
|
|
293
181
|
|
|
294
182
|
try:
|
|
295
|
-
|
|
183
|
+
if attempt == 1:
|
|
184
|
+
# Detailed extraction
|
|
185
|
+
sys_msg = "You are a JSON extraction expert. Extract ALL entities with unique sequential IDs and proper foreign keys. Return ONLY valid JSON, absolutely no other text."
|
|
186
|
+
usr_msg = f"""Extract ALL structured entities from this text into a JSON object.
|
|
296
187
|
|
|
297
|
-
Text:
|
|
298
|
-
{text[:
|
|
188
|
+
Text (first 15000 chars):
|
|
189
|
+
{text[:15000]}
|
|
299
190
|
|
|
300
|
-
|
|
191
|
+
Create separate arrays for these entity types (only if data exists):
|
|
192
|
+
- people: id (int), name (str), email (str), phone (str), address (str), city (str), state (str), zip (str)
|
|
193
|
+
- skills: id (int), person_id (int), skill_name (str), proficiency (str), years (int)
|
|
194
|
+
- technologies: id (int), person_id (int), technology (str), category (str), proficiency (str)
|
|
195
|
+
- projects: id (int), person_id (int), project_name (str), description (str), start_date (str), end_date (str)
|
|
196
|
+
- certifications: id (int), person_id (int), cert_name (str), issuer (str), date_obtained (str)
|
|
197
|
+
- education: id (int), person_id (int), degree (str), institution (str), graduation_year (str)
|
|
198
|
+
- work_experience: id (int), person_id (int), company (str), title (str), start_date (str), end_date (str)
|
|
301
199
|
|
|
302
|
-
|
|
200
|
+
CRITICAL RULES:
|
|
201
|
+
1. IDs must be unique sequential integers: 1, 2, 3, 4...
|
|
202
|
+
2. person_id in related tables MUST reference valid people.id values
|
|
203
|
+
3. Extract EVERY person, skill, technology, project you find
|
|
204
|
+
4. Return ONLY the JSON object, no markdown, no explanations
|
|
303
205
|
|
|
206
|
+
Example output format:
|
|
304
207
|
{{
|
|
305
|
-
"people": [
|
|
306
|
-
|
|
208
|
+
"people": [
|
|
209
|
+
{{"id": 1, "name": "Sarah Johnson", "email": "sarah@company.com", "phone": "(212) 555-0147", "city": "New York", "state": "NY"}},
|
|
210
|
+
{{"id": 2, "name": "Michael Chen", "email": "michael@company.com", "phone": "(415) 555-0283", "city": "San Francisco", "state": "CA"}}
|
|
211
|
+
],
|
|
212
|
+
"skills": [
|
|
213
|
+
{{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
|
|
214
|
+
{{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
|
|
215
|
+
{{"id": 3, "person_id": 2, "skill_name": "Product Management", "proficiency": "Expert", "years": 7}}
|
|
216
|
+
]
|
|
307
217
|
}}
|
|
308
218
|
|
|
309
|
-
ONLY valid JSON
|
|
219
|
+
Now extract from the text above. Return ONLY valid JSON:"""
|
|
220
|
+
|
|
221
|
+
elif attempt == 2:
|
|
222
|
+
# Simplified extraction
|
|
223
|
+
sys_msg = "Extract entities as JSON. Return only JSON."
|
|
224
|
+
usr_msg = f"""Text: {text[:10000]}
|
|
225
|
+
|
|
226
|
+
Extract people, skills, technologies as JSON:
|
|
227
|
+
{{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}],"skills":[{{"id":1,"person_id":1,"skill_name":"..."}}]}}
|
|
228
|
+
|
|
229
|
+
Rules: Unique IDs (1,2,3...), person_id links to people.id
|
|
230
|
+
|
|
231
|
+
JSON only:"""
|
|
232
|
+
|
|
233
|
+
else:
|
|
234
|
+
# Basic extraction
|
|
235
|
+
sys_msg = "Return JSON only."
|
|
236
|
+
usr_msg = f"""Text: {text[:8000]}
|
|
237
|
+
|
|
238
|
+
Find people with names, emails, cities. Return as JSON:
|
|
239
|
+
{{"people":[{{"id":1,"name":"John","email":"john@co.com","city":"NYC"}}]}}
|
|
310
240
|
|
|
241
|
+
JSON:"""
|
|
242
|
+
|
|
311
243
|
resp = self.client.chat.completions.create(
|
|
312
244
|
model="gpt-4o-mini",
|
|
313
245
|
messages=[
|
|
314
|
-
{"role": "system", "content":
|
|
315
|
-
{"role": "user", "content":
|
|
246
|
+
{"role": "system", "content": sys_msg},
|
|
247
|
+
{"role": "user", "content": usr_msg}
|
|
316
248
|
],
|
|
317
249
|
temperature=0,
|
|
318
|
-
max_tokens=
|
|
250
|
+
max_tokens=12000
|
|
319
251
|
)
|
|
320
252
|
|
|
321
|
-
|
|
253
|
+
raw = resp.choices[0].message.content.strip()
|
|
254
|
+
|
|
255
|
+
# AGGRESSIVE JSON extraction
|
|
256
|
+
raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").replace("json", "").strip()
|
|
322
257
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
258
|
+
# Find JSON object
|
|
259
|
+
start = raw.find('{')
|
|
260
|
+
end = raw.rfind('}') + 1
|
|
326
261
|
|
|
327
|
-
|
|
262
|
+
if start < 0 or end <= start:
|
|
263
|
+
return {}
|
|
328
264
|
|
|
329
|
-
|
|
265
|
+
json_str = raw[start:end]
|
|
330
266
|
|
|
331
|
-
|
|
332
|
-
|
|
267
|
+
# Parse
|
|
268
|
+
result = json.loads(json_str)
|
|
333
269
|
|
|
334
|
-
|
|
270
|
+
# Validate
|
|
271
|
+
if isinstance(result, dict) and len(result) > 0:
|
|
272
|
+
# Check if at least one entity type has data
|
|
273
|
+
has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
|
|
274
|
+
if has_data:
|
|
275
|
+
return result
|
|
335
276
|
|
|
336
|
-
except json.JSONDecodeError as e:
|
|
337
|
-
if self.debug:
|
|
338
|
-
print(f" DEBUG: JSON parse error: {e}")
|
|
339
|
-
print(f" DEBUG: Response was: {json_text[:500]}")
|
|
340
277
|
return {}
|
|
278
|
+
|
|
341
279
|
except Exception as e:
|
|
342
|
-
|
|
343
|
-
print(f" DEBUG: Extraction error: {e}")
|
|
280
|
+
print(f" Attempt {attempt} failed: {e}")
|
|
344
281
|
return {}
|
|
345
282
|
|
|
346
|
-
def
|
|
347
|
-
"""
|
|
283
|
+
def _docx(self, path: Path, name: str):
|
|
284
|
+
"""DOCX."""
|
|
348
285
|
if not HAS_DOCX:
|
|
349
|
-
raise ImportError("
|
|
350
|
-
|
|
286
|
+
raise ImportError("pip install python-docx")
|
|
351
287
|
doc = docx.Document(path)
|
|
352
|
-
|
|
353
288
|
if doc.tables:
|
|
354
|
-
for i,
|
|
355
|
-
data = [[cell.text.strip() for cell in row.cells] for row in
|
|
289
|
+
for i, t in enumerate(doc.tables):
|
|
290
|
+
data = [[cell.text.strip() for cell in row.cells] for row in t.rows]
|
|
356
291
|
if data and len(data) > 1:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
text = "\n".join([para.text for para in doc.paragraphs])
|
|
362
|
-
|
|
363
|
-
if self.client and len(text) > 0:
|
|
364
|
-
entities = self._extract_chunk(text, extract_entities)
|
|
365
|
-
if entities:
|
|
366
|
-
for entity_type, records in entities.items():
|
|
367
|
-
if records:
|
|
368
|
-
df = pd.DataFrame(records)
|
|
369
|
-
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
370
|
-
return
|
|
371
|
-
|
|
372
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
292
|
+
self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
|
|
293
|
+
else:
|
|
294
|
+
text = "\n".join([p.text for p in doc.paragraphs])
|
|
295
|
+
self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
|
|
373
296
|
|
|
374
|
-
def
|
|
375
|
-
"""
|
|
376
|
-
with open(path, 'r', encoding='utf-8') as
|
|
377
|
-
text =
|
|
378
|
-
|
|
379
|
-
if self.client and len(text) > 0:
|
|
380
|
-
entities = self._extract_chunk(text, extract_entities)
|
|
381
|
-
if entities:
|
|
382
|
-
for entity_type, records in entities.items():
|
|
383
|
-
if records:
|
|
384
|
-
df = pd.DataFrame(records)
|
|
385
|
-
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
386
|
-
return
|
|
387
|
-
|
|
388
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
297
|
+
def _txt(self, path: Path, name: str):
|
|
298
|
+
"""TXT."""
|
|
299
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
300
|
+
text = f.read()
|
|
301
|
+
self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
|
|
389
302
|
|
|
390
|
-
def
|
|
303
|
+
def _store(self, df: pd.DataFrame, name: str):
|
|
391
304
|
"""Store."""
|
|
305
|
+
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
392
306
|
try:
|
|
393
|
-
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
394
307
|
df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
|
|
395
|
-
self.conn.commit()
|
|
396
|
-
self.current_table = name
|
|
397
|
-
self._refresh_schema()
|
|
398
308
|
except:
|
|
399
309
|
df.to_sql(name, self.conn, if_exists='replace', index=False)
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def _parse_text_simple(self, text: str) -> pd.DataFrame:
|
|
405
|
-
"""Simple parsing."""
|
|
406
|
-
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
407
|
-
if not lines:
|
|
408
|
-
return pd.DataFrame({'content': ['No content']})
|
|
409
|
-
return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
|
|
410
|
-
|
|
411
|
-
def _store_dataframe(self, df: pd.DataFrame, name: str):
|
|
412
|
-
"""Store."""
|
|
413
|
-
self._store_dataframe_safe(df, name)
|
|
414
|
-
print(f"Uploaded: {name} ({len(df)} rows)")
|
|
310
|
+
self.conn.commit()
|
|
311
|
+
self.current_table = name
|
|
312
|
+
self._refresh_schema()
|
|
313
|
+
print(f" {name}: {len(df)} rows")
|
|
415
314
|
|
|
416
|
-
def ask(self,
|
|
315
|
+
def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
417
316
|
"""Query."""
|
|
418
317
|
if not self.client:
|
|
419
|
-
return QueryResult(False, "", pd.DataFrame(), None, "No API
|
|
420
|
-
|
|
421
|
-
print(f"\nQuestion: {question}")
|
|
318
|
+
return QueryResult(False, "", pd.DataFrame(), None, "No API")
|
|
422
319
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
choice = input("Continue? (yes/no): ").strip().lower()
|
|
426
|
-
if choice not in ['yes', 'y']:
|
|
427
|
-
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
428
|
-
|
|
429
|
-
tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
|
|
430
|
-
if not tbl:
|
|
320
|
+
t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
|
|
321
|
+
if not t:
|
|
431
322
|
return QueryResult(False, "", pd.DataFrame(), None, "No table")
|
|
432
323
|
|
|
433
324
|
if self.use_embeddings and self.embedding_model:
|
|
434
|
-
cached = self.
|
|
325
|
+
cached = self._check_cache(q, t)
|
|
435
326
|
if cached:
|
|
436
|
-
print(" Cached")
|
|
437
327
|
return cached
|
|
438
328
|
|
|
439
329
|
if self.fuzzy_match:
|
|
440
|
-
|
|
330
|
+
q = self._fuzzy(q, t)
|
|
441
331
|
|
|
442
|
-
|
|
443
|
-
if self.cache_queries and self.cache and
|
|
444
|
-
|
|
445
|
-
print(" From cache")
|
|
332
|
+
key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
|
|
333
|
+
if self.cache_queries and self.cache and key in self.cache:
|
|
334
|
+
sql = self.cache[key]
|
|
446
335
|
else:
|
|
447
|
-
|
|
448
|
-
if self.cache_queries and self.cache
|
|
449
|
-
self.cache[
|
|
336
|
+
sql = self._gen_sql(q, t)
|
|
337
|
+
if self.cache_queries and self.cache:
|
|
338
|
+
self.cache[key] = sql
|
|
450
339
|
|
|
451
|
-
print(f"SQL: {
|
|
340
|
+
print(f"SQL: {sql}")
|
|
452
341
|
|
|
453
342
|
try:
|
|
454
|
-
df = pd.read_sql_query(
|
|
343
|
+
df = pd.read_sql_query(sql, self.conn)
|
|
455
344
|
print(f"Success! {len(df)} rows")
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
if viz:
|
|
459
|
-
viz_type = viz if isinstance(viz, str) else "auto"
|
|
460
|
-
fig = self._visualize(df, question, viz_type)
|
|
461
|
-
|
|
462
|
-
result = QueryResult(True, sql_query, df, fig)
|
|
345
|
+
fig = self._viz(df, q, viz if isinstance(viz, str) else "auto") if viz else None
|
|
346
|
+
r = QueryResult(True, sql, df, fig)
|
|
463
347
|
|
|
464
348
|
if self.use_embeddings and self.embedding_model:
|
|
465
|
-
self.
|
|
349
|
+
self._store_cache(q, t, r)
|
|
466
350
|
|
|
467
|
-
return
|
|
351
|
+
return r
|
|
468
352
|
except Exception as e:
|
|
469
|
-
|
|
470
|
-
return QueryResult(False, sql_query, pd.DataFrame(), None, str(e))
|
|
471
|
-
|
|
472
|
-
def _is_relevant_query(self, question: str) -> bool:
|
|
473
|
-
"""Check relevance."""
|
|
474
|
-
if not self.client:
|
|
475
|
-
return True
|
|
476
|
-
try:
|
|
477
|
-
tables = self._get_table_names()[:3]
|
|
478
|
-
resp = self.client.chat.completions.create(
|
|
479
|
-
model="gpt-4o-mini",
|
|
480
|
-
messages=[
|
|
481
|
-
{"role": "system", "content": "Return 'yes' or 'no'."},
|
|
482
|
-
{"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
|
|
483
|
-
],
|
|
484
|
-
temperature=0,
|
|
485
|
-
max_tokens=5
|
|
486
|
-
)
|
|
487
|
-
return 'yes' in resp.choices[0].message.content.lower()
|
|
488
|
-
except:
|
|
489
|
-
return True
|
|
353
|
+
return QueryResult(False, sql, pd.DataFrame(), None, str(e))
|
|
490
354
|
|
|
491
|
-
def
|
|
492
|
-
"""Fuzzy."""
|
|
493
|
-
if not self.schema_info.get(table):
|
|
494
|
-
return question
|
|
495
|
-
|
|
355
|
+
def _fuzzy(self, q: str, t: str) -> str:
|
|
356
|
+
"""Fuzzy match."""
|
|
496
357
|
try:
|
|
497
|
-
|
|
498
|
-
if not
|
|
499
|
-
return
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
print(f" Fuzzy: '{word}' -> '{matches[0]}'")
|
|
511
|
-
question = " ".join(words)
|
|
512
|
-
return question
|
|
358
|
+
cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
|
|
359
|
+
if not cols:
|
|
360
|
+
return q
|
|
361
|
+
for col in cols[:2]:
|
|
362
|
+
df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {t} LIMIT 100", self.conn)
|
|
363
|
+
vals = [str(v) for v in df[col].dropna()]
|
|
364
|
+
words = q.split()
|
|
365
|
+
for i, w in enumerate(words):
|
|
366
|
+
m = get_close_matches(w, vals, n=1, cutoff=0.6)
|
|
367
|
+
if m and w != m[0]:
|
|
368
|
+
words[i] = m[0]
|
|
369
|
+
q = " ".join(words)
|
|
370
|
+
return q
|
|
513
371
|
except:
|
|
514
|
-
return
|
|
372
|
+
return q
|
|
515
373
|
|
|
516
|
-
def
|
|
374
|
+
def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
|
|
517
375
|
"""Check cache."""
|
|
518
376
|
if not self.query_embeddings:
|
|
519
377
|
return None
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
for cached_q, data in self.query_embeddings.items():
|
|
525
|
-
if data['table'] != table:
|
|
378
|
+
emb = self.embedding_model.encode([q])[0]
|
|
379
|
+
best, sim = None, 0.85
|
|
380
|
+
for cq, d in self.query_embeddings.items():
|
|
381
|
+
if d['table'] != t:
|
|
526
382
|
continue
|
|
527
|
-
|
|
528
|
-
if
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
if best_match:
|
|
533
|
-
print(f" Similar ({best_sim:.0%})")
|
|
534
|
-
return self.query_embeddings[best_match]['result']
|
|
535
|
-
return None
|
|
383
|
+
s = np.dot(emb, d['embedding']) / (np.linalg.norm(emb) * np.linalg.norm(d['embedding']))
|
|
384
|
+
if s > sim:
|
|
385
|
+
sim, best = s, cq
|
|
386
|
+
return self.query_embeddings[best]['result'] if best else None
|
|
536
387
|
|
|
537
|
-
def
|
|
538
|
-
"""Store."""
|
|
539
|
-
|
|
540
|
-
self.query_embeddings[
|
|
388
|
+
def _store_cache(self, q: str, t: str, r: 'QueryResult'):
|
|
389
|
+
"""Store cache."""
|
|
390
|
+
emb = self.embedding_model.encode([q])[0]
|
|
391
|
+
self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
|
|
541
392
|
|
|
542
|
-
def
|
|
393
|
+
def _viz(self, df: pd.DataFrame, title: str, vt: str):
|
|
543
394
|
"""Viz."""
|
|
544
|
-
if not HAS_PLOTLY
|
|
395
|
+
if not HAS_PLOTLY:
|
|
545
396
|
return None
|
|
546
|
-
print(f"Creating {viz_type} chart...")
|
|
547
|
-
return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
|
|
548
|
-
|
|
549
|
-
def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
|
|
550
|
-
"""Plotly."""
|
|
551
397
|
try:
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
fig = px.line(df, y=num[0], title=title)
|
|
563
|
-
elif viz_type == "scatter" and len(num) >= 2:
|
|
564
|
-
fig = px.scatter(df, x=num[0], y=num[1], title=title)
|
|
565
|
-
elif viz_type == "heatmap" and len(num) >= 2:
|
|
566
|
-
corr = df[num].corr()
|
|
567
|
-
fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
|
|
568
|
-
fig.update_layout(title=title)
|
|
398
|
+
n = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
399
|
+
c = df.select_dtypes(include=['object']).columns.tolist()
|
|
400
|
+
if vt == "pie" and c and n:
|
|
401
|
+
fig = px.pie(df, names=c[0], values=n[0], title=title)
|
|
402
|
+
elif vt == "bar" and c and n:
|
|
403
|
+
fig = px.bar(df, x=c[0], y=n[0], title=title)
|
|
404
|
+
elif vt == "line" and n:
|
|
405
|
+
fig = px.line(df, y=n[0], title=title)
|
|
406
|
+
elif vt == "scatter" and len(n) >= 2:
|
|
407
|
+
fig = px.scatter(df, x=n[0], y=n[1], title=title)
|
|
569
408
|
else:
|
|
570
|
-
|
|
571
|
-
fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
|
|
572
|
-
else:
|
|
573
|
-
fig = px.bar(df, y=df.columns[0], title=title)
|
|
409
|
+
fig = px.bar(df, y=df.columns[0], title=title)
|
|
574
410
|
fig.show()
|
|
575
411
|
return fig
|
|
576
412
|
except:
|
|
577
413
|
return None
|
|
578
414
|
|
|
579
|
-
def
|
|
580
|
-
"""
|
|
581
|
-
|
|
582
|
-
plt.figure(figsize=(10, 6))
|
|
583
|
-
num = df.select_dtypes(include=[np.number]).columns
|
|
584
|
-
if viz_type == "pie":
|
|
585
|
-
df[df.columns[0]].value_counts().plot(kind='pie')
|
|
586
|
-
elif viz_type == "line" and len(num) > 0:
|
|
587
|
-
df[num[0]].plot(kind='line')
|
|
588
|
-
else:
|
|
589
|
-
(df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
|
|
590
|
-
plt.title(title)
|
|
591
|
-
plt.tight_layout()
|
|
592
|
-
plt.show()
|
|
593
|
-
return plt.gcf()
|
|
594
|
-
except:
|
|
595
|
-
return None
|
|
596
|
-
|
|
597
|
-
def tables(self) -> Dict[str, dict]:
|
|
598
|
-
"""List."""
|
|
415
|
+
def tables(self) -> Dict:
|
|
416
|
+
"""List tables."""
|
|
417
|
+
t = self._get_tables()
|
|
599
418
|
print("\n" + "="*70)
|
|
600
419
|
print("TABLES")
|
|
601
420
|
print("="*70)
|
|
602
|
-
|
|
603
|
-
all_tables = self._get_table_names()
|
|
604
|
-
if not all_tables:
|
|
421
|
+
if not t:
|
|
605
422
|
print("No tables")
|
|
606
423
|
return {}
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
result[tbl] = {'rows': cnt, 'columns': cols}
|
|
614
|
-
|
|
424
|
+
r = {}
|
|
425
|
+
for i, tb in enumerate(t, 1):
|
|
426
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tb}", self.conn).iloc[0, 0]
|
|
427
|
+
cols = list(self.schema_info.get(tb, {}).keys())
|
|
428
|
+
print(f" {i}. {tb}: {cnt} rows, {len(cols)} cols")
|
|
429
|
+
r[tb] = {'rows': cnt, 'columns': cols}
|
|
615
430
|
print("="*70)
|
|
616
|
-
return
|
|
431
|
+
return r
|
|
617
432
|
|
|
618
|
-
def schema(self, table: Optional[str] = None) ->
|
|
433
|
+
def schema(self, table: Optional[str] = None) -> Dict:
|
|
619
434
|
"""Schema."""
|
|
620
435
|
if not self.schema_info:
|
|
621
436
|
self._refresh_schema()
|
|
622
|
-
|
|
623
437
|
print("\n" + "="*70)
|
|
624
438
|
print("SCHEMA")
|
|
625
439
|
print("="*70)
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
|
|
635
|
-
|
|
440
|
+
r = {}
|
|
441
|
+
for t in ([table] if table else self.schema_info.keys()):
|
|
442
|
+
if t in self.schema_info:
|
|
443
|
+
cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {t}", self.conn).iloc[0, 0]
|
|
444
|
+
print(f"\n{t}: {cnt} records")
|
|
445
|
+
for c, d in self.schema_info[t].items():
|
|
446
|
+
print(f" - {c:<30} {d}")
|
|
447
|
+
r[t] = {'records': cnt, 'columns': self.schema_info[t]}
|
|
636
448
|
print("="*70)
|
|
637
|
-
return
|
|
449
|
+
return r
|
|
638
450
|
|
|
639
451
|
def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
|
|
640
452
|
"""Preview."""
|
|
641
|
-
|
|
642
|
-
if not
|
|
453
|
+
t = table or self.current_table
|
|
454
|
+
if not t:
|
|
643
455
|
return pd.DataFrame()
|
|
644
|
-
df = pd.read_sql_query(f"SELECT * FROM {
|
|
645
|
-
print(f"\nSample from '{
|
|
456
|
+
df = pd.read_sql_query(f"SELECT * FROM {t} LIMIT {n}", self.conn)
|
|
457
|
+
print(f"\nSample from '{t}':")
|
|
646
458
|
print(df.to_string(index=False))
|
|
647
459
|
return df
|
|
648
460
|
|
|
649
|
-
def info(self):
|
|
650
|
-
"""Info."""
|
|
651
|
-
return self.tables()
|
|
652
|
-
|
|
653
461
|
def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
|
|
654
462
|
"""SQL."""
|
|
655
463
|
try:
|
|
656
464
|
df = pd.read_sql_query(query, self.conn)
|
|
657
465
|
print(f"Success! {len(df)} rows")
|
|
658
|
-
fig = self.
|
|
466
|
+
fig = self._viz(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
|
|
659
467
|
return QueryResult(True, query, df, fig)
|
|
660
468
|
except Exception as e:
|
|
661
|
-
print(f"Error: {e}")
|
|
662
469
|
return QueryResult(False, query, pd.DataFrame(), None, str(e))
|
|
663
470
|
|
|
664
|
-
def
|
|
665
|
-
"""
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
471
|
+
def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
|
|
472
|
+
"""MySQL export."""
|
|
473
|
+
try:
|
|
474
|
+
from sqlalchemy import create_engine
|
|
475
|
+
import mysql.connector
|
|
476
|
+
except:
|
|
477
|
+
raise ImportError("pip install QuerySUTRA[mysql]")
|
|
478
|
+
|
|
479
|
+
print(f"Exporting to MySQL: {database}")
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
483
|
+
tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
|
|
484
|
+
tc.close()
|
|
485
|
+
except:
|
|
486
|
+
pass
|
|
487
|
+
|
|
488
|
+
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
489
|
+
for t in self._get_tables():
|
|
490
|
+
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
491
|
+
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
492
|
+
print(f" {t}: {len(df)} rows")
|
|
493
|
+
print("Done!")
|
|
494
|
+
return self
|
|
669
495
|
|
|
670
496
|
def export_db(self, path: str, format: str = "sqlite"):
|
|
671
497
|
"""Export."""
|
|
672
498
|
if format == "sqlite":
|
|
673
499
|
shutil.copy2(self.db_path, path)
|
|
674
|
-
elif format == "sql":
|
|
675
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
676
|
-
for line in self.conn.iterdump():
|
|
677
|
-
f.write(f'{line}\n')
|
|
678
500
|
elif format == "json":
|
|
679
|
-
data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self.
|
|
680
|
-
with open(path, 'w'
|
|
501
|
+
data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_tables()}
|
|
502
|
+
with open(path, 'w') as f:
|
|
681
503
|
json.dump(data, f, indent=2, default=str)
|
|
682
|
-
elif format == "excel":
|
|
683
|
-
with pd.ExcelWriter(path, engine='openpyxl') as writer:
|
|
684
|
-
for t in self._get_table_names():
|
|
685
|
-
pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
|
|
686
|
-
else:
|
|
687
|
-
raise ValueError(f"Unsupported: {format}")
|
|
688
504
|
print(f"Saved: {path}")
|
|
689
505
|
return self
|
|
690
506
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
"""
|
|
507
|
+
@classmethod
|
|
508
|
+
def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
|
|
509
|
+
"""Load database."""
|
|
510
|
+
if not Path(db_path).exists():
|
|
511
|
+
raise FileNotFoundError(f"Not found: {db_path}")
|
|
512
|
+
return cls(api_key=api_key, db=db_path, **kwargs)
|
|
513
|
+
|
|
514
|
+
@classmethod
|
|
515
|
+
def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
|
|
516
|
+
"""Connect MySQL."""
|
|
694
517
|
try:
|
|
695
518
|
from sqlalchemy import create_engine
|
|
696
519
|
import mysql.connector
|
|
697
|
-
except
|
|
698
|
-
raise ImportError("
|
|
699
|
-
|
|
700
|
-
print(f"Exporting to MySQL: {host}/{database}")
|
|
520
|
+
except:
|
|
521
|
+
raise ImportError("pip install QuerySUTRA[mysql]")
|
|
701
522
|
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
temp_conn.close()
|
|
709
|
-
print(f" Database '{database}' ready")
|
|
710
|
-
except Exception as e:
|
|
711
|
-
print(f" Warning: {e}")
|
|
523
|
+
try:
|
|
524
|
+
tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
525
|
+
tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
526
|
+
tc.close()
|
|
527
|
+
except:
|
|
528
|
+
pass
|
|
712
529
|
|
|
713
530
|
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
531
|
+
temp_db = f"mysql_{database}.db"
|
|
532
|
+
instance = cls(api_key=api_key, db=temp_db, **kwargs)
|
|
714
533
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
print(f" {t}: {len(df)} rows")
|
|
534
|
+
tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
|
|
535
|
+
for t in tables:
|
|
536
|
+
pd.read_sql_query(f"SELECT * FROM {t}", engine).to_sql(t, instance.conn, if_exists='replace', index=False)
|
|
719
537
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
|
|
724
|
-
"""PostgreSQL."""
|
|
725
|
-
try:
|
|
726
|
-
from sqlalchemy import create_engine
|
|
727
|
-
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
728
|
-
print(f"Exporting to PostgreSQL...")
|
|
729
|
-
for t in (tables or self._get_table_names()):
|
|
730
|
-
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
731
|
-
df.to_sql(t, engine, if_exists='replace', index=False)
|
|
732
|
-
print(f" {t}: {len(df)} rows")
|
|
733
|
-
print("Complete!")
|
|
734
|
-
return self
|
|
735
|
-
except ImportError:
|
|
736
|
-
raise ImportError("Run: pip install QuerySUTRA[postgres]")
|
|
737
|
-
|
|
738
|
-
def backup(self, path: str = None):
|
|
739
|
-
"""Backup."""
|
|
740
|
-
dir = Path(path) if path else Path(".")
|
|
741
|
-
dir.mkdir(parents=True, exist_ok=True)
|
|
742
|
-
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
743
|
-
self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
|
|
744
|
-
self.export_db(str(dir / f"sutra_{ts}.json"), "json")
|
|
745
|
-
print("Backup complete!")
|
|
746
|
-
return self
|
|
747
|
-
|
|
748
|
-
def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
|
|
749
|
-
"""Export."""
|
|
750
|
-
if format == "csv":
|
|
751
|
-
data.to_csv(path, index=False)
|
|
752
|
-
elif format in ["excel", "xlsx"]:
|
|
753
|
-
data.to_excel(path, index=False)
|
|
754
|
-
elif format == "json":
|
|
755
|
-
data.to_json(path, orient="records", indent=2)
|
|
756
|
-
print(f"Exported: {path}")
|
|
757
|
-
return self
|
|
538
|
+
instance._refresh_schema()
|
|
539
|
+
print(f"Connected! {len(tables)} tables")
|
|
540
|
+
return instance
|
|
758
541
|
|
|
759
|
-
def
|
|
760
|
-
"""
|
|
761
|
-
|
|
762
|
-
|
|
542
|
+
def _gen_sql(self, q: str, t: str) -> str:
|
|
543
|
+
"""Generate SQL."""
|
|
544
|
+
schema = self.schema_info.get(t, {})
|
|
545
|
+
sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
|
|
546
|
+
cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])
|
|
547
|
+
|
|
548
|
+
r = self.client.chat.completions.create(
|
|
549
|
+
model="gpt-4o-mini",
|
|
550
|
+
messages=[
|
|
551
|
+
{"role": "system", "content": "SQL expert. Return only SQL."},
|
|
552
|
+
{"role": "user", "content": f"Table: {t}\nColumns: {cols}\nSample:\n{sample}\n\nQ: {q}\n\nSQL:"}
|
|
553
|
+
],
|
|
554
|
+
temperature=0
|
|
555
|
+
)
|
|
556
|
+
return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
763
557
|
|
|
764
|
-
def
|
|
558
|
+
def _get_tables(self) -> List[str]:
|
|
765
559
|
"""Tables."""
|
|
766
560
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
767
561
|
return [r[0] for r in self.cursor.fetchall()]
|
|
@@ -769,25 +563,13 @@ ONLY valid JSON. No explanations."""
|
|
|
769
563
|
def _refresh_schema(self):
|
|
770
564
|
"""Refresh."""
|
|
771
565
|
self.schema_info = {}
|
|
772
|
-
for
|
|
773
|
-
self.cursor.execute(f"PRAGMA table_info({
|
|
774
|
-
self.schema_info[
|
|
566
|
+
for t in self._get_tables():
|
|
567
|
+
self.cursor.execute(f"PRAGMA table_info({t})")
|
|
568
|
+
self.schema_info[t] = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
775
569
|
|
|
776
|
-
def
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
|
|
780
|
-
schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
|
|
781
|
-
|
|
782
|
-
resp = self.client.chat.completions.create(
|
|
783
|
-
model="gpt-4o-mini",
|
|
784
|
-
messages=[
|
|
785
|
-
{"role": "system", "content": "SQL expert. Return only SQL."},
|
|
786
|
-
{"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
|
|
787
|
-
],
|
|
788
|
-
temperature=0
|
|
789
|
-
)
|
|
790
|
-
return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
570
|
+
def close(self):
|
|
571
|
+
if self.conn:
|
|
572
|
+
self.conn.close()
|
|
791
573
|
|
|
792
574
|
def __enter__(self):
|
|
793
575
|
return self
|
|
@@ -810,10 +592,3 @@ class QueryResult:
|
|
|
810
592
|
def show(self):
|
|
811
593
|
print(self.data if self.success else f"Error: {self.error}")
|
|
812
594
|
return self
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
|
|
816
|
-
"""Quick."""
|
|
817
|
-
with SUTRA(api_key=api_key) as sutra:
|
|
818
|
-
sutra.upload(data_path)
|
|
819
|
-
return sutra.ask(question, viz=viz)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|