QuerySUTRA 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: QuerySUTRA
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: SUTRA
5
5
  Author: Aditya Batta
6
6
  License: MIT
@@ -1,5 +1,5 @@
1
- querysutra-0.5.0.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
- sutra/__init__.py,sha256=hcwaHWzFyS3XX-B9qn2mwdGL9kPh7aJE91e4OCflAiM,144
1
+ querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
+ sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
3
3
  sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
4
  sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
5
  sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
11
11
  sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
12
  sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
13
  sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=etDxiGYwCj8t6sdppYk2MsFmZlX9d2JiJv1na1GYF4Y,32320
14
+ sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
15
15
  sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
16
  sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
17
  sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
22
  utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.5.0.dist-info/METADATA,sha256=r6qSHEzh-U5U8lEp4WPllsK_sSH-sLLAtRXX2EmPG9s,7258
26
- querysutra-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.5.0.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.5.0.dist-info/RECORD,,
25
+ querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
26
+ querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
+ querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
+ querysutra-0.5.2.dist-info/RECORD,,
sutra/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- """QuerySUTRA v0.5.0"""
2
- __version__="0.5.0"
3
- from sutra.sutra import SUTRA,QueryResult,quick_start
4
- __all__=["SUTRA","QueryResult","quick_start"]
1
+ """QuerySUTRA v0.5.2"""
2
+ __version__="0.5.2"
3
+ from sutra.sutra import SUTRA,QueryResult
4
+ __all__=["SUTRA","QueryResult"]
sutra/sutra.py CHANGED
@@ -1,94 +1,62 @@
1
- """
2
- QuerySUTRA v0.4.5 - FIXED AI EXTRACTION
3
- Debug mode to see why extraction fails
4
- """
5
-
6
- __version__ = "0.4.5"
1
+ """QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
2
+ __version__ = "0.5.1"
7
3
  __author__ = "Aditya Batta"
8
- __all__ = ["SUTRA", "QueryResult", "quick_start"]
4
+ __all__ = ["SUTRA", "QueryResult"]
9
5
 
10
- import os
11
- import sqlite3
12
- import pandas as pd
13
- import numpy as np
6
+ import os, sqlite3, pandas as pd, numpy as np, json, hashlib, shutil, datetime, re
14
7
  from typing import Optional, Union, Dict, List
15
8
  from pathlib import Path
16
- import json
17
- import hashlib
18
- import warnings
19
- import shutil
20
- import datetime
21
- from io import StringIO
22
9
  from difflib import get_close_matches
23
- warnings.filterwarnings('ignore')
24
10
 
25
11
  try:
26
12
  from openai import OpenAI
27
13
  HAS_OPENAI = True
28
- except ImportError:
14
+ except:
29
15
  HAS_OPENAI = False
30
16
 
31
17
  try:
32
18
  import plotly.express as px
33
19
  import plotly.graph_objects as go
34
20
  HAS_PLOTLY = True
35
- except ImportError:
21
+ except:
36
22
  HAS_PLOTLY = False
37
23
 
38
- try:
39
- import matplotlib.pyplot as plt
40
- HAS_MATPLOTLIB = True
41
- except ImportError:
42
- HAS_MATPLOTLIB = False
43
-
44
24
  try:
45
25
  import PyPDF2
46
26
  HAS_PYPDF2 = True
47
- except ImportError:
27
+ except:
48
28
  HAS_PYPDF2 = False
49
29
 
50
30
  try:
51
31
  import docx
52
32
  HAS_DOCX = True
53
- except ImportError:
33
+ except:
54
34
  HAS_DOCX = False
55
35
 
56
36
  try:
57
37
  from sentence_transformers import SentenceTransformer
58
38
  HAS_EMBEDDINGS = True
59
- except ImportError:
39
+ except:
60
40
  HAS_EMBEDDINGS = False
61
41
 
62
42
 
63
43
  class SUTRA:
64
- """SUTRA: Structured-Unstructured-Text-Retrieval-Architecture"""
44
+ """SUTRA - BULLETPROOF"""
65
45
 
66
46
  def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
67
- use_embeddings: bool = False, check_relevance: bool = False,
68
- fuzzy_match: bool = True, cache_queries: bool = True, debug: bool = False):
69
- """Initialize."""
70
- print("Initializing QuerySUTRA v0.4.5")
47
+ use_embeddings: bool = False, fuzzy_match: bool = True,
48
+ cache_queries: bool = True, check_relevance: bool = False):
71
49
 
72
50
  if api_key:
73
51
  os.environ["OPENAI_API_KEY"] = api_key
74
52
 
75
53
  self.api_key = os.getenv("OPENAI_API_KEY")
76
54
  self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
77
-
78
55
  self.db_path = db
79
- self.debug = debug
80
-
81
- try:
82
- self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
83
- self.conn.execute("PRAGMA journal_mode=WAL")
84
- self.conn.execute("PRAGMA synchronous=NORMAL")
85
- except:
86
- self.conn = sqlite3.connect(db, check_same_thread=False)
87
-
56
+ self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
88
57
  self.cursor = self.conn.cursor()
89
58
  self.current_table = None
90
59
  self.schema_info = {}
91
-
92
60
  self.cache_queries = cache_queries
93
61
  self.cache = {} if cache_queries else None
94
62
  self.use_embeddings = use_embeddings
@@ -101,667 +69,459 @@ class SUTRA:
101
69
  try:
102
70
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
103
71
  except:
104
- self.use_embeddings = False
72
+ pass
105
73
 
106
74
  self._refresh_schema()
107
- print(f"Ready! Database: {db}")
75
+ print(f"QuerySUTRA v0.5.1 Ready")
108
76
 
109
- @classmethod
110
- def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
111
- """Load existing database."""
112
- if not Path(db_path).exists():
113
- raise FileNotFoundError(f"Not found: {db_path}")
114
- return cls(api_key=api_key, db=db_path, **kwargs)
115
-
116
- @classmethod
117
- def connect_mysql(cls, host: str, user: str, password: str, database: str,
118
- port: int = 3306, api_key: Optional[str] = None, **kwargs):
119
- """Connect to MySQL."""
120
- try:
121
- from sqlalchemy import create_engine
122
- import mysql.connector
123
- except ImportError:
124
- raise ImportError("Run: pip install QuerySUTRA[mysql]")
125
-
126
- print(f"Connecting to MySQL...")
127
-
128
- try:
129
- temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
130
- temp_cursor = temp_conn.cursor()
131
- temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
132
- temp_cursor.close()
133
- temp_conn.close()
134
- except:
135
- pass
136
-
137
- engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
138
- temp_db = f"sutra_mysql_{database}.db"
139
- instance = cls(api_key=api_key, db=temp_db, **kwargs)
140
-
141
- tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
142
-
143
- for table in tables:
144
- df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
145
- df.to_sql(table, instance.conn, if_exists='replace', index=False)
146
-
147
- instance._refresh_schema()
148
- print(f"Connected! {len(tables)} tables")
149
- return instance
150
-
151
- @classmethod
152
- def connect_postgres(cls, host: str, user: str, password: str, database: str,
153
- port: int = 5432, api_key: Optional[str] = None, **kwargs):
154
- """Connect to PostgreSQL."""
155
- try:
156
- from sqlalchemy import create_engine
157
- except ImportError:
158
- raise ImportError("Run: pip install QuerySUTRA[postgres]")
159
-
160
- print(f"Connecting to PostgreSQL...")
161
-
162
- engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
163
- temp_db = f"sutra_postgres_{database}.db"
164
- instance = cls(api_key=api_key, db=temp_db, **kwargs)
165
-
166
- tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
167
-
168
- for table in tables:
169
- df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
170
- df.to_sql(table, instance.conn, if_exists='replace', index=False)
171
-
172
- instance._refresh_schema()
173
- print(f"Connected! {len(tables)} tables")
174
- return instance
175
-
176
- def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
177
- extract_entities: Optional[List[str]] = None,
178
- auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
179
- """Upload data."""
180
- print("\nUploading...")
181
-
77
+ def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
78
+ """Upload."""
182
79
  if isinstance(data, pd.DataFrame):
183
- name = name or "data"
184
- self._store_dataframe(data, name)
80
+ self._store(data, name or "data")
81
+ return self
82
+
83
+ path = Path(data)
84
+ if not path.exists():
85
+ raise FileNotFoundError(f"Not found: {data}")
86
+
87
+ name = name or path.stem.replace(" ", "_").replace("-", "_")
88
+ ext = path.suffix.lower()
89
+
90
+ if ext == ".csv":
91
+ self._store(pd.read_csv(path), name)
92
+ elif ext in [".xlsx", ".xls"]:
93
+ self._store(pd.read_excel(path), name)
94
+ elif ext == ".json":
95
+ self._store(pd.read_json(path), name)
96
+ elif ext == ".pdf":
97
+ self._pdf(path, name)
98
+ elif ext == ".docx":
99
+ self._docx(path, name)
100
+ elif ext == ".txt":
101
+ self._txt(path, name)
185
102
  else:
186
- path = Path(data)
187
- if not path.exists():
188
- raise FileNotFoundError(f"Not found: {data}")
189
-
190
- name = name or path.stem.replace(" ", "_").replace("-", "_")
191
- ext = path.suffix.lower()
192
-
193
- print(f"File: {path.name}")
194
-
195
- if ext == ".csv":
196
- self._store_dataframe(pd.read_csv(path), name)
197
- elif ext in [".xlsx", ".xls"]:
198
- self._store_dataframe(pd.read_excel(path), name)
199
- elif ext == ".json":
200
- self._store_dataframe(pd.read_json(path), name)
201
- elif ext == ".sql":
202
- with open(path) as f:
203
- self.cursor.executescript(f.read())
204
- self.conn.commit()
205
- self._refresh_schema()
206
- elif ext == ".pdf":
207
- self._smart_upload_pdf(path, name, extract_entities)
208
- elif ext == ".docx":
209
- self._smart_upload_docx(path, name, extract_entities)
210
- elif ext == ".txt":
211
- self._smart_upload_txt(path, name, extract_entities)
212
- else:
213
- raise ValueError(f"Unsupported: {ext}")
214
-
215
- if auto_export_mysql:
216
- print("\nAuto-exporting to MySQL...")
217
- self.save_to_mysql(
218
- host=auto_export_mysql.get('host', 'localhost'),
219
- user=auto_export_mysql.get('user', 'root'),
220
- password=auto_export_mysql['password'],
221
- database=auto_export_mysql['database'],
222
- port=auto_export_mysql.get('port', 3306)
223
- )
103
+ raise ValueError(f"Unsupported: {ext}")
224
104
 
225
105
  return self
226
106
 
227
- def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
228
- """Parse PDF."""
107
+ def _pdf(self, path: Path, name: str):
108
+ """BULLETPROOF PDF - ALWAYS creates multiple tables."""
229
109
  if not HAS_PYPDF2:
230
- raise ImportError("Run: pip install PyPDF2")
110
+ raise ImportError("pip install PyPDF2")
231
111
 
232
- print("Extracting PDF...")
112
+ print(f"Extracting PDF: {path.name}")
233
113
 
234
- with open(path, 'rb') as file:
235
- pdf_reader = PyPDF2.PdfReader(file)
236
- full_text = ""
237
- for page_num, page in enumerate(pdf_reader.pages, 1):
238
- full_text += page.extract_text() + "\n"
239
- print(f" Page {page_num}/{len(pdf_reader.pages)}")
114
+ with open(path, 'rb') as f:
115
+ text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
240
116
 
241
- if self.client:
242
- print("AI: Extracting entities...")
243
-
244
- chunk_size = 10000
245
- all_entities = {}
246
-
247
- for i in range(0, len(full_text), chunk_size):
248
- chunk = full_text[i:i+chunk_size]
249
- chunk_num = (i // chunk_size) + 1
250
- total_chunks = (len(full_text) // chunk_size) + 1
251
-
252
- if total_chunks > 1:
253
- print(f" Chunk {chunk_num}/{total_chunks}...")
254
-
255
- entities = self._extract_chunk(chunk, extract_entities)
256
-
257
- if self.debug:
258
- print(f" DEBUG: Chunk {chunk_num} returned {len(entities)} entity types")
259
-
260
- for entity_type, records in entities.items():
261
- if entity_type not in all_entities:
262
- all_entities[entity_type] = []
263
- all_entities[entity_type].extend(records)
264
-
265
- if self.debug:
266
- print(f" DEBUG: Total entities collected: {len(all_entities)}")
267
- for k, v in all_entities.items():
268
- print(f" - {k}: {len(v)} records")
269
-
270
- # Renumber IDs
271
- for entity_type, records in all_entities.items():
272
- for idx, record in enumerate(records, 1):
273
- record['id'] = idx
274
-
275
- # Create tables
276
- if all_entities:
277
- print(f"\nCreated {len(all_entities)} tables:")
278
- for entity_type, records in all_entities.items():
279
- if records:
280
- table_name = f"{base_name}_{entity_type}"
281
- df = pd.DataFrame(records)
282
- self._store_dataframe_safe(df, table_name)
283
- print(f" {entity_type}: {len(df)} records")
284
- return
285
-
286
- print("Creating simple table")
287
- self._store_dataframe(self._parse_text_simple(full_text), base_name)
117
+ if not self.client:
118
+ print("ERROR: No API key! Set api_key parameter")
119
+ return
120
+
121
+ print("AI: Extracting...")
122
+
123
+ # TRY 3 TIMES
124
+ entities = None
125
+ for attempt in [1, 2, 3]:
126
+ entities = self._extract(text, attempt)
127
+ if entities and len(entities) > 0:
128
+ break
129
+ if attempt < 3:
130
+ print(f" Retry {attempt+1}/3...")
131
+
132
+ # Create tables from entities
133
+ if entities and len(entities) > 0:
134
+ print(f"Extracted {len(entities)} entity types:")
135
+ for etype, recs in entities.items():
136
+ if recs and len(recs) > 0:
137
+ for idx, rec in enumerate(recs, 1):
138
+ rec['id'] = idx
139
+ self._store(pd.DataFrame(recs), f"{name}_{etype}")
140
+ print(f" {etype}: {len(recs)} rows")
141
+ return
142
+
143
+ # REGEX FALLBACK - FIXED
144
+ print("Using regex fallback...")
145
+ people = []
146
+ emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
147
+
148
+ # Extract names from common patterns
149
+ name_patterns = [
150
+ r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
151
+ r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
152
+ r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
153
+ ]
154
+
155
+ names = []
156
+ for pattern in name_patterns:
157
+ names.extend(re.findall(pattern, text))
158
+ if len(names) >= len(emails):
159
+ break
160
+
161
+ # Match emails to names
162
+ max_people = min(len(emails), 50)
163
+ for i in range(max_people):
164
+ people.append({
165
+ 'id': i + 1,
166
+ 'name': names[i] if i < len(names) else f"Person {i+1}",
167
+ 'email': emails[i] if i < len(emails) else f"person{i+1}@unknown.com"
168
+ })
169
+
170
+ if people:
171
+ self._store(pd.DataFrame(people), f"{name}_people")
172
+ print(f" Extracted {len(people)} people via regex")
173
+ else:
174
+ # Absolute last resort
175
+ lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
176
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
288
177
 
289
- def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
290
- """Extract entities - WITH BETTER ERROR HANDLING."""
178
+ def _extract(self, text: str, attempt: int) -> Dict:
179
+ """Extract with 3 different strategies."""
291
180
  if not self.client:
292
181
  return {}
293
182
 
294
183
  try:
295
- prompt = f"""Extract ALL structured entities from this text.
184
+ if attempt == 1:
185
+ sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
186
+ usr_msg = f"""Extract ALL entities from text.
296
187
 
297
188
  Text:
298
- {text[:8000]}
189
+ {text[:15000]}
299
190
 
300
- Extract: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY structured data.
191
+ Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
301
192
 
302
- Return JSON with arrays. Sequential IDs. Foreign keys reference primary keys.
193
+ Example:
194
+ {{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
303
195
 
304
- {{
305
- "people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
306
- "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
307
- }}
196
+ Rules: Unique IDs (1,2,3...), person_id references people.id
308
197
 
309
- ONLY valid JSON. No explanations."""
198
+ JSON:"""
199
+
200
+ elif attempt == 2:
201
+ sys_msg = "Return JSON."
202
+ usr_msg = f"""Text: {text[:10000]}
203
+
204
+ Extract people as JSON:
205
+ {{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
206
+
207
+ JSON:"""
208
+
209
+ else:
210
+ sys_msg = "JSON only."
211
+ usr_msg = f"""Find names and emails in: {text[:8000]}
310
212
 
311
- resp = self.client.chat.completions.create(
213
+ {{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
214
+
215
+ r = self.client.chat.completions.create(
312
216
  model="gpt-4o-mini",
313
217
  messages=[
314
- {"role": "system", "content": "Extract ALL entities with unique IDs. Return ONLY valid JSON, nothing else."},
315
- {"role": "user", "content": prompt}
218
+ {"role": "system", "content": sys_msg},
219
+ {"role": "user", "content": usr_msg}
316
220
  ],
317
221
  temperature=0,
318
- max_tokens=8000
222
+ max_tokens=12000
319
223
  )
320
224
 
321
- json_text = resp.choices[0].message.content.strip()
225
+ raw = r.choices[0].message.content.strip()
226
+ raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").strip()
322
227
 
323
- if self.debug:
324
- print(f" DEBUG: AI response length: {len(json_text)} chars")
325
- print(f" DEBUG: First 200 chars: {json_text[:200]}")
228
+ start = raw.find('{')
229
+ end = raw.rfind('}') + 1
326
230
 
327
- json_text = json_text.replace("```json", "").replace("```", "").strip()
231
+ if start < 0 or end <= start:
232
+ return {}
328
233
 
329
- result = json.loads(json_text)
234
+ result = json.loads(raw[start:end])
330
235
 
331
- if self.debug:
332
- print(f" DEBUG: Parsed {len(result)} entity types")
236
+ if isinstance(result, dict) and len(result) > 0:
237
+ has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
238
+ if has_data:
239
+ return result
333
240
 
334
- return result
335
-
336
- except json.JSONDecodeError as e:
337
- if self.debug:
338
- print(f" DEBUG: JSON parse error: {e}")
339
- print(f" DEBUG: Response was: {json_text[:500]}")
340
241
  return {}
242
+
341
243
  except Exception as e:
342
- if self.debug:
343
- print(f" DEBUG: Extraction error: {e}")
244
+ print(f" Attempt {attempt} failed: {str(e)[:100]}")
344
245
  return {}
345
246
 
346
- def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
347
- """Parse DOCX."""
247
+ def _docx(self, path: Path, name: str):
248
+ """DOCX."""
348
249
  if not HAS_DOCX:
349
- raise ImportError("Run: pip install python-docx")
350
-
250
+ raise ImportError("pip install python-docx")
351
251
  doc = docx.Document(path)
352
-
353
252
  if doc.tables:
354
- for i, table in enumerate(doc.tables):
355
- data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
253
+ for i, t in enumerate(doc.tables):
254
+ data = [[cell.text.strip() for cell in row.cells] for row in t.rows]
356
255
  if data and len(data) > 1:
357
- df = pd.DataFrame(data[1:], columns=data[0])
358
- self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
359
- return
360
-
361
- text = "\n".join([para.text for para in doc.paragraphs])
362
-
363
- if self.client and len(text) > 0:
364
- entities = self._extract_chunk(text, extract_entities)
365
- if entities:
366
- for entity_type, records in entities.items():
367
- if records:
368
- df = pd.DataFrame(records)
369
- self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
370
- return
371
-
372
- self._store_dataframe(self._parse_text_simple(text), base_name)
373
-
374
- def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
375
- """Parse TXT."""
376
- with open(path, 'r', encoding='utf-8') as file:
377
- text = file.read()
378
-
379
- if self.client and len(text) > 0:
380
- entities = self._extract_chunk(text, extract_entities)
381
- if entities:
382
- for entity_type, records in entities.items():
383
- if records:
384
- df = pd.DataFrame(records)
385
- self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
386
- return
387
-
388
- self._store_dataframe(self._parse_text_simple(text), base_name)
389
-
390
- def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
256
+ self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
257
+ else:
258
+ text = "\n".join([p.text for p in doc.paragraphs])
259
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
260
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
261
+
262
+ def _txt(self, path: Path, name: str):
263
+ """TXT."""
264
+ with open(path, 'r', encoding='utf-8') as f:
265
+ text = f.read()
266
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
267
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
268
+
269
+ def _store(self, df: pd.DataFrame, name: str):
391
270
  """Store."""
271
+ df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
392
272
  try:
393
- df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
394
273
  df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
395
- self.conn.commit()
396
- self.current_table = name
397
- self._refresh_schema()
398
274
  except:
399
275
  df.to_sql(name, self.conn, if_exists='replace', index=False)
400
- self.conn.commit()
401
- self.current_table = name
402
- self._refresh_schema()
403
-
404
- def _parse_text_simple(self, text: str) -> pd.DataFrame:
405
- """Simple parsing."""
406
- lines = [line.strip() for line in text.split('\n') if line.strip()]
407
- if not lines:
408
- return pd.DataFrame({'content': ['No content']})
409
- return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
410
-
411
- def _store_dataframe(self, df: pd.DataFrame, name: str):
412
- """Store."""
413
- self._store_dataframe_safe(df, name)
414
- print(f"Uploaded: {name} ({len(df)} rows)")
276
+ self.conn.commit()
277
+ self.current_table = name
278
+ self._refresh_schema()
279
+ print(f" {name}: {len(df)} rows")
415
280
 
416
- def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
281
+ def ask(self, q: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
417
282
  """Query."""
418
283
  if not self.client:
419
- return QueryResult(False, "", pd.DataFrame(), None, "No API key")
420
-
421
- print(f"\nQuestion: {question}")
284
+ return QueryResult(False, "", pd.DataFrame(), None, "No API")
422
285
 
423
- if self.check_relevance and not self._is_relevant_query(question):
424
- print("Warning: Irrelevant")
425
- choice = input("Continue? (yes/no): ").strip().lower()
426
- if choice not in ['yes', 'y']:
427
- return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
428
-
429
- tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
430
- if not tbl:
286
+ t = table or self.current_table or (self._get_tables()[0] if self._get_tables() else None)
287
+ if not t:
431
288
  return QueryResult(False, "", pd.DataFrame(), None, "No table")
432
289
 
433
290
  if self.use_embeddings and self.embedding_model:
434
- cached = self._check_embedding_cache(question, tbl)
291
+ cached = self._check_cache(q, t)
435
292
  if cached:
436
- print(" Cached")
437
293
  return cached
438
294
 
439
295
  if self.fuzzy_match:
440
- question = self._apply_fuzzy_matching(question, tbl)
296
+ q = self._fuzzy(q, t)
441
297
 
442
- cache_key = hashlib.md5(f"{question}:{tbl}".encode()).hexdigest()
443
- if self.cache_queries and self.cache and cache_key in self.cache:
444
- sql_query = self.cache[cache_key]
445
- print(" From cache")
298
+ key = hashlib.md5(f"{q}:{t}".encode()).hexdigest()
299
+ if self.cache_queries and self.cache and key in self.cache:
300
+ sql = self.cache[key]
446
301
  else:
447
- sql_query = self._generate_sql(question, tbl)
448
- if self.cache_queries and self.cache is not None:
449
- self.cache[cache_key] = sql_query
302
+ sql = self._gen_sql(q, t)
303
+ if self.cache_queries and self.cache:
304
+ self.cache[key] = sql
450
305
 
451
- print(f"SQL: {sql_query}")
306
+ print(f"SQL: {sql}")
452
307
 
453
308
  try:
454
- df = pd.read_sql_query(sql_query, self.conn)
309
+ df = pd.read_sql_query(sql, self.conn)
455
310
  print(f"Success! {len(df)} rows")
456
-
457
- fig = None
458
- if viz:
459
- viz_type = viz if isinstance(viz, str) else "auto"
460
- fig = self._visualize(df, question, viz_type)
461
-
462
- result = QueryResult(True, sql_query, df, fig)
311
+ fig = self._viz(df, q, viz if isinstance(viz, str) else "auto") if viz else None
312
+ r = QueryResult(True, sql, df, fig)
463
313
 
464
314
  if self.use_embeddings and self.embedding_model:
465
- self._store_in_embedding_cache(question, tbl, result)
315
+ self._store_cache(q, t, r)
466
316
 
467
- return result
317
+ return r
468
318
  except Exception as e:
469
- print(f"Error: {e}")
470
- return QueryResult(False, sql_query, pd.DataFrame(), None, str(e))
471
-
472
- def _is_relevant_query(self, question: str) -> bool:
473
- """Check relevance."""
474
- if not self.client:
475
- return True
476
- try:
477
- tables = self._get_table_names()[:3]
478
- resp = self.client.chat.completions.create(
479
- model="gpt-4o-mini",
480
- messages=[
481
- {"role": "system", "content": "Return 'yes' or 'no'."},
482
- {"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
483
- ],
484
- temperature=0,
485
- max_tokens=5
486
- )
487
- return 'yes' in resp.choices[0].message.content.lower()
488
- except:
489
- return True
319
+ return QueryResult(False, sql, pd.DataFrame(), None, str(e))
490
320
 
491
- def _apply_fuzzy_matching(self, question: str, table: str) -> str:
321
+ def _fuzzy(self, q: str, t: str) -> str:
492
322
  """Fuzzy."""
493
- if not self.schema_info.get(table):
494
- return question
495
-
496
323
  try:
497
- string_cols = [col for col, dtype in self.schema_info[table].items() if 'TEXT' in dtype]
498
- if not string_cols:
499
- return question
500
-
501
- for col in string_cols[:2]:
502
- df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
503
- values = [str(v) for v in df[col].dropna().tolist()]
504
-
505
- words = question.split()
506
- for i, word in enumerate(words):
507
- matches = get_close_matches(word, values, n=1, cutoff=0.6)
508
- if matches and word != matches[0]:
509
- words[i] = matches[0]
510
- print(f" Fuzzy: '{word}' -> '{matches[0]}'")
511
- question = " ".join(words)
512
- return question
324
+ cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
325
+ if not cols:
326
+ return q
327
+ for col in cols[:2]:
328
+ df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {t} LIMIT 100", self.conn)
329
+ vals = [str(v) for v in df[col].dropna()]
330
+ words = q.split()
331
+ for i, w in enumerate(words):
332
+ m = get_close_matches(w, vals, n=1, cutoff=0.6)
333
+ if m and w != m[0]:
334
+ words[i] = m[0]
335
+ q = " ".join(words)
336
+ return q
513
337
  except:
514
- return question
338
+ return q
515
339
 
516
- def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
517
- """Check cache."""
340
+ def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
341
+ """Cache."""
518
342
  if not self.query_embeddings:
519
343
  return None
520
-
521
- q_emb = self.embedding_model.encode([question])[0]
522
- best_match, best_sim = None, 0.85
523
-
524
- for cached_q, data in self.query_embeddings.items():
525
- if data['table'] != table:
344
+ emb = self.embedding_model.encode([q])[0]
345
+ best, sim = None, 0.85
346
+ for cq, d in self.query_embeddings.items():
347
+ if d['table'] != t:
526
348
  continue
527
- sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
528
- if sim > best_sim:
529
- best_sim = sim
530
- best_match = cached_q
531
-
532
- if best_match:
533
- print(f" Similar ({best_sim:.0%})")
534
- return self.query_embeddings[best_match]['result']
535
- return None
349
+ s = np.dot(emb, d['embedding']) / (np.linalg.norm(emb) * np.linalg.norm(d['embedding']))
350
+ if s > sim:
351
+ sim, best = s, cq
352
+ return self.query_embeddings[best]['result'] if best else None
536
353
 
537
- def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
354
+ def _store_cache(self, q: str, t: str, r: 'QueryResult'):
538
355
  """Store."""
539
- q_emb = self.embedding_model.encode([question])[0]
540
- self.query_embeddings[question] = {'table': table, 'embedding': q_emb, 'result': result}
356
+ emb = self.embedding_model.encode([q])[0]
357
+ self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
541
358
 
542
- def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
359
+ def _viz(self, df: pd.DataFrame, title: str, vt: str):
543
360
  """Viz."""
544
- if not HAS_PLOTLY and not HAS_MATPLOTLIB:
361
+ if not HAS_PLOTLY:
545
362
  return None
546
- print(f"Creating {viz_type} chart...")
547
- return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
548
-
549
- def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
550
- """Plotly."""
551
363
  try:
552
- num = df.select_dtypes(include=[np.number]).columns.tolist()
553
- cat = df.select_dtypes(include=['object']).columns.tolist()
554
-
555
- if viz_type == "table":
556
- fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)), cells=dict(values=[df[c] for c in df.columns]))])
557
- elif viz_type == "pie" and cat and num:
558
- fig = px.pie(df, names=cat[0], values=num[0], title=title)
559
- elif viz_type == "bar" and cat and num:
560
- fig = px.bar(df, x=cat[0], y=num[0], title=title)
561
- elif viz_type == "line" and num:
562
- fig = px.line(df, y=num[0], title=title)
563
- elif viz_type == "scatter" and len(num) >= 2:
564
- fig = px.scatter(df, x=num[0], y=num[1], title=title)
565
- elif viz_type == "heatmap" and len(num) >= 2:
566
- corr = df[num].corr()
567
- fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
568
- fig.update_layout(title=title)
364
+ n = df.select_dtypes(include=[np.number]).columns.tolist()
365
+ c = df.select_dtypes(include=['object']).columns.tolist()
366
+ if vt == "pie" and c and n:
367
+ fig = px.pie(df, names=c[0], values=n[0], title=title)
368
+ elif vt == "bar" and c and n:
369
+ fig = px.bar(df, x=c[0], y=n[0], title=title)
370
+ elif vt == "line" and n:
371
+ fig = px.line(df, y=n[0], title=title)
372
+ elif vt == "scatter" and len(n) >= 2:
373
+ fig = px.scatter(df, x=n[0], y=n[1], title=title)
569
374
  else:
570
- if cat and num:
571
- fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
572
- else:
573
- fig = px.bar(df, y=df.columns[0], title=title)
375
+ fig = px.bar(df, y=df.columns[0], title=title)
574
376
  fig.show()
575
377
  return fig
576
378
  except:
577
379
  return None
578
380
 
579
- def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
580
- """Matplotlib."""
581
- try:
582
- plt.figure(figsize=(10, 6))
583
- num = df.select_dtypes(include=[np.number]).columns
584
- if viz_type == "pie":
585
- df[df.columns[0]].value_counts().plot(kind='pie')
586
- elif viz_type == "line" and len(num) > 0:
587
- df[num[0]].plot(kind='line')
588
- else:
589
- (df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
590
- plt.title(title)
591
- plt.tight_layout()
592
- plt.show()
593
- return plt.gcf()
594
- except:
595
- return None
596
-
597
- def tables(self) -> Dict[str, dict]:
598
- """List."""
381
+ def tables(self) -> Dict:
382
+ """Tables."""
383
+ t = self._get_tables()
599
384
  print("\n" + "="*70)
600
385
  print("TABLES")
601
386
  print("="*70)
602
-
603
- all_tables = self._get_table_names()
604
- if not all_tables:
387
+ if not t:
605
388
  print("No tables")
606
389
  return {}
607
-
608
- result = {}
609
- for i, tbl in enumerate(all_tables, 1):
610
- cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
611
- cols = list(self.schema_info.get(tbl, {}).keys())
612
- print(f" {i}. {tbl}: {cnt} rows, {len(cols)} columns")
613
- result[tbl] = {'rows': cnt, 'columns': cols}
614
-
390
+ r = {}
391
+ for i, tb in enumerate(t, 1):
392
+ cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tb}", self.conn).iloc[0, 0]
393
+ cols = list(self.schema_info.get(tb, {}).keys())
394
+ print(f" {i}. {tb}: {cnt} rows, {len(cols)} cols")
395
+ r[tb] = {'rows': cnt, 'columns': cols}
615
396
  print("="*70)
616
- return result
397
+ return r
617
398
 
618
- def schema(self, table: Optional[str] = None) -> dict:
399
+ def schema(self, table: Optional[str] = None) -> Dict:
619
400
  """Schema."""
620
401
  if not self.schema_info:
621
402
  self._refresh_schema()
622
-
623
403
  print("\n" + "="*70)
624
404
  print("SCHEMA")
625
405
  print("="*70)
626
-
627
- result = {}
628
- for tbl in ([table] if table else self.schema_info.keys()):
629
- if tbl in self.schema_info:
630
- cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
631
- print(f"\n{tbl}: {cnt} records")
632
- for col, dtype in self.schema_info[tbl].items():
633
- print(f" - {col:<30} {dtype}")
634
- result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
635
-
406
+ r = {}
407
+ for t in ([table] if table else self.schema_info.keys()):
408
+ if t in self.schema_info:
409
+ cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {t}", self.conn).iloc[0, 0]
410
+ print(f"\n{t}: {cnt} records")
411
+ for c, d in self.schema_info[t].items():
412
+ print(f" - {c:<30} {d}")
413
+ r[t] = {'records': cnt, 'columns': self.schema_info[t]}
636
414
  print("="*70)
637
- return result
415
+ return r
638
416
 
639
417
  def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
640
418
  """Preview."""
641
- tbl = table or self.current_table
642
- if not tbl:
419
+ t = table or self.current_table
420
+ if not t:
643
421
  return pd.DataFrame()
644
- df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
645
- print(f"\nSample from '{tbl}':")
422
+ df = pd.read_sql_query(f"SELECT * FROM {t} LIMIT {n}", self.conn)
423
+ print(f"\nSample from '{t}':")
646
424
  print(df.to_string(index=False))
647
425
  return df
648
426
 
649
- def info(self):
650
- """Info."""
651
- return self.tables()
652
-
653
427
  def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
654
428
  """SQL."""
655
429
  try:
656
430
  df = pd.read_sql_query(query, self.conn)
657
431
  print(f"Success! {len(df)} rows")
658
- fig = self._visualize(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
432
+ fig = self._viz(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
659
433
  return QueryResult(True, query, df, fig)
660
434
  except Exception as e:
661
- print(f"Error: {e}")
662
435
  return QueryResult(False, query, pd.DataFrame(), None, str(e))
663
436
 
664
- def interactive(self, question: str) -> 'QueryResult':
665
- """Interactive."""
666
- choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
667
- viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
668
- return self.ask(question, viz=viz)
437
+ def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
438
+ """MySQL."""
439
+ try:
440
+ from sqlalchemy import create_engine
441
+ import mysql.connector
442
+ except:
443
+ raise ImportError("pip install QuerySUTRA[mysql]")
444
+
445
+ print(f"Exporting to MySQL: {database}")
446
+
447
+ try:
448
+ tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
449
+ tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
450
+ tc.close()
451
+ except:
452
+ pass
453
+
454
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
455
+ for t in self._get_tables():
456
+ df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
457
+ df.to_sql(t, engine, if_exists='replace', index=False)
458
+ print(f" {t}: {len(df)} rows")
459
+ print("Done!")
460
+ return self
669
461
 
670
462
  def export_db(self, path: str, format: str = "sqlite"):
671
463
  """Export."""
672
464
  if format == "sqlite":
673
465
  shutil.copy2(self.db_path, path)
674
- elif format == "sql":
675
- with open(path, 'w', encoding='utf-8') as f:
676
- for line in self.conn.iterdump():
677
- f.write(f'{line}\n')
678
466
  elif format == "json":
679
- data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}
680
- with open(path, 'w', encoding='utf-8') as f:
467
+ data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_tables()}
468
+ with open(path, 'w') as f:
681
469
  json.dump(data, f, indent=2, default=str)
682
- elif format == "excel":
683
- with pd.ExcelWriter(path, engine='openpyxl') as writer:
684
- for t in self._get_table_names():
685
- pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
686
- else:
687
- raise ValueError(f"Unsupported: {format}")
688
470
  print(f"Saved: {path}")
689
471
  return self
690
472
 
691
- def save_to_mysql(self, host: str, user: str, password: str, database: str,
692
- port: int = 3306, tables: Optional[List[str]] = None, auto_create: bool = True):
693
- """Export to MySQL."""
473
+ @classmethod
474
+ def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
475
+ """Load."""
476
+ if not Path(db_path).exists():
477
+ raise FileNotFoundError(f"Not found: {db_path}")
478
+ return cls(api_key=api_key, db=db_path, **kwargs)
479
+
480
+ @classmethod
481
+ def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
482
+ """MySQL."""
694
483
  try:
695
484
  from sqlalchemy import create_engine
696
485
  import mysql.connector
697
- except ImportError:
698
- raise ImportError("Run: pip install QuerySUTRA[mysql]")
699
-
700
- print(f"Exporting to MySQL: {host}/{database}")
486
+ except:
487
+ raise ImportError("pip install QuerySUTRA[mysql]")
701
488
 
702
- if auto_create:
703
- try:
704
- temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
705
- temp_cursor = temp_conn.cursor()
706
- temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
707
- temp_cursor.close()
708
- temp_conn.close()
709
- print(f" Database '{database}' ready")
710
- except Exception as e:
711
- print(f" Warning: {e}")
489
+ try:
490
+ tc = mysql.connector.connect(host=host, user=user, password=password, port=port)
491
+ tc.cursor().execute(f"CREATE DATABASE IF NOT EXISTS {database}")
492
+ tc.close()
493
+ except:
494
+ pass
712
495
 
713
496
  engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
497
+ temp_db = f"mysql_{database}.db"
498
+ instance = cls(api_key=api_key, db=temp_db, **kwargs)
714
499
 
715
- for t in (tables or self._get_table_names()):
716
- df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
717
- df.to_sql(t, engine, if_exists='replace', index=False)
718
- print(f" {t}: {len(df)} rows")
500
+ tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
501
+ for t in tables:
502
+ pd.read_sql_query(f"SELECT * FROM {t}", engine).to_sql(t, instance.conn, if_exists='replace', index=False)
719
503
 
720
- print("Complete!")
721
- return self
722
-
723
- def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
724
- """PostgreSQL."""
725
- try:
726
- from sqlalchemy import create_engine
727
- engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
728
- print(f"Exporting to PostgreSQL...")
729
- for t in (tables or self._get_table_names()):
730
- df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
731
- df.to_sql(t, engine, if_exists='replace', index=False)
732
- print(f" {t}: {len(df)} rows")
733
- print("Complete!")
734
- return self
735
- except ImportError:
736
- raise ImportError("Run: pip install QuerySUTRA[postgres]")
737
-
738
- def backup(self, path: str = None):
739
- """Backup."""
740
- dir = Path(path) if path else Path(".")
741
- dir.mkdir(parents=True, exist_ok=True)
742
- ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
743
- self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
744
- self.export_db(str(dir / f"sutra_{ts}.json"), "json")
745
- print("Backup complete!")
746
- return self
747
-
748
- def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
749
- """Export."""
750
- if format == "csv":
751
- data.to_csv(path, index=False)
752
- elif format in ["excel", "xlsx"]:
753
- data.to_excel(path, index=False)
754
- elif format == "json":
755
- data.to_json(path, orient="records", indent=2)
756
- print(f"Exported: {path}")
757
- return self
504
+ instance._refresh_schema()
505
+ print(f"Connected! {len(tables)} tables")
506
+ return instance
758
507
 
759
- def close(self):
760
- """Close."""
761
- if self.conn:
762
- self.conn.close()
508
+ def _gen_sql(self, q: str, t: str) -> str:
509
+ """SQL."""
510
+ schema = self.schema_info.get(t, {})
511
+ sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
512
+ cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])
513
+
514
+ r = self.client.chat.completions.create(
515
+ model="gpt-4o-mini",
516
+ messages=[
517
+ {"role": "system", "content": "SQL expert. Return only SQL."},
518
+ {"role": "user", "content": f"Table: {t}\nColumns: {cols}\nSample:\n{sample}\n\nQ: {q}\n\nSQL:"}
519
+ ],
520
+ temperature=0
521
+ )
522
+ return r.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
763
523
 
764
- def _get_table_names(self) -> List[str]:
524
+ def _get_tables(self) -> List[str]:
765
525
  """Tables."""
766
526
  self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
767
527
  return [r[0] for r in self.cursor.fetchall()]
@@ -769,25 +529,13 @@ ONLY valid JSON. No explanations."""
769
529
  def _refresh_schema(self):
770
530
  """Refresh."""
771
531
  self.schema_info = {}
772
- for tbl in self._get_table_names():
773
- self.cursor.execute(f"PRAGMA table_info({tbl})")
774
- self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
532
+ for t in self._get_tables():
533
+ self.cursor.execute(f"PRAGMA table_info({t})")
534
+ self.schema_info[t] = {r[1]: r[2] for r in self.cursor.fetchall()}
775
535
 
776
- def _generate_sql(self, question: str, table: str) -> str:
777
- """SQL."""
778
- schema = self.schema_info.get(table, {})
779
- sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
780
- schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
781
-
782
- resp = self.client.chat.completions.create(
783
- model="gpt-4o-mini",
784
- messages=[
785
- {"role": "system", "content": "SQL expert. Return only SQL."},
786
- {"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
787
- ],
788
- temperature=0
789
- )
790
- return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
536
+ def close(self):
537
+ if self.conn:
538
+ self.conn.close()
791
539
 
792
540
  def __enter__(self):
793
541
  return self
@@ -810,10 +558,3 @@ class QueryResult:
810
558
  def show(self):
811
559
  print(self.data if self.success else f"Error: {self.error}")
812
560
  return self
813
-
814
-
815
- def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
816
- """Quick."""
817
- with SUTRA(api_key=api_key) as sutra:
818
- sutra.upload(data_path)
819
- return sutra.ask(question, viz=viz)