QuerySUTRA 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sutra/sutra.py CHANGED
@@ -1,35 +1,32 @@
1
1
  """
2
- QuerySUTRA v0.3.3 - PROPER RELATIONAL DATABASE EXTRACTION
2
+ QuerySUTRA v0.4.0 - SIMPLE & AUTOMATIC
3
3
  SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
4
4
 
5
- FIXED: Proper primary keys, foreign keys, and relational integrity
6
- - Unique IDs for each entity
7
- - Proper foreign key relationships
8
- - No duplicate keys
9
- - Comprehensive entity extraction (skills, technologies, projects, etc.)
5
+ FIXED:
6
+ - Auto-creates MySQL database if not exists
7
+ - One-line export to MySQL
8
+ - Complete data extraction from large PDFs
9
+ - No manual file transfers needed
10
10
 
11
11
  Author: Aditya Batta
12
- License: MIT
13
- Version: 0.3.3
12
+ Version: 0.4.0
14
13
  """
15
14
 
16
- __version__ = "0.3.3"
15
+ __version__ = "0.4.0"
17
16
  __author__ = "Aditya Batta"
18
- __title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
19
17
  __all__ = ["SUTRA", "QueryResult", "quick_start"]
20
18
 
21
19
  import os
22
20
  import sqlite3
23
21
  import pandas as pd
24
22
  import numpy as np
25
- from typing import Optional, Union, Dict, Any, List
23
+ from typing import Optional, Union, Dict, List
26
24
  from pathlib import Path
27
25
  import json
28
26
  import hashlib
29
27
  import warnings
30
28
  import shutil
31
29
  import datetime
32
- import re
33
30
  from io import StringIO
34
31
  from difflib import get_close_matches
35
32
  warnings.filterwarnings('ignore')
@@ -73,22 +70,13 @@ except ImportError:
73
70
 
74
71
 
75
72
  class SUTRA:
76
- """
77
- SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
73
+ """SUTRA: Structured-Unstructured-Text-Retrieval-Architecture"""
78
74
 
79
- Professional data analysis with proper relational database structure
80
- """
81
-
82
- def __init__(self,
83
- api_key: Optional[str] = None,
84
- db: str = "sutra.db",
85
- use_embeddings: bool = False,
86
- check_relevance: bool = False,
87
- fuzzy_match: bool = True,
88
- cache_queries: bool = True):
89
- """Initialize SUTRA with optional features."""
90
- print("Initializing QuerySUTRA v0.3.3")
91
- print("SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
75
+ def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
76
+ use_embeddings: bool = False, check_relevance: bool = False,
77
+ fuzzy_match: bool = True, cache_queries: bool = True):
78
+ """Initialize."""
79
+ print("Initializing QuerySUTRA v0.4.0")
92
80
 
93
81
  if api_key:
94
82
  os.environ["OPENAI_API_KEY"] = api_key
@@ -97,454 +85,354 @@ class SUTRA:
97
85
  self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
98
86
 
99
87
  self.db_path = db
100
- self.conn = sqlite3.connect(db, check_same_thread=False)
101
- self.cursor = self.conn.cursor()
102
88
 
89
+ try:
90
+ self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
91
+ self.conn.execute("PRAGMA journal_mode=WAL")
92
+ self.conn.execute("PRAGMA synchronous=NORMAL")
93
+ except:
94
+ self.conn = sqlite3.connect(db, check_same_thread=False)
95
+
96
+ self.cursor = self.conn.cursor()
103
97
  self.current_table = None
104
98
  self.schema_info = {}
105
99
 
106
100
  self.cache_queries = cache_queries
107
101
  self.cache = {} if cache_queries else None
108
-
109
102
  self.use_embeddings = use_embeddings
110
103
  self.embedding_model = None
111
104
  self.query_embeddings = {}
112
-
113
105
  self.check_relevance = check_relevance
114
106
  self.fuzzy_match = fuzzy_match
115
107
 
116
108
  if use_embeddings and HAS_EMBEDDINGS:
117
109
  try:
118
- print("Loading embeddings model...")
119
110
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
120
- print("Embeddings ready")
121
111
  except:
122
- print("Embeddings unavailable")
123
112
  self.use_embeddings = False
124
113
 
125
114
  self._refresh_schema()
126
-
127
115
  print(f"Ready! Database: {db}")
128
- if not self.api_key:
129
- print("No API key - use .sql() for direct queries")
130
116
 
131
117
  @classmethod
132
118
  def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
133
- """Load existing SQLite database."""
119
+ """Load existing database."""
134
120
  if not Path(db_path).exists():
135
- raise FileNotFoundError(f"Database not found: {db_path}")
136
-
137
- print(f"Loading database: {db_path}")
138
- instance = cls(api_key=api_key, db=db_path, **kwargs)
139
-
140
- tables = instance.tables()
141
- print(f"Loaded {len(tables)} tables")
142
-
143
- return instance
121
+ raise FileNotFoundError(f"Not found: {db_path}")
122
+ return cls(api_key=api_key, db=db_path, **kwargs)
144
123
 
145
124
  @classmethod
146
125
  def connect_mysql(cls, host: str, user: str, password: str, database: str,
147
126
  port: int = 3306, api_key: Optional[str] = None, **kwargs):
148
- """Connect to MySQL database."""
127
+ """Connect to MySQL."""
149
128
  try:
150
129
  from sqlalchemy import create_engine
130
+ import mysql.connector
151
131
  except ImportError:
152
132
  raise ImportError("Run: pip install QuerySUTRA[mysql]")
153
133
 
154
- print(f"Connecting to MySQL: {host}:{port}/{database}")
134
+ print(f"Connecting to MySQL...")
155
135
 
156
- connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
136
+ # Auto-create database if not exists
137
+ try:
138
+ temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
139
+ temp_cursor = temp_conn.cursor()
140
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
141
+ temp_cursor.close()
142
+ temp_conn.close()
143
+ except:
144
+ pass
157
145
 
146
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
158
147
  temp_db = f"sutra_mysql_{database}.db"
159
148
  instance = cls(api_key=api_key, db=temp_db, **kwargs)
160
149
 
161
- engine = create_engine(connection_string)
162
-
163
150
  tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
164
151
 
165
- print(f"Found {len(tables)} tables, syncing...")
166
-
167
152
  for table in tables:
168
153
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
169
154
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
170
- print(f" {table}: {len(df)} rows")
171
155
 
172
156
  instance._refresh_schema()
173
- print(f"Connected! {len(tables)} tables available")
174
-
157
+ print(f"Connected! {len(tables)} tables")
175
158
  return instance
176
159
 
177
160
  @classmethod
178
161
  def connect_postgres(cls, host: str, user: str, password: str, database: str,
179
162
  port: int = 5432, api_key: Optional[str] = None, **kwargs):
180
- """Connect to PostgreSQL database."""
163
+ """Connect to PostgreSQL."""
181
164
  try:
182
165
  from sqlalchemy import create_engine
183
166
  except ImportError:
184
167
  raise ImportError("Run: pip install QuerySUTRA[postgres]")
185
168
 
186
- print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
187
-
188
- connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
169
+ print(f"Connecting to PostgreSQL...")
189
170
 
171
+ engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
190
172
  temp_db = f"sutra_postgres_{database}.db"
191
173
  instance = cls(api_key=api_key, db=temp_db, **kwargs)
192
174
 
193
- engine = create_engine(connection_string)
194
-
195
- tables = pd.read_sql_query(
196
- "SELECT tablename FROM pg_tables WHERE schemaname='public'",
197
- engine
198
- )['tablename'].tolist()
199
-
200
- print(f"Found {len(tables)} tables, syncing...")
175
+ tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
201
176
 
202
177
  for table in tables:
203
178
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
204
179
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
205
- print(f" {table}: {len(df)} rows")
206
180
 
207
181
  instance._refresh_schema()
208
- print(f"Connected! {len(tables)} tables available")
209
-
182
+ print(f"Connected! {len(tables)} tables")
210
183
  return instance
211
184
 
212
185
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
213
- extract_entities: Optional[List[str]] = None) -> 'SUTRA':
186
+ extract_entities: Optional[List[str]] = None,
187
+ auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
214
188
  """
215
- Upload data with optional custom entity extraction.
189
+ Upload data with OPTIONAL automatic MySQL export.
216
190
 
217
191
  Args:
218
192
  data: File path or DataFrame
219
193
  name: Table name
220
- extract_entities: Custom entities to extract (e.g., ['skills', 'technologies'])
194
+ extract_entities: Custom entities to extract
195
+ auto_export_mysql: Auto-export to MySQL after upload
196
+ {'host': 'localhost', 'user': 'root', 'password': 'pass', 'database': 'mydb'}
197
+
198
+ Example:
199
+ sutra.upload("data.pdf", auto_export_mysql={
200
+ 'host': 'localhost',
201
+ 'user': 'root',
202
+ 'password': '123456',
203
+ 'database': 'my_database'
204
+ })
221
205
  """
222
- print(f"\nUploading data...")
206
+ print("\nUploading...")
223
207
 
224
208
  if isinstance(data, pd.DataFrame):
225
209
  name = name or "data"
226
210
  self._store_dataframe(data, name)
227
- return self
228
-
229
- path = Path(data)
230
- if not path.exists():
231
- raise FileNotFoundError(f"File not found: {data}")
232
-
233
- name = name or path.stem.replace(" ", "_").replace("-", "_")
234
- ext = path.suffix.lower()
235
-
236
- print(f"File: {path.name}")
237
-
238
- if ext == ".csv":
239
- df = pd.read_csv(path)
240
- self._store_dataframe(df, name)
241
-
242
- elif ext in [".xlsx", ".xls"]:
243
- df = pd.read_excel(path)
244
- self._store_dataframe(df, name)
245
-
246
- elif ext == ".json":
247
- df = pd.read_json(path)
248
- self._store_dataframe(df, name)
249
-
250
- elif ext == ".sql":
251
- with open(path) as f:
252
- self.cursor.executescript(f.read())
253
- self.conn.commit()
254
- self._refresh_schema()
255
- print("SQL executed")
256
-
257
- elif ext == ".pdf":
258
- self._smart_upload_pdf(path, name, extract_entities)
259
-
260
- elif ext == ".docx":
261
- self._smart_upload_docx(path, name, extract_entities)
262
-
263
- elif ext == ".txt":
264
- self._smart_upload_txt(path, name, extract_entities)
265
-
266
211
  else:
267
- raise ValueError(f"Unsupported format: {ext}")
212
+ path = Path(data)
213
+ if not path.exists():
214
+ raise FileNotFoundError(f"Not found: {data}")
215
+
216
+ name = name or path.stem.replace(" ", "_").replace("-", "_")
217
+ ext = path.suffix.lower()
218
+
219
+ print(f"File: {path.name}")
220
+
221
+ if ext == ".csv":
222
+ self._store_dataframe(pd.read_csv(path), name)
223
+ elif ext in [".xlsx", ".xls"]:
224
+ self._store_dataframe(pd.read_excel(path), name)
225
+ elif ext == ".json":
226
+ self._store_dataframe(pd.read_json(path), name)
227
+ elif ext == ".sql":
228
+ with open(path) as f:
229
+ self.cursor.executescript(f.read())
230
+ self.conn.commit()
231
+ self._refresh_schema()
232
+ elif ext == ".pdf":
233
+ self._smart_upload_pdf(path, name, extract_entities)
234
+ elif ext == ".docx":
235
+ self._smart_upload_docx(path, name, extract_entities)
236
+ elif ext == ".txt":
237
+ self._smart_upload_txt(path, name, extract_entities)
238
+ else:
239
+ raise ValueError(f"Unsupported: {ext}")
240
+
241
+ # AUTO-EXPORT to MySQL if requested
242
+ if auto_export_mysql:
243
+ print("\nAuto-exporting to MySQL...")
244
+ self.save_to_mysql(
245
+ host=auto_export_mysql.get('host', 'localhost'),
246
+ user=auto_export_mysql.get('user', 'root'),
247
+ password=auto_export_mysql['password'],
248
+ database=auto_export_mysql['database'],
249
+ port=auto_export_mysql.get('port', 3306)
250
+ )
268
251
 
269
252
  return self
270
253
 
271
254
  def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
272
- """Parse PDF with proper relational structure."""
255
+ """Parse PDF - extracts ALL pages."""
273
256
  if not HAS_PYPDF2:
274
257
  raise ImportError("Run: pip install PyPDF2")
275
258
 
276
- print("Extracting text from PDF...")
259
+ print("Extracting PDF...")
277
260
 
278
261
  with open(path, 'rb') as file:
279
262
  pdf_reader = PyPDF2.PdfReader(file)
280
- text = ""
263
+ full_text = ""
281
264
  for page_num, page in enumerate(pdf_reader.pages, 1):
282
- text += page.extract_text() + "\n"
265
+ full_text += page.extract_text() + "\n"
283
266
  print(f" Page {page_num}/{len(pdf_reader.pages)}")
284
267
 
285
268
  if self.client:
286
- print("AI: Comprehensive entity extraction with proper relationships...")
287
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
269
+ print("AI: Extracting entities...")
288
270
 
289
- if tables and len(tables) > 0:
290
- print(f"\nCreated {len(tables)} relational tables:")
291
- for tbl_name in tables:
292
- count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
293
- cols = len(self.schema_info.get(tbl_name, {}))
294
- print(f" {tbl_name}: {count} rows, {cols} columns")
271
+ # Process in chunks for large documents
272
+ chunk_size = 10000
273
+ all_entities = {}
274
+
275
+ for i in range(0, len(full_text), chunk_size):
276
+ chunk = full_text[i:i+chunk_size]
277
+ chunk_num = (i // chunk_size) + 1
278
+ total_chunks = (len(full_text) // chunk_size) + 1
279
+
280
+ if total_chunks > 1:
281
+ print(f" Chunk {chunk_num}/{total_chunks}...")
282
+
283
+ entities = self._extract_chunk(chunk, extract_entities)
284
+
285
+ for entity_type, records in entities.items():
286
+ if entity_type not in all_entities:
287
+ all_entities[entity_type] = []
288
+ all_entities[entity_type].extend(records)
289
+
290
+ # Renumber IDs
291
+ for entity_type, records in all_entities.items():
292
+ for idx, record in enumerate(records, 1):
293
+ record['id'] = idx
294
+
295
+ # Create tables
296
+ if all_entities:
297
+ print(f"\nCreated {len(all_entities)} tables:")
298
+ for entity_type, records in all_entities.items():
299
+ if records:
300
+ table_name = f"{base_name}_{entity_type}"
301
+ df = pd.DataFrame(records)
302
+ self._store_dataframe_safe(df, table_name)
303
+ print(f" {entity_type}: {len(df)} records")
295
304
  return
296
305
 
297
- print("AI unavailable, creating simple table")
298
- df = self._parse_text_simple(text)
299
- self._store_dataframe(df, base_name)
306
+ print("Creating simple table")
307
+ self._store_dataframe(self._parse_text_simple(full_text), base_name)
300
308
 
301
309
  def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
302
- """Parse DOCX with proper structure."""
310
+ """Parse DOCX."""
303
311
  if not HAS_DOCX:
304
312
  raise ImportError("Run: pip install python-docx")
305
313
 
306
- print("Extracting from DOCX...")
307
-
308
314
  doc = docx.Document(path)
309
315
 
310
316
  if doc.tables:
311
- print(f"Found {len(doc.tables)} table(s)")
312
317
  for i, table in enumerate(doc.tables):
313
318
  data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
314
319
  if data and len(data) > 1:
315
320
  df = pd.DataFrame(data[1:], columns=data[0])
316
- table_name = f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name
317
- self._store_dataframe(df, table_name)
321
+ self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
318
322
  return
319
323
 
320
324
  text = "\n".join([para.text for para in doc.paragraphs])
321
325
 
322
- if self.client:
323
- print("AI: Analyzing...")
324
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
325
-
326
- if tables and len(tables) > 0:
327
- print(f"\nCreated {len(tables)} tables:")
328
- for tbl_name in tables:
329
- count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
330
- cols = len(self.schema_info.get(tbl_name, {}))
331
- print(f" {tbl_name}: {count} rows, {cols} columns")
332
- return
333
-
334
- df = self._parse_text_simple(text)
335
- self._store_dataframe(df, base_name)
326
+ if self.client and len(text) > 0:
327
+ entities = self._extract_chunk(text, extract_entities)
328
+ for entity_type, records in entities.items():
329
+ if records:
330
+ df = pd.DataFrame(records)
331
+ self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
332
+ else:
333
+ self._store_dataframe(self._parse_text_simple(text), base_name)
336
334
 
337
335
  def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
338
- """Parse TXT with proper structure."""
339
- print("Reading TXT...")
340
-
336
+ """Parse TXT."""
341
337
  with open(path, 'r', encoding='utf-8') as file:
342
338
  text = file.read()
343
339
 
344
- if self.client:
345
- print("AI: Analyzing...")
346
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
347
-
348
- if tables and len(tables) > 0:
349
- print(f"\nCreated {len(tables)} tables:")
350
- for tbl_name in tables:
351
- count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
352
- cols = len(self.schema_info.get(tbl_name, {}))
353
- print(f" {tbl_name}: {count} rows, {cols} columns")
354
- return
355
-
356
- df = self._parse_text_simple(text)
357
- self._store_dataframe(df, base_name)
340
+ if self.client and len(text) > 0:
341
+ entities = self._extract_chunk(text, extract_entities)
342
+ for entity_type, records in entities.items():
343
+ if records:
344
+ df = pd.DataFrame(records)
345
+ self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
346
+ else:
347
+ self._store_dataframe(self._parse_text_simple(text), base_name)
358
348
 
359
- def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
360
- """
361
- AI extracts ALL entities with PROPER primary and foreign keys.
362
-
363
- CRITICAL: Each entity gets UNIQUE IDs, foreign keys properly link tables.
364
- """
349
+ def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
350
+ """Extract entities from text chunk."""
365
351
  if not self.client:
366
- return []
352
+ return {}
367
353
 
368
354
  try:
369
- if custom_entities:
370
- entity_instruction = f"""Extract these specific entities: {', '.join(custom_entities)}
371
- For each entity type, create a proper table with unique IDs."""
372
- else:
373
- entity_instruction = """Automatically identify and extract ALL structured entities.
374
-
375
- Common entities (extract ALL you find):
376
- - people: Personal information (id, name, email, phone, address, city, state, zip)
377
- - skills: Individual skills (id, person_id, skill_name, proficiency_level, years_experience)
378
- - technologies: Technologies/tools (id, person_id, technology_name, category, proficiency)
379
- - projects: Projects (id, person_id, project_name, description, start_date, end_date)
380
- - certifications: Certifications (id, person_id, cert_name, issuer, date_obtained)
381
- - education: Education records (id, person_id, degree, institution, graduation_year)
382
- - work_experience: Work history (id, person_id, company, title, start_date, end_date)
383
- - events: Events/meetings (id, host_id, description, location, date, attendee_ids)
384
- - organizations: Companies/departments (id, name, address, city, industry)
385
- - products: Products/services (id, name, description, price, category)
386
- - ANY other structured entities you identify
387
-
388
- Extract EVERYTHING you find in the text."""
389
-
390
- extraction_prompt = f"""Analyze this text and extract ALL structured data into proper relational database tables.
355
+ prompt = f"""Extract ALL structured entities from this text.
391
356
 
392
357
  Text:
393
- {text[:6000]}
358
+ {text[:8000]}
394
359
 
395
- {entity_instruction}
360
+ Extract entities like: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY other structured data.
396
361
 
397
- CRITICAL REQUIREMENTS FOR PROPER DATABASE DESIGN:
362
+ Return JSON with arrays. Use sequential IDs (1,2,3...). Foreign keys reference primary keys.
398
363
 
399
- 1. PRIMARY KEYS:
400
- - Each table MUST have unique sequential IDs starting from 1
401
- - Person 1 gets id=1, Person 2 gets id=2, etc.
402
- - NO DUPLICATE IDs within same table
403
- - IDs must be integers
404
-
405
- 2. FOREIGN KEYS:
406
- - Use foreign keys to link related tables
407
- - Example: skills table has person_id that references people.id
408
- - Example: projects table has person_id that references people.id
409
- - Foreign keys MUST match existing primary keys
410
-
411
- 3. TABLE STRUCTURE:
412
- - Each entity type gets its own table
413
- - Use clear table names (people, skills, technologies, not table1, table2)
414
- - Include ALL relevant attributes for each entity
415
-
416
- Return JSON with this EXACT structure:
364
+ Example:
417
365
  {{
418
- "people": [
419
- {{"id": 1, "name": "John Doe", "email": "john@email.com", "phone": "+1-555-0100", "city": "Dallas", "state": "TX"}},
420
- {{"id": 2, "name": "Jane Smith", "email": "jane@email.com", "phone": "+1-555-0101", "city": "New York", "state": "NY"}},
421
- ...
422
- ],
423
- "skills": [
424
- {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
425
- {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
426
- {{"id": 3, "person_id": 2, "skill_name": "Java", "proficiency": "Expert", "years": 7}},
427
- ...
428
- ],
429
- "technologies": [
430
- {{"id": 1, "person_id": 1, "technology": "React", "category": "Frontend"}},
431
- {{"id": 2, "person_id": 1, "technology": "PostgreSQL", "category": "Database"}},
432
- {{"id": 3, "person_id": 2, "technology": "Spring Boot", "category": "Backend"}},
433
- ...
434
- ],
435
- "projects": [
436
- {{"id": 1, "person_id": 1, "project_name": "E-commerce Platform", "role": "Lead Developer"}},
437
- {{"id": 2, "person_id": 2, "project_name": "Analytics Dashboard", "role": "Backend Engineer"}},
438
- ...
439
- ]
366
+ "people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
367
+ "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
440
368
  }}
441
369
 
442
- IMPORTANT:
443
- - Extract EVERY structured piece of data you find
444
- - Assign UNIQUE sequential IDs (1, 2, 3, ...) for each table
445
- - Foreign keys MUST reference valid primary keys
446
- - Create as many tables as needed (don't limit yourself)
447
- - Return ONLY valid JSON, no explanations
448
- - Be COMPREHENSIVE - extract skills, technologies, projects, certifications, education, work history, etc."""
370
+ Return ONLY valid JSON."""
449
371
 
450
- response = self.client.chat.completions.create(
372
+ resp = self.client.chat.completions.create(
451
373
  model="gpt-4o-mini",
452
374
  messages=[
453
- {"role": "system", "content": "You are a database design expert. Extract ALL entities with proper primary keys (unique sequential IDs) and foreign keys (referencing valid IDs). Be comprehensive and extract EVERYTHING. Return only valid JSON."},
454
- {"role": "user", "content": extraction_prompt}
375
+ {"role": "system", "content": "Extract ALL entities with unique IDs. Return only JSON."},
376
+ {"role": "user", "content": prompt}
455
377
  ],
456
378
  temperature=0,
457
- max_tokens=4096
379
+ max_tokens=8000
458
380
  )
459
381
 
460
- json_text = response.choices[0].message.content.strip()
461
- json_text = json_text.replace("```json", "").replace("```", "").strip()
462
-
463
- extracted_data = json.loads(json_text)
464
-
465
- created_tables = []
466
-
467
- for entity_type, records in extracted_data.items():
468
- if records and isinstance(records, list) and len(records) > 0:
469
- table_name = f"{base_name}_{entity_type}"
470
-
471
- try:
472
- df = pd.DataFrame(records)
473
- if not df.empty:
474
- self._store_dataframe(df, table_name, silent=True)
475
- created_tables.append(table_name)
476
- print(f" {entity_type}: {len(df)} records")
477
- except Exception as e:
478
- print(f" Failed {entity_type}: {e}")
479
-
480
- return created_tables
481
-
382
+ json_text = resp.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()
383
+ return json.loads(json_text)
482
384
  except Exception as e:
483
- print(f"AI extraction error: {e}")
484
- return []
385
+ return {}
386
+
387
+ def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
388
+ """Store with error handling."""
389
+ try:
390
+ df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
391
+ df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
392
+ self.conn.commit()
393
+ self.current_table = name
394
+ self._refresh_schema()
395
+ except:
396
+ df.to_sql(name, self.conn, if_exists='replace', index=False)
397
+ self.conn.commit()
398
+ self.current_table = name
399
+ self._refresh_schema()
485
400
 
486
401
  def _parse_text_simple(self, text: str) -> pd.DataFrame:
487
- """Fallback text parsing."""
402
+ """Simple parsing."""
488
403
  lines = [line.strip() for line in text.split('\n') if line.strip()]
489
-
490
404
  if not lines:
491
405
  return pd.DataFrame({'content': ['No content']})
492
406
 
493
- sample = lines[:min(10, len(lines))]
494
- for delimiter in ['\t', ',', '|', ';']:
495
- if all(delimiter in line for line in sample):
496
- try:
497
- df = pd.read_csv(StringIO('\n'.join(lines)), sep=delimiter)
498
- if len(df.columns) > 1:
499
- return df
500
- except:
501
- continue
502
-
503
- return pd.DataFrame({
504
- 'line_number': range(1, len(lines) + 1),
505
- 'content': lines
506
- })
407
+ return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
507
408
 
508
- def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
509
- """Store DataFrame."""
510
- df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
511
- df.to_sql(name, self.conn, if_exists='replace', index=False)
512
- self.current_table = name
513
- self._refresh_schema()
514
-
515
- if not silent:
516
- print(f"Uploaded: {name}")
517
- print(f" {len(df)} rows, {len(df.columns)} columns")
409
+ def _store_dataframe(self, df: pd.DataFrame, name: str):
410
+ """Store."""
411
+ self._store_dataframe_safe(df, name)
412
+ print(f"Uploaded: {name} ({len(df)} rows)")
518
413
 
519
414
  def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
520
- """Query with natural language."""
415
+ """Natural language query."""
521
416
  if not self.client:
522
- print("No API key")
523
417
  return QueryResult(False, "", pd.DataFrame(), None, "No API key")
524
418
 
525
419
  print(f"\nQuestion: {question}")
526
420
 
527
- if self.check_relevance:
528
- if not self._is_relevant_query(question):
529
- print("Warning: Query may be irrelevant")
530
- choice = input("Continue? (yes/no): ").strip().lower()
531
- if choice not in ['yes', 'y']:
532
- return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
421
+ if self.check_relevance and not self._is_relevant_query(question):
422
+ print("Warning: Irrelevant query")
423
+ choice = input("Continue? (yes/no): ").strip().lower()
424
+ if choice not in ['yes', 'y']:
425
+ return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
533
426
 
534
- tbl = table or self.current_table
427
+ tbl = table or self.current_table or (self._get_table_names()[0] if self._get_table_names() else None)
535
428
  if not tbl:
536
- all_tables = self._get_table_names()
537
- if all_tables:
538
- tbl = all_tables[0]
539
- else:
540
- print("No tables found")
541
- return QueryResult(False, "", pd.DataFrame(), None, "No table")
429
+ return QueryResult(False, "", pd.DataFrame(), None, "No table")
542
430
 
543
431
  if self.use_embeddings and self.embedding_model:
544
- cached_result = self._check_embedding_cache(question, tbl)
545
- if cached_result:
546
- print(" Using cached result")
547
- return cached_result
432
+ cached = self._check_embedding_cache(question, tbl)
433
+ if cached:
434
+ print(" Cached")
435
+ return cached
548
436
 
549
437
  if self.fuzzy_match:
550
438
  question = self._apply_fuzzy_matching(question, tbl)
@@ -567,7 +455,7 @@ IMPORTANT:
567
455
  fig = None
568
456
  if viz:
569
457
  viz_type = viz if isinstance(viz, str) else "auto"
570
- fig = self._visualize(df, question, viz_type=viz_type)
458
+ fig = self._visualize(df, question, viz_type)
571
459
 
572
460
  result = QueryResult(True, sql_query, df, fig)
573
461
 
@@ -584,199 +472,155 @@ IMPORTANT:
584
472
  if not self.client:
585
473
  return True
586
474
 
587
- tables = self._get_table_names()
588
- columns = []
589
- for tbl in tables[:3]:
590
- cols = list(self.schema_info.get(tbl, {}).keys())
591
- columns.extend(cols[:5])
592
-
593
- db_context = f"Tables: {', '.join(tables[:5])}. Columns: {', '.join(columns[:15])}"
594
-
595
475
  try:
596
- response = self.client.chat.completions.create(
476
+ tables = self._get_table_names()[:3]
477
+ cols = []
478
+ for tbl in tables:
479
+ cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
480
+
481
+ resp = self.client.chat.completions.create(
597
482
  model="gpt-4o-mini",
598
483
  messages=[
599
- {"role": "system", "content": "Relevance checker. Return only 'yes' or 'no'."},
600
- {"role": "user", "content": f"Is this relevant to database with {db_context}?\n\nQuestion: {question}\n\nyes or no:"}
484
+ {"role": "system", "content": "Return 'yes' or 'no'."},
485
+ {"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
601
486
  ],
602
487
  temperature=0,
603
488
  max_tokens=5
604
489
  )
605
-
606
- return 'yes' in response.choices[0].message.content.strip().lower()
490
+ return 'yes' in resp.choices[0].message.content.lower()
607
491
  except:
608
492
  return True
609
493
 
610
494
  def _apply_fuzzy_matching(self, question: str, table: str) -> str:
611
- """Fuzzy match query terms."""
495
+ """Fuzzy matching."""
612
496
  if not self.schema_info.get(table):
613
497
  return question
614
498
 
615
499
  try:
616
- string_cols = [col for col, dtype in self.schema_info[table].items()
617
- if 'TEXT' in dtype or 'VARCHAR' in dtype]
618
-
500
+ string_cols = [col for col, dtype in self.schema_info[table].items() if 'TEXT' in dtype]
619
501
  if not string_cols:
620
502
  return question
621
503
 
622
504
  for col in string_cols[:2]:
623
505
  df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
624
- unique_values = [str(v) for v in df[col].dropna().tolist()]
506
+ values = [str(v) for v in df[col].dropna().tolist()]
625
507
 
626
508
  words = question.split()
627
509
  for i, word in enumerate(words):
628
- matches = get_close_matches(word, unique_values, n=1, cutoff=0.6)
510
+ matches = get_close_matches(word, values, n=1, cutoff=0.6)
629
511
  if matches and word != matches[0]:
630
512
  words[i] = matches[0]
631
513
  print(f" Fuzzy: '{word}' -> '{matches[0]}'")
632
-
633
514
  question = " ".join(words)
634
-
635
515
  return question
636
516
  except:
637
517
  return question
638
518
 
639
519
  def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
640
- """Check embedding cache."""
520
+ """Check cache."""
641
521
  if not self.query_embeddings:
642
522
  return None
643
523
 
644
- q_embedding = self.embedding_model.encode([question])[0]
645
-
524
+ q_emb = self.embedding_model.encode([question])[0]
646
525
  best_match = None
647
- best_similarity = 0.85
526
+ best_sim = 0.85
648
527
 
649
- for cached_q, cached_data in self.query_embeddings.items():
650
- if cached_data['table'] != table:
528
+ for cached_q, data in self.query_embeddings.items():
529
+ if data['table'] != table:
651
530
  continue
652
531
 
653
- similarity = np.dot(q_embedding, cached_data['embedding']) / (
654
- np.linalg.norm(q_embedding) * np.linalg.norm(cached_data['embedding'])
655
- )
656
-
657
- if similarity > best_similarity:
658
- best_similarity = similarity
532
+ sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
533
+ if sim > best_sim:
534
+ best_sim = sim
659
535
  best_match = cached_q
660
536
 
661
537
  if best_match:
662
- print(f" Similar query ({best_similarity:.0%}): '{best_match}'")
538
+ print(f" Similar ({best_sim:.0%})")
663
539
  return self.query_embeddings[best_match]['result']
664
540
 
665
541
  return None
666
542
 
667
543
  def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
668
- """Store in cache."""
669
- q_embedding = self.embedding_model.encode([question])[0]
670
- self.query_embeddings[question] = {
671
- 'table': table,
672
- 'embedding': q_embedding,
673
- 'result': result
674
- }
544
+ """Store cache."""
545
+ q_emb = self.embedding_model.encode([question])[0]
546
+ self.query_embeddings[question] = {'table': table, 'embedding': q_emb, 'result': result}
675
547
 
676
548
  def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
677
- """Create visualization."""
549
+ """Visualize."""
678
550
  if not HAS_PLOTLY and not HAS_MATPLOTLIB:
679
- print("Install plotly or matplotlib")
680
551
  return None
681
552
 
682
553
  print(f"Creating {viz_type} chart...")
683
-
684
- if HAS_PLOTLY:
685
- return self._plotly_viz(df, title, viz_type)
686
- else:
687
- return self._matplotlib_viz(df, title, viz_type)
554
+ return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
688
555
 
689
556
  def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
690
- """Plotly visualization."""
557
+ """Plotly."""
691
558
  try:
692
- numeric = df.select_dtypes(include=[np.number]).columns.tolist()
693
- categorical = df.select_dtypes(include=['object']).columns.tolist()
559
+ num = df.select_dtypes(include=[np.number]).columns.tolist()
560
+ cat = df.select_dtypes(include=['object']).columns.tolist()
694
561
 
695
- if viz_type == "table" or len(df) == 1:
696
- fig = go.Figure(data=[go.Table(
697
- header=dict(values=list(df.columns)),
698
- cells=dict(values=[df[c] for c in df.columns])
699
- )])
700
- elif viz_type == "pie" and categorical and numeric:
701
- fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
702
- elif viz_type == "bar" and categorical and numeric:
703
- fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
704
- elif viz_type == "line" and numeric:
705
- fig = px.line(df, y=numeric[0], title=title)
706
- elif viz_type == "scatter" and len(numeric) >= 2:
707
- fig = px.scatter(df, x=numeric[0], y=numeric[1], title=title)
708
- elif viz_type == "heatmap" and len(numeric) >= 2:
709
- corr = df[numeric].corr()
710
- fig = go.Figure(data=go.Heatmap(
711
- z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'
712
- ))
562
+ if viz_type == "table":
563
+ fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)), cells=dict(values=[df[c] for c in df.columns]))])
564
+ elif viz_type == "pie" and cat and num:
565
+ fig = px.pie(df, names=cat[0], values=num[0], title=title)
566
+ elif viz_type == "bar" and cat and num:
567
+ fig = px.bar(df, x=cat[0], y=num[0], title=title)
568
+ elif viz_type == "line" and num:
569
+ fig = px.line(df, y=num[0], title=title)
570
+ elif viz_type == "scatter" and len(num) >= 2:
571
+ fig = px.scatter(df, x=num[0], y=num[1], title=title)
572
+ elif viz_type == "heatmap" and len(num) >= 2:
573
+ corr = df[num].corr()
574
+ fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns))
713
575
  fig.update_layout(title=title)
714
- elif viz_type == "auto":
715
- if categorical and numeric:
716
- fig = px.pie(df, names=categorical[0], values=numeric[0], title=title) if len(df) <= 10 else px.bar(df, x=categorical[0], y=numeric[0], title=title)
717
- elif len(numeric) >= 2:
718
- fig = px.line(df, y=numeric[0], title=title)
576
+ else:
577
+ if cat and num:
578
+ fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
719
579
  else:
720
580
  fig = px.bar(df, y=df.columns[0], title=title)
721
- else:
722
- fig = px.bar(df, x=categorical[0] if categorical else df.index, y=numeric[0] if numeric else df.columns[0], title=title)
723
581
 
724
582
  fig.show()
725
- print("Chart displayed")
726
583
  return fig
727
- except Exception as e:
728
- print(f"Viz error: {e}")
584
+ except:
729
585
  return None
730
586
 
731
587
  def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
732
- """Matplotlib visualization."""
588
+ """Matplotlib."""
733
589
  try:
734
590
  plt.figure(figsize=(10, 6))
735
- numeric = df.select_dtypes(include=[np.number]).columns
591
+ num = df.select_dtypes(include=[np.number]).columns
736
592
 
737
- if viz_type == "pie" and len(numeric) > 0:
593
+ if viz_type == "pie":
738
594
  df[df.columns[0]].value_counts().plot(kind='pie')
739
- elif viz_type == "line" and len(numeric) > 0:
740
- df[numeric[0]].plot(kind='line')
595
+ elif viz_type == "line" and len(num) > 0:
596
+ df[num[0]].plot(kind='line')
741
597
  else:
742
- if len(numeric) > 0:
743
- df[numeric[0]].plot(kind='bar')
744
- else:
745
- df.iloc[:, 0].value_counts().plot(kind='bar')
598
+ (df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
746
599
 
747
600
  plt.title(title)
748
601
  plt.tight_layout()
749
602
  plt.show()
750
- print("Chart displayed")
751
603
  return plt.gcf()
752
- except Exception as e:
753
- print(f"Viz error: {e}")
604
+ except:
754
605
  return None
755
606
 
756
607
  def tables(self) -> Dict[str, dict]:
757
- """List all tables."""
608
+ """List tables."""
758
609
  print("\n" + "="*70)
759
- print("TABLES IN DATABASE")
610
+ print("TABLES")
760
611
  print("="*70)
761
612
 
762
613
  all_tables = self._get_table_names()
763
-
764
614
  if not all_tables:
765
- print("No tables found")
615
+ print("No tables")
766
616
  return {}
767
617
 
768
618
  result = {}
769
619
  for i, tbl in enumerate(all_tables, 1):
770
- count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
771
- cols = self.schema_info.get(tbl, {})
772
- col_list = list(cols.keys())
773
-
774
- marker = ">" if tbl == self.current_table else " "
775
- print(f"{marker} {i}. {tbl}")
776
- print(f" {count} rows, {len(col_list)} columns")
777
- print(f" Columns: {', '.join(col_list[:8])}")
778
-
779
- result[tbl] = {'rows': count, 'columns': col_list}
620
+ cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
621
+ cols = list(self.schema_info.get(tbl, {}).keys())
622
+ print(f" {i}. {tbl}: {cnt} rows, {len(cols)} columns")
623
+ result[tbl] = {'rows': cnt, 'columns': cols}
780
624
 
781
625
  print("="*70)
782
626
  return result
@@ -787,77 +631,55 @@ IMPORTANT:
787
631
  self._refresh_schema()
788
632
 
789
633
  print("\n" + "="*70)
790
- print("DATABASE SCHEMA")
634
+ print("SCHEMA")
791
635
  print("="*70)
792
636
 
793
- tables_to_show = [table] if table else self.schema_info.keys()
794
-
795
637
  result = {}
796
- for tbl in tables_to_show:
638
+ for tbl in ([table] if table else self.schema_info.keys()):
797
639
  if tbl in self.schema_info:
798
- count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
799
- print(f"\nTable: {tbl}")
800
- print(f"Records: {count}")
801
- print("Columns:")
802
-
640
+ cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
641
+ print(f"\n{tbl}: {cnt} records")
803
642
  for col, dtype in self.schema_info[tbl].items():
804
- print(f" - {col:<30} ({dtype})")
805
-
806
- result[tbl] = {
807
- 'records': count,
808
- 'columns': self.schema_info[tbl]
809
- }
643
+ print(f" - {col:<30} {dtype}")
644
+ result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
810
645
 
811
646
  print("="*70)
812
647
  return result
813
648
 
814
649
  def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
815
- """Preview data."""
650
+ """Preview."""
816
651
  tbl = table or self.current_table
817
652
  if not tbl:
818
- print("No table specified")
819
653
  return pd.DataFrame()
820
654
 
821
655
  df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
822
- print(f"\nSample from '{tbl}' ({n} rows):")
656
+ print(f"\nSample from '{tbl}':")
823
657
  print(df.to_string(index=False))
824
658
  return df
825
659
 
826
660
  def info(self):
827
- """Database overview."""
661
+ """Overview."""
828
662
  return self.tables()
829
663
 
830
664
  def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
831
665
  """Execute SQL."""
832
- print("\nExecuting SQL...")
833
-
834
666
  try:
835
667
  df = pd.read_sql_query(query, self.conn)
836
668
  print(f"Success! {len(df)} rows")
837
-
838
- fig = None
839
- if viz:
840
- viz_type = viz if isinstance(viz, str) else "auto"
841
- fig = self._visualize(df, "SQL Result", viz_type=viz_type)
842
-
669
+ fig = self._visualize(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
843
670
  return QueryResult(True, query, df, fig)
844
671
  except Exception as e:
845
672
  print(f"Error: {e}")
846
673
  return QueryResult(False, query, pd.DataFrame(), None, str(e))
847
674
 
848
675
  def interactive(self, question: str) -> 'QueryResult':
849
- """Interactive query."""
850
- print(f"\nQuestion: {question}")
676
+ """Interactive."""
851
677
  choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
852
-
853
678
  viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
854
-
855
679
  return self.ask(question, viz=viz)
856
680
 
857
681
  def export_db(self, path: str, format: str = "sqlite"):
858
682
  """Export database."""
859
- print(f"\nExporting to {format}...")
860
-
861
683
  if format == "sqlite":
862
684
  shutil.copy2(self.db_path, path)
863
685
  elif format == "sql":
@@ -865,93 +687,90 @@ IMPORTANT:
865
687
  for line in self.conn.iterdump():
866
688
  f.write(f'{line}\n')
867
689
  elif format == "json":
868
- data = {}
869
- for table in self._get_table_names():
870
- df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
871
- data[table] = df.to_dict(orient='records')
690
+ data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}
872
691
  with open(path, 'w', encoding='utf-8') as f:
873
692
  json.dump(data, f, indent=2, default=str)
874
693
  elif format == "excel":
875
694
  with pd.ExcelWriter(path, engine='openpyxl') as writer:
876
- for table in self._get_table_names():
877
- df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
878
- df.to_excel(writer, sheet_name=table[:31], index=False)
695
+ for t in self._get_table_names():
696
+ pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
879
697
  else:
880
698
  raise ValueError(f"Unsupported: {format}")
881
699
 
882
- print(f"Saved to {path}")
700
+ print(f"Saved: {path}")
883
701
  return self
884
702
 
885
- def save_to_mysql(self, host: str, user: str, password: str, database: str,
886
- port: int = 3306, tables: Optional[List[str]] = None):
887
- """Export to MySQL."""
703
+ def save_to_mysql(self, host: str, user: str, password: str, database: str,
704
+ port: int = 3306, tables: Optional[List[str]] = None,
705
+ auto_create: bool = True):
706
+ """
707
+ Export to MySQL - AUTO-CREATES database if not exists.
708
+
709
+ Args:
710
+ host: MySQL host
711
+ user: MySQL user
712
+ password: MySQL password
713
+ database: Database name (auto-created if not exists)
714
+ port: MySQL port
715
+ tables: Specific tables to export (None = all)
716
+ auto_create: Auto-create database if not exists
717
+ """
888
718
  try:
889
719
  from sqlalchemy import create_engine
720
+ import mysql.connector
890
721
  except ImportError:
891
722
  raise ImportError("Run: pip install QuerySUTRA[mysql]")
892
723
 
893
- print(f"\nConnecting to MySQL: {host}:{port}...")
724
+ print(f"Exporting to MySQL: {host}/{database}")
894
725
 
895
- engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
896
-
897
- tables_to_export = tables or self._get_table_names()
726
+ # Auto-create database if requested
727
+ if auto_create:
728
+ try:
729
+ temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
730
+ temp_cursor = temp_conn.cursor()
731
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
732
+ temp_cursor.close()
733
+ temp_conn.close()
734
+ print(f" Database '{database}' ready")
735
+ except Exception as e:
736
+ print(f" Warning: Could not auto-create database: {e}")
898
737
 
899
- print(f"Exporting {len(tables_to_export)} tables...")
738
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
900
739
 
901
- for table in tables_to_export:
902
- df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
903
- df.to_sql(table, engine, if_exists='replace', index=False)
904
- print(f" {table}: {len(df)} rows")
740
+ for t in (tables or self._get_table_names()):
741
+ df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
742
+ df.to_sql(t, engine, if_exists='replace', index=False)
743
+ print(f" {t}: {len(df)} rows")
905
744
 
906
745
  print("Complete!")
907
746
  return self
908
747
 
909
- def save_to_postgres(self, host: str, user: str, password: str, database: str,
748
+ def save_to_postgres(self, host: str, user: str, password: str, database: str,
910
749
  port: int = 5432, tables: Optional[List[str]] = None):
911
750
  """Export to PostgreSQL."""
912
751
  try:
913
752
  from sqlalchemy import create_engine
753
+ engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
754
+
755
+ print(f"Exporting to PostgreSQL...")
756
+ for t in (tables or self._get_table_names()):
757
+ df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
758
+ df.to_sql(t, engine, if_exists='replace', index=False)
759
+ print(f" {t}: {len(df)} rows")
760
+ print("Complete!")
761
+ return self
914
762
  except ImportError:
915
763
  raise ImportError("Run: pip install QuerySUTRA[postgres]")
916
-
917
- print(f"\nConnecting to PostgreSQL: {host}:{port}...")
918
-
919
- engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
920
-
921
- tables_to_export = tables or self._get_table_names()
922
-
923
- print(f"Exporting {len(tables_to_export)} tables...")
924
-
925
- for table in tables_to_export:
926
- df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
927
- df.to_sql(table, engine, if_exists='replace', index=False)
928
- print(f" {table}: {len(df)} rows")
929
-
930
- print("Complete!")
931
- return self
932
764
 
933
- def backup(self, backup_path: str = None):
934
- """Create backup."""
935
- if backup_path:
936
- backup_dir = Path(backup_path)
937
- backup_dir.mkdir(parents=True, exist_ok=True)
938
- else:
939
- backup_dir = Path(".")
940
-
941
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
942
-
943
- print("\nCreating backup...")
944
-
945
- db_backup = backup_dir / f"sutra_{timestamp}.db"
946
- self.export_db(str(db_backup), format="sqlite")
947
-
948
- json_backup = backup_dir / f"sutra_{timestamp}.json"
949
- self.export_db(str(json_backup), format="json")
950
-
951
- print(f"\nBackup complete!")
952
- print(f" Database: {db_backup}")
953
- print(f" Data: {json_backup}")
765
+ def backup(self, path: str = None):
766
+ """Backup."""
767
+ dir = Path(path) if path else Path(".")
768
+ dir.mkdir(parents=True, exist_ok=True)
769
+ ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
954
770
 
771
+ self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
772
+ self.export_db(str(dir / f"sutra_{ts}.json"), "json")
773
+ print("Backup complete!")
955
774
  return self
956
775
 
957
776
  def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
@@ -962,17 +781,13 @@ IMPORTANT:
962
781
  data.to_excel(path, index=False)
963
782
  elif format == "json":
964
783
  data.to_json(path, orient="records", indent=2)
965
- else:
966
- raise ValueError(f"Unknown: {format}")
967
-
968
- print(f"Exported to {path}")
784
+ print(f"Exported: {path}")
969
785
  return self
970
786
 
971
787
  def close(self):
972
- """Close connection."""
788
+ """Close."""
973
789
  if self.conn:
974
790
  self.conn.close()
975
- print("Closed")
976
791
 
977
792
  def _get_table_names(self) -> List[str]:
978
793
  """Get tables."""
@@ -980,46 +795,28 @@ IMPORTANT:
980
795
  return [r[0] for r in self.cursor.fetchall()]
981
796
 
982
797
  def _refresh_schema(self):
983
- """Refresh schema."""
984
- tables = self._get_table_names()
985
-
798
+ """Refresh."""
986
799
  self.schema_info = {}
987
- for tbl in tables:
800
+ for tbl in self._get_table_names():
988
801
  self.cursor.execute(f"PRAGMA table_info({tbl})")
989
802
  self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
990
803
 
991
804
  def _generate_sql(self, question: str, table: str) -> str:
992
805
  """Generate SQL."""
993
806
  schema = self.schema_info.get(table, {})
994
- sample_df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn)
995
- sample = sample_df.to_string(index=False)
996
-
807
+ sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
997
808
  schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
998
809
 
999
- prompt = f"""Convert to SQL.
1000
-
1001
- Database: SQLite
1002
- Table: {table}
1003
- Columns: {schema_str}
1004
-
1005
- Sample:
1006
- {sample}
1007
-
1008
- Question: {question}
1009
-
1010
- Return ONLY SQL."""
1011
-
1012
- response = self.client.chat.completions.create(
810
+ resp = self.client.chat.completions.create(
1013
811
  model="gpt-4o-mini",
1014
812
  messages=[
1015
- {"role": "system", "content": "SQL expert. Return only SQL code."},
1016
- {"role": "user", "content": prompt}
813
+ {"role": "system", "content": "SQL expert. Return only SQL."},
814
+ {"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
1017
815
  ],
1018
816
  temperature=0
1019
817
  )
1020
818
 
1021
- sql = response.choices[0].message.content.strip()
1022
- return sql.replace("```sql", "").replace("```", "").strip()
819
+ return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
1023
820
 
1024
821
  def __enter__(self):
1025
822
  return self
@@ -1028,53 +825,28 @@ Return ONLY SQL."""
1028
825
  self.close()
1029
826
 
1030
827
  def __repr__(self):
1031
- features = []
1032
- if self.cache_queries:
1033
- features.append("cache")
1034
- if self.use_embeddings:
1035
- features.append("embeddings")
1036
- if self.check_relevance:
1037
- features.append("relevance")
1038
- if self.fuzzy_match:
1039
- features.append("fuzzy")
1040
-
1041
- feat_str = f", {', '.join(features)}" if features else ""
1042
- return f"SUTRA(tables={len(self.schema_info)}{feat_str})"
828
+ return f"SUTRA(tables={len(self.schema_info)})"
1043
829
 
1044
830
 
1045
831
  class QueryResult:
1046
- """Query result."""
1047
-
832
+ """Result."""
1048
833
  def __init__(self, success: bool, sql: str, data: pd.DataFrame, viz, error: str = None):
1049
- self.success = success
1050
- self.sql = sql
1051
- self.data = data
1052
- self.viz = viz
1053
- self.error = error
834
+ self.success, self.sql, self.data, self.viz, self.error = success, sql, data, viz, error
1054
835
 
1055
836
  def __repr__(self):
1056
- return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
837
+ return f"QueryResult(rows={len(self.data)})" if self.success else f"QueryResult(error='{self.error}')"
1057
838
 
1058
839
  def show(self):
1059
- print(self.data) if self.success else print(f"Error: {self.error}")
840
+ print(self.data if self.success else f"Error: {self.error}")
1060
841
  return self
1061
842
 
1062
843
 
1063
844
  def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
1064
- """One-liner."""
845
+ """Quick start."""
1065
846
  with SUTRA(api_key=api_key) as sutra:
1066
847
  sutra.upload(data_path)
1067
848
  return sutra.ask(question, viz=viz)
1068
849
 
1069
850
 
1070
851
  if __name__ == "__main__":
1071
- print("""
1072
- QuerySUTRA v0.3.3 - Professional Data Analysis
1073
- SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
1074
-
1075
- Fixed: Proper primary and foreign keys with unique IDs
1076
- Features: Load existing DB, custom viz, fuzzy matching, embeddings
1077
-
1078
- Installation: pip install QuerySUTRA
1079
- Usage: from sutra import SUTRA
1080
- """)
852
+ print("QuerySUTRA v0.4.0 - Simple & Automatic")