QuerySUTRA 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sutra/sutra.py CHANGED
@@ -1,36 +1,32 @@
1
1
  """
2
- QuerySUTRA v0.3.5 - FIXED COLAB COMPATIBILITY
2
+ QuerySUTRA v0.4.0 - SIMPLE & AUTOMATIC
3
3
  SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
4
4
 
5
- FIXED:
6
- - Colab disk I/O errors resolved
7
- - Batch processing for large datasets
8
- - Proper error handling
9
- - Unique IDs and proper foreign keys
10
- - Comprehensive entity extraction
5
+ FIXED:
6
+ - Auto-creates MySQL database if not exists
7
+ - One-line export to MySQL
8
+ - Complete data extraction from large PDFs
9
+ - No manual file transfers needed
11
10
 
12
11
  Author: Aditya Batta
13
- License: MIT
14
- Version: 0.3.5
12
+ Version: 0.4.0
15
13
  """
16
14
 
17
- __version__ = "0.3.5"
15
+ __version__ = "0.4.0"
18
16
  __author__ = "Aditya Batta"
19
- __title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
20
17
  __all__ = ["SUTRA", "QueryResult", "quick_start"]
21
18
 
22
19
  import os
23
20
  import sqlite3
24
21
  import pandas as pd
25
22
  import numpy as np
26
- from typing import Optional, Union, Dict, Any, List
23
+ from typing import Optional, Union, Dict, List
27
24
  from pathlib import Path
28
25
  import json
29
26
  import hashlib
30
27
  import warnings
31
28
  import shutil
32
29
  import datetime
33
- import re
34
30
  from io import StringIO
35
31
  from difflib import get_close_matches
36
32
  warnings.filterwarnings('ignore')
@@ -74,19 +70,13 @@ except ImportError:
74
70
 
75
71
 
76
72
  class SUTRA:
77
- """
78
- SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
79
- """
80
-
81
- def __init__(self,
82
- api_key: Optional[str] = None,
83
- db: str = "sutra.db",
84
- use_embeddings: bool = False,
85
- check_relevance: bool = False,
86
- fuzzy_match: bool = True,
87
- cache_queries: bool = True):
88
- """Initialize SUTRA."""
89
- print("Initializing QuerySUTRA v0.3.5")
73
+ """SUTRA: Structured-Unstructured-Text-Retrieval-Architecture"""
74
+
75
+ def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
76
+ use_embeddings: bool = False, check_relevance: bool = False,
77
+ fuzzy_match: bool = True, cache_queries: bool = True):
78
+ """Initialize."""
79
+ print("Initializing QuerySUTRA v0.4.0")
90
80
 
91
81
  if api_key:
92
82
  os.environ["OPENAI_API_KEY"] = api_key
@@ -96,7 +86,6 @@ class SUTRA:
96
86
 
97
87
  self.db_path = db
98
88
 
99
- # FIXED: Better connection handling for Colab
100
89
  try:
101
90
  self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
102
91
  self.conn.execute("PRAGMA journal_mode=WAL")
@@ -105,24 +94,20 @@ class SUTRA:
105
94
  self.conn = sqlite3.connect(db, check_same_thread=False)
106
95
 
107
96
  self.cursor = self.conn.cursor()
108
-
109
97
  self.current_table = None
110
98
  self.schema_info = {}
111
99
 
112
100
  self.cache_queries = cache_queries
113
101
  self.cache = {} if cache_queries else None
114
-
115
102
  self.use_embeddings = use_embeddings
116
103
  self.embedding_model = None
117
104
  self.query_embeddings = {}
118
-
119
105
  self.check_relevance = check_relevance
120
106
  self.fuzzy_match = fuzzy_match
121
107
 
122
108
  if use_embeddings and HAS_EMBEDDINGS:
123
109
  try:
124
110
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
125
- print("Embeddings ready")
126
111
  except:
127
112
  self.use_embeddings = False
128
113
 
@@ -133,12 +118,8 @@ class SUTRA:
133
118
  def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
134
119
  """Load existing database."""
135
120
  if not Path(db_path).exists():
136
- raise FileNotFoundError(f"Database not found: {db_path}")
137
-
138
- print(f"Loading: {db_path}")
139
- instance = cls(api_key=api_key, db=db_path, **kwargs)
140
- print(f"Loaded {len(instance.tables())} tables")
141
- return instance
121
+ raise FileNotFoundError(f"Not found: {db_path}")
122
+ return cls(api_key=api_key, db=db_path, **kwargs)
142
123
 
143
124
  @classmethod
144
125
  def connect_mysql(cls, host: str, user: str, password: str, database: str,
@@ -146,26 +127,34 @@ class SUTRA:
146
127
  """Connect to MySQL."""
147
128
  try:
148
129
  from sqlalchemy import create_engine
130
+ import mysql.connector
149
131
  except ImportError:
150
132
  raise ImportError("Run: pip install QuerySUTRA[mysql]")
151
133
 
152
- print(f"Connecting to MySQL: {host}:{port}/{database}")
134
+ print(f"Connecting to MySQL...")
153
135
 
154
- engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
136
+ # Auto-create database if not exists
137
+ try:
138
+ temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
139
+ temp_cursor = temp_conn.cursor()
140
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
141
+ temp_cursor.close()
142
+ temp_conn.close()
143
+ except:
144
+ pass
155
145
 
146
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
156
147
  temp_db = f"sutra_mysql_{database}.db"
157
148
  instance = cls(api_key=api_key, db=temp_db, **kwargs)
158
149
 
159
150
  tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
160
- print(f"Syncing {len(tables)} tables...")
161
151
 
162
152
  for table in tables:
163
153
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
164
154
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
165
- print(f" {table}: {len(df)} rows")
166
155
 
167
156
  instance._refresh_schema()
168
- print("Connected!")
157
+ print(f"Connected! {len(tables)} tables")
169
158
  return instance
170
159
 
171
160
  @classmethod
@@ -177,286 +166,250 @@ class SUTRA:
177
166
  except ImportError:
178
167
  raise ImportError("Run: pip install QuerySUTRA[postgres]")
179
168
 
180
- print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
169
+ print(f"Connecting to PostgreSQL...")
181
170
 
182
171
  engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
183
-
184
172
  temp_db = f"sutra_postgres_{database}.db"
185
173
  instance = cls(api_key=api_key, db=temp_db, **kwargs)
186
174
 
187
175
  tables = pd.read_sql_query("SELECT tablename FROM pg_tables WHERE schemaname='public'", engine)['tablename'].tolist()
188
- print(f"Syncing {len(tables)} tables...")
189
176
 
190
177
  for table in tables:
191
178
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
192
179
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
193
- print(f" {table}: {len(df)} rows")
194
180
 
195
181
  instance._refresh_schema()
196
- print("Connected!")
182
+ print(f"Connected! {len(tables)} tables")
197
183
  return instance
198
184
 
199
185
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
200
- extract_entities: Optional[List[str]] = None) -> 'SUTRA':
201
- """Upload data."""
202
- print("\nUploading data...")
186
+ extract_entities: Optional[List[str]] = None,
187
+ auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
188
+ """
189
+ Upload data with OPTIONAL automatic MySQL export.
190
+
191
+ Args:
192
+ data: File path or DataFrame
193
+ name: Table name
194
+ extract_entities: Custom entities to extract
195
+ auto_export_mysql: Auto-export to MySQL after upload
196
+ {'host': 'localhost', 'user': 'root', 'password': 'pass', 'database': 'mydb'}
197
+
198
+ Example:
199
+ sutra.upload("data.pdf", auto_export_mysql={
200
+ 'host': 'localhost',
201
+ 'user': 'root',
202
+ 'password': '123456',
203
+ 'database': 'my_database'
204
+ })
205
+ """
206
+ print("\nUploading...")
203
207
 
204
208
  if isinstance(data, pd.DataFrame):
205
209
  name = name or "data"
206
210
  self._store_dataframe(data, name)
207
- return self
208
-
209
- path = Path(data)
210
- if not path.exists():
211
- raise FileNotFoundError(f"File not found: {data}")
212
-
213
- name = name or path.stem.replace(" ", "_").replace("-", "_")
214
- ext = path.suffix.lower()
215
-
216
- print(f"File: {path.name}")
217
-
218
- if ext == ".csv":
219
- df = pd.read_csv(path)
220
- self._store_dataframe(df, name)
221
- elif ext in [".xlsx", ".xls"]:
222
- df = pd.read_excel(path)
223
- self._store_dataframe(df, name)
224
- elif ext == ".json":
225
- df = pd.read_json(path)
226
- self._store_dataframe(df, name)
227
- elif ext == ".sql":
228
- with open(path) as f:
229
- self.cursor.executescript(f.read())
230
- self.conn.commit()
231
- self._refresh_schema()
232
- print("SQL executed")
233
- elif ext == ".pdf":
234
- self._smart_upload_pdf(path, name, extract_entities)
235
- elif ext == ".docx":
236
- self._smart_upload_docx(path, name, extract_entities)
237
- elif ext == ".txt":
238
- self._smart_upload_txt(path, name, extract_entities)
239
211
  else:
240
- raise ValueError(f"Unsupported: {ext}")
212
+ path = Path(data)
213
+ if not path.exists():
214
+ raise FileNotFoundError(f"Not found: {data}")
215
+
216
+ name = name or path.stem.replace(" ", "_").replace("-", "_")
217
+ ext = path.suffix.lower()
218
+
219
+ print(f"File: {path.name}")
220
+
221
+ if ext == ".csv":
222
+ self._store_dataframe(pd.read_csv(path), name)
223
+ elif ext in [".xlsx", ".xls"]:
224
+ self._store_dataframe(pd.read_excel(path), name)
225
+ elif ext == ".json":
226
+ self._store_dataframe(pd.read_json(path), name)
227
+ elif ext == ".sql":
228
+ with open(path) as f:
229
+ self.cursor.executescript(f.read())
230
+ self.conn.commit()
231
+ self._refresh_schema()
232
+ elif ext == ".pdf":
233
+ self._smart_upload_pdf(path, name, extract_entities)
234
+ elif ext == ".docx":
235
+ self._smart_upload_docx(path, name, extract_entities)
236
+ elif ext == ".txt":
237
+ self._smart_upload_txt(path, name, extract_entities)
238
+ else:
239
+ raise ValueError(f"Unsupported: {ext}")
240
+
241
+ # AUTO-EXPORT to MySQL if requested
242
+ if auto_export_mysql:
243
+ print("\nAuto-exporting to MySQL...")
244
+ self.save_to_mysql(
245
+ host=auto_export_mysql.get('host', 'localhost'),
246
+ user=auto_export_mysql.get('user', 'root'),
247
+ password=auto_export_mysql['password'],
248
+ database=auto_export_mysql['database'],
249
+ port=auto_export_mysql.get('port', 3306)
250
+ )
241
251
 
242
252
  return self
243
253
 
244
254
  def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
245
- """Parse PDF."""
255
+ """Parse PDF - extracts ALL pages."""
246
256
  if not HAS_PYPDF2:
247
257
  raise ImportError("Run: pip install PyPDF2")
248
258
 
249
- print("Extracting from PDF...")
259
+ print("Extracting PDF...")
250
260
 
251
261
  with open(path, 'rb') as file:
252
262
  pdf_reader = PyPDF2.PdfReader(file)
253
- text = ""
263
+ full_text = ""
254
264
  for page_num, page in enumerate(pdf_reader.pages, 1):
255
- text += page.extract_text() + "\n"
265
+ full_text += page.extract_text() + "\n"
256
266
  print(f" Page {page_num}/{len(pdf_reader.pages)}")
257
267
 
258
268
  if self.client:
259
269
  print("AI: Extracting entities...")
260
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
261
270
 
262
- if tables and len(tables) > 0:
263
- print(f"\nCreated {len(tables)} tables:")
264
- for tbl in tables:
265
- cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
266
- cols = len(self.schema_info.get(tbl, {}))
267
- print(f" {tbl}: {cnt} rows, {cols} columns")
271
+ # Process in chunks for large documents
272
+ chunk_size = 10000
273
+ all_entities = {}
274
+
275
+ for i in range(0, len(full_text), chunk_size):
276
+ chunk = full_text[i:i+chunk_size]
277
+ chunk_num = (i // chunk_size) + 1
278
+ total_chunks = (len(full_text) // chunk_size) + 1
279
+
280
+ if total_chunks > 1:
281
+ print(f" Chunk {chunk_num}/{total_chunks}...")
282
+
283
+ entities = self._extract_chunk(chunk, extract_entities)
284
+
285
+ for entity_type, records in entities.items():
286
+ if entity_type not in all_entities:
287
+ all_entities[entity_type] = []
288
+ all_entities[entity_type].extend(records)
289
+
290
+ # Renumber IDs
291
+ for entity_type, records in all_entities.items():
292
+ for idx, record in enumerate(records, 1):
293
+ record['id'] = idx
294
+
295
+ # Create tables
296
+ if all_entities:
297
+ print(f"\nCreated {len(all_entities)} tables:")
298
+ for entity_type, records in all_entities.items():
299
+ if records:
300
+ table_name = f"{base_name}_{entity_type}"
301
+ df = pd.DataFrame(records)
302
+ self._store_dataframe_safe(df, table_name)
303
+ print(f" {entity_type}: {len(df)} records")
268
304
  return
269
305
 
270
306
  print("Creating simple table")
271
- df = self._parse_text_simple(text)
272
- self._store_dataframe(df, base_name)
307
+ self._store_dataframe(self._parse_text_simple(full_text), base_name)
273
308
 
274
309
  def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
275
310
  """Parse DOCX."""
276
311
  if not HAS_DOCX:
277
312
  raise ImportError("Run: pip install python-docx")
278
313
 
279
- print("Extracting from DOCX...")
280
-
281
314
  doc = docx.Document(path)
282
315
 
283
316
  if doc.tables:
284
- print(f"Found {len(doc.tables)} table(s)")
285
317
  for i, table in enumerate(doc.tables):
286
318
  data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
287
319
  if data and len(data) > 1:
288
320
  df = pd.DataFrame(data[1:], columns=data[0])
289
- table_name = f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name
290
- self._store_dataframe(df, table_name)
321
+ self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
291
322
  return
292
323
 
293
324
  text = "\n".join([para.text for para in doc.paragraphs])
294
325
 
295
- if self.client:
296
- print("AI: Extracting...")
297
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
298
- if tables and len(tables) > 0:
299
- print(f"\nCreated {len(tables)} tables:")
300
- for tbl in tables:
301
- cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
302
- print(f" {tbl}: {cnt} rows")
303
- return
304
-
305
- df = self._parse_text_simple(text)
306
- self._store_dataframe(df, base_name)
326
+ if self.client and len(text) > 0:
327
+ entities = self._extract_chunk(text, extract_entities)
328
+ for entity_type, records in entities.items():
329
+ if records:
330
+ df = pd.DataFrame(records)
331
+ self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
332
+ else:
333
+ self._store_dataframe(self._parse_text_simple(text), base_name)
307
334
 
308
335
  def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
309
336
  """Parse TXT."""
310
- print("Reading TXT...")
311
-
312
337
  with open(path, 'r', encoding='utf-8') as file:
313
338
  text = file.read()
314
339
 
315
- if self.client:
316
- print("AI: Extracting...")
317
- tables = self._create_tables_with_ai(text, base_name, extract_entities)
318
- if tables and len(tables) > 0:
319
- print(f"\nCreated {len(tables)} tables:")
320
- for tbl in tables:
321
- cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
322
- print(f" {tbl}: {cnt} rows")
323
- return
324
-
325
- df = self._parse_text_simple(text)
326
- self._store_dataframe(df, base_name)
340
+ if self.client and len(text) > 0:
341
+ entities = self._extract_chunk(text, extract_entities)
342
+ for entity_type, records in entities.items():
343
+ if records:
344
+ df = pd.DataFrame(records)
345
+ self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
346
+ else:
347
+ self._store_dataframe(self._parse_text_simple(text), base_name)
327
348
 
328
- def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
329
- """AI extraction with proper keys."""
349
+ def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
350
+ """Extract entities from text chunk."""
330
351
  if not self.client:
331
- return []
352
+ return {}
332
353
 
333
354
  try:
334
- entity_list = """Extract ALL entities you find:
335
- - people: id, name, email, phone, address, city, state, zip
336
- - skills: id, person_id, skill_name, proficiency, years
337
- - technologies: id, person_id, technology, category, proficiency
338
- - projects: id, person_id, project_name, description, role
339
- - certifications: id, person_id, cert_name, issuer, date
340
- - education: id, person_id, degree, institution, year
341
- - work_experience: id, person_id, company, title, start_date, end_date
342
- - events: id, host_id, description, location, date
343
- - organizations: id, name, address, city
344
- - ANY other structured data
345
-
346
- CRITICAL: Use UNIQUE sequential IDs (1,2,3...) for each table. Foreign keys MUST reference valid IDs."""
347
-
348
- if custom_entities:
349
- entity_list = f"Extract these entities: {', '.join(custom_entities)}"
350
-
351
- extraction_prompt = f"""Extract structured data from this text.
355
+ prompt = f"""Extract ALL structured entities from this text.
352
356
 
353
357
  Text:
354
- {text[:5000]}
358
+ {text[:8000]}
359
+
360
+ Extract entities like: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY other structured data.
355
361
 
356
- {entity_list}
362
+ Return JSON with arrays. Use sequential IDs (1,2,3...). Foreign keys reference primary keys.
357
363
 
358
- Return JSON:
364
+ Example:
359
365
  {{
360
- "people": [{{"id": 1, "name": "John", ...}}, {{"id": 2, "name": "Jane", ...}}],
361
- "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python", ...}}, {{"id": 2, "person_id": 2, ...}}]
366
+ "people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
367
+ "skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
362
368
  }}
363
369
 
364
- Requirements:
365
- - UNIQUE IDs: id=1,2,3,... (no duplicates)
366
- - Valid foreign keys: person_id must match people.id
367
- - Extract EVERYTHING
368
- - Return ONLY valid JSON"""
370
+ Return ONLY valid JSON."""
369
371
 
370
- response = self.client.chat.completions.create(
372
+ resp = self.client.chat.completions.create(
371
373
  model="gpt-4o-mini",
372
374
  messages=[
373
- {"role": "system", "content": "Extract entities with unique IDs and proper foreign keys. Return only JSON."},
374
- {"role": "user", "content": extraction_prompt}
375
+ {"role": "system", "content": "Extract ALL entities with unique IDs. Return only JSON."},
376
+ {"role": "user", "content": prompt}
375
377
  ],
376
378
  temperature=0,
377
- max_tokens=4096
379
+ max_tokens=8000
378
380
  )
379
381
 
380
- json_text = response.choices[0].message.content.strip()
381
- json_text = json_text.replace("```json", "").replace("```", "").strip()
382
-
383
- extracted_data = json.loads(json_text)
384
-
385
- created_tables = []
386
-
387
- for entity_type, records in extracted_data.items():
388
- if records and isinstance(records, list) and len(records) > 0:
389
- table_name = f"{base_name}_{entity_type}"
390
-
391
- try:
392
- df = pd.DataFrame(records)
393
- if not df.empty:
394
- # FIXED: Store with better error handling
395
- self._store_dataframe_safe(df, table_name)
396
- created_tables.append(table_name)
397
- print(f" {entity_type}: {len(df)} records")
398
- except Exception as e:
399
- print(f" Error {entity_type}: {e}")
400
-
401
- return created_tables
402
-
382
+ json_text = resp.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()
383
+ return json.loads(json_text)
403
384
  except Exception as e:
404
- print(f"AI error: {e}")
405
- return []
385
+ return {}
406
386
 
407
387
  def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
408
- """FIXED: Store with proper error handling for Colab."""
388
+ """Store with error handling."""
409
389
  try:
410
390
  df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
411
-
412
- # FIXED: Use method='multi' for better performance and if_exists='replace'
413
391
  df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
414
-
415
- self.conn.commit() # FIXED: Explicit commit
392
+ self.conn.commit()
393
+ self.current_table = name
394
+ self._refresh_schema()
395
+ except:
396
+ df.to_sql(name, self.conn, if_exists='replace', index=False)
397
+ self.conn.commit()
416
398
  self.current_table = name
417
399
  self._refresh_schema()
418
-
419
- except Exception as e:
420
- # FIXED: Fallback to single-row insert if bulk fails
421
- print(f" Bulk insert failed, using row-by-row (slower but safer)")
422
- try:
423
- df.to_sql(name, self.conn, if_exists='replace', index=False)
424
- self.conn.commit()
425
- self.current_table = name
426
- self._refresh_schema()
427
- except Exception as e2:
428
- print(f" Storage error: {e2}")
429
- raise
430
400
 
431
401
  def _parse_text_simple(self, text: str) -> pd.DataFrame:
432
402
  """Simple parsing."""
433
403
  lines = [line.strip() for line in text.split('\n') if line.strip()]
434
-
435
404
  if not lines:
436
405
  return pd.DataFrame({'content': ['No content']})
437
406
 
438
- sample = lines[:min(10, len(lines))]
439
- for delimiter in ['\t', ',', '|', ';']:
440
- if all(delimiter in line for line in sample):
441
- try:
442
- df = pd.read_csv(StringIO('\n'.join(lines)), sep=delimiter)
443
- if len(df.columns) > 1:
444
- return df
445
- except:
446
- continue
447
-
448
- return pd.DataFrame({
449
- 'line_number': range(1, len(lines) + 1),
450
- 'content': lines
451
- })
452
-
453
- def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
454
- """Store DataFrame."""
407
+ return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
408
+
409
+ def _store_dataframe(self, df: pd.DataFrame, name: str):
410
+ """Store."""
455
411
  self._store_dataframe_safe(df, name)
456
-
457
- if not silent:
458
- print(f"Uploaded: {name}")
459
- print(f" {len(df)} rows, {len(df.columns)} columns")
412
+ print(f"Uploaded: {name} ({len(df)} rows)")
460
413
 
461
414
  def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
462
415
  """Natural language query."""
@@ -466,7 +419,7 @@ Requirements:
466
419
  print(f"\nQuestion: {question}")
467
420
 
468
421
  if self.check_relevance and not self._is_relevant_query(question):
469
- print("Warning: Query may be irrelevant")
422
+ print("Warning: Irrelevant query")
470
423
  choice = input("Continue? (yes/no): ").strip().lower()
471
424
  if choice not in ['yes', 'y']:
472
425
  return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
@@ -478,7 +431,7 @@ Requirements:
478
431
  if self.use_embeddings and self.embedding_model:
479
432
  cached = self._check_embedding_cache(question, tbl)
480
433
  if cached:
481
- print(" Cached result")
434
+ print(" Cached")
482
435
  return cached
483
436
 
484
437
  if self.fuzzy_match:
@@ -519,19 +472,17 @@ Requirements:
519
472
  if not self.client:
520
473
  return True
521
474
 
522
- tables = self._get_table_names()[:3]
523
- cols = []
524
- for tbl in tables:
525
- cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
526
-
527
- context = f"Tables: {', '.join(tables)}. Columns: {', '.join(cols[:15])}"
528
-
529
475
  try:
476
+ tables = self._get_table_names()[:3]
477
+ cols = []
478
+ for tbl in tables:
479
+ cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
480
+
530
481
  resp = self.client.chat.completions.create(
531
482
  model="gpt-4o-mini",
532
483
  messages=[
533
- {"role": "system", "content": "Return only 'yes' or 'no'."},
534
- {"role": "user", "content": f"Relevant to {context}?\n\nQ: {question}\n\nyes/no:"}
484
+ {"role": "system", "content": "Return 'yes' or 'no'."},
485
+ {"role": "user", "content": f"Relevant to DB with tables {', '.join(tables)}?\n\nQ: {question}\n\nyes/no:"}
535
486
  ],
536
487
  temperature=0,
537
488
  max_tokens=5
@@ -571,7 +522,6 @@ Requirements:
571
522
  return None
572
523
 
573
524
  q_emb = self.embedding_model.encode([question])[0]
574
-
575
525
  best_match = None
576
526
  best_sim = 0.85
577
527
 
@@ -580,13 +530,12 @@ Requirements:
580
530
  continue
581
531
 
582
532
  sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
583
-
584
533
  if sim > best_sim:
585
534
  best_sim = sim
586
535
  best_match = cached_q
587
536
 
588
537
  if best_match:
589
- print(f" Similar ({best_sim:.0%}): '{best_match}'")
538
+ print(f" Similar ({best_sim:.0%})")
590
539
  return self.query_embeddings[best_match]['result']
591
540
 
592
541
  return None
@@ -605,7 +554,7 @@ Requirements:
605
554
  return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
606
555
 
607
556
  def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
608
- """Plotly viz."""
557
+ """Plotly."""
609
558
  try:
610
559
  num = df.select_dtypes(include=[np.number]).columns.tolist()
611
560
  cat = df.select_dtypes(include=['object']).columns.tolist()
@@ -631,14 +580,12 @@ Requirements:
631
580
  fig = px.bar(df, y=df.columns[0], title=title)
632
581
 
633
582
  fig.show()
634
- print("Chart displayed")
635
583
  return fig
636
- except Exception as e:
637
- print(f"Viz error: {e}")
584
+ except:
638
585
  return None
639
586
 
640
587
  def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
641
- """Matplotlib viz."""
588
+ """Matplotlib."""
642
589
  try:
643
590
  plt.figure(figsize=(10, 6))
644
591
  num = df.select_dtypes(include=[np.number]).columns
@@ -654,14 +601,13 @@ Requirements:
654
601
  plt.tight_layout()
655
602
  plt.show()
656
603
  return plt.gcf()
657
- except Exception as e:
658
- print(f"Viz error: {e}")
604
+ except:
659
605
  return None
660
606
 
661
607
  def tables(self) -> Dict[str, dict]:
662
608
  """List tables."""
663
609
  print("\n" + "="*70)
664
- print("TABLES IN DATABASE")
610
+ print("TABLES")
665
611
  print("="*70)
666
612
 
667
613
  all_tables = self._get_table_names()
@@ -673,11 +619,7 @@ Requirements:
673
619
  for i, tbl in enumerate(all_tables, 1):
674
620
  cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
675
621
  cols = list(self.schema_info.get(tbl, {}).keys())
676
-
677
- print(f" {i}. {tbl}")
678
- print(f" {cnt} rows, {len(cols)} columns")
679
- print(f" {', '.join(cols[:8])}")
680
-
622
+ print(f" {i}. {tbl}: {cnt} rows, {len(cols)} columns")
681
623
  result[tbl] = {'rows': cnt, 'columns': cols}
682
624
 
683
625
  print("="*70)
@@ -689,16 +631,14 @@ Requirements:
689
631
  self._refresh_schema()
690
632
 
691
633
  print("\n" + "="*70)
692
- print("DATABASE SCHEMA")
634
+ print("SCHEMA")
693
635
  print("="*70)
694
636
 
695
- tables_to_show = [table] if table else self.schema_info.keys()
696
-
697
637
  result = {}
698
- for tbl in tables_to_show:
638
+ for tbl in ([table] if table else self.schema_info.keys()):
699
639
  if tbl in self.schema_info:
700
640
  cnt = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
701
- print(f"\nTable: {tbl} ({cnt} records)")
641
+ print(f"\n{tbl}: {cnt} records")
702
642
  for col, dtype in self.schema_info[tbl].items():
703
643
  print(f" - {col:<30} {dtype}")
704
644
  result[tbl] = {'records': cnt, 'columns': self.schema_info[tbl]}
@@ -723,12 +663,10 @@ Requirements:
723
663
 
724
664
  def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
725
665
  """Execute SQL."""
726
- print("\nExecuting SQL...")
727
666
  try:
728
667
  df = pd.read_sql_query(query, self.conn)
729
668
  print(f"Success! {len(df)} rows")
730
-
731
- fig = self._visualize(df, "SQL Result", viz if isinstance(viz, str) else "auto") if viz else None
669
+ fig = self._visualize(df, "Result", viz if isinstance(viz, str) else "auto") if viz else None
732
670
  return QueryResult(True, query, df, fig)
733
671
  except Exception as e:
734
672
  print(f"Error: {e}")
@@ -736,44 +674,79 @@ Requirements:
736
674
 
737
675
  def interactive(self, question: str) -> 'QueryResult':
738
676
  """Interactive."""
739
- print(f"\nQuestion: {question}")
740
677
  choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
741
678
  viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
742
679
  return self.ask(question, viz=viz)
743
680
 
744
681
  def export_db(self, path: str, format: str = "sqlite"):
745
- """Export."""
746
- formats = {
747
- "sqlite": lambda: shutil.copy2(self.db_path, path),
748
- "sql": lambda: open(path, 'w', encoding='utf-8').writelines(f'{line}\n' for line in self.conn.iterdump()),
749
- "json": lambda: json.dump({t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}, open(path, 'w', encoding='utf-8'), indent=2, default=str),
750
- "excel": lambda: pd.ExcelWriter(path, engine='openpyxl').__enter__() and [pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(path, sheet_name=t[:31], index=False) for t in self._get_table_names()]
751
- }
752
-
753
- if format in formats:
754
- formats[format]()
755
- print(f"Saved: {path}")
682
+ """Export database."""
683
+ if format == "sqlite":
684
+ shutil.copy2(self.db_path, path)
685
+ elif format == "sql":
686
+ with open(path, 'w', encoding='utf-8') as f:
687
+ for line in self.conn.iterdump():
688
+ f.write(f'{line}\n')
689
+ elif format == "json":
690
+ data = {t: pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_dict('records') for t in self._get_table_names()}
691
+ with open(path, 'w', encoding='utf-8') as f:
692
+ json.dump(data, f, indent=2, default=str)
693
+ elif format == "excel":
694
+ with pd.ExcelWriter(path, engine='openpyxl') as writer:
695
+ for t in self._get_table_names():
696
+ pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
756
697
  else:
757
698
  raise ValueError(f"Unsupported: {format}")
699
+
700
+ print(f"Saved: {path}")
758
701
  return self
759
702
 
760
- def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306, tables: Optional[List[str]] = None):
761
- """Export to MySQL."""
703
+ def save_to_mysql(self, host: str, user: str, password: str, database: str,
704
+ port: int = 3306, tables: Optional[List[str]] = None,
705
+ auto_create: bool = True):
706
+ """
707
+ Export to MySQL - AUTO-CREATES database if not exists.
708
+
709
+ Args:
710
+ host: MySQL host
711
+ user: MySQL user
712
+ password: MySQL password
713
+ database: Database name (auto-created if not exists)
714
+ port: MySQL port
715
+ tables: Specific tables to export (None = all)
716
+ auto_create: Auto-create database if not exists
717
+ """
762
718
  try:
763
719
  from sqlalchemy import create_engine
764
- engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
765
-
766
- print(f"Exporting to MySQL...")
767
- for t in (tables or self._get_table_names()):
768
- df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
769
- df.to_sql(t, engine, if_exists='replace', index=False)
770
- print(f" {t}: {len(df)} rows")
771
- print("Complete!")
772
- return self
720
+ import mysql.connector
773
721
  except ImportError:
774
722
  raise ImportError("Run: pip install QuerySUTRA[mysql]")
723
+
724
+ print(f"Exporting to MySQL: {host}/{database}")
725
+
726
+ # Auto-create database if requested
727
+ if auto_create:
728
+ try:
729
+ temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
730
+ temp_cursor = temp_conn.cursor()
731
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{database}`")
732
+ temp_cursor.close()
733
+ temp_conn.close()
734
+ print(f" Database '{database}' ready")
735
+ except Exception as e:
736
+ print(f" Warning: Could not auto-create database: {e}")
737
+
738
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
739
+
740
+ for t in (tables or self._get_table_names()):
741
+ df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
742
+ df.to_sql(t, engine, if_exists='replace', index=False)
743
+ print(f" {t}: {len(df)} rows")
744
+
745
+ print("Complete!")
746
+ return self
775
747
 
776
- def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
748
+ def save_to_postgres(self, host: str, user: str, password: str, database: str,
749
+ port: int = 5432, tables: Optional[List[str]] = None):
777
750
  """Export to PostgreSQL."""
778
751
  try:
779
752
  from sqlalchemy import create_engine
@@ -795,7 +768,6 @@ Requirements:
795
768
  dir.mkdir(parents=True, exist_ok=True)
796
769
  ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
797
770
 
798
- print("Creating backup...")
799
771
  self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
800
772
  self.export_db(str(dir / f"sutra_{ts}.json"), "json")
801
773
  print("Backup complete!")
@@ -803,9 +775,12 @@ Requirements:
803
775
 
804
776
  def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
805
777
  """Export results."""
806
- {"csv": lambda: data.to_csv(path, index=False),
807
- "excel": lambda: data.to_excel(path, index=False),
808
- "json": lambda: data.to_json(path, orient="records", indent=2)}[format]()
778
+ if format == "csv":
779
+ data.to_csv(path, index=False)
780
+ elif format in ["excel", "xlsx"]:
781
+ data.to_excel(path, index=False)
782
+ elif format == "json":
783
+ data.to_json(path, orient="records", indent=2)
809
784
  print(f"Exported: {path}")
810
785
  return self
811
786
 
@@ -820,7 +795,7 @@ Requirements:
820
795
  return [r[0] for r in self.cursor.fetchall()]
821
796
 
822
797
  def _refresh_schema(self):
823
- """Refresh schema."""
798
+ """Refresh."""
824
799
  self.schema_info = {}
825
800
  for tbl in self._get_table_names():
826
801
  self.cursor.execute(f"PRAGMA table_info({tbl})")
@@ -836,7 +811,7 @@ Requirements:
836
811
  model="gpt-4o-mini",
837
812
  messages=[
838
813
  {"role": "system", "content": "SQL expert. Return only SQL."},
839
- {"role": "user", "content": f"Convert to SQL.\nTable: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
814
+ {"role": "user", "content": f"Table: {table}\nColumns: {schema_str}\nSample:\n{sample}\n\nQ: {question}\n\nSQL:"}
840
815
  ],
841
816
  temperature=0
842
817
  )
@@ -850,8 +825,7 @@ Requirements:
850
825
  self.close()
851
826
 
852
827
  def __repr__(self):
853
- feat = [f for f, v in [("cache", self.cache_queries), ("embeddings", self.use_embeddings), ("relevance", self.check_relevance), ("fuzzy", self.fuzzy_match)] if v]
854
- return f"SUTRA(tables={len(self.schema_info)}, {', '.join(feat)})"
828
+ return f"SUTRA(tables={len(self.schema_info)})"
855
829
 
856
830
 
857
831
  class QueryResult:
@@ -860,7 +834,7 @@ class QueryResult:
860
834
  self.success, self.sql, self.data, self.viz, self.error = success, sql, data, viz, error
861
835
 
862
836
  def __repr__(self):
863
- return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
837
+ return f"QueryResult(rows={len(self.data)})" if self.success else f"QueryResult(error='{self.error}')"
864
838
 
865
839
  def show(self):
866
840
  print(self.data if self.success else f"Error: {self.error}")
@@ -872,3 +846,7 @@ def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, st
872
846
  with SUTRA(api_key=api_key) as sutra:
873
847
  sutra.upload(data_path)
874
848
  return sutra.ask(question, viz=viz)
849
+
850
+
851
+ if __name__ == "__main__":
852
+ print("QuerySUTRA v0.4.0 - Simple & Automatic")