QuerySUTRA 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sutra/sutra.py CHANGED
@@ -1,21 +1,19 @@
1
1
  """
2
- QuerySUTRA v0.3.0 - ENHANCED FLEXIBLE VERSION
2
+ QuerySUTRA v0.3.3 - PROPER RELATIONAL DATABASE EXTRACTION
3
3
  SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
4
4
 
5
- NEW FEATURES:
6
- Custom visualization types - USER CHOICE
7
- Load existing databases - no re-upload
8
- Smart NLP with fuzzy matching - OPTIONAL
9
- Irrelevant query detection - OPTIONAL
10
- ✅ Embeddings for caching - OPTIONAL (user decides)
11
- ✅ All features are OPTIONAL - zero hard coding
5
+ FIXED: Proper primary keys, foreign keys, and relational integrity
6
+ - Unique IDs for each entity
7
+ - Proper foreign key relationships
8
+ - No duplicate keys
9
+ - Comprehensive entity extraction (skills, technologies, projects, etc.)
12
10
 
13
11
  Author: Aditya Batta
14
12
  License: MIT
15
- Version: 0.3.0
13
+ Version: 0.3.3
16
14
  """
17
15
 
18
- __version__ = "0.3.0"
16
+ __version__ = "0.3.3"
19
17
  __author__ = "Aditya Batta"
20
18
  __title__ = "QuerySUTRA: Structured-Unstructured-Text-Retrieval-Architecture"
21
19
  __all__ = ["SUTRA", "QueryResult", "quick_start"]
@@ -24,7 +22,7 @@ import os
24
22
  import sqlite3
25
23
  import pandas as pd
26
24
  import numpy as np
27
- from typing import Optional, Union, Dict, Any, List, Literal
25
+ from typing import Optional, Union, Dict, Any, List
28
26
  from pathlib import Path
29
27
  import json
30
28
  import hashlib
@@ -78,12 +76,7 @@ class SUTRA:
78
76
  """
79
77
  SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
80
78
 
81
- Enhanced with OPTIONAL features (user controls everything):
82
- - Custom visualizations (pie, bar, line, scatter, etc.)
83
- - Load existing databases (SQLite, MySQL, PostgreSQL)
84
- - Smart NLP with fuzzy matching (OPTIONAL)
85
- - Query relevance detection (OPTIONAL)
86
- - Embeddings for caching (OPTIONAL)
79
+ Professional data analysis with proper relational database structure
87
80
  """
88
81
 
89
82
  def __init__(self,
@@ -93,19 +86,9 @@ class SUTRA:
93
86
  check_relevance: bool = False,
94
87
  fuzzy_match: bool = True,
95
88
  cache_queries: bool = True):
96
- """
97
- Initialize SUTRA with OPTIONAL features.
98
-
99
- Args:
100
- api_key: OpenAI API key (optional)
101
- db: Database path (SQLite file)
102
- use_embeddings: Use embeddings for smart query caching (saves API calls)
103
- check_relevance: Check if query is relevant to database before processing
104
- fuzzy_match: Enable fuzzy matching for city names, etc. (e.g., "New York City" → "New York")
105
- cache_queries: Cache SQL queries to avoid repeated API calls
106
- """
107
- print("🚀 Initializing QuerySUTRA v0.3.0 - ENHANCED MODE")
108
- print(" SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
89
+ """Initialize SUTRA with optional features."""
90
+ print("Initializing QuerySUTRA v0.3.3")
91
+ print("SUTRA: Structured-Unstructured-Text-Retrieval-Architecture")
109
92
 
110
93
  if api_key:
111
94
  os.environ["OPENAI_API_KEY"] = api_key
@@ -120,7 +103,6 @@ class SUTRA:
120
103
  self.current_table = None
121
104
  self.schema_info = {}
122
105
 
123
- # OPTIONAL FEATURES (user decides)
124
106
  self.cache_queries = cache_queries
125
107
  self.cache = {} if cache_queries else None
126
108
 
@@ -131,126 +113,77 @@ class SUTRA:
131
113
  self.check_relevance = check_relevance
132
114
  self.fuzzy_match = fuzzy_match
133
115
 
134
- # Initialize embeddings if requested
135
116
  if use_embeddings and HAS_EMBEDDINGS:
136
117
  try:
137
- print(" 🧠 Loading embeddings model for smart caching...")
118
+ print("Loading embeddings model...")
138
119
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
139
- print("Embeddings ready (similar queries will use cache)")
120
+ print("Embeddings ready")
140
121
  except:
141
- print(" ⚠️ Embeddings failed, using simple cache instead")
122
+ print("Embeddings unavailable")
142
123
  self.use_embeddings = False
143
124
 
144
- # Refresh schema
145
125
  self._refresh_schema()
146
126
 
147
- print(f"Ready! Database: {db}")
148
- print(f" Cache: {'ON' if cache_queries else 'OFF'}")
149
- print(f" Embeddings: {'ON' if use_embeddings else 'OFF'}")
150
- print(f" Relevance Check: {'ON' if check_relevance else 'OFF'}")
151
- print(f" Fuzzy Match: {'ON' if fuzzy_match else 'OFF'}")
152
-
127
+ print(f"Ready! Database: {db}")
153
128
  if not self.api_key:
154
- print("⚠️ No API key - use .sql() for direct queries")
155
-
156
- # ========================================================================
157
- # NEW: LOAD EXISTING DATABASE
158
- # ========================================================================
129
+ print("No API key - use .sql() for direct queries")
159
130
 
160
131
  @classmethod
161
132
  def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
162
- """
163
- Load existing database WITHOUT re-uploading data.
164
-
165
- Args:
166
- db_path: Path to existing SQLite database
167
- api_key: OpenAI API key
168
- **kwargs: Other options (use_embeddings, check_relevance, etc.)
169
-
170
- Returns:
171
- SUTRA instance connected to existing database
172
-
173
- Example:
174
- sutra = SUTRA.load_from_db("sutra.db", api_key="sk-...")
175
- sutra.tables() # See existing tables
176
- result = sutra.ask("Show me data") # Query immediately!
177
- """
133
+ """Load existing SQLite database."""
178
134
  if not Path(db_path).exists():
179
135
  raise FileNotFoundError(f"Database not found: {db_path}")
180
136
 
181
- print(f"📂 Loading existing database: {db_path}")
137
+ print(f"Loading database: {db_path}")
182
138
  instance = cls(api_key=api_key, db=db_path, **kwargs)
183
139
 
184
140
  tables = instance.tables()
185
- print(f"\n✅ Loaded {len(tables)} existing tables - ready to query!")
141
+ print(f"Loaded {len(tables)} tables")
186
142
 
187
143
  return instance
188
144
 
189
145
  @classmethod
190
146
  def connect_mysql(cls, host: str, user: str, password: str, database: str,
191
147
  port: int = 3306, api_key: Optional[str] = None, **kwargs):
192
- """
193
- Connect to existing MySQL database WITHOUT importing data.
194
- Query directly from MySQL!
195
-
196
- Args:
197
- host: MySQL host
198
- user: MySQL user
199
- password: MySQL password
200
- database: Database name
201
- port: MySQL port
202
- api_key: OpenAI API key
203
-
204
- Example:
205
- sutra = SUTRA.connect_mysql("localhost", "root", "pass", "mydb", api_key="sk-...")
206
- result = sutra.ask("Show me users")
207
- """
148
+ """Connect to MySQL database."""
208
149
  try:
209
150
  from sqlalchemy import create_engine
210
151
  except ImportError:
211
- raise ImportError("Run: pip install sqlalchemy mysql-connector-python")
152
+ raise ImportError("Run: pip install QuerySUTRA[mysql]")
212
153
 
213
- print(f"🔄 Connecting to MySQL: {host}:{port}/{database}")
154
+ print(f"Connecting to MySQL: {host}:{port}/{database}")
214
155
 
215
156
  connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
216
157
 
217
- # Create temporary SQLite and sync tables
218
158
  temp_db = f"sutra_mysql_{database}.db"
219
159
  instance = cls(api_key=api_key, db=temp_db, **kwargs)
220
160
 
221
161
  engine = create_engine(connection_string)
222
162
 
223
- # Get all tables from MySQL
224
163
  tables = pd.read_sql_query("SHOW TABLES", engine).iloc[:, 0].tolist()
225
164
 
226
- print(f" Found {len(tables)} tables in MySQL")
227
- print(f" Syncing to local cache...")
165
+ print(f"Found {len(tables)} tables, syncing...")
228
166
 
229
167
  for table in tables:
230
168
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
231
169
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
232
- print(f"{table}: {len(df)} rows")
170
+ print(f" {table}: {len(df)} rows")
233
171
 
234
172
  instance._refresh_schema()
235
- print(f"\n✅ Connected! You can now query {len(tables)} MySQL tables")
173
+ print(f"Connected! {len(tables)} tables available")
236
174
 
237
175
  return instance
238
176
 
239
177
  @classmethod
240
178
  def connect_postgres(cls, host: str, user: str, password: str, database: str,
241
179
  port: int = 5432, api_key: Optional[str] = None, **kwargs):
242
- """
243
- Connect to existing PostgreSQL database WITHOUT importing data.
244
-
245
- Example:
246
- sutra = SUTRA.connect_postgres("localhost", "postgres", "pass", "mydb", api_key="sk-...")
247
- """
180
+ """Connect to PostgreSQL database."""
248
181
  try:
249
182
  from sqlalchemy import create_engine
250
183
  except ImportError:
251
- raise ImportError("Run: pip install sqlalchemy psycopg2-binary")
184
+ raise ImportError("Run: pip install QuerySUTRA[postgres]")
252
185
 
253
- print(f"🔄 Connecting to PostgreSQL: {host}:{port}/{database}")
186
+ print(f"Connecting to PostgreSQL: {host}:{port}/{database}")
254
187
 
255
188
  connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
256
189
 
@@ -259,32 +192,34 @@ class SUTRA:
259
192
 
260
193
  engine = create_engine(connection_string)
261
194
 
262
- # Get all tables
263
195
  tables = pd.read_sql_query(
264
196
  "SELECT tablename FROM pg_tables WHERE schemaname='public'",
265
197
  engine
266
198
  )['tablename'].tolist()
267
199
 
268
- print(f" Found {len(tables)} tables in PostgreSQL")
269
- print(f" Syncing to local cache...")
200
+ print(f"Found {len(tables)} tables, syncing...")
270
201
 
271
202
  for table in tables:
272
203
  df = pd.read_sql_query(f"SELECT * FROM {table}", engine)
273
204
  df.to_sql(table, instance.conn, if_exists='replace', index=False)
274
- print(f"{table}: {len(df)} rows")
205
+ print(f" {table}: {len(df)} rows")
275
206
 
276
207
  instance._refresh_schema()
277
- print(f"\n✅ Connected! You can now query {len(tables)} PostgreSQL tables")
208
+ print(f"Connected! {len(tables)} tables available")
278
209
 
279
210
  return instance
280
211
 
281
- # ========================================================================
282
- # UPLOAD - CREATES MULTIPLE TABLES
283
- # ========================================================================
284
-
285
- def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
286
- """Upload data and create structured tables with AI."""
287
- print(f"\n📤 Uploading data...")
212
+ def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
213
+ extract_entities: Optional[List[str]] = None) -> 'SUTRA':
214
+ """
215
+ Upload data with optional custom entity extraction.
216
+
217
+ Args:
218
+ data: File path or DataFrame
219
+ name: Table name
220
+ extract_entities: Custom entities to extract (e.g., ['skills', 'technologies'])
221
+ """
222
+ print(f"\nUploading data...")
288
223
 
289
224
  if isinstance(data, pd.DataFrame):
290
225
  name = name or "data"
@@ -298,9 +233,8 @@ class SUTRA:
298
233
  name = name or path.stem.replace(" ", "_").replace("-", "_")
299
234
  ext = path.suffix.lower()
300
235
 
301
- print(f" 📄 File: {path.name}")
236
+ print(f"File: {path.name}")
302
237
 
303
- # Load based on format
304
238
  if ext == ".csv":
305
239
  df = pd.read_csv(path)
306
240
  self._store_dataframe(df, name)
@@ -318,69 +252,63 @@ class SUTRA:
318
252
  self.cursor.executescript(f.read())
319
253
  self.conn.commit()
320
254
  self._refresh_schema()
321
- print(f"SQL executed!")
255
+ print("SQL executed")
322
256
 
323
257
  elif ext == ".pdf":
324
- self._smart_upload_pdf(path, name)
258
+ self._smart_upload_pdf(path, name, extract_entities)
325
259
 
326
260
  elif ext == ".docx":
327
- self._smart_upload_docx(path, name)
261
+ self._smart_upload_docx(path, name, extract_entities)
328
262
 
329
263
  elif ext == ".txt":
330
- self._smart_upload_txt(path, name)
264
+ self._smart_upload_txt(path, name, extract_entities)
331
265
 
332
266
  else:
333
267
  raise ValueError(f"Unsupported format: {ext}")
334
268
 
335
269
  return self
336
270
 
337
- # ========================================================================
338
- # SMART PARSING - CREATES MULTIPLE TABLES
339
- # ========================================================================
340
-
341
- def _smart_upload_pdf(self, path: Path, base_name: str):
342
- """Parse PDF and create multiple tables."""
271
+ def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
272
+ """Parse PDF with proper relational structure."""
343
273
  if not HAS_PYPDF2:
344
- raise ImportError("PyPDF2 not installed. Run: pip install PyPDF2")
274
+ raise ImportError("Run: pip install PyPDF2")
345
275
 
346
- print(" 📑 Extracting text from PDF...")
276
+ print("Extracting text from PDF...")
347
277
 
348
278
  with open(path, 'rb') as file:
349
279
  pdf_reader = PyPDF2.PdfReader(file)
350
280
  text = ""
351
281
  for page_num, page in enumerate(pdf_reader.pages, 1):
352
282
  text += page.extract_text() + "\n"
353
- print(f" Extracted page {page_num}/{len(pdf_reader.pages)}")
283
+ print(f" Page {page_num}/{len(pdf_reader.pages)}")
354
284
 
355
285
  if self.client:
356
- print(" 🧠 AI: Analyzing and extracting structured entities...")
357
- tables = self._create_tables_with_ai(text, base_name)
286
+ print("AI: Comprehensive entity extraction with proper relationships...")
287
+ tables = self._create_tables_with_ai(text, base_name, extract_entities)
358
288
 
359
289
  if tables and len(tables) > 0:
360
- print(f"\n✅ Created {len(tables)} structured tables:")
290
+ print(f"\nCreated {len(tables)} relational tables:")
361
291
  for tbl_name in tables:
362
292
  count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
363
293
  cols = len(self.schema_info.get(tbl_name, {}))
364
- print(f" 📊 {tbl_name}: {count} rows, {cols} columns")
294
+ print(f" {tbl_name}: {count} rows, {cols} columns")
365
295
  return
366
296
 
367
- # Fallback: simple text table
368
- print(" ⚠️ AI not available, creating simple text table")
297
+ print("AI unavailable, creating simple table")
369
298
  df = self._parse_text_simple(text)
370
299
  self._store_dataframe(df, base_name)
371
300
 
372
- def _smart_upload_docx(self, path: Path, base_name: str):
373
- """Parse DOCX and create multiple tables."""
301
+ def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
302
+ """Parse DOCX with proper structure."""
374
303
  if not HAS_DOCX:
375
- raise ImportError("python-docx not installed. Run: pip install python-docx")
304
+ raise ImportError("Run: pip install python-docx")
376
305
 
377
- print(" 📄 Extracting content from DOCX...")
306
+ print("Extracting from DOCX...")
378
307
 
379
308
  doc = docx.Document(path)
380
309
 
381
- # Check for tables first
382
310
  if doc.tables:
383
- print(f" 📊 Found {len(doc.tables)} table(s)")
311
+ print(f"Found {len(doc.tables)} table(s)")
384
312
  for i, table in enumerate(doc.tables):
385
313
  data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
386
314
  if data and len(data) > 1:
@@ -389,93 +317,144 @@ class SUTRA:
389
317
  self._store_dataframe(df, table_name)
390
318
  return
391
319
 
392
- # Extract text
393
320
  text = "\n".join([para.text for para in doc.paragraphs])
394
321
 
395
322
  if self.client:
396
- print(" 🧠 AI: Analyzing and extracting structured entities...")
397
- tables = self._create_tables_with_ai(text, base_name)
323
+ print("AI: Analyzing...")
324
+ tables = self._create_tables_with_ai(text, base_name, extract_entities)
398
325
 
399
326
  if tables and len(tables) > 0:
400
- print(f"\n✅ Created {len(tables)} structured tables:")
327
+ print(f"\nCreated {len(tables)} tables:")
401
328
  for tbl_name in tables:
402
329
  count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
403
330
  cols = len(self.schema_info.get(tbl_name, {}))
404
- print(f" 📊 {tbl_name}: {count} rows, {cols} columns")
331
+ print(f" {tbl_name}: {count} rows, {cols} columns")
405
332
  return
406
333
 
407
334
  df = self._parse_text_simple(text)
408
335
  self._store_dataframe(df, base_name)
409
336
 
410
- def _smart_upload_txt(self, path: Path, base_name: str):
411
- """Parse TXT and create multiple tables."""
412
- print(" 📝 Reading TXT file...")
337
+ def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
338
+ """Parse TXT with proper structure."""
339
+ print("Reading TXT...")
413
340
 
414
341
  with open(path, 'r', encoding='utf-8') as file:
415
342
  text = file.read()
416
343
 
417
344
  if self.client:
418
- print(" 🧠 AI: Analyzing and extracting structured entities...")
419
- tables = self._create_tables_with_ai(text, base_name)
345
+ print("AI: Analyzing...")
346
+ tables = self._create_tables_with_ai(text, base_name, extract_entities)
420
347
 
421
348
  if tables and len(tables) > 0:
422
- print(f"\n✅ Created {len(tables)} structured tables:")
349
+ print(f"\nCreated {len(tables)} tables:")
423
350
  for tbl_name in tables:
424
351
  count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl_name}", self.conn).iloc[0, 0]
425
352
  cols = len(self.schema_info.get(tbl_name, {}))
426
- print(f" 📊 {tbl_name}: {count} rows, {cols} columns")
353
+ print(f" {tbl_name}: {count} rows, {cols} columns")
427
354
  return
428
355
 
429
356
  df = self._parse_text_simple(text)
430
357
  self._store_dataframe(df, base_name)
431
358
 
432
- def _create_tables_with_ai(self, text: str, base_name: str) -> List[str]:
433
- """Use AI to extract entities and create multiple tables."""
359
+ def _create_tables_with_ai(self, text: str, base_name: str, custom_entities: Optional[List[str]] = None) -> List[str]:
360
+ """
361
+ AI extracts ALL entities with PROPER primary and foreign keys.
362
+
363
+ CRITICAL: Each entity gets UNIQUE IDs, foreign keys properly link tables.
364
+ """
434
365
  if not self.client:
435
366
  return []
436
367
 
437
368
  try:
438
- extraction_prompt = f"""Extract ALL structured data from this text into separate entity tables.
369
+ if custom_entities:
370
+ entity_instruction = f"""Extract these specific entities: {', '.join(custom_entities)}
371
+ For each entity type, create a proper table with unique IDs."""
372
+ else:
373
+ entity_instruction = """Automatically identify and extract ALL structured entities.
374
+
375
+ Common entities (extract ALL you find):
376
+ - people: Personal information (id, name, email, phone, address, city, state, zip)
377
+ - skills: Individual skills (id, person_id, skill_name, proficiency_level, years_experience)
378
+ - technologies: Technologies/tools (id, person_id, technology_name, category, proficiency)
379
+ - projects: Projects (id, person_id, project_name, description, start_date, end_date)
380
+ - certifications: Certifications (id, person_id, cert_name, issuer, date_obtained)
381
+ - education: Education records (id, person_id, degree, institution, graduation_year)
382
+ - work_experience: Work history (id, person_id, company, title, start_date, end_date)
383
+ - events: Events/meetings (id, host_id, description, location, date, attendee_ids)
384
+ - organizations: Companies/departments (id, name, address, city, industry)
385
+ - products: Products/services (id, name, description, price, category)
386
+ - ANY other structured entities you identify
387
+
388
+ Extract EVERYTHING you find in the text."""
389
+
390
+ extraction_prompt = f"""Analyze this text and extract ALL structured data into proper relational database tables.
439
391
 
440
392
  Text:
441
- {text[:4000]}
393
+ {text[:6000]}
394
+
395
+ {entity_instruction}
396
+
397
+ CRITICAL REQUIREMENTS FOR PROPER DATABASE DESIGN:
398
+
399
+ 1. PRIMARY KEYS:
400
+ - Each table MUST have unique sequential IDs starting from 1
401
+ - Person 1 gets id=1, Person 2 gets id=2, etc.
402
+ - NO DUPLICATE IDs within same table
403
+ - IDs must be integers
404
+
405
+ 2. FOREIGN KEYS:
406
+ - Use foreign keys to link related tables
407
+ - Example: skills table has person_id that references people.id
408
+ - Example: projects table has person_id that references people.id
409
+ - Foreign keys MUST match existing primary keys
442
410
 
443
- Create these types of tables (if data exists):
444
- 1. people - (id, name, address, city, state, zip, email, phone)
445
- 2. contacts - (id, person_id, contact_type, value)
446
- 3. events - (id, host_id, description, location, city)
447
- 4. organizations - (id, name, address, city)
448
- 5. Any other entities you find
411
+ 3. TABLE STRUCTURE:
412
+ - Each entity type gets its own table
413
+ - Use clear table names (people, skills, technologies, not table1, table2)
414
+ - Include ALL relevant attributes for each entity
449
415
 
450
- Return a JSON object with this EXACT structure:
416
+ Return JSON with this EXACT structure:
451
417
  {{
452
418
  "people": [
453
- {{"id": 1, "name": "John Doe", "address": "123 Main St", "city": "Dallas", "email": "john@email.com", "phone": "555-1234"}},
419
+ {{"id": 1, "name": "John Doe", "email": "john@email.com", "phone": "+1-555-0100", "city": "Dallas", "state": "TX"}},
420
+ {{"id": 2, "name": "Jane Smith", "email": "jane@email.com", "phone": "+1-555-0101", "city": "New York", "state": "NY"}},
454
421
  ...
455
422
  ],
456
- "contacts": [
457
- {{"id": 1, "person_id": 1, "email": "john@email.com", "phone": "555-1234"}},
423
+ "skills": [
424
+ {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
425
+ {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
426
+ {{"id": 3, "person_id": 2, "skill_name": "Java", "proficiency": "Expert", "years": 7}},
458
427
  ...
459
428
  ],
460
- "events": [
461
- {{"id": 1, "host_id": 1, "description": "Team meeting", "city": "Dallas"}},
429
+ "technologies": [
430
+ {{"id": 1, "person_id": 1, "technology": "React", "category": "Frontend"}},
431
+ {{"id": 2, "person_id": 1, "technology": "PostgreSQL", "category": "Database"}},
432
+ {{"id": 3, "person_id": 2, "technology": "Spring Boot", "category": "Backend"}},
433
+ ...
434
+ ],
435
+ "projects": [
436
+ {{"id": 1, "person_id": 1, "project_name": "E-commerce Platform", "role": "Lead Developer"}},
437
+ {{"id": 2, "person_id": 2, "project_name": "Analytics Dashboard", "role": "Backend Engineer"}},
462
438
  ...
463
439
  ]
464
440
  }}
465
441
 
466
442
  IMPORTANT:
467
- - Extract ALL people, contacts, events, organizations you find
468
- - Use consistent column names
443
+ - Extract EVERY structured piece of data you find
444
+ - Assign UNIQUE sequential IDs (1, 2, 3, ...) for each table
445
+ - Foreign keys MUST reference valid primary keys
446
+ - Create as many tables as needed (don't limit yourself)
469
447
  - Return ONLY valid JSON, no explanations
470
- - If a table type has no data, omit it from JSON"""
448
+ - Be COMPREHENSIVE - extract skills, technologies, projects, certifications, education, work history, etc."""
471
449
 
472
450
  response = self.client.chat.completions.create(
473
451
  model="gpt-4o-mini",
474
452
  messages=[
475
- {"role": "system", "content": "You are a data extraction expert. Extract ALL entities from text into structured JSON tables. Return only valid JSON."},
453
+ {"role": "system", "content": "You are a database design expert. Extract ALL entities with proper primary keys (unique sequential IDs) and foreign keys (referencing valid IDs). Be comprehensive and extract EVERYTHING. Return only valid JSON."},
476
454
  {"role": "user", "content": extraction_prompt}
477
455
  ],
478
- temperature=0
456
+ temperature=0,
457
+ max_tokens=4096
479
458
  )
480
459
 
481
460
  json_text = response.choices[0].message.content.strip()
@@ -485,7 +464,6 @@ IMPORTANT:
485
464
 
486
465
  created_tables = []
487
466
 
488
- # Create tables from extracted data
489
467
  for entity_type, records in extracted_data.items():
490
468
  if records and isinstance(records, list) and len(records) > 0:
491
469
  table_name = f"{base_name}_{entity_type}"
@@ -495,24 +473,23 @@ IMPORTANT:
495
473
  if not df.empty:
496
474
  self._store_dataframe(df, table_name, silent=True)
497
475
  created_tables.append(table_name)
498
- print(f"{entity_type}: {len(df)} records")
476
+ print(f" {entity_type}: {len(df)} records")
499
477
  except Exception as e:
500
- print(f" ⚠️ Failed to create {entity_type}: {e}")
478
+ print(f" Failed {entity_type}: {e}")
501
479
 
502
480
  return created_tables
503
481
 
504
482
  except Exception as e:
505
- print(f" ⚠️ AI extraction error: {e}")
483
+ print(f"AI extraction error: {e}")
506
484
  return []
507
485
 
508
486
  def _parse_text_simple(self, text: str) -> pd.DataFrame:
509
- """Simple text to DataFrame (fallback)."""
487
+ """Fallback text parsing."""
510
488
  lines = [line.strip() for line in text.split('\n') if line.strip()]
511
489
 
512
490
  if not lines:
513
491
  return pd.DataFrame({'content': ['No content']})
514
492
 
515
- # Try to detect if it's tabular
516
493
  sample = lines[:min(10, len(lines))]
517
494
  for delimiter in ['\t', ',', '|', ';']:
518
495
  if all(delimiter in line for line in sample):
@@ -529,98 +506,63 @@ IMPORTANT:
529
506
  })
530
507
 
531
508
  def _store_dataframe(self, df: pd.DataFrame, name: str, silent: bool = False):
532
- """Store DataFrame in database."""
509
+ """Store DataFrame."""
533
510
  df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
534
511
  df.to_sql(name, self.conn, if_exists='replace', index=False)
535
512
  self.current_table = name
536
513
  self._refresh_schema()
537
514
 
538
515
  if not silent:
539
- print(f"Uploaded to table: {name}")
540
- print(f" 📊 {len(df)} rows × {len(df.columns)} columns")
541
- print(f" 🔤 Columns: {', '.join(df.columns[:10].tolist())}{' ...' if len(df.columns) > 10 else ''}")
516
+ print(f"Uploaded: {name}")
517
+ print(f" {len(df)} rows, {len(df.columns)} columns")
542
518
 
543
- # ========================================================================
544
- # NEW: FLEXIBLE QUERY WITH CUSTOM VIZ AND RELEVANCE CHECK
545
- # ========================================================================
546
-
547
- def ask(self, question: str,
548
- viz: Union[bool, str] = False,
549
- table: Optional[str] = None) -> 'QueryResult':
550
- """
551
- Ask question with FLEXIBLE visualization options.
552
-
553
- Args:
554
- question: Natural language question
555
- viz: Visualization type:
556
- - False: No visualization
557
- - True: Auto-detect best chart
558
- - "pie": Pie chart
559
- - "bar": Bar chart
560
- - "line": Line chart
561
- - "scatter": Scatter plot
562
- - "table": Table view
563
- - "heatmap": Heatmap
564
- table: Specific table to query (optional)
565
-
566
- Examples:
567
- result = sutra.ask("How many people in each city?")
568
- result = sutra.ask("Show sales by month", viz="line")
569
- result = sutra.ask("Revenue breakdown", viz="pie")
570
- result = sutra.ask("Compare metrics", viz="bar")
571
- """
519
+ def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
520
+ """Query with natural language."""
572
521
  if not self.client:
573
- print("No API key configured")
522
+ print("No API key")
574
523
  return QueryResult(False, "", pd.DataFrame(), None, "No API key")
575
524
 
576
- print(f"\n🔍 Question: {question}")
525
+ print(f"\nQuestion: {question}")
577
526
 
578
- # NEW: Check relevance if enabled
579
527
  if self.check_relevance:
580
528
  if not self._is_relevant_query(question):
581
- print("⚠️ This question seems irrelevant to your database")
582
- print(" Database contains tables about:", ", ".join(self._get_table_names()[:5]))
583
-
584
- choice = input(" Continue anyway? (yes/no): ").strip().lower()
585
- if choice not in ['yes', 'y', 'yeah', 'yep', 'sure']:
586
- return QueryResult(False, "", pd.DataFrame(), None, "Query not relevant to database")
529
+ print("Warning: Query may be irrelevant")
530
+ choice = input("Continue? (yes/no): ").strip().lower()
531
+ if choice not in ['yes', 'y']:
532
+ return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
587
533
 
588
- # Select table
589
534
  tbl = table or self.current_table
590
535
  if not tbl:
591
536
  all_tables = self._get_table_names()
592
537
  if all_tables:
593
538
  tbl = all_tables[0]
594
539
  else:
595
- print("No tables found")
540
+ print("No tables found")
596
541
  return QueryResult(False, "", pd.DataFrame(), None, "No table")
597
542
 
598
- # NEW: Check embeddings cache if enabled
599
543
  if self.use_embeddings and self.embedding_model:
600
544
  cached_result = self._check_embedding_cache(question, tbl)
601
545
  if cached_result:
602
- print("Using cached result from similar query")
546
+ print(" Using cached result")
603
547
  return cached_result
604
548
 
605
- # NEW: Apply fuzzy matching to question if enabled
606
549
  if self.fuzzy_match:
607
550
  question = self._apply_fuzzy_matching(question, tbl)
608
551
 
609
- # Check simple cache
610
552
  cache_key = hashlib.md5(f"{question}:{tbl}".encode()).hexdigest()
611
- if self.cache_queries and cache_key in self.cache:
553
+ if self.cache_queries and self.cache and cache_key in self.cache:
612
554
  sql_query = self.cache[cache_key]
613
- print(" 💾 From cache")
555
+ print(" From cache")
614
556
  else:
615
557
  sql_query = self._generate_sql(question, tbl)
616
- if self.cache_queries:
558
+ if self.cache_queries and self.cache is not None:
617
559
  self.cache[cache_key] = sql_query
618
560
 
619
- print(f" 📝 SQL: {sql_query}")
561
+ print(f"SQL: {sql_query}")
620
562
 
621
563
  try:
622
564
  df = pd.read_sql_query(sql_query, self.conn)
623
- print(f"Success! {len(df)} rows")
565
+ print(f"Success! {len(df)} rows")
624
566
 
625
567
  fig = None
626
568
  if viz:
@@ -629,30 +571,24 @@ IMPORTANT:
629
571
 
630
572
  result = QueryResult(True, sql_query, df, fig)
631
573
 
632
- # Store in embedding cache if enabled
633
574
  if self.use_embeddings and self.embedding_model:
634
575
  self._store_in_embedding_cache(question, tbl, result)
635
576
 
636
577
  return result
637
578
  except Exception as e:
638
- print(f"Error: {e}")
579
+ print(f"Error: {e}")
639
580
  return QueryResult(False, sql_query, pd.DataFrame(), None, str(e))
640
581
 
641
- # ========================================================================
642
- # NEW: RELEVANCE CHECK
643
- # ========================================================================
644
-
645
582
  def _is_relevant_query(self, question: str) -> bool:
646
- """Check if query is relevant to database (OPTIONAL feature)."""
583
+ """Check relevance."""
647
584
  if not self.client:
648
585
  return True
649
586
 
650
- # Get database context
651
587
  tables = self._get_table_names()
652
588
  columns = []
653
- for tbl in tables[:3]: # First 3 tables
589
+ for tbl in tables[:3]:
654
590
  cols = list(self.schema_info.get(tbl, {}).keys())
655
- columns.extend(cols[:5]) # First 5 columns
591
+ columns.extend(cols[:5])
656
592
 
657
593
  db_context = f"Tables: {', '.join(tables[:5])}. Columns: {', '.join(columns[:15])}"
658
594
 
@@ -660,31 +596,22 @@ IMPORTANT:
660
596
  response = self.client.chat.completions.create(
661
597
  model="gpt-4o-mini",
662
598
  messages=[
663
- {"role": "system", "content": "You are a query relevance checker. Return only 'yes' or 'no'."},
664
- {"role": "user", "content": f"Is this question relevant to a database with {db_context}?\n\nQuestion: {question}\n\nAnswer only 'yes' or 'no':"}
599
+ {"role": "system", "content": "Relevance checker. Return only 'yes' or 'no'."},
600
+ {"role": "user", "content": f"Is this relevant to database with {db_context}?\n\nQuestion: {question}\n\nyes or no:"}
665
601
  ],
666
602
  temperature=0,
667
603
  max_tokens=5
668
604
  )
669
605
 
670
- answer = response.choices[0].message.content.strip().lower()
671
- return 'yes' in answer
606
+ return 'yes' in response.choices[0].message.content.strip().lower()
672
607
  except:
673
- return True # If check fails, allow query
674
-
675
- # ========================================================================
676
- # NEW: FUZZY MATCHING FOR BETTER NLP
677
- # ========================================================================
608
+ return True
678
609
 
679
610
  def _apply_fuzzy_matching(self, question: str, table: str) -> str:
680
- """
681
- Apply fuzzy matching to improve NLP understanding.
682
- Example: "New York City" → finds "New York" in database
683
- """
611
+ """Fuzzy match query terms."""
684
612
  if not self.schema_info.get(table):
685
613
  return question
686
614
 
687
- # Get all unique values from string columns
688
615
  try:
689
616
  string_cols = [col for col, dtype in self.schema_info[table].items()
690
617
  if 'TEXT' in dtype or 'VARCHAR' in dtype]
@@ -692,41 +619,32 @@ IMPORTANT:
692
619
  if not string_cols:
693
620
  return question
694
621
 
695
- # Get unique values from first string column (usually city, name, etc.)
696
- col = string_cols[0]
697
- df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
698
- unique_values = df[col].dropna().tolist()
699
-
700
- # Find words in question that might match database values
701
- words_in_question = question.split()
702
-
703
- for i, word in enumerate(words_in_question):
704
- # Try to find close matches
705
- matches = get_close_matches(word, unique_values, n=1, cutoff=0.6)
706
- if matches:
707
- # Replace with closest match
708
- words_in_question[i] = matches[0]
709
- print(f" 🔍 Fuzzy match: '{word}' → '{matches[0]}'")
622
+ for col in string_cols[:2]:
623
+ df = pd.read_sql_query(f"SELECT DISTINCT {col} FROM {table} LIMIT 100", self.conn)
624
+ unique_values = [str(v) for v in df[col].dropna().tolist()]
625
+
626
+ words = question.split()
627
+ for i, word in enumerate(words):
628
+ matches = get_close_matches(word, unique_values, n=1, cutoff=0.6)
629
+ if matches and word != matches[0]:
630
+ words[i] = matches[0]
631
+ print(f" Fuzzy: '{word}' -> '{matches[0]}'")
632
+
633
+ question = " ".join(words)
710
634
 
711
- return " ".join(words_in_question)
635
+ return question
712
636
  except:
713
637
  return question
714
638
 
715
- # ========================================================================
716
- # NEW: EMBEDDING-BASED CACHE
717
- # ========================================================================
718
-
719
639
  def _check_embedding_cache(self, question: str, table: str) -> Optional['QueryResult']:
720
- """Check if similar query exists in cache using embeddings."""
640
+ """Check embedding cache."""
721
641
  if not self.query_embeddings:
722
642
  return None
723
643
 
724
- # Get embedding for current question
725
644
  q_embedding = self.embedding_model.encode([question])[0]
726
645
 
727
- # Find most similar cached query
728
646
  best_match = None
729
- best_similarity = 0.85 # Threshold
647
+ best_similarity = 0.85
730
648
 
731
649
  for cached_q, cached_data in self.query_embeddings.items():
732
650
  if cached_data['table'] != table:
@@ -741,39 +659,27 @@ IMPORTANT:
741
659
  best_match = cached_q
742
660
 
743
661
  if best_match:
744
- print(f" 🎯 Found similar query ({best_similarity:.1%} match): '{best_match}'")
662
+ print(f" Similar query ({best_similarity:.0%}): '{best_match}'")
745
663
  return self.query_embeddings[best_match]['result']
746
664
 
747
665
  return None
748
666
 
749
667
  def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
750
- """Store query result in embedding cache."""
668
+ """Store in cache."""
751
669
  q_embedding = self.embedding_model.encode([question])[0]
752
-
753
670
  self.query_embeddings[question] = {
754
671
  'table': table,
755
672
  'embedding': q_embedding,
756
673
  'result': result
757
674
  }
758
675
 
759
- # ========================================================================
760
- # NEW: FLEXIBLE VISUALIZATION
761
- # ========================================================================
762
-
763
676
  def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
764
- """
765
- Create flexible visualization based on user choice.
766
-
767
- Args:
768
- df: Data to visualize
769
- title: Chart title
770
- viz_type: Type of visualization (auto, pie, bar, line, scatter, table, heatmap)
771
- """
677
+ """Create visualization."""
772
678
  if not HAS_PLOTLY and not HAS_MATPLOTLIB:
773
- print("⚠️ Install plotly or matplotlib for visualizations")
679
+ print("Install plotly or matplotlib")
774
680
  return None
775
681
 
776
- print(f"📊 Creating {viz_type} chart...")
682
+ print(f"Creating {viz_type} chart...")
777
683
 
778
684
  if HAS_PLOTLY:
779
685
  return self._plotly_viz(df, title, viz_type)
@@ -781,7 +687,7 @@ IMPORTANT:
781
687
  return self._matplotlib_viz(df, title, viz_type)
782
688
 
783
689
  def _plotly_viz(self, df: pd.DataFrame, title: str, viz_type: str):
784
- """Create Plotly chart with user-specified type."""
690
+ """Plotly visualization."""
785
691
  try:
786
692
  numeric = df.select_dtypes(include=[np.number]).columns.tolist()
787
693
  categorical = df.select_dtypes(include=['object']).columns.tolist()
@@ -791,57 +697,39 @@ IMPORTANT:
791
697
  header=dict(values=list(df.columns)),
792
698
  cells=dict(values=[df[c] for c in df.columns])
793
699
  )])
794
-
795
700
  elif viz_type == "pie" and categorical and numeric:
796
701
  fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
797
-
798
702
  elif viz_type == "bar" and categorical and numeric:
799
703
  fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
800
-
801
704
  elif viz_type == "line" and numeric:
802
705
  fig = px.line(df, y=numeric[0], title=title)
803
-
804
706
  elif viz_type == "scatter" and len(numeric) >= 2:
805
707
  fig = px.scatter(df, x=numeric[0], y=numeric[1], title=title)
806
-
807
708
  elif viz_type == "heatmap" and len(numeric) >= 2:
808
- # Create correlation heatmap
809
709
  corr = df[numeric].corr()
810
710
  fig = go.Figure(data=go.Heatmap(
811
- z=corr.values,
812
- x=corr.columns,
813
- y=corr.columns,
814
- colorscale='Viridis'
711
+ z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'
815
712
  ))
816
713
  fig.update_layout(title=title)
817
-
818
714
  elif viz_type == "auto":
819
- # Auto-detect best chart
820
715
  if categorical and numeric:
821
- if len(df) <= 10:
822
- fig = px.pie(df, names=categorical[0], values=numeric[0], title=title)
823
- else:
824
- fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
716
+ fig = px.pie(df, names=categorical[0], values=numeric[0], title=title) if len(df) <= 10 else px.bar(df, x=categorical[0], y=numeric[0], title=title)
825
717
  elif len(numeric) >= 2:
826
718
  fig = px.line(df, y=numeric[0], title=title)
827
719
  else:
828
720
  fig = px.bar(df, y=df.columns[0], title=title)
829
721
  else:
830
- # Default to bar
831
- if categorical and numeric:
832
- fig = px.bar(df, x=categorical[0], y=numeric[0], title=title)
833
- else:
834
- fig = px.bar(df, y=df.columns[0], title=title)
722
+ fig = px.bar(df, x=categorical[0] if categorical else df.index, y=numeric[0] if numeric else df.columns[0], title=title)
835
723
 
836
724
  fig.show()
837
- print("Chart displayed")
725
+ print("Chart displayed")
838
726
  return fig
839
727
  except Exception as e:
840
- print(f"⚠️ Viz error: {e}")
728
+ print(f"Viz error: {e}")
841
729
  return None
842
730
 
843
731
  def _matplotlib_viz(self, df: pd.DataFrame, title: str, viz_type: str):
844
- """Create Matplotlib chart."""
732
+ """Matplotlib visualization."""
845
733
  try:
846
734
  plt.figure(figsize=(10, 6))
847
735
  numeric = df.select_dtypes(include=[np.number]).columns
@@ -859,26 +747,22 @@ IMPORTANT:
859
747
  plt.title(title)
860
748
  plt.tight_layout()
861
749
  plt.show()
862
- print("Chart displayed")
750
+ print("Chart displayed")
863
751
  return plt.gcf()
864
752
  except Exception as e:
865
- print(f"⚠️ Viz error: {e}")
753
+ print(f"Viz error: {e}")
866
754
  return None
867
755
 
868
- # ========================================================================
869
- # VIEW DATABASE
870
- # ========================================================================
871
-
872
756
  def tables(self) -> Dict[str, dict]:
873
- """Show all tables with details."""
757
+ """List all tables."""
874
758
  print("\n" + "="*70)
875
- print("📋 TABLES IN DATABASE")
759
+ print("TABLES IN DATABASE")
876
760
  print("="*70)
877
761
 
878
762
  all_tables = self._get_table_names()
879
763
 
880
764
  if not all_tables:
881
- print("No tables found")
765
+ print("No tables found")
882
766
  return {}
883
767
 
884
768
  result = {}
@@ -887,26 +771,23 @@ IMPORTANT:
887
771
  cols = self.schema_info.get(tbl, {})
888
772
  col_list = list(cols.keys())
889
773
 
890
- marker = "👉" if tbl == self.current_table else " "
891
- print(f"{marker} {i}. Table: {tbl}")
892
- print(f" 📊 {count} rows, {len(col_list)} columns")
893
- print(f" 🔤 Columns: {', '.join(col_list[:8])}{' ...' if len(col_list) > 8 else ''}")
774
+ marker = ">" if tbl == self.current_table else " "
775
+ print(f"{marker} {i}. {tbl}")
776
+ print(f" {count} rows, {len(col_list)} columns")
777
+ print(f" Columns: {', '.join(col_list[:8])}")
894
778
 
895
- result[tbl] = {
896
- 'rows': count,
897
- 'columns': col_list
898
- }
779
+ result[tbl] = {'rows': count, 'columns': col_list}
899
780
 
900
781
  print("="*70)
901
782
  return result
902
783
 
903
784
  def schema(self, table: Optional[str] = None) -> dict:
904
- """Show detailed schema with data types."""
785
+ """Show schema."""
905
786
  if not self.schema_info:
906
787
  self._refresh_schema()
907
788
 
908
789
  print("\n" + "="*70)
909
- print("📋 DATABASE SCHEMA")
790
+ print("DATABASE SCHEMA")
910
791
  print("="*70)
911
792
 
912
793
  tables_to_show = [table] if table else self.schema_info.keys()
@@ -915,12 +796,12 @@ IMPORTANT:
915
796
  for tbl in tables_to_show:
916
797
  if tbl in self.schema_info:
917
798
  count = pd.read_sql_query(f"SELECT COUNT(*) FROM {tbl}", self.conn).iloc[0, 0]
918
- print(f"\n📊 Table: {tbl}")
919
- print(f" Records: {count}")
920
- print(f" Columns:")
799
+ print(f"\nTable: {tbl}")
800
+ print(f"Records: {count}")
801
+ print("Columns:")
921
802
 
922
803
  for col, dtype in self.schema_info[tbl].items():
923
- print(f" - {col:<30} ({dtype})")
804
+ print(f" - {col:<30} ({dtype})")
924
805
 
925
806
  result[tbl] = {
926
807
  'records': count,
@@ -931,151 +812,126 @@ IMPORTANT:
931
812
  return result
932
813
 
933
814
  def peek(self, table: Optional[str] = None, n: int = 5) -> pd.DataFrame:
934
- """View sample data."""
815
+ """Preview data."""
935
816
  tbl = table or self.current_table
936
817
  if not tbl:
937
- print("No table specified")
818
+ print("No table specified")
938
819
  return pd.DataFrame()
939
820
 
940
821
  df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
941
- print(f"\n📊 Sample from '{tbl}' ({n} rows):")
822
+ print(f"\nSample from '{tbl}' ({n} rows):")
942
823
  print(df.to_string(index=False))
943
824
  return df
944
825
 
945
826
  def info(self):
946
- """Show complete database overview."""
827
+ """Database overview."""
947
828
  return self.tables()
948
829
 
949
- # ========================================================================
950
- # QUERY METHODS
951
- # ========================================================================
952
-
953
830
  def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
954
- """Execute SQL directly (no API cost)."""
955
- print(f"\n⚡ Executing SQL...")
831
+ """Execute SQL."""
832
+ print("\nExecuting SQL...")
956
833
 
957
834
  try:
958
835
  df = pd.read_sql_query(query, self.conn)
959
- print(f"Success! {len(df)} rows returned")
836
+ print(f"Success! {len(df)} rows")
960
837
 
961
838
  fig = None
962
839
  if viz:
963
840
  viz_type = viz if isinstance(viz, str) else "auto"
964
- fig = self._visualize(df, "SQL Query Result", viz_type=viz_type)
841
+ fig = self._visualize(df, "SQL Result", viz_type=viz_type)
965
842
 
966
843
  return QueryResult(True, query, df, fig)
967
844
  except Exception as e:
968
- print(f"Error: {e}")
845
+ print(f"Error: {e}")
969
846
  return QueryResult(False, query, pd.DataFrame(), None, str(e))
970
847
 
971
848
  def interactive(self, question: str) -> 'QueryResult':
972
- """Ask with interactive visualization prompt."""
973
- print(f"\n🔍 Question: {question}")
974
- choice = input("💡 Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
975
-
976
- if choice in ['yes', 'y', 'yeah', 'yep', 'sure']:
977
- viz = True
978
- elif choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap']:
979
- viz = choice
980
- else:
981
- viz = False
849
+ """Interactive query."""
850
+ print(f"\nQuestion: {question}")
851
+ choice = input("Visualize? (yes/no/pie/bar/line/scatter): ").strip().lower()
852
+
853
+ viz = choice if choice in ['pie', 'bar', 'line', 'scatter', 'table', 'heatmap'] else (True if choice in ['yes', 'y'] else False)
982
854
 
983
855
  return self.ask(question, viz=viz)
984
856
 
985
- # ========================================================================
986
- # DATABASE EXPORT
987
- # ========================================================================
988
-
989
857
  def export_db(self, path: str, format: str = "sqlite"):
990
- """Export entire database."""
991
- print(f"\n💾 Exporting to {format}...")
992
-
993
- format = format.lower()
858
+ """Export database."""
859
+ print(f"\nExporting to {format}...")
994
860
 
995
861
  if format == "sqlite":
996
862
  shutil.copy2(self.db_path, path)
997
- print(f"✅ Saved to {path}")
998
-
999
863
  elif format == "sql":
1000
864
  with open(path, 'w', encoding='utf-8') as f:
1001
865
  for line in self.conn.iterdump():
1002
866
  f.write(f'{line}\n')
1003
- print(f"✅ Saved to {path}")
1004
-
1005
867
  elif format == "json":
1006
868
  data = {}
1007
869
  for table in self._get_table_names():
1008
870
  df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
1009
871
  data[table] = df.to_dict(orient='records')
1010
-
1011
872
  with open(path, 'w', encoding='utf-8') as f:
1012
873
  json.dump(data, f, indent=2, default=str)
1013
- print(f"✅ Saved to {path}")
1014
-
1015
874
  elif format == "excel":
1016
875
  with pd.ExcelWriter(path, engine='openpyxl') as writer:
1017
876
  for table in self._get_table_names():
1018
877
  df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
1019
878
  df.to_excel(writer, sheet_name=table[:31], index=False)
1020
- print(f"✅ Saved to {path}")
1021
-
1022
879
  else:
1023
- raise ValueError(f"Unsupported format: {format}")
880
+ raise ValueError(f"Unsupported: {format}")
1024
881
 
882
+ print(f"Saved to {path}")
1025
883
  return self
1026
884
 
1027
885
  def save_to_mysql(self, host: str, user: str, password: str, database: str,
1028
886
  port: int = 3306, tables: Optional[List[str]] = None):
1029
- """Save to MySQL (local or cloud)."""
887
+ """Export to MySQL."""
1030
888
  try:
1031
889
  from sqlalchemy import create_engine
1032
890
  except ImportError:
1033
- raise ImportError("Run: pip install sqlalchemy mysql-connector-python")
891
+ raise ImportError("Run: pip install QuerySUTRA[mysql]")
1034
892
 
1035
- print(f"\n🔄 Connecting to MySQL at {host}:{port}...")
893
+ print(f"\nConnecting to MySQL: {host}:{port}...")
1036
894
 
1037
- connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
1038
- engine = create_engine(connection_string)
895
+ engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
1039
896
 
1040
897
  tables_to_export = tables or self._get_table_names()
1041
898
 
1042
- print(f"📤 Exporting {len(tables_to_export)} tables...")
899
+ print(f"Exporting {len(tables_to_export)} tables...")
1043
900
 
1044
901
  for table in tables_to_export:
1045
902
  df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
1046
903
  df.to_sql(table, engine, if_exists='replace', index=False)
1047
- print(f"{table}: {len(df)} rows")
904
+ print(f" {table}: {len(df)} rows")
1048
905
 
1049
- print(f"Complete!")
906
+ print("Complete!")
1050
907
  return self
1051
908
 
1052
909
  def save_to_postgres(self, host: str, user: str, password: str, database: str,
1053
910
  port: int = 5432, tables: Optional[List[str]] = None):
1054
- """Save to PostgreSQL (local or cloud)."""
911
+ """Export to PostgreSQL."""
1055
912
  try:
1056
913
  from sqlalchemy import create_engine
1057
914
  except ImportError:
1058
- raise ImportError("Run: pip install sqlalchemy psycopg2-binary")
915
+ raise ImportError("Run: pip install QuerySUTRA[postgres]")
1059
916
 
1060
- print(f"\n🔄 Connecting to PostgreSQL at {host}:{port}...")
917
+ print(f"\nConnecting to PostgreSQL: {host}:{port}...")
1061
918
 
1062
- connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
1063
- engine = create_engine(connection_string)
919
+ engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
1064
920
 
1065
921
  tables_to_export = tables or self._get_table_names()
1066
922
 
1067
- print(f"📤 Exporting {len(tables_to_export)} tables...")
923
+ print(f"Exporting {len(tables_to_export)} tables...")
1068
924
 
1069
925
  for table in tables_to_export:
1070
926
  df = pd.read_sql_query(f"SELECT * FROM {table}", self.conn)
1071
927
  df.to_sql(table, engine, if_exists='replace', index=False)
1072
- print(f"{table}: {len(df)} rows")
928
+ print(f" {table}: {len(df)} rows")
1073
929
 
1074
- print(f"Complete!")
930
+ print("Complete!")
1075
931
  return self
1076
932
 
1077
933
  def backup(self, backup_path: str = None):
1078
- """Create complete backup."""
934
+ """Create backup."""
1079
935
  if backup_path:
1080
936
  backup_dir = Path(backup_path)
1081
937
  backup_dir.mkdir(parents=True, exist_ok=True)
@@ -1084,52 +940,47 @@ IMPORTANT:
1084
940
 
1085
941
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
1086
942
 
1087
- print(f"\n💾 Creating backup...")
943
+ print("\nCreating backup...")
1088
944
 
1089
- db_backup = backup_dir / f"sutra_backup_{timestamp}.db"
945
+ db_backup = backup_dir / f"sutra_{timestamp}.db"
1090
946
  self.export_db(str(db_backup), format="sqlite")
1091
947
 
1092
- json_backup = backup_dir / f"sutra_data_{timestamp}.json"
948
+ json_backup = backup_dir / f"sutra_{timestamp}.json"
1093
949
  self.export_db(str(json_backup), format="json")
1094
950
 
1095
- print(f"\n✅ Backup complete!")
1096
- print(f" 📁 Database: {db_backup}")
1097
- print(f" 📊 Data: {json_backup}")
951
+ print(f"\nBackup complete!")
952
+ print(f" Database: {db_backup}")
953
+ print(f" Data: {json_backup}")
1098
954
 
1099
955
  return self
1100
956
 
1101
- # ========================================================================
1102
- # UTILITIES
1103
- # ========================================================================
1104
-
1105
957
  def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
1106
958
  """Export results."""
1107
- fmt = format.lower()
1108
- if fmt == "csv":
959
+ if format == "csv":
1109
960
  data.to_csv(path, index=False)
1110
- elif fmt in ["excel", "xlsx"]:
961
+ elif format in ["excel", "xlsx"]:
1111
962
  data.to_excel(path, index=False)
1112
- elif fmt == "json":
963
+ elif format == "json":
1113
964
  data.to_json(path, orient="records", indent=2)
1114
965
  else:
1115
- raise ValueError(f"Unknown format: {format}")
966
+ raise ValueError(f"Unknown: {format}")
1116
967
 
1117
- print(f"Exported to {path}")
968
+ print(f"Exported to {path}")
1118
969
  return self
1119
970
 
1120
971
  def close(self):
1121
- """Close database."""
972
+ """Close connection."""
1122
973
  if self.conn:
1123
974
  self.conn.close()
1124
- print("Closed")
975
+ print("Closed")
1125
976
 
1126
977
  def _get_table_names(self) -> List[str]:
1127
- """Get list of tables."""
978
+ """Get tables."""
1128
979
  self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
1129
980
  return [r[0] for r in self.cursor.fetchall()]
1130
981
 
1131
982
  def _refresh_schema(self):
1132
- """Refresh schema info."""
983
+ """Refresh schema."""
1133
984
  tables = self._get_table_names()
1134
985
 
1135
986
  self.schema_info = {}
@@ -1138,7 +989,7 @@ IMPORTANT:
1138
989
  self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
1139
990
 
1140
991
  def _generate_sql(self, question: str, table: str) -> str:
1141
- """Generate SQL with OpenAI."""
992
+ """Generate SQL."""
1142
993
  schema = self.schema_info.get(table, {})
1143
994
  sample_df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn)
1144
995
  sample = sample_df.to_string(index=False)
@@ -1156,7 +1007,7 @@ Sample:
1156
1007
 
1157
1008
  Question: {question}
1158
1009
 
1159
- Return ONLY SQL. No explanations."""
1010
+ Return ONLY SQL."""
1160
1011
 
1161
1012
  response = self.client.chat.completions.create(
1162
1013
  model="gpt-4o-mini",
@@ -1177,7 +1028,6 @@ Return ONLY SQL. No explanations."""
1177
1028
  self.close()
1178
1029
 
1179
1030
  def __repr__(self):
1180
- tables = len(self.schema_info)
1181
1031
  features = []
1182
1032
  if self.cache_queries:
1183
1033
  features.append("cache")
@@ -1188,8 +1038,8 @@ Return ONLY SQL. No explanations."""
1188
1038
  if self.fuzzy_match:
1189
1039
  features.append("fuzzy")
1190
1040
 
1191
- features_str = f", features=[{', '.join(features)}]" if features else ""
1192
- return f"SUTRA(tables={tables}, current='{self.current_table}'{features_str})"
1041
+ feat_str = f", {', '.join(features)}" if features else ""
1042
+ return f"SUTRA(tables={len(self.schema_info)}{feat_str})"
1193
1043
 
1194
1044
 
1195
1045
  class QueryResult:
@@ -1203,15 +1053,10 @@ class QueryResult:
1203
1053
  self.error = error
1204
1054
 
1205
1055
  def __repr__(self):
1206
- if self.success:
1207
- return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})"
1208
- return f"QueryResult(error='{self.error}')"
1056
+ return f"QueryResult(rows={len(self.data)}, cols={len(self.data.columns)})" if self.success else f"QueryResult(error='{self.error}')"
1209
1057
 
1210
1058
  def show(self):
1211
- if self.success:
1212
- print(self.data)
1213
- else:
1214
- print(f"Error: {self.error}")
1059
+ print(self.data) if self.success else print(f"Error: {self.error}")
1215
1060
  return self
1216
1061
 
1217
1062
 
@@ -1224,39 +1069,12 @@ def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, st
1224
1069
 
1225
1070
  if __name__ == "__main__":
1226
1071
  print("""
1227
- ╔══════════════════════════════════════════════════════════════╗
1228
- ║ QuerySUTRA v0.3.0 - ENHANCED ║
1229
- ║ Structured-Unstructured-Text-Retrieval-Architecture ║
1230
- ╚══════════════════════════════════════════════════════════════╝
1072
+ QuerySUTRA v0.3.3 - Professional Data Analysis
1073
+ SUTRA: Structured-Unstructured-Text-Retrieval-Architecture
1231
1074
 
1232
- NEW FEATURES:
1233
- Load existing databases (no re-upload needed!)
1234
- ✅ Custom visualizations (pie, bar, line, scatter, table)
1235
- ✅ Smart NLP with fuzzy matching (optional)
1236
- ✅ Irrelevant query detection (optional)
1237
- ✅ Embeddings for caching (optional)
1238
- ✅ All features are OPTIONAL - you control everything!
1075
+ Fixed: Proper primary and foreign keys with unique IDs
1076
+ Features: Load existing DB, custom viz, fuzzy matching, embeddings
1239
1077
 
1240
1078
  Installation: pip install QuerySUTRA
1241
-
1242
- Quick Start:
1243
- from sutra import SUTRA
1244
-
1245
- # NEW: Load existing database
1246
- sutra = SUTRA.load_from_db("sutra.db", api_key="sk-...")
1247
-
1248
- # Or create new with options
1249
- sutra = SUTRA(api_key="sk-...",
1250
- use_embeddings=True, # Smart caching
1251
- check_relevance=True, # Detect irrelevant queries
1252
- fuzzy_match=True) # Better NLP
1253
-
1254
- # Upload and query
1255
- sutra.upload("data.pdf")
1256
- result = sutra.ask("Show sales by region", viz="pie")
1257
-
1258
- # Connect to MySQL/PostgreSQL
1259
- sutra = SUTRA.connect_mysql("localhost", "root", "pass", "db")
1260
-
1261
- Supported: CSV, Excel, JSON, SQL, PDF, DOCX, TXT, DataFrame
1079
+ Usage: from sutra import SUTRA
1262
1080
  """)