QuerySUTRA 0.4.5__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.4.5 → querysutra-0.5.0}/PKG-INFO +1 -1
- {querysutra-0.4.5 → querysutra-0.5.0}/QuerySUTRA.egg-info/PKG-INFO +1 -1
- querysutra-0.5.0/pyproject.toml +17 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/setup.py +3 -4
- querysutra-0.5.0/sutra/__init__.py +4 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/sutra.py +108 -141
- querysutra-0.4.5/pyproject.toml +0 -17
- querysutra-0.4.5/sutra/__init__.py +0 -4
- {querysutra-0.4.5 → querysutra-0.5.0}/LICENSE +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/MANIFEST.in +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/QuerySUTRA.egg-info/SOURCES.txt +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/QuerySUTRA.egg-info/dependency_links.txt +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/QuerySUTRA.egg-info/requires.txt +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/QuerySUTRA.egg-info/top_level.txt +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/README.md +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/examples/quickstart.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/examples/sutra_usage_guide.ipynb +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/examples/usage_guide.ipynb +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/requirements.txt +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/setup.cfg +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/cache_manager.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/clear_cache.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/core.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/data_loader.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/database_manager.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/direct_query.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/feedback.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/feedback_matcher.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/nlp_processor.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/schema_embeddings.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/schema_generator.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/sutra_client.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/sutra_core.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/sutra_simple.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/sutra/visualizer.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/tests/__init__.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/tests/test_modules.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/tests/test_sutra.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/utils/__init__.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/utils/file_utils.py +0 -0
- {querysutra-0.4.5 → querysutra-0.5.0}/utils/text_utils.py +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires=["setuptools>=45"]
|
|
3
|
+
build-backend="setuptools.build_meta"
|
|
4
|
+
[project]
|
|
5
|
+
name="QuerySUTRA"
|
|
6
|
+
version="0.5.0"
|
|
7
|
+
description="SUTRA"
|
|
8
|
+
readme="README.md"
|
|
9
|
+
requires-python=">=3.8"
|
|
10
|
+
license={text="MIT"}
|
|
11
|
+
authors=[{name="Aditya Batta"}]
|
|
12
|
+
dependencies=["pandas>=1.3.0","numpy>=1.21.0","openai>=1.0.0","plotly>=5.0.0","matplotlib>=3.3.0","PyPDF2>=3.0.0","python-docx>=0.8.11","openpyxl>=3.0.0"]
|
|
13
|
+
[project.optional-dependencies]
|
|
14
|
+
mysql=["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0"]
|
|
15
|
+
postgres=["sqlalchemy>=1.4.0","psycopg2-binary>=2.9.0"]
|
|
16
|
+
embeddings=["sentence-transformers>=2.0.0"]
|
|
17
|
+
all=["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0","psycopg2-binary>=2.9.0","sentence-transformers>=2.0.0"]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from setuptools import setup,
|
|
2
|
-
with open("README.md",
|
|
3
|
-
|
|
4
|
-
setup(name="QuerySUTRA",version="0.4.5",author="Aditya Batta",description="SUTRA",long_description=d,long_description_content_type="text/markdown",packages=find_packages(),python_requires=">=3.8",install_requires=["pandas>=1.3.0","numpy>=1.21.0","openai>=1.0.0","plotly>=5.0.0","matplotlib>=3.3.0","PyPDF2>=3.0.0","python-docx>=0.8.11","openpyxl>=3.0.0"],extras_require={"mysql":["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0"],"postgres":["sqlalchemy>=1.4.0","psycopg2-binary>=2.9.0"],"embeddings":["sentence-transformers>=2.0.0"],"all":["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0","psycopg2-binary>=2.9.0","sentence-transformers>=2.0.0"]})
|
|
1
|
+
from setuptools import setup,find_packages
|
|
2
|
+
with open("README.md","r",encoding="utf-8") as f:d=f.read()
|
|
3
|
+
setup(name="QuerySUTRA",version="0.5.0",author="Aditya Batta",description="SUTRA",long_description=d,long_description_content_type="text/markdown",packages=find_packages(),python_requires=">=3.8",install_requires=["pandas>=1.3.0","numpy>=1.21.0","openai>=1.0.0","plotly>=5.0.0","matplotlib>=3.3.0","PyPDF2>=3.0.0","python-docx>=0.8.11","openpyxl>=3.0.0"],extras_require={"mysql":["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0"],"postgres":["sqlalchemy>=1.4.0","psycopg2-binary>=2.9.0"],"embeddings":["sentence-transformers>=2.0.0"],"all":["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0","psycopg2-binary>=2.9.0","sentence-transformers>=2.0.0"]})
|
|
@@ -1,18 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
QuerySUTRA v0.4.
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
FIXED:
|
|
6
|
-
- Auto-creates MySQL database if not exists
|
|
7
|
-
- One-line export to MySQL
|
|
8
|
-
- Complete data extraction from large PDFs
|
|
9
|
-
- No manual file transfers needed
|
|
10
|
-
|
|
11
|
-
Author: Aditya Batta
|
|
12
|
-
Version: 0.4.0
|
|
2
|
+
QuerySUTRA v0.4.5 - FIXED AI EXTRACTION
|
|
3
|
+
Debug mode to see why extraction fails
|
|
13
4
|
"""
|
|
14
5
|
|
|
15
|
-
__version__ = "0.4.
|
|
6
|
+
__version__ = "0.4.5"
|
|
16
7
|
__author__ = "Aditya Batta"
|
|
17
8
|
__all__ = ["SUTRA", "QueryResult", "quick_start"]
|
|
18
9
|
|
|
@@ -74,9 +65,9 @@ class SUTRA:
|
|
|
74
65
|
|
|
75
66
|
def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
|
|
76
67
|
use_embeddings: bool = False, check_relevance: bool = False,
|
|
77
|
-
fuzzy_match: bool = True, cache_queries: bool = True):
|
|
68
|
+
fuzzy_match: bool = True, cache_queries: bool = True, debug: bool = False):
|
|
78
69
|
"""Initialize."""
|
|
79
|
-
print("Initializing QuerySUTRA v0.4.
|
|
70
|
+
print("Initializing QuerySUTRA v0.4.5")
|
|
80
71
|
|
|
81
72
|
if api_key:
|
|
82
73
|
os.environ["OPENAI_API_KEY"] = api_key
|
|
@@ -85,6 +76,7 @@ class SUTRA:
|
|
|
85
76
|
self.client = OpenAI(api_key=self.api_key) if self.api_key and HAS_OPENAI else None
|
|
86
77
|
|
|
87
78
|
self.db_path = db
|
|
79
|
+
self.debug = debug
|
|
88
80
|
|
|
89
81
|
try:
|
|
90
82
|
self.conn = sqlite3.connect(db, timeout=30, check_same_thread=False)
|
|
@@ -133,7 +125,6 @@ class SUTRA:
|
|
|
133
125
|
|
|
134
126
|
print(f"Connecting to MySQL...")
|
|
135
127
|
|
|
136
|
-
# Auto-create database if not exists
|
|
137
128
|
try:
|
|
138
129
|
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
139
130
|
temp_cursor = temp_conn.cursor()
|
|
@@ -185,24 +176,7 @@ class SUTRA:
|
|
|
185
176
|
def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None,
|
|
186
177
|
extract_entities: Optional[List[str]] = None,
|
|
187
178
|
auto_export_mysql: Optional[Dict[str, str]] = None) -> 'SUTRA':
|
|
188
|
-
"""
|
|
189
|
-
Upload data with OPTIONAL automatic MySQL export.
|
|
190
|
-
|
|
191
|
-
Args:
|
|
192
|
-
data: File path or DataFrame
|
|
193
|
-
name: Table name
|
|
194
|
-
extract_entities: Custom entities to extract
|
|
195
|
-
auto_export_mysql: Auto-export to MySQL after upload
|
|
196
|
-
{'host': 'localhost', 'user': 'root', 'password': 'pass', 'database': 'mydb'}
|
|
197
|
-
|
|
198
|
-
Example:
|
|
199
|
-
sutra.upload("data.pdf", auto_export_mysql={
|
|
200
|
-
'host': 'localhost',
|
|
201
|
-
'user': 'root',
|
|
202
|
-
'password': '123456',
|
|
203
|
-
'database': 'my_database'
|
|
204
|
-
})
|
|
205
|
-
"""
|
|
179
|
+
"""Upload data."""
|
|
206
180
|
print("\nUploading...")
|
|
207
181
|
|
|
208
182
|
if isinstance(data, pd.DataFrame):
|
|
@@ -238,7 +212,6 @@ class SUTRA:
|
|
|
238
212
|
else:
|
|
239
213
|
raise ValueError(f"Unsupported: {ext}")
|
|
240
214
|
|
|
241
|
-
# AUTO-EXPORT to MySQL if requested
|
|
242
215
|
if auto_export_mysql:
|
|
243
216
|
print("\nAuto-exporting to MySQL...")
|
|
244
217
|
self.save_to_mysql(
|
|
@@ -252,7 +225,7 @@ class SUTRA:
|
|
|
252
225
|
return self
|
|
253
226
|
|
|
254
227
|
def _smart_upload_pdf(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
255
|
-
"""Parse PDF
|
|
228
|
+
"""Parse PDF."""
|
|
256
229
|
if not HAS_PYPDF2:
|
|
257
230
|
raise ImportError("Run: pip install PyPDF2")
|
|
258
231
|
|
|
@@ -268,7 +241,6 @@ class SUTRA:
|
|
|
268
241
|
if self.client:
|
|
269
242
|
print("AI: Extracting entities...")
|
|
270
243
|
|
|
271
|
-
# Process in chunks for large documents
|
|
272
244
|
chunk_size = 10000
|
|
273
245
|
all_entities = {}
|
|
274
246
|
|
|
@@ -282,11 +254,19 @@ class SUTRA:
|
|
|
282
254
|
|
|
283
255
|
entities = self._extract_chunk(chunk, extract_entities)
|
|
284
256
|
|
|
257
|
+
if self.debug:
|
|
258
|
+
print(f" DEBUG: Chunk {chunk_num} returned {len(entities)} entity types")
|
|
259
|
+
|
|
285
260
|
for entity_type, records in entities.items():
|
|
286
261
|
if entity_type not in all_entities:
|
|
287
262
|
all_entities[entity_type] = []
|
|
288
263
|
all_entities[entity_type].extend(records)
|
|
289
264
|
|
|
265
|
+
if self.debug:
|
|
266
|
+
print(f" DEBUG: Total entities collected: {len(all_entities)}")
|
|
267
|
+
for k, v in all_entities.items():
|
|
268
|
+
print(f" - {k}: {len(v)} records")
|
|
269
|
+
|
|
290
270
|
# Renumber IDs
|
|
291
271
|
for entity_type, records in all_entities.items():
|
|
292
272
|
for idx, record in enumerate(records, 1):
|
|
@@ -306,48 +286,8 @@ class SUTRA:
|
|
|
306
286
|
print("Creating simple table")
|
|
307
287
|
self._store_dataframe(self._parse_text_simple(full_text), base_name)
|
|
308
288
|
|
|
309
|
-
def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
310
|
-
"""Parse DOCX."""
|
|
311
|
-
if not HAS_DOCX:
|
|
312
|
-
raise ImportError("Run: pip install python-docx")
|
|
313
|
-
|
|
314
|
-
doc = docx.Document(path)
|
|
315
|
-
|
|
316
|
-
if doc.tables:
|
|
317
|
-
for i, table in enumerate(doc.tables):
|
|
318
|
-
data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
|
|
319
|
-
if data and len(data) > 1:
|
|
320
|
-
df = pd.DataFrame(data[1:], columns=data[0])
|
|
321
|
-
self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
|
|
322
|
-
return
|
|
323
|
-
|
|
324
|
-
text = "\n".join([para.text for para in doc.paragraphs])
|
|
325
|
-
|
|
326
|
-
if self.client and len(text) > 0:
|
|
327
|
-
entities = self._extract_chunk(text, extract_entities)
|
|
328
|
-
for entity_type, records in entities.items():
|
|
329
|
-
if records:
|
|
330
|
-
df = pd.DataFrame(records)
|
|
331
|
-
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
332
|
-
else:
|
|
333
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
334
|
-
|
|
335
|
-
def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
336
|
-
"""Parse TXT."""
|
|
337
|
-
with open(path, 'r', encoding='utf-8') as file:
|
|
338
|
-
text = file.read()
|
|
339
|
-
|
|
340
|
-
if self.client and len(text) > 0:
|
|
341
|
-
entities = self._extract_chunk(text, extract_entities)
|
|
342
|
-
for entity_type, records in entities.items():
|
|
343
|
-
if records:
|
|
344
|
-
df = pd.DataFrame(records)
|
|
345
|
-
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
346
|
-
else:
|
|
347
|
-
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
348
|
-
|
|
349
289
|
def _extract_chunk(self, text: str, custom_entities: Optional[List[str]] = None) -> Dict:
|
|
350
|
-
"""Extract entities
|
|
290
|
+
"""Extract entities - WITH BETTER ERROR HANDLING."""
|
|
351
291
|
if not self.client:
|
|
352
292
|
return {}
|
|
353
293
|
|
|
@@ -357,35 +297,98 @@ class SUTRA:
|
|
|
357
297
|
Text:
|
|
358
298
|
{text[:8000]}
|
|
359
299
|
|
|
360
|
-
Extract
|
|
300
|
+
Extract: people, skills, technologies, projects, certifications, education, work_experience, events, organizations, or ANY structured data.
|
|
361
301
|
|
|
362
|
-
Return JSON with arrays.
|
|
302
|
+
Return JSON with arrays. Sequential IDs. Foreign keys reference primary keys.
|
|
363
303
|
|
|
364
|
-
Example:
|
|
365
304
|
{{
|
|
366
305
|
"people": [{{"id": 1, "name": "John", "email": "john@co.com", "city": "Dallas"}}, ...],
|
|
367
306
|
"skills": [{{"id": 1, "person_id": 1, "skill_name": "Python"}}, ...]
|
|
368
307
|
}}
|
|
369
308
|
|
|
370
|
-
|
|
309
|
+
ONLY valid JSON. No explanations."""
|
|
371
310
|
|
|
372
311
|
resp = self.client.chat.completions.create(
|
|
373
312
|
model="gpt-4o-mini",
|
|
374
313
|
messages=[
|
|
375
|
-
{"role": "system", "content": "Extract ALL entities with unique IDs. Return
|
|
314
|
+
{"role": "system", "content": "Extract ALL entities with unique IDs. Return ONLY valid JSON, nothing else."},
|
|
376
315
|
{"role": "user", "content": prompt}
|
|
377
316
|
],
|
|
378
317
|
temperature=0,
|
|
379
318
|
max_tokens=8000
|
|
380
319
|
)
|
|
381
320
|
|
|
382
|
-
json_text = resp.choices[0].message.content.strip()
|
|
383
|
-
|
|
321
|
+
json_text = resp.choices[0].message.content.strip()
|
|
322
|
+
|
|
323
|
+
if self.debug:
|
|
324
|
+
print(f" DEBUG: AI response length: {len(json_text)} chars")
|
|
325
|
+
print(f" DEBUG: First 200 chars: {json_text[:200]}")
|
|
326
|
+
|
|
327
|
+
json_text = json_text.replace("```json", "").replace("```", "").strip()
|
|
328
|
+
|
|
329
|
+
result = json.loads(json_text)
|
|
330
|
+
|
|
331
|
+
if self.debug:
|
|
332
|
+
print(f" DEBUG: Parsed {len(result)} entity types")
|
|
333
|
+
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
except json.JSONDecodeError as e:
|
|
337
|
+
if self.debug:
|
|
338
|
+
print(f" DEBUG: JSON parse error: {e}")
|
|
339
|
+
print(f" DEBUG: Response was: {json_text[:500]}")
|
|
340
|
+
return {}
|
|
384
341
|
except Exception as e:
|
|
342
|
+
if self.debug:
|
|
343
|
+
print(f" DEBUG: Extraction error: {e}")
|
|
385
344
|
return {}
|
|
386
345
|
|
|
346
|
+
def _smart_upload_docx(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
347
|
+
"""Parse DOCX."""
|
|
348
|
+
if not HAS_DOCX:
|
|
349
|
+
raise ImportError("Run: pip install python-docx")
|
|
350
|
+
|
|
351
|
+
doc = docx.Document(path)
|
|
352
|
+
|
|
353
|
+
if doc.tables:
|
|
354
|
+
for i, table in enumerate(doc.tables):
|
|
355
|
+
data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
|
|
356
|
+
if data and len(data) > 1:
|
|
357
|
+
df = pd.DataFrame(data[1:], columns=data[0])
|
|
358
|
+
self._store_dataframe(df, f"{base_name}_table_{i+1}" if len(doc.tables) > 1 else base_name)
|
|
359
|
+
return
|
|
360
|
+
|
|
361
|
+
text = "\n".join([para.text for para in doc.paragraphs])
|
|
362
|
+
|
|
363
|
+
if self.client and len(text) > 0:
|
|
364
|
+
entities = self._extract_chunk(text, extract_entities)
|
|
365
|
+
if entities:
|
|
366
|
+
for entity_type, records in entities.items():
|
|
367
|
+
if records:
|
|
368
|
+
df = pd.DataFrame(records)
|
|
369
|
+
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
373
|
+
|
|
374
|
+
def _smart_upload_txt(self, path: Path, base_name: str, extract_entities: Optional[List[str]] = None):
|
|
375
|
+
"""Parse TXT."""
|
|
376
|
+
with open(path, 'r', encoding='utf-8') as file:
|
|
377
|
+
text = file.read()
|
|
378
|
+
|
|
379
|
+
if self.client and len(text) > 0:
|
|
380
|
+
entities = self._extract_chunk(text, extract_entities)
|
|
381
|
+
if entities:
|
|
382
|
+
for entity_type, records in entities.items():
|
|
383
|
+
if records:
|
|
384
|
+
df = pd.DataFrame(records)
|
|
385
|
+
self._store_dataframe_safe(df, f"{base_name}_{entity_type}")
|
|
386
|
+
return
|
|
387
|
+
|
|
388
|
+
self._store_dataframe(self._parse_text_simple(text), base_name)
|
|
389
|
+
|
|
387
390
|
def _store_dataframe_safe(self, df: pd.DataFrame, name: str):
|
|
388
|
-
"""Store
|
|
391
|
+
"""Store."""
|
|
389
392
|
try:
|
|
390
393
|
df.columns = [str(c).strip().replace(" ", "_").replace("-", "_") for c in df.columns]
|
|
391
394
|
df.to_sql(name, self.conn, if_exists='replace', index=False, method='multi', chunksize=500)
|
|
@@ -403,7 +406,6 @@ Return ONLY valid JSON."""
|
|
|
403
406
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
404
407
|
if not lines:
|
|
405
408
|
return pd.DataFrame({'content': ['No content']})
|
|
406
|
-
|
|
407
409
|
return pd.DataFrame({'line_number': range(1, len(lines) + 1), 'content': lines})
|
|
408
410
|
|
|
409
411
|
def _store_dataframe(self, df: pd.DataFrame, name: str):
|
|
@@ -412,14 +414,14 @@ Return ONLY valid JSON."""
|
|
|
412
414
|
print(f"Uploaded: {name} ({len(df)} rows)")
|
|
413
415
|
|
|
414
416
|
def ask(self, question: str, viz: Union[bool, str] = False, table: Optional[str] = None) -> 'QueryResult':
|
|
415
|
-
"""
|
|
417
|
+
"""Query."""
|
|
416
418
|
if not self.client:
|
|
417
419
|
return QueryResult(False, "", pd.DataFrame(), None, "No API key")
|
|
418
420
|
|
|
419
421
|
print(f"\nQuestion: {question}")
|
|
420
422
|
|
|
421
423
|
if self.check_relevance and not self._is_relevant_query(question):
|
|
422
|
-
print("Warning: Irrelevant
|
|
424
|
+
print("Warning: Irrelevant")
|
|
423
425
|
choice = input("Continue? (yes/no): ").strip().lower()
|
|
424
426
|
if choice not in ['yes', 'y']:
|
|
425
427
|
return QueryResult(False, "", pd.DataFrame(), None, "Irrelevant")
|
|
@@ -471,13 +473,8 @@ Return ONLY valid JSON."""
|
|
|
471
473
|
"""Check relevance."""
|
|
472
474
|
if not self.client:
|
|
473
475
|
return True
|
|
474
|
-
|
|
475
476
|
try:
|
|
476
477
|
tables = self._get_table_names()[:3]
|
|
477
|
-
cols = []
|
|
478
|
-
for tbl in tables:
|
|
479
|
-
cols.extend(list(self.schema_info.get(tbl, {}).keys())[:5])
|
|
480
|
-
|
|
481
478
|
resp = self.client.chat.completions.create(
|
|
482
479
|
model="gpt-4o-mini",
|
|
483
480
|
messages=[
|
|
@@ -492,7 +489,7 @@ Return ONLY valid JSON."""
|
|
|
492
489
|
return True
|
|
493
490
|
|
|
494
491
|
def _apply_fuzzy_matching(self, question: str, table: str) -> str:
|
|
495
|
-
"""Fuzzy
|
|
492
|
+
"""Fuzzy."""
|
|
496
493
|
if not self.schema_info.get(table):
|
|
497
494
|
return question
|
|
498
495
|
|
|
@@ -522,13 +519,11 @@ Return ONLY valid JSON."""
|
|
|
522
519
|
return None
|
|
523
520
|
|
|
524
521
|
q_emb = self.embedding_model.encode([question])[0]
|
|
525
|
-
best_match = None
|
|
526
|
-
best_sim = 0.85
|
|
522
|
+
best_match, best_sim = None, 0.85
|
|
527
523
|
|
|
528
524
|
for cached_q, data in self.query_embeddings.items():
|
|
529
525
|
if data['table'] != table:
|
|
530
526
|
continue
|
|
531
|
-
|
|
532
527
|
sim = np.dot(q_emb, data['embedding']) / (np.linalg.norm(q_emb) * np.linalg.norm(data['embedding']))
|
|
533
528
|
if sim > best_sim:
|
|
534
529
|
best_sim = sim
|
|
@@ -537,19 +532,17 @@ Return ONLY valid JSON."""
|
|
|
537
532
|
if best_match:
|
|
538
533
|
print(f" Similar ({best_sim:.0%})")
|
|
539
534
|
return self.query_embeddings[best_match]['result']
|
|
540
|
-
|
|
541
535
|
return None
|
|
542
536
|
|
|
543
537
|
def _store_in_embedding_cache(self, question: str, table: str, result: 'QueryResult'):
|
|
544
|
-
"""Store
|
|
538
|
+
"""Store."""
|
|
545
539
|
q_emb = self.embedding_model.encode([question])[0]
|
|
546
540
|
self.query_embeddings[question] = {'table': table, 'embedding': q_emb, 'result': result}
|
|
547
541
|
|
|
548
542
|
def _visualize(self, df: pd.DataFrame, title: str, viz_type: str = "auto"):
|
|
549
|
-
"""
|
|
543
|
+
"""Viz."""
|
|
550
544
|
if not HAS_PLOTLY and not HAS_MATPLOTLIB:
|
|
551
545
|
return None
|
|
552
|
-
|
|
553
546
|
print(f"Creating {viz_type} chart...")
|
|
554
547
|
return self._plotly_viz(df, title, viz_type) if HAS_PLOTLY else self._matplotlib_viz(df, title, viz_type)
|
|
555
548
|
|
|
@@ -578,7 +571,6 @@ Return ONLY valid JSON."""
|
|
|
578
571
|
fig = px.pie(df, names=cat[0], values=num[0], title=title) if len(df) <= 10 else px.bar(df, x=cat[0], y=num[0], title=title)
|
|
579
572
|
else:
|
|
580
573
|
fig = px.bar(df, y=df.columns[0], title=title)
|
|
581
|
-
|
|
582
574
|
fig.show()
|
|
583
575
|
return fig
|
|
584
576
|
except:
|
|
@@ -589,14 +581,12 @@ Return ONLY valid JSON."""
|
|
|
589
581
|
try:
|
|
590
582
|
plt.figure(figsize=(10, 6))
|
|
591
583
|
num = df.select_dtypes(include=[np.number]).columns
|
|
592
|
-
|
|
593
584
|
if viz_type == "pie":
|
|
594
585
|
df[df.columns[0]].value_counts().plot(kind='pie')
|
|
595
586
|
elif viz_type == "line" and len(num) > 0:
|
|
596
587
|
df[num[0]].plot(kind='line')
|
|
597
588
|
else:
|
|
598
589
|
(df[num[0]] if len(num) > 0 else df.iloc[:, 0].value_counts()).plot(kind='bar')
|
|
599
|
-
|
|
600
590
|
plt.title(title)
|
|
601
591
|
plt.tight_layout()
|
|
602
592
|
plt.show()
|
|
@@ -605,7 +595,7 @@ Return ONLY valid JSON."""
|
|
|
605
595
|
return None
|
|
606
596
|
|
|
607
597
|
def tables(self) -> Dict[str, dict]:
|
|
608
|
-
"""List
|
|
598
|
+
"""List."""
|
|
609
599
|
print("\n" + "="*70)
|
|
610
600
|
print("TABLES")
|
|
611
601
|
print("="*70)
|
|
@@ -626,7 +616,7 @@ Return ONLY valid JSON."""
|
|
|
626
616
|
return result
|
|
627
617
|
|
|
628
618
|
def schema(self, table: Optional[str] = None) -> dict:
|
|
629
|
-
"""
|
|
619
|
+
"""Schema."""
|
|
630
620
|
if not self.schema_info:
|
|
631
621
|
self._refresh_schema()
|
|
632
622
|
|
|
@@ -651,18 +641,17 @@ Return ONLY valid JSON."""
|
|
|
651
641
|
tbl = table or self.current_table
|
|
652
642
|
if not tbl:
|
|
653
643
|
return pd.DataFrame()
|
|
654
|
-
|
|
655
644
|
df = pd.read_sql_query(f"SELECT * FROM {tbl} LIMIT {n}", self.conn)
|
|
656
645
|
print(f"\nSample from '{tbl}':")
|
|
657
646
|
print(df.to_string(index=False))
|
|
658
647
|
return df
|
|
659
648
|
|
|
660
649
|
def info(self):
|
|
661
|
-
"""
|
|
650
|
+
"""Info."""
|
|
662
651
|
return self.tables()
|
|
663
652
|
|
|
664
653
|
def sql(self, query: str, viz: Union[bool, str] = False) -> 'QueryResult':
|
|
665
|
-
"""
|
|
654
|
+
"""SQL."""
|
|
666
655
|
try:
|
|
667
656
|
df = pd.read_sql_query(query, self.conn)
|
|
668
657
|
print(f"Success! {len(df)} rows")
|
|
@@ -679,7 +668,7 @@ Return ONLY valid JSON."""
|
|
|
679
668
|
return self.ask(question, viz=viz)
|
|
680
669
|
|
|
681
670
|
def export_db(self, path: str, format: str = "sqlite"):
|
|
682
|
-
"""Export
|
|
671
|
+
"""Export."""
|
|
683
672
|
if format == "sqlite":
|
|
684
673
|
shutil.copy2(self.db_path, path)
|
|
685
674
|
elif format == "sql":
|
|
@@ -696,25 +685,12 @@ Return ONLY valid JSON."""
|
|
|
696
685
|
pd.read_sql_query(f"SELECT * FROM {t}", self.conn).to_excel(writer, sheet_name=t[:31], index=False)
|
|
697
686
|
else:
|
|
698
687
|
raise ValueError(f"Unsupported: {format}")
|
|
699
|
-
|
|
700
688
|
print(f"Saved: {path}")
|
|
701
689
|
return self
|
|
702
690
|
|
|
703
691
|
def save_to_mysql(self, host: str, user: str, password: str, database: str,
|
|
704
|
-
port: int = 3306, tables: Optional[List[str]] = None,
|
|
705
|
-
|
|
706
|
-
"""
|
|
707
|
-
Export to MySQL - AUTO-CREATES database if not exists.
|
|
708
|
-
|
|
709
|
-
Args:
|
|
710
|
-
host: MySQL host
|
|
711
|
-
user: MySQL user
|
|
712
|
-
password: MySQL password
|
|
713
|
-
database: Database name (auto-created if not exists)
|
|
714
|
-
port: MySQL port
|
|
715
|
-
tables: Specific tables to export (None = all)
|
|
716
|
-
auto_create: Auto-create database if not exists
|
|
717
|
-
"""
|
|
692
|
+
port: int = 3306, tables: Optional[List[str]] = None, auto_create: bool = True):
|
|
693
|
+
"""Export to MySQL."""
|
|
718
694
|
try:
|
|
719
695
|
from sqlalchemy import create_engine
|
|
720
696
|
import mysql.connector
|
|
@@ -723,7 +699,6 @@ Return ONLY valid JSON."""
|
|
|
723
699
|
|
|
724
700
|
print(f"Exporting to MySQL: {host}/{database}")
|
|
725
701
|
|
|
726
|
-
# Auto-create database if requested
|
|
727
702
|
if auto_create:
|
|
728
703
|
try:
|
|
729
704
|
temp_conn = mysql.connector.connect(host=host, user=user, password=password, port=port)
|
|
@@ -733,7 +708,7 @@ Return ONLY valid JSON."""
|
|
|
733
708
|
temp_conn.close()
|
|
734
709
|
print(f" Database '{database}' ready")
|
|
735
710
|
except Exception as e:
|
|
736
|
-
print(f" Warning:
|
|
711
|
+
print(f" Warning: {e}")
|
|
737
712
|
|
|
738
713
|
engine = create_engine(f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}")
|
|
739
714
|
|
|
@@ -745,13 +720,11 @@ Return ONLY valid JSON."""
|
|
|
745
720
|
print("Complete!")
|
|
746
721
|
return self
|
|
747
722
|
|
|
748
|
-
def save_to_postgres(self, host: str, user: str, password: str, database: str,
|
|
749
|
-
|
|
750
|
-
"""Export to PostgreSQL."""
|
|
723
|
+
def save_to_postgres(self, host: str, user: str, password: str, database: str, port: int = 5432, tables: Optional[List[str]] = None):
|
|
724
|
+
"""PostgreSQL."""
|
|
751
725
|
try:
|
|
752
726
|
from sqlalchemy import create_engine
|
|
753
727
|
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
|
|
754
|
-
|
|
755
728
|
print(f"Exporting to PostgreSQL...")
|
|
756
729
|
for t in (tables or self._get_table_names()):
|
|
757
730
|
df = pd.read_sql_query(f"SELECT * FROM {t}", self.conn)
|
|
@@ -767,14 +740,13 @@ Return ONLY valid JSON."""
|
|
|
767
740
|
dir = Path(path) if path else Path(".")
|
|
768
741
|
dir.mkdir(parents=True, exist_ok=True)
|
|
769
742
|
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
770
|
-
|
|
771
743
|
self.export_db(str(dir / f"sutra_{ts}.db"), "sqlite")
|
|
772
744
|
self.export_db(str(dir / f"sutra_{ts}.json"), "json")
|
|
773
745
|
print("Backup complete!")
|
|
774
746
|
return self
|
|
775
747
|
|
|
776
748
|
def export(self, data: pd.DataFrame, path: str, format: str = "csv"):
|
|
777
|
-
"""Export
|
|
749
|
+
"""Export."""
|
|
778
750
|
if format == "csv":
|
|
779
751
|
data.to_csv(path, index=False)
|
|
780
752
|
elif format in ["excel", "xlsx"]:
|
|
@@ -790,7 +762,7 @@ Return ONLY valid JSON."""
|
|
|
790
762
|
self.conn.close()
|
|
791
763
|
|
|
792
764
|
def _get_table_names(self) -> List[str]:
|
|
793
|
-
"""
|
|
765
|
+
"""Tables."""
|
|
794
766
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
795
767
|
return [r[0] for r in self.cursor.fetchall()]
|
|
796
768
|
|
|
@@ -802,7 +774,7 @@ Return ONLY valid JSON."""
|
|
|
802
774
|
self.schema_info[tbl] = {r[1]: r[2] for r in self.cursor.fetchall()}
|
|
803
775
|
|
|
804
776
|
def _generate_sql(self, question: str, table: str) -> str:
|
|
805
|
-
"""
|
|
777
|
+
"""SQL."""
|
|
806
778
|
schema = self.schema_info.get(table, {})
|
|
807
779
|
sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", self.conn).to_string(index=False)
|
|
808
780
|
schema_str = ", ".join([f"{col} ({dtype})" for col, dtype in schema.items()])
|
|
@@ -815,7 +787,6 @@ Return ONLY valid JSON."""
|
|
|
815
787
|
],
|
|
816
788
|
temperature=0
|
|
817
789
|
)
|
|
818
|
-
|
|
819
790
|
return resp.choices[0].message.content.strip().replace("```sql", "").replace("```", "").strip()
|
|
820
791
|
|
|
821
792
|
def __enter__(self):
|
|
@@ -842,11 +813,7 @@ class QueryResult:
|
|
|
842
813
|
|
|
843
814
|
|
|
844
815
|
def quick_start(api_key: str, data_path: str, question: str, viz: Union[bool, str] = False):
|
|
845
|
-
"""Quick
|
|
816
|
+
"""Quick."""
|
|
846
817
|
with SUTRA(api_key=api_key) as sutra:
|
|
847
818
|
sutra.upload(data_path)
|
|
848
819
|
return sutra.ask(question, viz=viz)
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
if __name__ == "__main__":
|
|
852
|
-
print("QuerySUTRA v0.4.0 - Simple & Automatic")
|
querysutra-0.4.5/pyproject.toml
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
requires = ["setuptools>=45"]
|
|
3
|
-
build-backend = "setuptools.build_meta"
|
|
4
|
-
[project]
|
|
5
|
-
name = "QuerySUTRA"
|
|
6
|
-
version = "0.4.5"
|
|
7
|
-
description = "SUTRA"
|
|
8
|
-
readme = "README.md"
|
|
9
|
-
requires-python = ">=3.8"
|
|
10
|
-
license = {text = "MIT"}
|
|
11
|
-
authors = [{name = "Aditya Batta"}]
|
|
12
|
-
dependencies = ["pandas>=1.3.0","numpy>=1.21.0","openai>=1.0.0","plotly>=5.0.0","matplotlib>=3.3.0","PyPDF2>=3.0.0","python-docx>=0.8.11","openpyxl>=3.0.0"]
|
|
13
|
-
[project.optional-dependencies]
|
|
14
|
-
mysql = ["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0"]
|
|
15
|
-
postgres = ["sqlalchemy>=1.4.0","psycopg2-binary>=2.9.0"]
|
|
16
|
-
embeddings = ["sentence-transformers>=2.0.0"]
|
|
17
|
-
all = ["sqlalchemy>=1.4.0","mysql-connector-python>=8.0.0","psycopg2-binary>=2.9.0","sentence-transformers>=2.0.0"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|