QuerySUTRA 0.4.2__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. querysutra-0.4.4/PKG-INFO +441 -0
  2. querysutra-0.4.4/QuerySUTRA.egg-info/PKG-INFO +441 -0
  3. querysutra-0.4.4/README.md +408 -0
  4. {querysutra-0.4.2 → querysutra-0.4.4}/pyproject.toml +3 -5
  5. querysutra-0.4.4/setup.py +4 -0
  6. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/__init__.py +2 -7
  7. querysutra-0.4.2/PKG-INFO +0 -264
  8. querysutra-0.4.2/QuerySUTRA.egg-info/PKG-INFO +0 -264
  9. querysutra-0.4.2/README.md +0 -231
  10. querysutra-0.4.2/setup.py +0 -17
  11. {querysutra-0.4.2 → querysutra-0.4.4}/LICENSE +0 -0
  12. {querysutra-0.4.2 → querysutra-0.4.4}/MANIFEST.in +0 -0
  13. {querysutra-0.4.2 → querysutra-0.4.4}/QuerySUTRA.egg-info/SOURCES.txt +0 -0
  14. {querysutra-0.4.2 → querysutra-0.4.4}/QuerySUTRA.egg-info/dependency_links.txt +0 -0
  15. {querysutra-0.4.2 → querysutra-0.4.4}/QuerySUTRA.egg-info/requires.txt +0 -0
  16. {querysutra-0.4.2 → querysutra-0.4.4}/QuerySUTRA.egg-info/top_level.txt +0 -0
  17. {querysutra-0.4.2 → querysutra-0.4.4}/examples/quickstart.py +0 -0
  18. {querysutra-0.4.2 → querysutra-0.4.4}/examples/sutra_usage_guide.ipynb +0 -0
  19. {querysutra-0.4.2 → querysutra-0.4.4}/examples/usage_guide.ipynb +0 -0
  20. {querysutra-0.4.2 → querysutra-0.4.4}/requirements.txt +0 -0
  21. {querysutra-0.4.2 → querysutra-0.4.4}/setup.cfg +0 -0
  22. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/cache_manager.py +0 -0
  23. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/clear_cache.py +0 -0
  24. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/core.py +0 -0
  25. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/data_loader.py +0 -0
  26. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/database_manager.py +0 -0
  27. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/direct_query.py +0 -0
  28. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/feedback.py +0 -0
  29. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/feedback_matcher.py +0 -0
  30. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/nlp_processor.py +0 -0
  31. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/schema_embeddings.py +0 -0
  32. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/schema_generator.py +0 -0
  33. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/sutra.py +0 -0
  34. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/sutra_client.py +0 -0
  35. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/sutra_core.py +0 -0
  36. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/sutra_simple.py +0 -0
  37. {querysutra-0.4.2 → querysutra-0.4.4}/sutra/visualizer.py +0 -0
  38. {querysutra-0.4.2 → querysutra-0.4.4}/tests/__init__.py +0 -0
  39. {querysutra-0.4.2 → querysutra-0.4.4}/tests/test_modules.py +0 -0
  40. {querysutra-0.4.2 → querysutra-0.4.4}/tests/test_sutra.py +0 -0
  41. {querysutra-0.4.2 → querysutra-0.4.4}/utils/__init__.py +0 -0
  42. {querysutra-0.4.2 → querysutra-0.4.4}/utils/file_utils.py +0 -0
  43. {querysutra-0.4.2 → querysutra-0.4.4}/utils/text_utils.py +0 -0
@@ -0,0 +1,441 @@
1
+ Metadata-Version: 2.4
2
+ Name: QuerySUTRA
3
+ Version: 0.4.4
4
+ Summary: SUTRA
5
+ Author: Aditya Batta
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=1.3.0
11
+ Requires-Dist: numpy>=1.21.0
12
+ Requires-Dist: openai>=1.0.0
13
+ Requires-Dist: plotly>=5.0.0
14
+ Requires-Dist: matplotlib>=3.3.0
15
+ Requires-Dist: PyPDF2>=3.0.0
16
+ Requires-Dist: python-docx>=0.8.11
17
+ Requires-Dist: openpyxl>=3.0.0
18
+ Provides-Extra: mysql
19
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "mysql"
20
+ Requires-Dist: mysql-connector-python>=8.0.0; extra == "mysql"
21
+ Provides-Extra: postgres
22
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "postgres"
23
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
24
+ Provides-Extra: embeddings
25
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "embeddings"
26
+ Provides-Extra: all
27
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "all"
28
+ Requires-Dist: mysql-connector-python>=8.0.0; extra == "all"
29
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "all"
30
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "all"
31
+ Dynamic: license-file
32
+ Dynamic: requires-python
33
+
34
+ # QuerySUTRA
35
+
36
+ **SUTRA: Structured-Unstructured-Text-Retrieval-Architecture**
37
+
38
+ AI-powered data analysis library. Upload PDFs, query with natural language, export to MySQL automatically.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install QuerySUTRA
44
+ pip install QuerySUTRA[mysql] # For MySQL export
45
+ pip install QuerySUTRA[embeddings] # For smart caching
46
+ pip install QuerySUTRA[all] # All features
47
+ ```
48
+
49
+ ## Quick Start
50
+
51
+ ```python
52
+ from sutra import SUTRA
53
+
54
+ sutra = SUTRA(api_key="your-openai-key")
55
+ sutra.upload("data.pdf")
56
+ result = sutra.ask("Show me all people")
57
+ print(result.data)
58
+ ```
59
+
60
+ ## Core Features
61
+
62
+ ### 1. Automatic MySQL Export
63
+
64
+ Database auto-created if not exists.
65
+
66
+ ```python
67
+ # Upload and export to MySQL automatically
68
+ sutra.upload("data.pdf", auto_export_mysql={
69
+ 'host': 'localhost',
70
+ 'user': 'root',
71
+ 'password': 'your_password',
72
+ 'database': 'my_database' # Auto-creates
73
+ })
74
+ ```
75
+
76
+ ### 2. Complete Data Extraction
77
+
78
+ Processes entire PDF in chunks. Extracts ALL data.
79
+
80
+ ```python
81
+ sutra.upload("large_document.pdf") # Extracts all 50+ employees
82
+ sutra.tables()
83
+ ```
84
+
85
+ ### 3. Natural Language Queries
86
+
87
+ ```python
88
+ result = sutra.ask("Show all people from California")
89
+ result = sutra.ask("Who has Python skills?")
90
+ result = sutra.ask("Count employees by state", viz="pie")
91
+ ```
92
+
93
+ ### 4. Custom Visualizations
94
+
95
+ ```python
96
+ result = sutra.ask("Sales by region", viz="pie")
97
+ result = sutra.ask("Trends", viz="line")
98
+ result = sutra.ask("Compare", viz="bar")
99
+ result = sutra.ask("Correlation", viz="scatter")
100
+ result = sutra.ask("Data", viz="table")
101
+ result = sutra.ask("Analysis", viz="heatmap")
102
+ ```
103
+
104
+ ### 5. Load Existing Databases
105
+
106
+ ```python
107
+ # Load SQLite
108
+ sutra = SUTRA.load_from_db("data.db", api_key="key")
109
+
110
+ # Connect to MySQL
111
+ sutra = SUTRA.connect_mysql("localhost", "root", "pass", "database")
112
+
113
+ # Connect to PostgreSQL
114
+ sutra = SUTRA.connect_postgres("localhost", "postgres", "pass", "database")
115
+ ```
116
+
117
+ ### 6. Fuzzy Matching for Better NLP
118
+
119
+ Automatically matches similar terms.
120
+
121
+ ```python
122
+ sutra = SUTRA(api_key="your-key", fuzzy_match=True)
123
+
124
+ # "New York City" automatically matches "New York" in database
125
+ result = sutra.ask("Who are from New York City?")
126
+ # Output: Fuzzy: 'City' -> 'New York'
127
+ ```
128
+
129
+ **How it works:**
130
+ - Uses Python's `difflib.get_close_matches`
131
+ - 60% similarity threshold
132
+ - Matches query terms to actual database values
133
+ - Example: "NYC" → "New York", "Cali" → "California"
134
+
135
+ ### 7. Embeddings for Smart Caching (Saves API Calls)
136
+
137
+ Cache similar queries to save OpenAI API costs.
138
+
139
+ ```python
140
+ sutra = SUTRA(api_key="your-key", use_embeddings=True)
141
+
142
+ # First query - calls OpenAI API
143
+ result = sutra.ask("Show sales data")
144
+
145
+ # Similar query - uses cache (NO API call, FREE!)
146
+ result = sutra.ask("Display sales information")
147
+ # Output: Similar (92%): 'Show sales data'
148
+ ```
149
+
150
+ **How it works:**
151
+ - Uses `sentence-transformers` library
152
+ - Model: `all-MiniLM-L6-v2` (80MB, runs locally)
153
+ - Converts queries to 384-dimensional vectors
154
+ - Similarity threshold: 85%
155
+ - Completely offline (no external API calls)
156
+
157
+ **Technical details:**
158
+ ```
159
+ Query 1: "Show sales" → Vector: [0.23, -0.45, 0.67, ...]
160
+ Query 2: "Display sales" → Vector: [0.25, -0.43, 0.69, ...]
161
+ Similarity: 92% → Uses cached result (saves API call)
162
+
163
+ Query 3: "What's the weather?" → Vector: [-0.89, 0.12, -0.34, ...]
164
+ Similarity: 15% → New API call (different topic)
165
+ ```
166
+
167
+ **Cost savings:**
168
+ ```python
169
+ # Without embeddings: 10 similar queries = 10 API calls = $0.10
170
+ # With embeddings: 10 similar queries = 1 API call = $0.01 (90% savings)
171
+ ```
172
+
173
+ ### 8. Irrelevant Query Detection
174
+
175
+ Detects when queries don't relate to your database.
176
+
177
+ ```python
178
+ sutra = SUTRA(api_key="your-key", check_relevance=True)
179
+
180
+ result = sutra.ask("What is the weather today?")
181
+ # Output: Warning: Query may be irrelevant to your database
182
+ # Database contains tables about: employee_data_people, employee_data_skills
183
+ # Continue anyway? (yes/no):
184
+ ```
185
+
186
+ **How it works:**
187
+ - Sends database context (table names, column names) to AI
188
+ - AI determines if query is relevant
189
+ - Prompts user before wasting API call
190
+ - Can proceed anyway if desired
191
+
192
+ ### 9. Query Caching
193
+
194
+ Simple caching for exact query matches.
195
+
196
+ ```python
197
+ sutra = SUTRA(api_key="your-key", cache_queries=True)
198
+
199
+ result = sutra.ask("Show total sales") # API call
200
+ result = sutra.ask("Show total sales") # From cache (FREE)
201
+ # Output: From cache
202
+ ```
203
+
204
+ ### 10. Direct SQL (Free, No API Cost)
205
+
206
+ ```python
207
+ result = sutra.sql("SELECT * FROM people WHERE state='CA'")
208
+ print(result.data)
209
+ ```
210
+
211
+ ## Advanced Configuration
212
+
213
+ Enable all optional features:
214
+
215
+ ```python
216
+ sutra = SUTRA(
217
+ api_key="your-openai-key",
218
+ db="database.db", # SQLite database path
219
+ use_embeddings=True, # Smart caching with embeddings (saves 90% API costs)
220
+ check_relevance=True, # Detect irrelevant queries before API call
221
+ fuzzy_match=True, # Better NLP matching
222
+ cache_queries=True # Cache exact query matches
223
+ )
224
+ ```
225
+
226
+ **Feature comparison:**
227
+
228
+ | Feature | Benefit | When to Use |
229
+ |---------|---------|-------------|
230
+ | `use_embeddings=True` | Saves 90% on API costs for similar queries | Always recommended |
231
+ | `fuzzy_match=True` | Better query matching | When data has city/location names |
232
+ | `check_relevance=True` | Prevents wasted API calls | When users ask random questions |
233
+ | `cache_queries=True` | Saves on exact query repeats | Always recommended |
234
+
235
+ ## Import SQLite to MySQL
236
+
237
+ **Step 1: In Colab - Export Database**
238
+ ```python
239
+ sutra.upload("data.pdf")
240
+ sutra.export_db("my_data.db", format="sqlite")
241
+
242
+ from google.colab import files
243
+ files.download("my_data.db")
244
+ ```
245
+
246
+ **Step 2: On Windows - Import to MySQL**
247
+
248
+ Method A: Using QuerySUTRA
249
+ ```python
250
+ sutra = SUTRA.load_from_db("my_data.db", api_key="key")
251
+ sutra.save_to_mysql("localhost", "root", "password", "my_database")
252
+ ```
253
+
254
+ Method B: Using simple_import.py script
255
+
256
+ Create `simple_import.py`:
257
+ ```python
258
+ import sqlite3, mysql.connector, pandas as pd
259
+
260
+ SQLITE_DB = "my_data.db"
261
+ MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD = "localhost", "root", "password"
262
+ MYSQL_DATABASE = "my_database"
263
+
264
+ sqlite_conn = sqlite3.connect(SQLITE_DB)
265
+ cursor = sqlite_conn.cursor()
266
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
267
+ tables = [row[0] for row in cursor.fetchall()]
268
+
269
+ temp_conn = mysql.connector.connect(host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD)
270
+ temp_cursor = temp_conn.cursor()
271
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{MYSQL_DATABASE}`")
272
+ temp_cursor.close()
273
+ temp_conn.close()
274
+
275
+ mysql_conn = mysql.connector.connect(host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD, database=MYSQL_DATABASE)
276
+ mysql_cursor = mysql_conn.cursor()
277
+
278
+ for table in tables:
279
+ df = pd.read_sql_query(f"SELECT * FROM {table}", sqlite_conn)
280
+ mysql_cursor.execute(f"DROP TABLE IF EXISTS {table}")
281
+
282
+ cols = [f"`{col}` {'INT' if df[col].dtype == 'int64' else 'FLOAT' if df[col].dtype == 'float64' else 'TEXT'}" for col in df.columns]
283
+ mysql_cursor.execute(f"CREATE TABLE {table} ({', '.join(cols)})")
284
+
285
+ if len(df) > 0:
286
+ placeholders = ', '.join(['%s'] * len(df.columns))
287
+ for _, row in df.iterrows():
288
+ vals = [None if pd.isna(v) else v for v in row.values]
289
+ mysql_cursor.execute(f"INSERT INTO {table} VALUES ({placeholders})", vals)
290
+ mysql_conn.commit()
291
+
292
+ sqlite_conn.close()
293
+ mysql_cursor.close()
294
+ mysql_conn.close()
295
+ print(f"Complete! Data in MySQL database '{MYSQL_DATABASE}'")
296
+ ```
297
+
298
+ Run: `python simple_import.py`
299
+
300
+ ## Supported Formats
301
+
302
+ CSV, Excel, JSON, SQL, PDF, Word, Text, Pandas DataFrame
303
+
304
+ ## How Embeddings Work
305
+
306
+ QuerySUTRA uses **sentence-transformers** to create semantic embeddings of your queries:
307
+
308
+ **Model:** `all-MiniLM-L6-v2`
309
+ - Size: 80MB (downloads once, cached locally)
310
+ - Embedding dimension: 384
311
+ - Speed: Very fast, runs locally
312
+ - No external API calls
313
+
314
+ **Process:**
315
+
316
+ 1. Query is converted to a 384-dimensional vector
317
+ 2. Compared to cached query vectors using cosine similarity
318
+ 3. If similarity > 85%, uses cached result
319
+ 4. Otherwise, makes new API call
320
+
321
+ **Example:**
322
+
323
+ ```python
324
+ sutra = SUTRA(api_key="key", use_embeddings=True)
325
+
326
+ # Query 1: "Show me sales data"
327
+ # → Embedding: [0.234, -0.456, 0.678, -0.123, ...]
328
+ # → API call made
329
+ # → Result cached
330
+
331
+ # Query 2: "Display sales information"
332
+ # → Embedding: [0.238, -0.451, 0.682, -0.119, ...]
333
+ # → Similarity: 92% with Query 1
334
+ # → Uses cached result (NO API CALL)
335
+
336
+ # Query 3: "What's the weather?"
337
+ # → Embedding: [-0.891, 0.123, -0.345, 0.567, ...]
338
+ # → Similarity: 15% with Query 1
339
+ # → Makes new API call (different topic)
340
+ ```
341
+
342
+ **Cost Comparison:**
343
+
344
+ Without embeddings:
345
+ ```
346
+ 10 queries about sales = 10 API calls = $0.10
347
+ ```
348
+
349
+ With embeddings:
350
+ ```
351
+ 10 similar queries about sales = 1 API call + 9 cached = $0.01 (90% savings)
352
+ ```
353
+
354
+ **Installation:**
355
+ ```bash
356
+ pip install QuerySUTRA[embeddings]
357
+ ```
358
+
359
+ **Usage:**
360
+ ```python
361
+ sutra = SUTRA(api_key="key", use_embeddings=True)
362
+
363
+ # All similar queries are cached automatically
364
+ result1 = sutra.ask("Show sales")
365
+ result2 = sutra.ask("Display sales data") # Cached
366
+ result3 = sutra.ask("Give me sales information") # Cached
367
+ result4 = sutra.ask("Sales data please") # Cached
368
+ # Only 1 API call for all 4 queries!
369
+ ```
370
+
371
+ ## API Reference
372
+
373
+ **Initialize**
374
+ ```python
375
+ SUTRA(
376
+ api_key: str, # OpenAI API key
377
+ db: str = "sutra.db", # SQLite database path
378
+ use_embeddings: bool = False, # Enable smart caching
379
+ check_relevance: bool = False, # Check query relevance
380
+ fuzzy_match: bool = True, # Enable fuzzy matching
381
+ cache_queries: bool = True # Cache exact matches
382
+ )
383
+ ```
384
+
385
+ **Class Methods**
386
+ - `load_from_db(path, api_key, **kwargs)` - Load existing SQLite database
387
+ - `connect_mysql(host, user, password, database, port, api_key, **kwargs)` - Connect to MySQL
388
+ - `connect_postgres(host, user, password, database, port, api_key, **kwargs)` - Connect to PostgreSQL
389
+
390
+ **Instance Methods**
391
+ - `upload(data, name, extract_entities, auto_export_mysql)` - Upload data
392
+ - `ask(question, viz, table)` - Natural language query
393
+ - `sql(query, viz)` - Direct SQL query
394
+ - `tables()` - List all tables
395
+ - `schema(table)` - Show schema
396
+ - `peek(table, n)` - Preview data
397
+ - `export_db(path, format)` - Export (sqlite/sql/json/excel)
398
+ - `save_to_mysql(...)` - Export to MySQL (auto-creates database)
399
+ - `save_to_postgres(...)` - Export to PostgreSQL
400
+ - `backup(path)` - Create backup
401
+ - `close()` - Close connection
402
+
403
+ ## Troubleshooting
404
+
405
+ **MySQL database doesn't exist**
406
+ - Fixed - auto-creates automatically
407
+
408
+ **Only 10 records from large PDF**
409
+ - Fixed - processes entire document in chunks
410
+
411
+ **connect_mysql() not found**
412
+ - Update: `pip install --upgrade QuerySUTRA`
413
+
414
+ **Embeddings not working**
415
+ - Install: `pip install QuerySUTRA[embeddings]`
416
+
417
+ ## Requirements
418
+
419
+ - Python 3.8+
420
+ - OpenAI API key
421
+ - MySQL/PostgreSQL (optional)
422
+
423
+ ## License
424
+
425
+ MIT License
426
+
427
+ ## Changelog
428
+
429
+ **v0.4.2**
430
+ - Complete embeddings documentation
431
+ - Simplified workflows
432
+
433
+ **v0.4.0**
434
+ - Auto-creates MySQL database
435
+ - Complete PDF extraction
436
+ - Chunk processing
437
+ - Auto-export feature
438
+
439
+ ---
440
+
441
+ **Made by Aditya Batta**