QuerySUTRA 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,438 @@
1
+ Metadata-Version: 2.4
2
+ Name: QuerySUTRA
3
+ Version: 0.4.0
4
+ Summary: SUTRA: Structured-Unstructured-Text-Retrieval-Architecture - Complete data extraction
5
+ Author: Aditya Batta
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=1.3.0
11
+ Requires-Dist: numpy>=1.21.0
12
+ Requires-Dist: openai>=1.0.0
13
+ Requires-Dist: plotly>=5.0.0
14
+ Requires-Dist: matplotlib>=3.3.0
15
+ Requires-Dist: PyPDF2>=3.0.0
16
+ Requires-Dist: python-docx>=0.8.11
17
+ Requires-Dist: openpyxl>=3.0.0
18
+ Provides-Extra: mysql
19
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "mysql"
20
+ Requires-Dist: mysql-connector-python>=8.0.0; extra == "mysql"
21
+ Provides-Extra: postgres
22
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "postgres"
23
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
24
+ Provides-Extra: embeddings
25
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "embeddings"
26
+ Provides-Extra: all
27
+ Requires-Dist: sqlalchemy>=1.4.0; extra == "all"
28
+ Requires-Dist: mysql-connector-python>=8.0.0; extra == "all"
29
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "all"
30
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "all"
31
+ Dynamic: license-file
32
+ Dynamic: requires-python
33
+
34
+ # QuerySUTRA
35
+
36
+ **SUTRA: Structured-Unstructured-Text-Retrieval-Architecture**
37
+
38
+ Professional Python library for AI-powered data analysis with automatic entity extraction, natural language querying, and intelligent caching.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install QuerySUTRA
44
+
45
+ # With optional features
46
+ pip install QuerySUTRA[embeddings] # Smart caching
47
+ pip install QuerySUTRA[mysql] # MySQL support
48
+ pip install QuerySUTRA[postgres] # PostgreSQL support
49
+ pip install QuerySUTRA[all] # All features
50
+ ```
51
+
52
+ ## Key Features
53
+
54
+ ### 1. Complete Data Extraction from Large Documents
55
+ Processes entire documents in chunks - no data loss on large PDFs.
56
+
57
+ ```python
58
+ from sutra import SUTRA
59
+
60
+ sutra = SUTRA(api_key="your-openai-key")
61
+ sutra.upload("large_document.pdf") # Extracts ALL data, not just first page
62
+
63
+ # Automatically creates multiple tables:
64
+ # - document_people (40 rows, 8 columns)
65
+ # - document_skills (50 rows, 5 columns)
66
+ # - document_technologies (30 rows, 4 columns)
67
+ # - document_projects (25 rows, 6 columns)
68
+ # etc.
69
+ ```
70
+
71
+ ### 2. Natural Language Querying
72
+
73
+ ```python
74
+ result = sutra.ask("Show me all people from New York")
75
+ print(result.data)
76
+
77
+ # With visualization
78
+ result = sutra.ask("Show sales by region", viz="pie")
79
+ ```
80
+
81
+ ### 3. Load Existing Databases
82
+
83
+ ```python
84
+ # Load SQLite database
85
+ sutra = SUTRA.load_from_db("sutra.db", api_key="your-key")
86
+
87
+ # Connect to MySQL
88
+ sutra = SUTRA.connect_mysql("localhost", "root", "password", "database")
89
+
90
+ # Connect to PostgreSQL
91
+ sutra = SUTRA.connect_postgres("localhost", "postgres", "password", "database")
92
+ ```
93
+
94
+ ### 4. Import SQLite to MySQL
95
+
96
+ **Step 1: In Colab or Python - Export Database**
97
+ ```python
98
+ # After uploading your data
99
+ sutra.upload("data.pdf")
100
+ sutra.tables()
101
+
102
+ # Export to SQLite
103
+ sutra.export_db("my_data.db", format="sqlite")
104
+
105
+ # In Colab, download the file
106
+ from google.colab import files
107
+ files.download("my_data.db")
108
+ ```
109
+
110
+ **Step 2: On Windows - Import to MySQL**
111
+
112
+ Method A: Using QuerySUTRA
113
+ ```python
114
+ from sutra import SUTRA
115
+
116
+ # Load the SQLite database
117
+ sutra = SUTRA(api_key="your-key", db="my_data.db")
118
+
119
+ # Verify tables
120
+ sutra.tables()
121
+
122
+ # Export to MySQL
123
+ sutra.save_to_mysql("localhost", "root", "password", "my_database")
124
+ ```
125
+
126
+ Method B: Using simple_import.py Script
127
+
128
+ Download the conversion script from the repository or create simple_import.py:
129
+
130
+ ```python
131
+ import sqlite3
132
+ import mysql.connector
133
+ import pandas as pd
134
+
135
+ # Configuration
136
+ SQLITE_DB = "my_data.db"
137
+ MYSQL_HOST = "localhost"
138
+ MYSQL_USER = "root"
139
+ MYSQL_PASSWORD = "your_password"
140
+ MYSQL_DATABASE = "my_database"
141
+
142
+ # Connect to SQLite
143
+ sqlite_conn = sqlite3.connect(SQLITE_DB)
144
+ cursor = sqlite_conn.cursor()
145
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
146
+ tables = [row[0] for row in cursor.fetchall()]
147
+
148
+ # Connect to MySQL and create database
149
+ mysql_conn_temp = mysql.connector.connect(
150
+ host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD
151
+ )
152
+ temp_cursor = mysql_conn_temp.cursor()
153
+ temp_cursor.execute(f"CREATE DATABASE IF NOT EXISTS {MYSQL_DATABASE}")
154
+ temp_cursor.close()
155
+ mysql_conn_temp.close()
156
+
157
+ # Connect to database and import
158
+ mysql_conn = mysql.connector.connect(
159
+ host=MYSQL_HOST, user=MYSQL_USER,
160
+ password=MYSQL_PASSWORD, database=MYSQL_DATABASE
161
+ )
162
+ mysql_cursor = mysql_conn.cursor()
163
+
164
+ for table in tables:
165
+ df = pd.read_sql_query(f"SELECT * FROM {table}", sqlite_conn)
166
+
167
+ # Create table
168
+ mysql_cursor.execute(f"DROP TABLE IF EXISTS {table}")
169
+ cols = []
170
+ for col in df.columns:
171
+ dtype = 'INT' if df[col].dtype == 'int64' else 'FLOAT' if df[col].dtype == 'float64' else 'TEXT'
172
+ cols.append(f"`{col}` {dtype}")
173
+ mysql_cursor.execute(f"CREATE TABLE {table} ({', '.join(cols)})")
174
+
175
+ # Insert data
176
+ if len(df) > 0:
177
+ placeholders = ', '.join(['%s'] * len(df.columns))
178
+ for _, row in df.iterrows():
179
+ vals = [None if pd.isna(v) else v for v in row.values]
180
+ mysql_cursor.execute(f"INSERT INTO {table} VALUES ({placeholders})", vals)
181
+
182
+ mysql_conn.commit()
183
+ print(f"Imported {table}: {len(df)} rows")
184
+
185
+ sqlite_conn.close()
186
+ mysql_cursor.close()
187
+ mysql_conn.close()
188
+ print("Complete!")
189
+ ```
190
+
191
+ Run: `python simple_import.py`
192
+
193
+ **Step 3: Verify in MySQL**
194
+ ```sql
195
+ USE my_database;
196
+ SHOW TABLES;
197
+ SELECT * FROM employee_data_people;
198
+ ```
199
+
200
+ ### 5. Custom Visualizations
201
+
202
+ ```python
203
+ result = sutra.ask("Sales by region", viz="pie") # Pie chart
204
+ result = sutra.ask("Trends", viz="line") # Line chart
205
+ result = sutra.ask("Compare", viz="bar") # Bar chart
206
+ result = sutra.ask("Correlation", viz="scatter") # Scatter plot
207
+ result = sutra.ask("Data", viz="table") # Table view
208
+ result = sutra.ask("Analysis", viz="heatmap") # Heatmap
209
+ result = sutra.ask("Auto", viz=True) # Auto-detect
210
+ ```
211
+
212
+ ### 6. Smart Fuzzy Matching
213
+
214
+ ```python
215
+ sutra = SUTRA(api_key="your-key", fuzzy_match=True)
216
+
217
+ # "New York City" automatically matches "New York"
218
+ result = sutra.ask("Who are from New York City?")
219
+ ```
220
+
221
+ ### 7. Intelligent Caching with Embeddings
222
+
223
+ ```python
224
+ sutra = SUTRA(api_key="your-key", use_embeddings=True)
225
+
226
+ result = sutra.ask("Show sales") # Calls API
227
+ result = sutra.ask("Display sales data") # Uses cache (no API call)
228
+ ```
229
+
230
+ ### 8. Irrelevant Query Detection
231
+
232
+ ```python
233
+ sutra = SUTRA(api_key="your-key", check_relevance=True)
234
+
235
+ result = sutra.ask("What is the weather?")
236
+ # Warns: "Query may be irrelevant to your database"
237
+ ```
238
+
239
+ ### 9. Direct SQL Access
240
+
241
+ ```python
242
+ result = sutra.sql("SELECT * FROM people WHERE city='New York'")
243
+ print(result.data)
244
+ ```
245
+
246
+ ## Complete Configuration
247
+
248
+ ```python
249
+ sutra = SUTRA(
250
+ api_key="your-openai-key",
251
+ db="database.db", # SQLite path
252
+ use_embeddings=True, # Smart caching (saves API calls)
253
+ check_relevance=True, # Detect irrelevant queries
254
+ fuzzy_match=True, # Better NLP
255
+ cache_queries=True # Simple caching
256
+ )
257
+ ```
258
+
259
+ ## Supported Formats
260
+
261
+ CSV, Excel, JSON, SQL, PDF, Word, Text, Pandas DataFrame
262
+
263
+ ## How It Works
264
+
265
+ ### Multi-Table Entity Extraction
266
+
267
+ From a single PDF, QuerySUTRA automatically creates multiple related tables:
268
+
269
+ **Input:** Employee PDF with 40 employees
270
+
271
+ **Output Tables:**
272
+ - employee_data_people (40 rows)
273
+ - employee_data_skills (50 rows)
274
+ - employee_data_technologies (30 rows)
275
+ - employee_data_projects (25 rows)
276
+ - employee_data_certifications (20 rows)
277
+ - employee_data_education (40 rows)
278
+ - employee_data_work_experience (35 rows)
279
+
280
+ ### Proper Relational Structure
281
+
282
+ Tables have unique primary keys and proper foreign key relationships:
283
+
284
+ ```
285
+ people table:
286
+ id=1, name="John Doe", city="Dallas"
287
+ id=2, name="Jane Smith", city="New York"
288
+
289
+ skills table:
290
+ id=1, person_id=1, skill_name="Python"
291
+ id=2, person_id=1, skill_name="SQL"
292
+ id=3, person_id=2, skill_name="Java"
293
+ ```
294
+
295
+ ### Chunk Processing for Large Documents
296
+
297
+ v0.4.0 processes documents in 10,000 character chunks, ensuring ALL data is extracted:
298
+
299
+ - PDF with 50 employees: Extracts all 50 (not just first 10)
300
+ - Large documents: Processes entire content
301
+ - Merges results with unique IDs across chunks
302
+
303
+ ## API Reference
304
+
305
+ ### Class Methods
306
+
307
+ `SUTRA.load_from_db(db_path, api_key, **kwargs)` - Load existing SQLite
308
+
309
+ `SUTRA.connect_mysql(host, user, password, database, ...)` - Connect to MySQL
310
+
311
+ `SUTRA.connect_postgres(host, user, password, database, ...)` - Connect to PostgreSQL
312
+
313
+ ### Instance Methods
314
+
315
+ `upload(data, name, extract_entities)` - Upload data
316
+
317
+ `ask(question, viz, table)` - Natural language query
318
+
319
+ `sql(query, viz)` - Raw SQL query
320
+
321
+ `tables()` - List all tables
322
+
323
+ `schema(table)` - Show schema
324
+
325
+ `peek(table, n)` - Preview data
326
+
327
+ `export_db(path, format)` - Export database (sqlite/sql/json/excel)
328
+
329
+ `save_to_mysql(...)` - Export to MySQL
330
+
331
+ `save_to_postgres(...)` - Export to PostgreSQL
332
+
333
+ `backup(path)` - Create backup
334
+
335
+ `close()` - Close connection
336
+
337
+ ## Common Workflows
338
+
339
+ ### Workflow 1: Analyze PDF in Colab, Export to Local MySQL
340
+
341
+ ```python
342
+ # In Colab
343
+ from sutra import SUTRA
344
+
345
+ sutra = SUTRA(api_key="your-key")
346
+ sutra.upload("document.pdf")
347
+ sutra.tables()
348
+
349
+ # Export and download
350
+ sutra.export_db("data.db", format="sqlite")
351
+ from google.colab import files
352
+ files.download("data.db")
353
+
354
+ # On Windows
355
+ sutra = SUTRA(api_key="your-key", db="data.db")
356
+ sutra.save_to_mysql("localhost", "root", "password", "my_database")
357
+ ```
358
+
359
+ ### Workflow 2: Load Existing Database and Query
360
+
361
+ ```python
362
+ # No need to re-upload data
363
+ sutra = SUTRA.load_from_db("data.db", api_key="your-key")
364
+ result = sutra.ask("Your question", viz="pie")
365
+ ```
366
+
367
+ ### Workflow 3: Query MySQL Directly
368
+
369
+ ```python
370
+ # Connect and query MySQL database
371
+ sutra = SUTRA.connect_mysql("localhost", "root", "password", "production_db")
372
+ result = sutra.ask("Show me latest transactions")
373
+ ```
374
+
375
+ ## Performance Tips
376
+
377
+ 1. Use `load_from_db()` to avoid re-uploading
378
+ 2. Use `sql()` for complex queries (no API cost)
379
+ 3. Enable `use_embeddings=True` for caching similar queries
380
+ 4. Enable `cache_queries=True` for exact query matches
381
+ 5. For large PDFs (50+ pages), allow extra processing time
382
+
383
+ ## Troubleshooting
384
+
385
+ **Only extracting 10 records instead of 50:**
386
+ - Fixed in v0.4.0 with chunk processing
387
+ - Upgrade: `pip install --upgrade QuerySUTRA`
388
+
389
+ **MySQL import fails:**
390
+ - Ensure MySQL database exists: `CREATE DATABASE my_database;`
391
+ - Install dependencies: `pip install QuerySUTRA[mysql]`
392
+ - Check MySQL is running: `mysql -u root -p`
393
+
394
+ **Colab disk I/O error:**
395
+ - Fixed in v0.4.0 with better connection handling
396
+ - Restart Colab runtime and try again
397
+
398
+ **connect_mysql() not found:**
399
+ - Update QuerySUTRA: `pip install --upgrade QuerySUTRA`
400
+ - Requires v0.3.0 or higher
401
+
402
+ ## System Requirements
403
+
404
+ - Python 3.8+
405
+ - OpenAI API key
406
+ - 100MB disk space (for embeddings)
407
+ - MySQL/PostgreSQL (optional, for database export)
408
+
409
+ ## License
410
+
411
+ MIT License
412
+
413
+ ## Changelog
414
+
415
+ ### v0.4.0 (Latest)
416
+ - FIXED: Complete data extraction from large documents
417
+ - Chunk processing for PDFs with 50+ pages
418
+ - All employees extracted (not just first 10)
419
+ - Improved MySQL/PostgreSQL import
420
+ - Better error handling for Colab
421
+
422
+ ### v0.3.x
423
+ - Added MySQL/PostgreSQL connectivity
424
+ - Smart caching with embeddings
425
+ - Fuzzy matching for better NLP
426
+ - Custom visualizations
427
+ - Irrelevant query detection
428
+
429
+ ### v0.2.x
430
+ - Multi-table entity extraction
431
+ - Proper primary and foreign keys
432
+
433
+ ### v0.1.x
434
+ - Initial release
435
+
436
+ ---
437
+
438
+ **Made by Aditya Batta**
@@ -1,5 +1,5 @@
1
- querysutra-0.3.3.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
- sutra/__init__.py,sha256=ujgmobOPa4UHLpXv7Nz84qQ2B2_BOHfyNE0lW1TlDuw,335
1
+ querysutra-0.4.0.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
+ sutra/__init__.py,sha256=ESRehV9T3j3vU1UTA7JWM-BU-nVqBzlPSiGoBrHMHFs,256
3
3
  sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
4
  sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
5
  sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
11
11
  sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
12
  sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
13
  sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=EGMBebWAYLzwADozUNbMQfEPv4xGCtL5zsbymPJ43HU,40554
14
+ sutra/sutra.py,sha256=CIDr5kqDFpzbohFV13xNy9MYw0I9TEvTl1A-89BzdNE,33856
15
15
  sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
16
  sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
17
  sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
22
  utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.3.3.dist-info/METADATA,sha256=DRUhb00cXQrf9JIjfbp9CiSJkoPrlgsY1wmWWjQ1pdI,7637
26
- querysutra-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.3.3.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.3.3.dist-info/RECORD,,
25
+ querysutra-0.4.0.dist-info/METADATA,sha256=jvIiXUtqi1ZR47HBq-fN56Hr24EIbe5wPX2FWqlOFTA,11890
26
+ querysutra-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
+ querysutra-0.4.0.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
+ querysutra-0.4.0.dist-info/RECORD,,
sutra/__init__.py CHANGED
@@ -1,11 +1,9 @@
1
1
  """
2
2
  QuerySUTRA - Structured-Unstructured-Text-Retrieval-Architecture
3
- Creates multiple structured tables from ANY data with AI
4
-
5
- v0.3.3 - FIXED: Proper primary/foreign keys and comprehensive extraction
3
+ v0.3.4 - Proper relational database with unique IDs
6
4
  """
7
5
 
8
- __version__ = "0.3.3"
6
+ __version__ = "0.3.4"
9
7
 
10
8
  from sutra.sutra import SUTRA, QueryResult, quick_start
11
9