QuerySUTRA 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: QuerySUTRA
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: SUTRA
5
5
  Author: Aditya Batta
6
6
  License: MIT
@@ -1,5 +1,5 @@
1
- querysutra-0.5.1.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
- sutra/__init__.py,sha256=fCBD8dtNCkIaglLrLPBC4UGJxYPUJ7GyCfBh7zj8bLg,118
1
+ querysutra-0.5.2.dist-info/licenses/LICENSE,sha256=F-4b93u0OVrVwGXgMwBRq6MlGyUT9zmre1oh5Gft5Ts,1066
2
+ sutra/__init__.py,sha256=25HUMETpmA1tlMl5j-ajdo9MRXljSZBrirSTH7w7jIc,118
3
3
  sutra/cache_manager.py,sha256=e0AAeUqoR-aiqzZ3fB-IDvpJ4JA6-YBFyRJxusEnIrA,3082
4
4
  sutra/clear_cache.py,sha256=rVIz29p7V11Uh6oHXeaWpFtYXXv-2OED91cHMAWWxtQ,187
5
5
  sutra/core.py,sha256=R_JbOlZTukegP92Dr-WLsdr632_otFN7o9qSvcxyBtw,10497
@@ -11,7 +11,7 @@ sutra/feedback_matcher.py,sha256=WXYpGtFJnOyYQOzy-z8uBiUWH5vyJJOMS1NwEYzNfic,286
11
11
  sutra/nlp_processor.py,sha256=wMS1hz1aGWjSwPUD7lSNBbQapFtLgF2l65j0QKXQOd0,5461
12
12
  sutra/schema_embeddings.py,sha256=bVPzpJOdYTyUdG2k3ZdgYJLrX2opHBx68RIjJcMlueo,9732
13
13
  sutra/schema_generator.py,sha256=BX_vXmnvSGc6nCBx40WLSoNL3WIYPDahd1cEYloyY4M,1925
14
- sutra/sutra.py,sha256=A2qX0tm2eaxVTU4yNKFk8v07suYaD86P1degwBhAyGk,22919
14
+ sutra/sutra.py,sha256=XgNCY8QPOod0-ymt6R50JMaHJetyfTsElzyvNHpYStw,20664
15
15
  sutra/sutra_client.py,sha256=PYYDGqVbA9pB-Zcsm52i9KarwijCIGVZOThgONZP6Vs,14203
16
16
  sutra/sutra_core.py,sha256=diaWOXUHn1wrqCQrBhLKL612tMQioaqx-ILc3y9-CqM,11708
17
17
  sutra/sutra_simple.py,sha256=rnqzG7OAt4p64XtO0peMqHS1pG5tdA8U3EYTMVsq7BE,23201
@@ -22,7 +22,7 @@ tests/test_sutra.py,sha256=6Z4SoIuBzza101304I7plkyPVkUBbjIxR8uPs9z5ntg,2383
22
22
  utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  utils/file_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  utils/text_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- querysutra-0.5.1.dist-info/METADATA,sha256=uiNLBUFwgNkwo1NfMYkg7uZLzfgzoEnTncNwweRnenY,7258
26
- querysutra-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- querysutra-0.5.1.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
- querysutra-0.5.1.dist-info/RECORD,,
25
+ querysutra-0.5.2.dist-info/METADATA,sha256=8brpcR8UxQwuz28hi8oUL8F5Dfug5AcFk_SdReJlWd0,7258
26
+ querysutra-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
+ querysutra-0.5.2.dist-info/top_level.txt,sha256=9v0buw21eo5LaUU_3Cf9b9MqRyEvtM9cHaOuEXUKVqM,18
28
+ querysutra-0.5.2.dist-info/RECORD,,
sutra/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- """QuerySUTRA v0.5.1"""
2
- __version__="0.5.1"
1
+ """QuerySUTRA v0.5.2"""
2
+ __version__="0.5.2"
3
3
  from sutra.sutra import SUTRA,QueryResult
4
4
  __all__=["SUTRA","QueryResult"]
sutra/sutra.py CHANGED
@@ -1,10 +1,5 @@
1
- """
2
- QuerySUTRA v0.5.0 - BULLETPROOF
3
- GUARANTEED to create multiple tables with proper keys
4
- NEVER falls back to single table
5
- """
6
-
7
- __version__ = "0.5.0"
1
+ """QuerySUTRA v0.5.1 - BULLETPROOF & FIXED"""
2
+ __version__ = "0.5.1"
8
3
  __author__ = "Aditya Batta"
9
4
  __all__ = ["SUTRA", "QueryResult"]
10
5
 
@@ -46,7 +41,7 @@ except:
46
41
 
47
42
 
48
43
  class SUTRA:
49
- """SUTRA - BULLETPROOF AI EXTRACTION"""
44
+ """SUTRA - BULLETPROOF"""
50
45
 
51
46
  def __init__(self, api_key: Optional[str] = None, db: str = "sutra.db",
52
47
  use_embeddings: bool = False, fuzzy_match: bool = True,
@@ -77,10 +72,10 @@ class SUTRA:
77
72
  pass
78
73
 
79
74
  self._refresh_schema()
80
- print(f"QuerySUTRA v0.5.0 Ready")
75
+ print(f"QuerySUTRA v0.5.1 Ready")
81
76
 
82
77
  def upload(self, data: Union[str, pd.DataFrame], name: Optional[str] = None) -> 'SUTRA':
83
- """Upload data."""
78
+ """Upload."""
84
79
  if isinstance(data, pd.DataFrame):
85
80
  self._store(data, name or "data")
86
81
  return self
@@ -110,7 +105,7 @@ class SUTRA:
110
105
  return self
111
106
 
112
107
  def _pdf(self, path: Path, name: str):
113
- """BULLETPROOF PDF extraction - GUARANTEED to create multiple tables."""
108
+ """BULLETPROOF PDF - ALWAYS creates multiple tables."""
114
109
  if not HAS_PYPDF2:
115
110
  raise ImportError("pip install PyPDF2")
116
111
 
@@ -120,127 +115,104 @@ class SUTRA:
120
115
  text = "".join([p.extract_text() + "\n" for p in PyPDF2.PdfReader(f).pages])
121
116
 
122
117
  if not self.client:
123
- print("No API key - using simple extraction")
124
- self._store(pd.DataFrame({'line': range(1, len(text.split('\n'))), 'text': text.split('\n')}), name)
118
+ print("ERROR: No API key! Set api_key parameter")
125
119
  return
126
120
 
127
- print("AI: Extracting entities (BULLETPROOF mode)...")
121
+ print("AI: Extracting...")
128
122
 
129
- # TRY 3 TIMES with progressively simpler prompts
123
+ # TRY 3 TIMES
130
124
  entities = None
131
-
132
- # ATTEMPT 1: Full extraction
133
- entities = self._extract(text, attempt=1)
134
-
135
- # ATTEMPT 2: Simpler prompt
136
- if not entities or len(entities) == 0:
137
- print(" Retry with simpler prompt...")
138
- entities = self._extract(text, attempt=2)
139
-
140
- # ATTEMPT 3: Basic extraction
141
- if not entities or len(entities) == 0:
142
- print(" Final retry with basic prompt...")
143
- entities = self._extract(text, attempt=3)
144
-
145
- # SUCCESS - Create tables
125
+ for attempt in [1, 2, 3]:
126
+ entities = self._extract(text, attempt)
127
+ if entities and len(entities) > 0:
128
+ break
129
+ if attempt < 3:
130
+ print(f" Retry {attempt+1}/3...")
131
+
132
+ # Create tables from entities
146
133
  if entities and len(entities) > 0:
147
- print(f"SUCCESS! Extracted {len(entities)} entity types:")
134
+ print(f"Extracted {len(entities)} entity types:")
148
135
  for etype, recs in entities.items():
149
136
  if recs and len(recs) > 0:
150
- # Renumber IDs
151
137
  for idx, rec in enumerate(recs, 1):
152
138
  rec['id'] = idx
153
-
154
- df = pd.DataFrame(recs)
155
- self._store(df, f"{name}_{etype}")
156
- print(f" {etype}: {len(df)} rows")
139
+ self._store(pd.DataFrame(recs), f"{name}_{etype}")
140
+ print(f" {etype}: {len(recs)} rows")
157
141
  return
158
142
 
159
- # LAST RESORT - Force at least people table from text analysis
160
- print("WARNING: AI extraction failed 3 times - using text analysis...")
161
-
162
- # Try to extract at least names/emails with regex
143
+ # REGEX FALLBACK - FIXED
144
+ print("Using regex fallback...")
163
145
  people = []
164
146
  emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
165
- names = re.findall(r'(?:Employee|Mr\.|Mrs\.|Ms\.|Dr\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', text)
166
147
 
167
- for i, (email, name_match) in enumerate(zip(emails[:50], names[:50] if names else [f"Person {i+1}" for i in range(len(emails))]), 1):
168
- people.append({'id': i, 'name': name_match if isinstance(name_match, str) else f"Person {i}", 'email': email})
148
+ # Extract names from common patterns
149
+ name_patterns = [
150
+ r'(?:Employee|Name|Mr\.|Mrs\.|Ms\.|Dr\.)\s*[:\-]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
151
+ r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:lives|resides|works|is based)',
152
+ r'\*\*([A-Z][a-z]+\s+[A-Z][a-z]+)\*\*'
153
+ ]
154
+
155
+ names = []
156
+ for pattern in name_patterns:
157
+ names.extend(re.findall(pattern, text))
158
+ if len(names) >= len(emails):
159
+ break
160
+
161
+ # Match emails to names
162
+ max_people = min(len(emails), 50)
163
+ for i in range(max_people):
164
+ people.append({
165
+ 'id': i + 1,
166
+ 'name': names[i] if i < len(names) else f"Person {i+1}",
167
+ 'email': emails[i] if i < len(emails) else f"person{i+1}@unknown.com"
168
+ })
169
169
 
170
170
  if people:
171
171
  self._store(pd.DataFrame(people), f"{name}_people")
172
172
  print(f" Extracted {len(people)} people via regex")
173
173
  else:
174
- # Absolute fallback
175
- self._store(pd.DataFrame({'line': range(1, min(100, len(text.split('\n')))), 'text': text.split('\n')[:100]}), name)
174
+ # Absolute last resort
175
+ lines = [l.strip() for l in text.split('\n') if l.strip()][:100]
176
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
176
177
 
177
178
  def _extract(self, text: str, attempt: int) -> Dict:
178
- """Extract with different strategies."""
179
+ """Extract with 3 different strategies."""
179
180
  if not self.client:
180
181
  return {}
181
182
 
182
183
  try:
183
184
  if attempt == 1:
184
- # Detailed extraction
185
- sys_msg = "You are a JSON extraction expert. Extract ALL entities with unique sequential IDs and proper foreign keys. Return ONLY valid JSON, absolutely no other text."
186
- usr_msg = f"""Extract ALL structured entities from this text into a JSON object.
185
+ sys_msg = "Extract entities as JSON. Return ONLY valid JSON."
186
+ usr_msg = f"""Extract ALL entities from text.
187
187
 
188
- Text (first 15000 chars):
188
+ Text:
189
189
  {text[:15000]}
190
190
 
191
- Create separate arrays for these entity types (only if data exists):
192
- - people: id (int), name (str), email (str), phone (str), address (str), city (str), state (str), zip (str)
193
- - skills: id (int), person_id (int), skill_name (str), proficiency (str), years (int)
194
- - technologies: id (int), person_id (int), technology (str), category (str), proficiency (str)
195
- - projects: id (int), person_id (int), project_name (str), description (str), start_date (str), end_date (str)
196
- - certifications: id (int), person_id (int), cert_name (str), issuer (str), date_obtained (str)
197
- - education: id (int), person_id (int), degree (str), institution (str), graduation_year (str)
198
- - work_experience: id (int), person_id (int), company (str), title (str), start_date (str), end_date (str)
191
+ Return JSON with: people, skills, technologies, projects, certifications, education, work_experience
199
192
 
200
- CRITICAL RULES:
201
- 1. IDs must be unique sequential integers: 1, 2, 3, 4...
202
- 2. person_id in related tables MUST reference valid people.id values
203
- 3. Extract EVERY person, skill, technology, project you find
204
- 4. Return ONLY the JSON object, no markdown, no explanations
193
+ Example:
194
+ {{"people":[{{"id":1,"name":"Sarah Johnson","email":"sarah@co.com","city":"New York","state":"NY"}},{{"id":2,"name":"Michael Chen","email":"michael@co.com","city":"SF","state":"CA"}}],"skills":[{{"id":1,"person_id":1,"skill_name":"Python","proficiency":"Expert"}}]}}
205
195
 
206
- Example output format:
207
- {{
208
- "people": [
209
- {{"id": 1, "name": "Sarah Johnson", "email": "sarah@company.com", "phone": "(212) 555-0147", "city": "New York", "state": "NY"}},
210
- {{"id": 2, "name": "Michael Chen", "email": "michael@company.com", "phone": "(415) 555-0283", "city": "San Francisco", "state": "CA"}}
211
- ],
212
- "skills": [
213
- {{"id": 1, "person_id": 1, "skill_name": "Python", "proficiency": "Expert", "years": 5}},
214
- {{"id": 2, "person_id": 1, "skill_name": "SQL", "proficiency": "Advanced", "years": 3}},
215
- {{"id": 3, "person_id": 2, "skill_name": "Product Management", "proficiency": "Expert", "years": 7}}
216
- ]
217
- }}
196
+ Rules: Unique IDs (1,2,3...), person_id references people.id
218
197
 
219
- Now extract from the text above. Return ONLY valid JSON:"""
198
+ JSON:"""
220
199
 
221
200
  elif attempt == 2:
222
- # Simplified extraction
223
- sys_msg = "Extract entities as JSON. Return only JSON."
201
+ sys_msg = "Return JSON."
224
202
  usr_msg = f"""Text: {text[:10000]}
225
203
 
226
- Extract people, skills, technologies as JSON:
227
- {{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}],"skills":[{{"id":1,"person_id":1,"skill_name":"..."}}]}}
228
-
229
- Rules: Unique IDs (1,2,3...), person_id links to people.id
204
+ Extract people as JSON:
205
+ {{"people":[{{"id":1,"name":"...","email":"...","city":"..."}}]}}
230
206
 
231
- JSON only:"""
207
+ JSON:"""
232
208
 
233
209
  else:
234
- # Basic extraction
235
- sys_msg = "Return JSON only."
236
- usr_msg = f"""Text: {text[:8000]}
237
-
238
- Find people with names, emails, cities. Return as JSON:
239
- {{"people":[{{"id":1,"name":"John","email":"john@co.com","city":"NYC"}}]}}
210
+ sys_msg = "JSON only."
211
+ usr_msg = f"""Find names and emails in: {text[:8000]}
240
212
 
241
- JSON:"""
213
+ {{"people":[{{"id":1,"name":"John","email":"john@co.com"}}]}}"""
242
214
 
243
- resp = self.client.chat.completions.create(
215
+ r = self.client.chat.completions.create(
244
216
  model="gpt-4o-mini",
245
217
  messages=[
246
218
  {"role": "system", "content": sys_msg},
@@ -250,26 +222,18 @@ JSON:"""
250
222
  max_tokens=12000
251
223
  )
252
224
 
253
- raw = resp.choices[0].message.content.strip()
254
-
255
- # AGGRESSIVE JSON extraction
256
- raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").replace("json", "").strip()
225
+ raw = r.choices[0].message.content.strip()
226
+ raw = raw.replace("```json", "").replace("```", "").replace("JSON:", "").strip()
257
227
 
258
- # Find JSON object
259
228
  start = raw.find('{')
260
229
  end = raw.rfind('}') + 1
261
230
 
262
231
  if start < 0 or end <= start:
263
232
  return {}
264
233
 
265
- json_str = raw[start:end]
234
+ result = json.loads(raw[start:end])
266
235
 
267
- # Parse
268
- result = json.loads(json_str)
269
-
270
- # Validate
271
236
  if isinstance(result, dict) and len(result) > 0:
272
- # Check if at least one entity type has data
273
237
  has_data = any(isinstance(v, list) and len(v) > 0 for v in result.values())
274
238
  if has_data:
275
239
  return result
@@ -277,7 +241,7 @@ JSON:"""
277
241
  return {}
278
242
 
279
243
  except Exception as e:
280
- print(f" Attempt {attempt} failed: {e}")
244
+ print(f" Attempt {attempt} failed: {str(e)[:100]}")
281
245
  return {}
282
246
 
283
247
  def _docx(self, path: Path, name: str):
@@ -292,13 +256,15 @@ JSON:"""
292
256
  self._store(pd.DataFrame(data[1:], columns=data[0]), f"{name}_t{i+1}")
293
257
  else:
294
258
  text = "\n".join([p.text for p in doc.paragraphs])
295
- self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
259
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
260
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
296
261
 
297
262
  def _txt(self, path: Path, name: str):
298
263
  """TXT."""
299
264
  with open(path, 'r', encoding='utf-8') as f:
300
265
  text = f.read()
301
- self._store(pd.DataFrame({'line': range(len(text.split('\n'))), 'text': text.split('\n')}), name)
266
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
267
+ self._store(pd.DataFrame({'line': range(1, len(lines)+1), 'text': lines}), name)
302
268
 
303
269
  def _store(self, df: pd.DataFrame, name: str):
304
270
  """Store."""
@@ -353,7 +319,7 @@ JSON:"""
353
319
  return QueryResult(False, sql, pd.DataFrame(), None, str(e))
354
320
 
355
321
  def _fuzzy(self, q: str, t: str) -> str:
356
- """Fuzzy match."""
322
+ """Fuzzy."""
357
323
  try:
358
324
  cols = [c for c, d in self.schema_info.get(t, {}).items() if 'TEXT' in d]
359
325
  if not cols:
@@ -372,7 +338,7 @@ JSON:"""
372
338
  return q
373
339
 
374
340
  def _check_cache(self, q: str, t: str) -> Optional['QueryResult']:
375
- """Check cache."""
341
+ """Cache."""
376
342
  if not self.query_embeddings:
377
343
  return None
378
344
  emb = self.embedding_model.encode([q])[0]
@@ -386,7 +352,7 @@ JSON:"""
386
352
  return self.query_embeddings[best]['result'] if best else None
387
353
 
388
354
  def _store_cache(self, q: str, t: str, r: 'QueryResult'):
389
- """Store cache."""
355
+ """Store."""
390
356
  emb = self.embedding_model.encode([q])[0]
391
357
  self.query_embeddings[q] = {'table': t, 'embedding': emb, 'result': r}
392
358
 
@@ -413,7 +379,7 @@ JSON:"""
413
379
  return None
414
380
 
415
381
  def tables(self) -> Dict:
416
- """List tables."""
382
+ """Tables."""
417
383
  t = self._get_tables()
418
384
  print("\n" + "="*70)
419
385
  print("TABLES")
@@ -469,7 +435,7 @@ JSON:"""
469
435
  return QueryResult(False, query, pd.DataFrame(), None, str(e))
470
436
 
471
437
  def save_to_mysql(self, host: str, user: str, password: str, database: str, port: int = 3306):
472
- """MySQL export."""
438
+ """MySQL."""
473
439
  try:
474
440
  from sqlalchemy import create_engine
475
441
  import mysql.connector
@@ -506,14 +472,14 @@ JSON:"""
506
472
 
507
473
  @classmethod
508
474
  def load_from_db(cls, db_path: str, api_key: Optional[str] = None, **kwargs):
509
- """Load database."""
475
+ """Load."""
510
476
  if not Path(db_path).exists():
511
477
  raise FileNotFoundError(f"Not found: {db_path}")
512
478
  return cls(api_key=api_key, db=db_path, **kwargs)
513
479
 
514
480
  @classmethod
515
481
  def connect_mysql(cls, host: str, user: str, password: str, database: str, port: int = 3306, api_key: Optional[str] = None, **kwargs):
516
- """Connect MySQL."""
482
+ """MySQL."""
517
483
  try:
518
484
  from sqlalchemy import create_engine
519
485
  import mysql.connector
@@ -540,7 +506,7 @@ JSON:"""
540
506
  return instance
541
507
 
542
508
  def _gen_sql(self, q: str, t: str) -> str:
543
- """Generate SQL."""
509
+ """SQL."""
544
510
  schema = self.schema_info.get(t, {})
545
511
  sample = pd.read_sql_query(f"SELECT * FROM {t} LIMIT 3", self.conn).to_string(index=False)
546
512
  cols = ", ".join([f"{c} ({d})" for c, d in schema.items()])