QuerySUTRA 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {querysutra-0.5.2.dist-info → querysutra-0.6.0.dist-info}/METADATA +18 -2
- querysutra-0.6.0.dist-info/RECORD +22 -0
- {querysutra-0.5.2.dist-info → querysutra-0.6.0.dist-info}/WHEEL +1 -1
- querysutra-0.6.0.dist-info/top_level.txt +1 -0
- sutra/__init__.py +6 -4
- sutra/database_manager.py +235 -195
- sutra/nlp_processor.py +175 -143
- sutra/schema_generator.py +56 -52
- sutra/sutra.py +196 -53
- querysutra-0.5.2.dist-info/RECORD +0 -28
- querysutra-0.5.2.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/test_modules.py +0 -0
- tests/test_sutra.py +0 -76
- utils/__init__.py +0 -0
- utils/file_utils.py +0 -0
- utils/text_utils.py +0 -0
- {querysutra-0.5.2.dist-info → querysutra-0.6.0.dist-info}/licenses/LICENSE +0 -0
sutra/nlp_processor.py
CHANGED
|
@@ -1,144 +1,176 @@
|
|
|
1
|
-
"""NLP to SQL query processor with relevancy checking"""
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import Optional, Tuple
|
|
5
|
-
from tabulate import tabulate
|
|
6
|
-
from sutra.cache_manager import CacheManager
|
|
7
|
-
import openai
|
|
8
|
-
import config
|
|
9
|
-
from sutra.feedback import SimpleFeedback
|
|
10
|
-
from sutra.schema_embeddings import SchemaEmbeddings
|
|
11
|
-
from sutra.feedback_matcher import FeedbackMatcher
|
|
12
|
-
|
|
13
|
-
class NLPProcessor:
|
|
14
|
-
"""Process natural language questions to SQL queries"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, db_manager, openai_client=None):
|
|
17
|
-
self.db = db_manager
|
|
18
|
-
self.cache = CacheManager() if config.CACHE_ENABLED else None
|
|
19
|
-
self.model_name = config.MODEL_NAME
|
|
20
|
-
|
|
21
|
-
# Set the API key directly for openai 0.28.1
|
|
22
|
-
openai.api_key = config.OPENAI_API_KEY
|
|
23
|
-
|
|
24
|
-
# Added for feedback handling and tracking
|
|
25
|
-
self.feedback = SimpleFeedback()
|
|
26
|
-
self.last_question = None
|
|
27
|
-
self.last_sql = None
|
|
28
|
-
|
|
29
|
-
# ✅ NEW: Auto-load schema embeddings
|
|
30
|
-
self.relevancy_checker = SchemaEmbeddings(db_manager)
|
|
31
|
-
|
|
32
|
-
# ✅ NEW: Smart feedback matcher
|
|
33
|
-
self.feedback_matcher = FeedbackMatcher()
|
|
34
|
-
|
|
35
|
-
def nlp_to_sql(self, question: str) -> str:
|
|
36
|
-
"""Convert natural language question to SQL"""
|
|
37
|
-
|
|
38
|
-
# ✅ NEW: Check feedback for similar queries first
|
|
39
|
-
similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
|
|
40
|
-
if similar_sql:
|
|
41
|
-
print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
|
|
42
|
-
return similar_sql
|
|
43
|
-
|
|
44
|
-
# Check cache next
|
|
45
|
-
if self.cache:
|
|
46
|
-
cached_sql = self.cache.get_cached_query(question)
|
|
47
|
-
if cached_sql:
|
|
48
|
-
print("⚡ Using cached query")
|
|
49
|
-
return cached_sql
|
|
50
|
-
|
|
51
|
-
# Only call API if no feedback match and no cache
|
|
52
|
-
print("🤖 Calling OpenAI API...")
|
|
53
|
-
|
|
54
|
-
# Get schema context
|
|
55
|
-
schema = self.db.get_schema_context()
|
|
56
|
-
|
|
57
|
-
prompt = f"""
|
|
58
|
-
Convert this question to a SQLite query
|
|
59
|
-
|
|
60
|
-
Question: {question}
|
|
61
|
-
|
|
62
|
-
Database schema:
|
|
63
|
-
{schema}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
#
|
|
137
|
-
self.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
1
|
+
"""NLP to SQL query processor with relevancy checking"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Optional, Tuple
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
from sutra.cache_manager import CacheManager
|
|
7
|
+
import openai
|
|
8
|
+
import config
|
|
9
|
+
from sutra.feedback import SimpleFeedback
|
|
10
|
+
from sutra.schema_embeddings import SchemaEmbeddings
|
|
11
|
+
from sutra.feedback_matcher import FeedbackMatcher
|
|
12
|
+
|
|
13
|
+
class NLPProcessor:
|
|
14
|
+
"""Process natural language questions to SQL queries"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, db_manager, openai_client=None):
|
|
17
|
+
self.db = db_manager
|
|
18
|
+
self.cache = CacheManager() if config.CACHE_ENABLED else None
|
|
19
|
+
self.model_name = config.MODEL_NAME
|
|
20
|
+
|
|
21
|
+
# Set the API key directly for openai 0.28.1
|
|
22
|
+
openai.api_key = config.OPENAI_API_KEY
|
|
23
|
+
|
|
24
|
+
# Added for feedback handling and tracking
|
|
25
|
+
self.feedback = SimpleFeedback()
|
|
26
|
+
self.last_question = None
|
|
27
|
+
self.last_sql = None
|
|
28
|
+
|
|
29
|
+
# ✅ NEW: Auto-load schema embeddings
|
|
30
|
+
self.relevancy_checker = SchemaEmbeddings(db_manager)
|
|
31
|
+
|
|
32
|
+
# ✅ NEW: Smart feedback matcher
|
|
33
|
+
self.feedback_matcher = FeedbackMatcher()
|
|
34
|
+
|
|
35
|
+
def nlp_to_sql(self, question: str) -> str:
|
|
36
|
+
"""Convert natural language question to SQL"""
|
|
37
|
+
|
|
38
|
+
# ✅ NEW: Check feedback for similar queries first
|
|
39
|
+
similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
|
|
40
|
+
if similar_sql:
|
|
41
|
+
print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
|
|
42
|
+
return similar_sql
|
|
43
|
+
|
|
44
|
+
# Check cache next
|
|
45
|
+
if self.cache:
|
|
46
|
+
cached_sql = self.cache.get_cached_query(question)
|
|
47
|
+
if cached_sql:
|
|
48
|
+
print("⚡ Using cached query")
|
|
49
|
+
return cached_sql
|
|
50
|
+
|
|
51
|
+
# Only call API if no feedback match and no cache
|
|
52
|
+
print("🤖 Calling OpenAI API...")
|
|
53
|
+
|
|
54
|
+
# Get schema context
|
|
55
|
+
schema = self.db.get_schema_context()
|
|
56
|
+
|
|
57
|
+
prompt = f"""
|
|
58
|
+
Convert this question to a SQLite query.
|
|
59
|
+
|
|
60
|
+
Question: {question}
|
|
61
|
+
|
|
62
|
+
Database schema:
|
|
63
|
+
{schema}
|
|
64
|
+
|
|
65
|
+
CRITICAL INSTRUCTIONS FOR MULTI-TABLE QUERIES:
|
|
66
|
+
|
|
67
|
+
**STEP 1: CHECK TABLE RELATIONSHIPS FIRST**
|
|
68
|
+
Look at the "=== TABLE RELATIONSHIPS ===" section at the top of the schema.
|
|
69
|
+
These show you exactly how tables are connected via foreign keys.
|
|
70
|
+
Format: table1.column → table2.column means table1.column references table2.column
|
|
71
|
+
|
|
72
|
+
**STEP 2: IDENTIFY REQUIRED TABLES**
|
|
73
|
+
Analyze which tables contain the data needed to answer the question.
|
|
74
|
+
If information is spread across multiple tables, you MUST join them.
|
|
75
|
+
|
|
76
|
+
**STEP 3: USE THE RELATIONSHIPS TO JOIN**
|
|
77
|
+
When you need data from multiple tables:
|
|
78
|
+
- Use the foreign key relationships shown in the TABLE RELATIONSHIPS section
|
|
79
|
+
- Join table1 to table2 using: JOIN table2 ON table1.fk_column = table2.pk_column
|
|
80
|
+
- Use INNER JOIN when both tables must have matching data
|
|
81
|
+
- Use LEFT JOIN when you need all rows from the first table regardless of matches
|
|
82
|
+
|
|
83
|
+
**STEP 4: WRITE THE QUERY**
|
|
84
|
+
- Use table aliases (t1, t2, etc.) for readability
|
|
85
|
+
- Qualify all column names with table aliases to avoid ambiguity
|
|
86
|
+
- Include all necessary columns from all joined tables in SELECT
|
|
87
|
+
|
|
88
|
+
EXAMPLES:
|
|
89
|
+
❌ WRONG: SELECT name FROM customers WHERE city = 'NYC'
|
|
90
|
+
(if you need order information too)
|
|
91
|
+
|
|
92
|
+
✅ CORRECT: SELECT c.name, o.order_date, o.total
|
|
93
|
+
FROM customers c
|
|
94
|
+
JOIN orders o ON c.customer_id = o.customer_id
|
|
95
|
+
WHERE c.city = 'NYC'
|
|
96
|
+
|
|
97
|
+
Return ONLY the executable SELECT statement. No explanations, no markdown, no code blocks.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# Use openai.ChatCompletion directly for version 0.28.1
|
|
101
|
+
response = openai.ChatCompletion.create(
|
|
102
|
+
model=self.model_name,
|
|
103
|
+
messages=[{"role": "user", "content": prompt}],
|
|
104
|
+
temperature=0
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
sql_query = response['choices'][0]['message']['content'].strip()
|
|
108
|
+
sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
|
|
109
|
+
|
|
110
|
+
# Cache the result
|
|
111
|
+
if self.cache:
|
|
112
|
+
self.cache.add_to_cache(question, sql_query)
|
|
113
|
+
|
|
114
|
+
return sql_query
|
|
115
|
+
|
|
116
|
+
def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
|
|
117
|
+
"""Process a natural language question and return results"""
|
|
118
|
+
|
|
119
|
+
# ✅ NEW: Check relevancy FIRST - BEFORE any API calls
|
|
120
|
+
is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
|
|
121
|
+
|
|
122
|
+
if not is_relevant:
|
|
123
|
+
print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
|
|
124
|
+
for item in info:
|
|
125
|
+
print(f" {item}")
|
|
126
|
+
return None, ""
|
|
127
|
+
|
|
128
|
+
print(f"✅ Relevant question (similarity: {similarity:.2f})")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# Convert to SQL (only if relevant)
|
|
132
|
+
sql_query = self.nlp_to_sql(question)
|
|
133
|
+
print(f"\n🔍 Generated SQL Query:")
|
|
134
|
+
print(f" {sql_query}")
|
|
135
|
+
|
|
136
|
+
# Track for feedback
|
|
137
|
+
self.last_question = question
|
|
138
|
+
self.last_sql = sql_query
|
|
139
|
+
|
|
140
|
+
# Execute query
|
|
141
|
+
result_df = self.db.execute_query(sql_query)
|
|
142
|
+
|
|
143
|
+
return result_df, sql_query
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print(f"❌ Error processing question: {e}")
|
|
147
|
+
return None, ""
|
|
148
|
+
|
|
149
|
+
def display_results(self, df: pd.DataFrame, max_rows: int = 15):
|
|
150
|
+
"""Display query results in a formatted table"""
|
|
151
|
+
if df is None or df.empty:
|
|
152
|
+
print(" No results found")
|
|
153
|
+
return # Exit early if no results
|
|
154
|
+
|
|
155
|
+
# Show the table
|
|
156
|
+
display_df = df.head(max_rows) if len(df) > max_rows else df
|
|
157
|
+
print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
|
|
158
|
+
|
|
159
|
+
if len(df) > max_rows:
|
|
160
|
+
print(f" ... showing first {max_rows} of {len(df)} rows")
|
|
161
|
+
|
|
162
|
+
# ✅ UPDATED: Only ask for feedback for relevant questions with results
|
|
163
|
+
# (Irrelevant questions never reach here due to early return)
|
|
164
|
+
feedback = input("\n👍 or 👎? (y/n): ").lower()
|
|
165
|
+
if feedback == 'y':
|
|
166
|
+
self.feedback.save(self.last_question, self.last_sql, True)
|
|
167
|
+
print("✅ Saved as good")
|
|
168
|
+
# Reload feedback matcher with new data
|
|
169
|
+
self.feedback_matcher.reload_feedback()
|
|
170
|
+
elif feedback == 'n':
|
|
171
|
+
correct = input("Correct SQL: ").strip()
|
|
172
|
+
self.feedback.save(self.last_question, self.last_sql, False, correct)
|
|
173
|
+
if correct:
|
|
174
|
+
print("✅ Learned correction")
|
|
175
|
+
# Reload feedback matcher with new data
|
|
144
176
|
self.feedback_matcher.reload_feedback()
|
sutra/schema_generator.py
CHANGED
|
@@ -1,53 +1,57 @@
|
|
|
1
|
-
"""SQL schema generation from unstructured text using AI"""
|
|
2
|
-
|
|
3
|
-
import openai
|
|
4
|
-
import config
|
|
5
|
-
|
|
6
|
-
class SchemaGenerator:
|
|
7
|
-
"""Generate SQL schema from unstructured data using OpenAI"""
|
|
8
|
-
|
|
9
|
-
def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
|
|
10
|
-
openai.api_key = api_key
|
|
11
|
-
self.model_name = model_name
|
|
12
|
-
self.temperature = config.TEMPERATURE
|
|
13
|
-
|
|
14
|
-
def generate_schema(self, unstructured_data: str) -> str:
|
|
15
|
-
"""Generate SQL schema from unstructured text"""
|
|
16
|
-
|
|
17
|
-
# Truncate if too long
|
|
18
|
-
if len(unstructured_data) > config.MAX_TEXT_LENGTH:
|
|
19
|
-
unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
|
|
20
|
-
print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
|
|
21
|
-
|
|
22
|
-
prompt = f"""
|
|
23
|
-
Convert this unstructured text into a SQLite database:
|
|
24
|
-
|
|
25
|
-
{unstructured_data}
|
|
26
|
-
|
|
27
|
-
Requirements:
|
|
28
|
-
1.
|
|
29
|
-
2. Add
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
1
|
+
"""SQL schema generation from unstructured text using AI"""
|
|
2
|
+
|
|
3
|
+
import openai
|
|
4
|
+
import config
|
|
5
|
+
|
|
6
|
+
class SchemaGenerator:
|
|
7
|
+
"""Generate SQL schema from unstructured data using OpenAI"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
|
|
10
|
+
openai.api_key = api_key
|
|
11
|
+
self.model_name = model_name
|
|
12
|
+
self.temperature = config.TEMPERATURE
|
|
13
|
+
|
|
14
|
+
def generate_schema(self, unstructured_data: str) -> str:
|
|
15
|
+
"""Generate SQL schema from unstructured text"""
|
|
16
|
+
|
|
17
|
+
# Truncate if too long
|
|
18
|
+
if len(unstructured_data) > config.MAX_TEXT_LENGTH:
|
|
19
|
+
unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
|
|
20
|
+
print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
|
|
21
|
+
|
|
22
|
+
prompt = f"""
|
|
23
|
+
Convert this unstructured text into a SQLite database:
|
|
24
|
+
|
|
25
|
+
{unstructured_data}
|
|
26
|
+
|
|
27
|
+
CRITICAL Requirements:
|
|
28
|
+
1. Identify all entities in the text and create a table for each
|
|
29
|
+
2. **MANDATORY:** Add FOREIGN KEY constraints to connect related tables
|
|
30
|
+
- If table A references table B, add: FOREIGN KEY (column_name) REFERENCES table_b(id)
|
|
31
|
+
- Example: If employees belong to departments, employees table must have:
|
|
32
|
+
department_id INTEGER, FOREIGN KEY (department_id) REFERENCES departments(id)
|
|
33
|
+
3. Extract ALL data from the text - don't add anything not in the text
|
|
34
|
+
4. Use INTEGER PRIMARY KEY AUTOINCREMENT for all ID columns
|
|
35
|
+
5. Ensure parent tables (referenced tables) are created BEFORE child tables
|
|
36
|
+
|
|
37
|
+
Return ONLY executable SQLite statements in this order:
|
|
38
|
+
1. DROP TABLE IF EXISTS statements (child tables first, parent tables last)
|
|
39
|
+
2. CREATE TABLE statements (parent tables first, child tables last)
|
|
40
|
+
3. INSERT statements (parent tables first, child tables last)
|
|
41
|
+
|
|
42
|
+
No markdown, no code blocks, no explanations - just SQL statements.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
print("🔄 Generating schema via OpenAI API...")
|
|
46
|
+
|
|
47
|
+
response = openai.ChatCompletion.create(
|
|
48
|
+
model=self.model_name,
|
|
49
|
+
messages=[{"role": "user", "content": prompt}],
|
|
50
|
+
temperature=self.temperature
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
generated_schema = response['choices'][0]['message']['content'].strip()
|
|
54
|
+
generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
|
|
55
|
+
|
|
56
|
+
print("✅ Schema generated!")
|
|
53
57
|
return generated_schema
|