QuerySUTRA 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sutra/nlp_processor.py CHANGED
@@ -1,144 +1,176 @@
1
- """NLP to SQL query processor with relevancy checking"""
2
-
3
- import pandas as pd
4
- from typing import Optional, Tuple
5
- from tabulate import tabulate
6
- from sutra.cache_manager import CacheManager
7
- import openai
8
- import config
9
- from sutra.feedback import SimpleFeedback
10
- from sutra.schema_embeddings import SchemaEmbeddings
11
- from sutra.feedback_matcher import FeedbackMatcher
12
-
13
- class NLPProcessor:
14
- """Process natural language questions to SQL queries"""
15
-
16
- def __init__(self, db_manager, openai_client=None):
17
- self.db = db_manager
18
- self.cache = CacheManager() if config.CACHE_ENABLED else None
19
- self.model_name = config.MODEL_NAME
20
-
21
- # Set the API key directly for openai 0.28.1
22
- openai.api_key = config.OPENAI_API_KEY
23
-
24
- # Added for feedback handling and tracking
25
- self.feedback = SimpleFeedback()
26
- self.last_question = None
27
- self.last_sql = None
28
-
29
- # ✅ NEW: Auto-load schema embeddings
30
- self.relevancy_checker = SchemaEmbeddings(db_manager)
31
-
32
- # ✅ NEW: Smart feedback matcher
33
- self.feedback_matcher = FeedbackMatcher()
34
-
35
- def nlp_to_sql(self, question: str) -> str:
36
- """Convert natural language question to SQL"""
37
-
38
- # ✅ NEW: Check feedback for similar queries first
39
- similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
40
- if similar_sql:
41
- print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
42
- return similar_sql
43
-
44
- # Check cache next
45
- if self.cache:
46
- cached_sql = self.cache.get_cached_query(question)
47
- if cached_sql:
48
- print("⚡ Using cached query")
49
- return cached_sql
50
-
51
- # Only call API if no feedback match and no cache
52
- print("🤖 Calling OpenAI API...")
53
-
54
- # Get schema context
55
- schema = self.db.get_schema_context()
56
-
57
- prompt = f"""
58
- Convert this question to a SQLite query:
59
-
60
- Question: {question}
61
-
62
- Database schema:
63
- {schema}
64
-
65
- Return ONLY the SELECT statement. No explanations, no markdown.
66
- """
67
-
68
- # Use openai.ChatCompletion directly for version 0.28.1
69
- response = openai.ChatCompletion.create(
70
- model=self.model_name,
71
- messages=[{"role": "user", "content": prompt}],
72
- temperature=0
73
- )
74
-
75
- sql_query = response['choices'][0]['message']['content'].strip()
76
- sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
77
-
78
- # Cache the result
79
- if self.cache:
80
- self.cache.add_to_cache(question, sql_query)
81
-
82
- return sql_query
83
-
84
- def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
85
- """Process a natural language question and return results"""
86
-
87
- # ✅ NEW: Check relevancy FIRST - BEFORE any API calls
88
- is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
89
-
90
- if not is_relevant:
91
- print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
92
- for item in info:
93
- print(f" {item}")
94
- return None, ""
95
-
96
- print(f"✅ Relevant question (similarity: {similarity:.2f})")
97
-
98
- try:
99
- # Convert to SQL (only if relevant)
100
- sql_query = self.nlp_to_sql(question)
101
- print(f"\n🔍 Generated SQL Query:")
102
- print(f" {sql_query}")
103
-
104
- # Track for feedback
105
- self.last_question = question
106
- self.last_sql = sql_query
107
-
108
- # Execute query
109
- result_df = self.db.execute_query(sql_query)
110
-
111
- return result_df, sql_query
112
-
113
- except Exception as e:
114
- print(f"❌ Error processing question: {e}")
115
- return None, ""
116
-
117
- def display_results(self, df: pd.DataFrame, max_rows: int = 15):
118
- """Display query results in a formatted table"""
119
- if df is None or df.empty:
120
- print(" No results found")
121
- return # Exit early if no results
122
-
123
- # Show the table
124
- display_df = df.head(max_rows) if len(df) > max_rows else df
125
- print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
126
-
127
- if len(df) > max_rows:
128
- print(f" ... showing first {max_rows} of {len(df)} rows")
129
-
130
- # ✅ UPDATED: Only ask for feedback for relevant questions with results
131
- # (Irrelevant questions never reach here due to early return)
132
- feedback = input("\n👍 or 👎? (y/n): ").lower()
133
- if feedback == 'y':
134
- self.feedback.save(self.last_question, self.last_sql, True)
135
- print("✅ Saved as good")
136
- # Reload feedback matcher with new data
137
- self.feedback_matcher.reload_feedback()
138
- elif feedback == 'n':
139
- correct = input("Correct SQL: ").strip()
140
- self.feedback.save(self.last_question, self.last_sql, False, correct)
141
- if correct:
142
- print("✅ Learned correction")
143
- # Reload feedback matcher with new data
1
+ """NLP to SQL query processor with relevancy checking"""
2
+
3
+ import pandas as pd
4
+ from typing import Optional, Tuple
5
+ from tabulate import tabulate
6
+ from sutra.cache_manager import CacheManager
7
+ import openai
8
+ import config
9
+ from sutra.feedback import SimpleFeedback
10
+ from sutra.schema_embeddings import SchemaEmbeddings
11
+ from sutra.feedback_matcher import FeedbackMatcher
12
+
13
+ class NLPProcessor:
14
+ """Process natural language questions to SQL queries"""
15
+
16
+ def __init__(self, db_manager, openai_client=None):
17
+ self.db = db_manager
18
+ self.cache = CacheManager() if config.CACHE_ENABLED else None
19
+ self.model_name = config.MODEL_NAME
20
+
21
+ # Set the API key directly for openai 0.28.1
22
+ openai.api_key = config.OPENAI_API_KEY
23
+
24
+ # Added for feedback handling and tracking
25
+ self.feedback = SimpleFeedback()
26
+ self.last_question = None
27
+ self.last_sql = None
28
+
29
+ # ✅ NEW: Auto-load schema embeddings
30
+ self.relevancy_checker = SchemaEmbeddings(db_manager)
31
+
32
+ # ✅ NEW: Smart feedback matcher
33
+ self.feedback_matcher = FeedbackMatcher()
34
+
35
+ def nlp_to_sql(self, question: str) -> str:
36
+ """Convert natural language question to SQL"""
37
+
38
+ # ✅ NEW: Check feedback for similar queries first
39
+ similar_sql, similarity = self.feedback_matcher.find_similar_query(question)
40
+ if similar_sql:
41
+ print(f"🎯 Found similar query in feedback (similarity: {similarity:.2f})")
42
+ return similar_sql
43
+
44
+ # Check cache next
45
+ if self.cache:
46
+ cached_sql = self.cache.get_cached_query(question)
47
+ if cached_sql:
48
+ print("⚡ Using cached query")
49
+ return cached_sql
50
+
51
+ # Only call API if no feedback match and no cache
52
+ print("🤖 Calling OpenAI API...")
53
+
54
+ # Get schema context
55
+ schema = self.db.get_schema_context()
56
+
57
+ prompt = f"""
58
+ Convert this question to a SQLite query.
59
+
60
+ Question: {question}
61
+
62
+ Database schema:
63
+ {schema}
64
+
65
+ CRITICAL INSTRUCTIONS FOR MULTI-TABLE QUERIES:
66
+
67
+ **STEP 1: CHECK TABLE RELATIONSHIPS FIRST**
68
+ Look at the "=== TABLE RELATIONSHIPS ===" section at the top of the schema.
69
+ These show you exactly how tables are connected via foreign keys.
70
+ Format: table1.column → table2.column means table1.column references table2.column
71
+
72
+ **STEP 2: IDENTIFY REQUIRED TABLES**
73
+ Analyze which tables contain the data needed to answer the question.
74
+ If information is spread across multiple tables, you MUST join them.
75
+
76
+ **STEP 3: USE THE RELATIONSHIPS TO JOIN**
77
+ When you need data from multiple tables:
78
+ - Use the foreign key relationships shown in the TABLE RELATIONSHIPS section
79
+ - Join table1 to table2 using: JOIN table2 ON table1.fk_column = table2.pk_column
80
+ - Use INNER JOIN when both tables must have matching data
81
+ - Use LEFT JOIN when you need all rows from the first table regardless of matches
82
+
83
+ **STEP 4: WRITE THE QUERY**
84
+ - Use table aliases (t1, t2, etc.) for readability
85
+ - Qualify all column names with table aliases to avoid ambiguity
86
+ - Include all necessary columns from all joined tables in SELECT
87
+
88
+ EXAMPLES:
89
+ ❌ WRONG: SELECT name FROM customers WHERE city = 'NYC'
90
+ (if you need order information too)
91
+
92
+ CORRECT: SELECT c.name, o.order_date, o.total
93
+ FROM customers c
94
+ JOIN orders o ON c.customer_id = o.customer_id
95
+ WHERE c.city = 'NYC'
96
+
97
+ Return ONLY the executable SELECT statement. No explanations, no markdown, no code blocks.
98
+ """
99
+
100
+ # Use openai.ChatCompletion directly for version 0.28.1
101
+ response = openai.ChatCompletion.create(
102
+ model=self.model_name,
103
+ messages=[{"role": "user", "content": prompt}],
104
+ temperature=0
105
+ )
106
+
107
+ sql_query = response['choices'][0]['message']['content'].strip()
108
+ sql_query = sql_query.replace('```sql', '').replace('```', '').strip()
109
+
110
+ # Cache the result
111
+ if self.cache:
112
+ self.cache.add_to_cache(question, sql_query)
113
+
114
+ return sql_query
115
+
116
+ def process_question(self, question: str) -> Tuple[Optional[pd.DataFrame], str]:
117
+ """Process a natural language question and return results"""
118
+
119
+ # NEW: Check relevancy FIRST - BEFORE any API calls
120
+ is_relevant, similarity, info = self.relevancy_checker.is_relevant(question)
121
+
122
+ if not is_relevant:
123
+ print(f"\n❌ Question not relevant to database (similarity: {similarity:.2f})")
124
+ for item in info:
125
+ print(f" {item}")
126
+ return None, ""
127
+
128
+ print(f" Relevant question (similarity: {similarity:.2f})")
129
+
130
+ try:
131
+ # Convert to SQL (only if relevant)
132
+ sql_query = self.nlp_to_sql(question)
133
+ print(f"\n🔍 Generated SQL Query:")
134
+ print(f" {sql_query}")
135
+
136
+ # Track for feedback
137
+ self.last_question = question
138
+ self.last_sql = sql_query
139
+
140
+ # Execute query
141
+ result_df = self.db.execute_query(sql_query)
142
+
143
+ return result_df, sql_query
144
+
145
+ except Exception as e:
146
+ print(f"❌ Error processing question: {e}")
147
+ return None, ""
148
+
149
+ def display_results(self, df: pd.DataFrame, max_rows: int = 15):
150
+ """Display query results in a formatted table"""
151
+ if df is None or df.empty:
152
+ print(" No results found")
153
+ return # Exit early if no results
154
+
155
+ # Show the table
156
+ display_df = df.head(max_rows) if len(df) > max_rows else df
157
+ print(tabulate(display_df, headers='keys', tablefmt='grid', showindex=False))
158
+
159
+ if len(df) > max_rows:
160
+ print(f" ... showing first {max_rows} of {len(df)} rows")
161
+
162
+ # ✅ UPDATED: Only ask for feedback for relevant questions with results
163
+ # (Irrelevant questions never reach here due to early return)
164
+ feedback = input("\n👍 or 👎? (y/n): ").lower()
165
+ if feedback == 'y':
166
+ self.feedback.save(self.last_question, self.last_sql, True)
167
+ print("✅ Saved as good")
168
+ # Reload feedback matcher with new data
169
+ self.feedback_matcher.reload_feedback()
170
+ elif feedback == 'n':
171
+ correct = input("Correct SQL: ").strip()
172
+ self.feedback.save(self.last_question, self.last_sql, False, correct)
173
+ if correct:
174
+ print("✅ Learned correction")
175
+ # Reload feedback matcher with new data
144
176
  self.feedback_matcher.reload_feedback()
sutra/schema_generator.py CHANGED
@@ -1,53 +1,57 @@
1
- """SQL schema generation from unstructured text using AI"""
2
-
3
- import openai
4
- import config
5
-
6
- class SchemaGenerator:
7
- """Generate SQL schema from unstructured data using OpenAI"""
8
-
9
- def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
10
- openai.api_key = api_key
11
- self.model_name = model_name
12
- self.temperature = config.TEMPERATURE
13
-
14
- def generate_schema(self, unstructured_data: str) -> str:
15
- """Generate SQL schema from unstructured text"""
16
-
17
- # Truncate if too long
18
- if len(unstructured_data) > config.MAX_TEXT_LENGTH:
19
- unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
20
- print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
21
-
22
- prompt = f"""
23
- Convert this unstructured text into a SQLite database:
24
-
25
- {unstructured_data}
26
-
27
- Requirements:
28
- 1. Create tables based on what entities you find in the text
29
- 2. Add foreign keys to connect related tables
30
- 3. Extract ALL data from the text - don't add anything not in the text
31
- 4. Use INTEGER PRIMARY KEY AUTOINCREMENT for IDs
32
-
33
- Return ONLY executable SQLite statements:
34
- - DROP TABLE IF EXISTS statements
35
- - CREATE TABLE statements with PRIMARY KEY and FOREIGN KEY
36
- - INSERT statements with the actual data from the text
37
-
38
- No markdown, no code blocks, just SQL.
39
- """
40
-
41
- print("🔄 Generating schema via OpenAI API...")
42
-
43
- response = openai.ChatCompletion.create(
44
- model=self.model_name,
45
- messages=[{"role": "user", "content": prompt}],
46
- temperature=self.temperature
47
- )
48
-
49
- generated_schema = response['choices'][0]['message']['content'].strip()
50
- generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
51
-
52
- print("✅ Schema generated!")
1
+ """SQL schema generation from unstructured text using AI"""
2
+
3
+ import openai
4
+ import config
5
+
6
+ class SchemaGenerator:
7
+ """Generate SQL schema from unstructured data using OpenAI"""
8
+
9
+ def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
10
+ openai.api_key = api_key
11
+ self.model_name = model_name
12
+ self.temperature = config.TEMPERATURE
13
+
14
+ def generate_schema(self, unstructured_data: str) -> str:
15
+ """Generate SQL schema from unstructured text"""
16
+
17
+ # Truncate if too long
18
+ if len(unstructured_data) > config.MAX_TEXT_LENGTH:
19
+ unstructured_data = unstructured_data[:config.MAX_TEXT_LENGTH]
20
+ print(f"⚠️ Data truncated to {config.MAX_TEXT_LENGTH} characters")
21
+
22
+ prompt = f"""
23
+ Convert this unstructured text into a SQLite database:
24
+
25
+ {unstructured_data}
26
+
27
+ CRITICAL Requirements:
28
+ 1. Identify all entities in the text and create a table for each
29
+ 2. **MANDATORY:** Add FOREIGN KEY constraints to connect related tables
30
+ - If table A references table B, add: FOREIGN KEY (column_name) REFERENCES table_b(id)
31
+ - Example: If employees belong to departments, employees table must have:
32
+ department_id INTEGER, FOREIGN KEY (department_id) REFERENCES departments(id)
33
+ 3. Extract ALL data from the text - don't add anything not in the text
34
+ 4. Use INTEGER PRIMARY KEY AUTOINCREMENT for all ID columns
35
+ 5. Ensure parent tables (referenced tables) are created BEFORE child tables
36
+
37
+ Return ONLY executable SQLite statements in this order:
38
+ 1. DROP TABLE IF EXISTS statements (child tables first, parent tables last)
39
+ 2. CREATE TABLE statements (parent tables first, child tables last)
40
+ 3. INSERT statements (parent tables first, child tables last)
41
+
42
+ No markdown, no code blocks, no explanations - just SQL statements.
43
+ """
44
+
45
+ print("🔄 Generating schema via OpenAI API...")
46
+
47
+ response = openai.ChatCompletion.create(
48
+ model=self.model_name,
49
+ messages=[{"role": "user", "content": prompt}],
50
+ temperature=self.temperature
51
+ )
52
+
53
+ generated_schema = response['choices'][0]['message']['content'].strip()
54
+ generated_schema = generated_schema.replace('```sql', '').replace('```', '').strip()
55
+
56
+ print("✅ Schema generated!")
53
57
  return generated_schema