npcpy 1.2.26__py3-none-any.whl → 1.2.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcpy/sql/npcsql.py CHANGED
@@ -2,27 +2,200 @@ import pandas as pd
2
2
  import re
3
3
  import os
4
4
  from pathlib import Path
5
- from typing import Dict, List, Set, Union, Any
5
+ from typing import Dict, List, Set, Union, Any, Optional, Callable
6
6
  from collections import defaultdict, deque
7
- from sqlalchemy import create_engine, text, Engine
8
- import inspect
9
- from npcpy.llm_funcs import *
7
+ from sqlalchemy import create_engine, text, Engine, inspect
8
+ import inspect as py_inspect
9
+
10
+ # --- Explicitly import llm_funcs as a module object ---
11
+ try:
12
+ import npcpy.llm_funcs as llm_funcs
13
+ except ImportError:
14
+ print("Warning: `npcpy.llm_funcs` not found. Providing mock AI functions for execution.")
15
+ class MockLlmFuncs:
16
+ def generate_text(self, prompt: str, npc=None, team=None, context="") -> Dict[str, str]:
17
+ print(f"MOCK AI: generate_text('{prompt}')")
18
+ return {"response": f"MOCK: Generated text for '{prompt}'"}
19
+ def analyze_sentiment(self, text: str, npc=None, team=None, context="") -> Dict[str, str]:
20
+ print(f"MOCK AI: analyze_sentiment('{text}')")
21
+ return {"response": f"MOCK: Positive sentiment for '{text}'"}
22
+ def summarize(self, text: str, npc=None, team=None, context="") -> Dict[str, str]:
23
+ print(f"MOCK AI: summarize('{text}')")
24
+ return {"response": f"MOCK: Summary of '{text}'"}
25
+ def translate(self, text: str, source_lang='auto', target_lang='en', npc=None, team=None, context="") -> Dict[str, str]:
26
+ print(f"MOCK AI: translate('{text}', '{source_lang}', '{target_lang}')")
27
+ return {"response": f"MOCK: Translated '{text}' from {source_lang} to {target_lang}"}
28
+ def extract_entities(self, text: str, npc=None, team=None, context="") -> Dict[str, str]:
29
+ print(f"MOCK AI: extract_entities('{text}')")
30
+ return {"response": f"MOCK: Entities from '{text}'"}
31
+ def generate_embedding(self, text: str, model='default', npc=None, team=None, context="") -> Dict[str, str]:
32
+ print(f"MOCK AI: generate_embedding('{text}', '{model}')")
33
+ return {"response": f"MOCK: Embedding for '{text}'"}
34
+ llm_funcs = MockLlmFuncs()
35
+
36
+ # Assuming these are available in the npcpy environment
10
37
  from npcpy.memory.command_history import create_engine_from_path
38
+ try:
39
+ from npcpy.npc_compiler import Team
40
+ except ImportError:
41
+ print("Warning: `npcpy.npc_compiler.Team` not found. Providing mock Team class.")
42
+ class Team:
43
+ def __init__(self, team_path: str = "./npc_team/", npcs: Optional[List[Any]] = None):
44
+ print(f"MOCK NPC: Team initialized for path: {team_path}")
45
+ self.npcs = npcs if npcs is not None else []
46
+ def get_npc(self, npc_ref: str):
47
+ print(f"MOCK NPC: get_npc called for: {npc_ref}")
48
+ return {"name": npc_ref, "type": "mock_npc"}
49
+
11
50
 
51
+ # --- PANDAS BACKEND CONFIGURATION ---
12
52
  try:
13
- import modin.pandas as pd
53
+ import modin.pandas as pd_modin
14
54
  import snowflake.snowpark.modin.plugin
15
- PANDAS_BACKEND = 'snowflake'
55
+ pd = pd_modin
56
+ PANDAS_BACKEND = 'snowflake_modin'
16
57
  except ImportError:
17
58
  try:
18
- import modin.pandas as pd
59
+ import modin.pandas as pd_modin
60
+ pd = pd_modin
19
61
  PANDAS_BACKEND = 'modin'
20
62
  except ImportError:
21
63
  import pandas as pd
22
64
  PANDAS_BACKEND = 'pandas'
65
+ # print(f"Using pandas backend: {PANDAS_BACKEND}") # Removed for cleaner output
66
+
67
+
68
+ # --- AI Function Mappings ---
69
+ class DatabaseAIFunctionMapper:
70
+ @staticmethod
71
+ def get_snowflake_cortex_mapping() -> Dict[str, Dict[str, Any]]:
72
+ return {
73
+ 'get_llm_response': {
74
+ 'cortex_function': 'COMPLETE',
75
+ 'transformer': lambda prompt, **kwargs: f"SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b', {prompt})"
76
+ },
77
+ 'extract_facts': {
78
+ 'cortex_function': 'COMPLETE',
79
+ 'transformer': lambda text, **kwargs: f"SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b', CONCAT('Extract concise facts from this text. Return JSON with fact_list array. Text: ', {text}))"
80
+ },
81
+ 'get_facts': {
82
+ 'cortex_function': 'COMPLETE',
83
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
84
+ 'Extract facts from this text. Return JSON with facts array containing statement, source_text, and type fields. Text: ' || {text})"""
85
+ },
86
+ 'identify_groups': {
87
+ 'cortex_function': 'COMPLETE',
88
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
89
+ 'Identify main groups these facts could be organized into. Return JSON with groups array. Facts: ' || {text})"""
90
+ },
91
+ 'assign_groups_to_fact': {
92
+ 'cortex_function': 'COMPLETE',
93
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
94
+ 'Assign this fact to relevant groups. Return JSON with groups array. Fact: ' || {text})"""
95
+ },
96
+ 'generate_group_candidates': {
97
+ 'cortex_function': 'COMPLETE',
98
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
99
+ 'Generate specific conceptual groups for these items. Return JSON with groups array. Items: ' || {text})"""
100
+ },
101
+ 'remove_idempotent_groups': {
102
+ 'cortex_function': 'COMPLETE',
103
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
104
+ 'Remove conceptually identical groups, favor specificity. Return JSON with distinct_groups array. Groups: ' || {text})"""
105
+ },
106
+ 'zoom_in': {
107
+ 'cortex_function': 'COMPLETE',
108
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
109
+ 'Infer new implied facts from existing facts. Return JSON with implied_facts array. Facts: ' || {text})"""
110
+ },
111
+ 'generate_groups': {
112
+ 'cortex_function': 'COMPLETE',
113
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
114
+ 'Generate conceptual groups for facts. Return JSON with groups array. Facts: ' || {text})"""
115
+ },
116
+ 'remove_redundant_groups': {
117
+ 'cortex_function': 'COMPLETE',
118
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
119
+ 'Remove redundant groups, merge similar concepts. Return JSON with groups array. Groups: ' || {text})"""
120
+ },
121
+ 'criticize': {
122
+ 'cortex_function': 'COMPLETE',
123
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
124
+ 'Provide critical analysis and constructive criticism. Input: ' || {text})"""
125
+ },
126
+ 'synthesize': {
127
+ 'cortex_function': 'COMPLETE',
128
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
129
+ 'Synthesize information from multiple perspectives. Input: ' || {text})"""
130
+ },
131
+ 'breathe': {
132
+ 'cortex_function': 'COMPLETE',
133
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
134
+ 'Condense conversation context into key extractions. Return JSON with high_level_objective, most_recent_task, accomplishments, failures. Conversation: ' || {text})"""
135
+ },
136
+ 'abstract': {
137
+ 'cortex_function': 'COMPLETE',
138
+ 'transformer': lambda text, **kwargs: f"""SNOWFLAKE.CORTEX.COMPLETE('llama3.1-8b',
139
+ 'Create more abstract categories from groups. Return JSON with groups array. Groups: ' || {text})"""
140
+ }
141
+ }
142
+
143
+
144
+ @staticmethod
145
+ def get_databricks_ai_mapping() -> Dict[str, Dict[str, Any]]:
146
+ return {
147
+ 'generate_text': {
148
+ 'databricks_function': 'serving.predict',
149
+ 'transformer': lambda prompt, model='databricks-dolly', **kwargs:
150
+ f"serving.predict('{model}', '{prompt}')"
151
+ },
152
+ }
153
+
154
+ @staticmethod
155
+ def get_bigquery_ai_mapping() -> Dict[str, Dict[str, Any]]:
156
+ return {
157
+ 'generate_text': {
158
+ 'bigquery_function': 'ML.GENERATE_TEXT',
159
+ 'transformer': lambda prompt, model='text-bison', **kwargs:
160
+ f"ML.GENERATE_TEXT(MODEL `{model}`, '{prompt}')"
161
+ },
162
+ }
163
+
164
+ # --- Native Database AI Transformer (INCLUDED in the module) ---
165
+ class NativeDatabaseAITransformer:
166
+ def __init__(self, database_type: str):
167
+ self.database_type = database_type.lower()
168
+ self.function_mappings = self._get_database_mappings()
169
+
170
+ def _get_database_mappings(self) -> Dict[str, Dict[str, Any]]:
171
+ mappings = {
172
+ 'snowflake': DatabaseAIFunctionMapper.get_snowflake_cortex_mapping(),
173
+ 'databricks': DatabaseAIFunctionMapper.get_databricks_ai_mapping(),
174
+ 'bigquery': DatabaseAIFunctionMapper.get_bigquery_ai_mapping()
175
+ }
176
+ return mappings.get(self.database_type, {})
177
+
178
+ def transform_ai_function(self, function_name: str, **kwargs) -> str:
179
+ mapping = self.function_mappings.get(function_name)
180
+ if not mapping:
181
+ raise ValueError(f"No native mapping found for function: {function_name} for database type {self.database_type}")
182
+
183
+ transformer: Callable[..., str] = mapping.get('transformer')
184
+ if not transformer:
185
+ raise ValueError(f"No transformer found for function: {function_name} for database type {self.database_type}")
186
+
187
+ if function_name == 'generate_text' and 'text' in kwargs:
188
+ kwargs['prompt'] = kwargs.pop('text')
189
+
190
+ return transformer(**kwargs)
23
191
 
192
+ # --- NPCSQL Operations ---
24
193
  class NPCSQLOperations:
25
- def __init__(self, npc_directory: str, db_engine: Union[str, Engine] = "~/npcsh_history.db"):
194
+ def __init__(
195
+ self,
196
+ npc_directory: str,
197
+ db_engine: Union[str, Engine] = "~/npcsh_history.db"
198
+ ):
26
199
  self.npc_directory = npc_directory
27
200
 
28
201
  if isinstance(db_engine, str):
@@ -34,10 +207,11 @@ class NPCSQLOperations:
34
207
  self.function_map = self._build_function_map()
35
208
 
36
209
  def _get_team(self):
37
- return self.npc_loader if hasattr(self.npc_loader, 'npcs') else None
210
+ return (self.npc_loader
211
+ if hasattr(self.npc_loader, 'npcs')
212
+ else None)
38
213
 
39
214
  def _build_function_map(self):
40
- import npcpy.llm_funcs as llm_funcs
41
215
  import types
42
216
 
43
217
  function_map = {}
@@ -45,7 +219,8 @@ class NPCSQLOperations:
45
219
  if name.startswith('_'):
46
220
  continue
47
221
  obj = getattr(llm_funcs, name)
48
- if isinstance(obj, types.FunctionType) and obj.__module__ == 'npcpy.llm_funcs':
222
+ if (isinstance(obj, types.FunctionType) or
223
+ (isinstance(obj, types.MethodType) and obj.__self__ is not None)):
49
224
  function_map[name] = obj
50
225
 
51
226
  return function_map
@@ -62,18 +237,27 @@ class NPCSQLOperations:
62
237
  return npc
63
238
 
64
239
  if ',' in npc_ref:
65
- npc_names = [name.strip() for name in npc_ref.split(',')]
66
- npcs = [self.npc_loader.get_npc(name) for name in npc_names]
240
+ npc_names = [
241
+ name.strip() for name in npc_ref.split(',')
242
+ ]
243
+ npcs = [
244
+ self.npc_loader.get_npc(name)
245
+ for name in npc_names
246
+ ]
67
247
  npcs = [npc for npc in npcs if npc is not None]
68
248
 
69
249
  if npcs:
70
- from npcpy.npc_compiler import Team
71
250
  temp_team = Team(npcs=npcs)
72
251
  return temp_team
73
252
 
74
253
  return None
75
254
 
76
- def execute_ai_function(self, func_name: str, df: pd.DataFrame, **params):
255
+ def execute_ai_function(
256
+ self,
257
+ func_name: str,
258
+ df: pd.DataFrame,
259
+ **params
260
+ ) -> pd.Series:
77
261
  if func_name not in self.function_map:
78
262
  raise ValueError(f"Unknown AI function: {func_name}")
79
263
 
@@ -86,43 +270,85 @@ class NPCSQLOperations:
86
270
  if not resolved_team and hasattr(resolved_npc, 'team'):
87
271
  resolved_team = resolved_npc.team
88
272
 
89
- def apply_function(row):
90
- try:
91
- query_template = params.get('query', '')
92
- if query_template:
93
- row_data = {col: str(row[col]) for col in df.columns}
94
- query = query_template.format(**row_data)
95
- else:
96
- query = ''
97
-
98
- sig = inspect.signature(func)
99
- func_params = {k: v for k, v in {
100
- 'prompt': query,
273
+ def apply_function_to_row(row):
274
+ query_template = params.get('query', '')
275
+ column_name = params.get('column', '')
276
+
277
+ column_value = str(row[column_name]) if column_name and column_name in row.index else column_name
278
+
279
+ if query_template:
280
+ row_data = {
281
+ col: str(row[col])
282
+ for col in df.columns
283
+ }
284
+ row_data['column_value'] = column_value
285
+ query = query_template.format(**row_data)
286
+ else:
287
+ query = column_value
288
+
289
+ sig = py_inspect.signature(func)
290
+ func_params = {
291
+ k: v for k, v in {
292
+ 'prompt': query,
293
+ 'text': query,
101
294
  'npc': resolved_npc,
102
295
  'team': resolved_team,
103
296
  'context': params.get('context', '')
104
- }.items() if k in sig.parameters}
105
-
106
- result = func(**func_params)
107
- return result.get("response", "") if isinstance(result, dict) else str(result)
108
-
109
- except Exception as e:
110
- print(f"Error applying function {func_name}: {e}")
111
- return f"Error: {str(e)}"
297
+ }.items() if k in sig.parameters
298
+ }
299
+
300
+ result = func(**func_params)
301
+ return (result.get("response", "")
302
+ if isinstance(result, dict)
303
+ else str(result))
112
304
 
113
- return df.apply(apply_function, axis=1)
305
+ return df.apply(apply_function_to_row, axis=1)
114
306
 
115
307
 
308
+ # --- SQL Model Definition ---
116
309
  class SQLModel:
117
- def __init__(self, name: str, content: str, path: str, npc_directory: str):
310
+ def __init__(
311
+ self,
312
+ name: str,
313
+ content: str,
314
+ path: str,
315
+ npc_directory: str
316
+ ):
118
317
  self.name = name
119
318
  self.content = content
120
319
  self.path = path
121
320
  self.npc_directory = npc_directory
321
+
322
+ config_match = re.search(
323
+ r'\{\{[\s]*config\((.*?)\)[\s]*\}\}',
324
+ content,
325
+ re.DOTALL
326
+ )
327
+ if config_match:
328
+ self.config = self._parse_config(config_match.group(1))
329
+ else:
330
+ self.config = {'materialized': 'table'}
122
331
 
123
332
  self.dependencies = self._extract_dependencies()
124
333
  self.has_ai_function = self._check_ai_functions()
334
+
335
+ # DEBUG print to confirm if AI functions are found
125
336
  self.ai_functions = self._extract_ai_functions()
337
+ if self.ai_functions:
338
+ print(f"DEBUG SQLModel: Model '{self.name}' extracted AI functions: {list(self.ai_functions.keys())}")
339
+ else:
340
+ print(f"DEBUG SQLModel: Model '{self.name}' has no AI functions found by _extract_ai_functions.")
341
+
342
+
343
+ def _parse_config(self, config_str: str) -> Dict:
344
+ config = {}
345
+ for item in re.split(r',\s*(?=[a-zA-Z0-9_]+\s*=)', config_str):
346
+ if '=' in item:
347
+ key, value = item.split('=', 1)
348
+ key = key.strip()
349
+ value = value.strip().strip('"').strip("'")
350
+ config[key] = value
351
+ return config
126
352
 
127
353
  def _extract_dependencies(self) -> Set[str]:
128
354
  pattern = r"\{\{\s*ref\(['\"]([^'\"]+)['\"]\)\s*\}\}"
@@ -132,232 +358,429 @@ class SQLModel:
132
358
  return "nql." in self.content
133
359
 
134
360
  def _extract_ai_functions(self) -> Dict[str, Dict]:
135
- import npcpy.llm_funcs as llm_funcs
361
+ """Extract AI function calls from SQL content with improved robustness."""
136
362
  import types
137
363
 
138
364
  ai_functions = {}
139
- pattern = r"nql\.(\w+)\s*\(((?:[^()]*|\([^()]*\))*)\)"
140
- matches = re.finditer(pattern, self.content)
365
+ # More robust pattern that handles nested parentheses better
366
+ # This captures: nql.function_name(args...)
367
+ pattern = r"nql\.(\w+)\s*\(((?:[^()]|\([^()]*\))*)\)"
368
+
369
+ matches = re.finditer(pattern, self.content, flags=re.DOTALL | re.IGNORECASE)
141
370
 
142
- # Get available function names dynamically
143
371
  available_functions = []
144
372
  for name in dir(llm_funcs):
145
373
  if name.startswith('_'):
146
374
  continue
147
375
  obj = getattr(llm_funcs, name)
148
- if isinstance(obj, types.FunctionType) and obj.__module__ == 'npcpy.llm_funcs':
149
- available_functions.append(name)
150
-
376
+ if (isinstance(obj, types.FunctionType) or
377
+ (isinstance(obj, types.MethodType) and obj.__self__ is not None)):
378
+ available_functions.append(name.lower()) # Store as lowercase for comparison
379
+
151
380
  for match in matches:
152
- func_name = match.group(1)
381
+ full_call_string = match.group(0).strip()
382
+ func_name = match.group(1).lower() # Convert to lowercase for lookup
383
+
153
384
  if func_name in available_functions:
154
- params = [
155
- param.strip().strip("\"'") for param in match.group(2).split(",")
156
- ]
157
- npc = params[1] if len(params) > 1 else ""
158
- if not npc.endswith(".npc"):
159
- npc = npc.replace(".npc", "")
160
- if self.npc_directory in npc:
161
- npc = npc.replace(self.npc_directory, "")
385
+ params_str = match.group(2)
386
+
387
+ # Simplified parameter extraction
388
+ params_list = []
389
+ balance = 0
390
+ in_quote = None
391
+ current_param_chars = []
392
+
393
+ for char in params_str:
394
+ if char in ("'", '"'):
395
+ if in_quote == char:
396
+ in_quote = None
397
+ elif in_quote is None:
398
+ in_quote = char
399
+ current_param_chars.append(char)
400
+ elif char == '(' and in_quote is None:
401
+ balance += 1
402
+ current_param_chars.append(char)
403
+ elif char == ')' and in_quote is None:
404
+ balance -= 1
405
+ current_param_chars.append(char)
406
+ elif char == ',' and balance == 0 and in_quote is None:
407
+ params_list.append("".join(current_param_chars).strip())
408
+ current_param_chars = []
409
+ else:
410
+ current_param_chars.append(char)
411
+
412
+ if current_param_chars:
413
+ params_list.append("".join(current_param_chars).strip())
414
+
415
+ params = [p.strip().strip("'\"") for p in params_list]
416
+
417
+ column_param = params[0] if len(params) > 0 else ""
418
+ npc_param = params[1] if len(params) > 1 else ""
419
+ query_param = params[2] if len(params) > 2 else ""
420
+ context_param = params[3] if len(params) > 3 else None
421
+
422
+ if npc_param.endswith(".npc"):
423
+ npc_param = npc_param[:-4]
424
+ if self.npc_directory and npc_param.startswith(self.npc_directory):
425
+ npc_param = npc_param[len(self.npc_directory):].strip('/')
162
426
 
163
427
  ai_functions[func_name] = {
164
- "column": params[0] if params else "",
165
- "npc": npc,
166
- "query": params[2] if len(params) > 2 else "",
167
- "context": params[3] if len(params) > 3 else None,
428
+ "column": column_param,
429
+ "npc": npc_param,
430
+ "query": query_param,
431
+ "context": context_param,
432
+ "full_call_string": full_call_string,
433
+ "original_func_name": match.group(1) # Store original case
168
434
  }
169
- return ai_functions
170
-
435
+ else:
436
+ print(f"DEBUG SQLModel: Function '{func_name}' not found in available LLM funcs ({available_functions}). Skipping this NQL call.")
171
437
 
438
+ return ai_functions
172
439
 
440
+ # --- Model Compiler ---
173
441
  class ModelCompiler:
174
- def __init__(self, models_dir: str, db_engine: Union[str, Engine] = "~/npcsh_history.db",
175
- npc_directory: str = "./npc_team/", external_engines: Dict[str, Engine] = None):
442
+ def __init__(
443
+ self,
444
+ models_dir: str,
445
+ target_engine: Union[str, Engine],
446
+ npc_directory: str = "./npc_team/",
447
+ external_engines: Optional[Dict[str, Engine]] = None,
448
+ target_schema: Optional[str] = None
449
+ ):
176
450
  self.models_dir = Path(os.path.expanduser(models_dir))
177
451
 
178
- if isinstance(db_engine, str):
179
- self.engine = create_engine_from_path(db_engine)
452
+ if isinstance(target_engine, str):
453
+ self.target_engine = create_engine_from_path(
454
+ target_engine
455
+ )
180
456
  else:
181
- self.engine = db_engine
457
+ self.target_engine = target_engine
182
458
 
183
459
  self.external_engines = external_engines or {}
460
+ self.target_schema = target_schema
184
461
  self.models: Dict[str, SQLModel] = {}
185
- self.npc_operations = NPCSQLOperations(npc_directory, self.engine)
462
+ self.npc_operations = NPCSQLOperations(
463
+ npc_directory,
464
+ self.target_engine
465
+ )
186
466
  self.npc_directory = npc_directory
187
467
 
188
- from npcpy.npc_compiler import Team
189
468
  try:
190
469
  self.npc_team = Team(team_path=npc_directory)
191
470
  self.npc_operations.npc_loader = self.npc_team
192
- except:
471
+ except Exception as e:
193
472
  self.npc_team = None
473
+ print(f"Warning: Could not load NPC team from {npc_directory}. AI functions relying on NPC context might fail: {e}")
194
474
 
195
475
  def _get_engine(self, source_name: str) -> Engine:
196
- """Get database engine by source name"""
197
- if source_name == 'local' or source_name not in self.external_engines:
198
- return self.engine
199
- return self.external_engines[source_name]
476
+ if source_name.lower() == 'local' or not self.external_engines:
477
+ return self.target_engine
200
478
 
479
+ for key, engine in self.external_engines.items():
480
+ if key.lower() == source_name.lower():
481
+ return engine
482
+ return self.target_engine
483
+
201
484
  def _has_native_ai_functions(self, source_name: str) -> bool:
202
- """Check if database has native AI function support"""
203
- ai_enabled = {'snowflake', 'databricks', 'bigquery'}
204
- return source_name in ai_enabled
485
+ ai_enabled_dbs = {'snowflake', 'databricks', 'bigquery'}
486
+ return source_name.lower() in ai_enabled_dbs
205
487
 
206
488
  def discover_models(self):
207
- """Discover SQL models in directory structure"""
208
489
  self.models = {}
209
- print(self.models_dir)
210
- print(list(self.models_dir.glob("**/*.sql")))
490
+ sql_files = list(self.models_dir.glob("**/*.sql"))
211
491
 
212
- for sql_file in self.models_dir.glob("**/*.sql"):
492
+ for sql_file in sql_files:
213
493
  model_name = sql_file.stem
214
494
  with open(sql_file, "r") as f:
215
495
  content = f.read()
216
-
217
- model_npc_dir = sql_file.parent
218
496
 
219
497
  self.models[model_name] = SQLModel(
220
- model_name, content, str(sql_file), str(model_npc_dir)
498
+ model_name,
499
+ content,
500
+ str(sql_file),
501
+ str(sql_file.parent)
221
502
  )
222
- print(f"Discovered model: {model_name}")
223
- print(sql_file, )
224
503
 
225
504
  return self.models
226
505
 
227
506
  def build_dag(self) -> Dict[str, Set[str]]:
228
- """Build dependency graph"""
229
507
  dag = {}
230
508
  for model_name, model in self.models.items():
231
509
  dag[model_name] = model.dependencies
232
510
  return dag
233
511
 
234
512
  def topological_sort(self) -> List[str]:
235
- """Generate execution order using topological sort"""
236
513
  dag = self.build_dag()
237
- in_degree = defaultdict(int)
514
+
515
+ true_in_degree = {model_name: 0 for model_name in self.models.keys()}
516
+ adj_list = defaultdict(list)
238
517
 
239
- for node, deps in dag.items():
240
- for dep in deps:
241
- in_degree[dep] += 1
242
- if dep not in dag:
243
- dag[dep] = set()
518
+ for model_name, model in self.models.items():
519
+ for dependency in model.dependencies:
520
+ if dependency not in self.models:
521
+ raise ValueError(f"Dependency '{dependency}' of model '{model_name}' not found in discovered models.")
522
+ true_in_degree[model_name] += 1
523
+ adj_list[dependency].append(model_name)
244
524
 
245
- queue = deque([node for node in dag.keys() if len(dag[node]) == 0])
525
+ queue = deque([model_name for model_name in self.models.keys() if true_in_degree[model_name] == 0])
246
526
  result = []
247
-
527
+
248
528
  while queue:
249
- node = queue.popleft()
250
- result.append(node)
529
+ current_model = queue.popleft()
530
+ result.append(current_model)
251
531
 
252
- for dependent, deps in dag.items():
253
- if node in deps:
254
- deps.remove(node)
255
- if len(deps) == 0:
256
- queue.append(dependent)
532
+ for dependent_model in adj_list[current_model]:
533
+ true_in_degree[dependent_model] -= 1
534
+ if true_in_degree[dependent_model] == 0:
535
+ queue.append(dependent_model)
257
536
 
258
- if len(result) != len(dag):
259
- raise ValueError("Circular dependency detected")
537
+ if len(result) != len(self.models):
538
+ raise ValueError("Circular dependency detected or some models not processed.")
260
539
 
261
540
  return result
262
541
 
263
- def _replace_model_references(self, sql: str) -> str:
264
- ref_pattern = r"\{\{\s*ref\s*\(\s*['\"]([^'\"]+)['\"]\s*\)\s*\}\}"
542
+ def _replace_model_references(self, sql_content: str) -> str:
543
+ ref_pattern = (
544
+ r"\{\{\s*ref\s*\(\s*['\"]([^'\"]+)['\"]\s*\)\s*\}\}"
545
+ )
265
546
 
266
547
  def replace_ref(match):
267
548
  model_name = match.group(1)
268
549
  if model_name not in self.models:
269
- raise ValueError(f"Model '{model_name}' not found during ref replacement.")
550
+ raise ValueError(
551
+ f"Model '{model_name}' referenced by '{{{{ ref('{model_name}') }}}}' not found during compilation."
552
+ )
553
+
554
+ if self.target_schema:
555
+ return f"{self.target_schema}.{model_name}"
270
556
  return model_name
271
557
 
272
- replaced_sql = re.sub(ref_pattern, replace_ref, sql)
558
+ replaced_sql = re.sub(ref_pattern, replace_ref, sql_content)
273
559
  return replaced_sql
274
560
 
561
+ def _clean_sql_for_execution(self, sql_content: str) -> str:
562
+ config_pattern = r'\{\{[\s]*config\((.*?)\)[\s]*\}\}'
563
+ cleaned_sql = re.sub(config_pattern, '', sql_content, flags=re.DOTALL).strip()
564
+ cleaned_sql = re.sub(r"--.*?\n", "\n", cleaned_sql)
565
+ cleaned_sql = re.sub(r"/\*.*?\*/", "", cleaned_sql, flags=re.DOTALL)
566
+ cleaned_sql = re.sub(r"\s+", " ", cleaned_sql).strip()
567
+ return cleaned_sql
568
+
569
+ def _execute_standard_sql(
570
+ self,
571
+ sql_to_execute: str,
572
+ engine: Engine
573
+ ) -> pd.DataFrame:
574
+ return pd.read_sql(sql_to_execute, engine)
575
+
576
+ def _execute_ai_model(self, cleaned_sql_content: str, model: SQLModel) -> pd.DataFrame:
577
+ processed_sql = self._replace_model_references(cleaned_sql_content)
578
+
579
+ db_type = self.target_engine.dialect.name.lower()
580
+ print(f"DEBUG: Determined DB dialect: '{db_type}'")
581
+
582
+ if self._has_native_ai_functions(db_type):
583
+ print(f"DEBUG: Native AI functions ARE supported for '{db_type}'. Attempting native translation.")
584
+ transformer = NativeDatabaseAITransformer(db_type)
585
+ sql_to_execute_with_native_ai = processed_sql
586
+
587
+ print("DEBUG: AI functions and NQL calls to replace (from model.ai_functions):")
588
+ if model.ai_functions:
589
+ for fn, params in model.ai_functions.items():
590
+ print(f" Function: {fn}, Full Call String: '{params.get('full_call_string')}'")
591
+ else:
592
+ print(" (None found in model.ai_functions to replace natively)")
593
+
594
+ # Replace NQL calls with native functions
595
+ for func_name, params in model.ai_functions.items():
596
+ original_nql_call = params.get('full_call_string')
597
+ if not original_nql_call:
598
+ print(f"WARNING: 'full_call_string' not found for NQL function '{func_name}'. Skipping native replacement attempt.")
599
+ continue
600
+
601
+ try:
602
+ column_ref = params.get('column', '')
603
+
604
+ transform_kwargs = {
605
+ 'text': column_ref,
606
+ 'prompt': column_ref,
607
+ 'query': params.get('query', ''),
608
+ 'context': params.get('context', ''),
609
+ 'npc': params.get('npc', '')
610
+ }
611
+
612
+ native_func_call = transformer.transform_ai_function(
613
+ func_name,
614
+ **transform_kwargs
615
+ )
616
+
617
+ print(f"DEBUG: Replacing '{original_nql_call}' with '{native_func_call}'")
618
+
619
+ # NORMALIZE WHITESPACE in both the original call and the SQL
620
+ # This handles multiline NQL calls with varying indentation
621
+ normalized_original = re.sub(r'\s+', ' ', original_nql_call).strip()
622
+ normalized_sql = re.sub(r'\s+', ' ', sql_to_execute_with_native_ai).strip()
623
+
624
+ # Find the normalized pattern in the normalized SQL
625
+ if normalized_original in normalized_sql:
626
+ # Now do the replacement on the ORIGINAL (non-normalized) SQL
627
+ # by creating a flexible regex pattern
628
+ # Escape special regex chars but allow flexible whitespace
629
+ pattern_parts = [re.escape(part) for part in original_nql_call.split()]
630
+ flexible_pattern = r'\s*'.join(pattern_parts)
631
+ pattern = re.compile(flexible_pattern, re.IGNORECASE | re.DOTALL)
632
+
633
+ old_sql = sql_to_execute_with_native_ai
634
+ sql_to_execute_with_native_ai = pattern.sub(native_func_call, sql_to_execute_with_native_ai, count=1)
635
+
636
+ if old_sql != sql_to_execute_with_native_ai:
637
+ print(f"DEBUG: Successfully replaced with flexible whitespace pattern.")
638
+ else:
639
+ print(f"ERROR: Flexible pattern replacement failed for '{func_name}'.")
640
+ else:
641
+ print(f"ERROR: Could not find normalized NQL call in SQL for '{func_name}'.")
642
+
643
+ except ValueError as e:
644
+ print(f"WARNING: Native translation failed for '{func_name}': {e}. This AI function will NOT be natively translated.")
645
+ except Exception as e:
646
+ print(f"ERROR: An unexpected error occurred during native AI transformation for '{func_name}': {e}. This AI function will NOT be natively translated.") # Check for remaining NQL calls
647
+ if "nql." in sql_to_execute_with_native_ai.lower():
648
+ print(f"WARNING: Some NQL calls remain after native translation attempts. Replacing remaining NQL calls with NULLs.")
649
+ sql_to_execute_with_native_ai = self._replace_nql_calls_with_null(sql_to_execute_with_native_ai, model)
650
+
651
+ print(f"DEBUG: Final SQL for native/mixed AI execution:\n{sql_to_execute_with_native_ai}\n")
652
+ target_engine_for_native_ai = self.target_engine
653
+ return pd.read_sql(sql_to_execute_with_native_ai, target_engine_for_native_ai)
654
+
655
+ else: # Fallback path when native AI is not supported for the determined DB type
656
+ print(f"DEBUG: Native AI functions are NOT supported for '{db_type}'. Entering Python fallback path.")
657
+ sql_with_nql_as_null = self._replace_nql_calls_with_null(processed_sql, model)
658
+
659
+ print(f"DEBUG: SQL to execute in pure fallback (NQL as NULLs for DB):\n{sql_with_nql_as_null}\n")
660
+
661
+ target_engine_for_fallback = self.target_engine # Use target_engine directly
662
+ df = pd.read_sql(sql_with_nql_as_null, target_engine_for_fallback)
663
+
664
+ # Apply Python-driven AI functions on the DataFrame
665
+ for func_name, params in model.ai_functions.items():
666
+ try:
667
+ result_series = self.npc_operations.execute_ai_function(func_name, df, **params)
668
+ result_column_name = f"{func_name}_{params.get('column', 'result')}" # Use a more specific alias if possible
669
+ df[result_column_name] = result_series
670
+ print(f"DEBUG: Python-driven AI function '{func_name}' executed. Result in column '{result_column_name}'.")
671
+ except Exception as e:
672
+ print(f"ERROR: Executing Python-driven AI function '{func_name}': {e}. Assigning NULL.")
673
+ df[f"{func_name}_{params.get('column', 'result')}"] = None
674
+
675
+ return df
275
676
 
677
+ def _replace_nql_calls_with_null(self, sql_content: str, model: SQLModel) -> str:
678
+ """
679
+ Replaces specific nql.func(...) as alias calls with NULL as alias.
680
+ This is used for the fallback path or to clean up any NQL calls missed by native translation.
681
+ """
682
+ modified_sql = sql_content
683
+ for func_name, params in model.ai_functions.items():
684
+ original_nql_call = params.get('full_call_string')
685
+ if not original_nql_call:
686
+ print(f"WARNING: 'full_call_string' not found for NQL function '{func_name}'. Cannot replace with NULL.")
687
+ continue
276
688
 
277
- def _extract_base_query(self, sql: str) -> str:
278
- for dep in self.models[self.current_model].dependencies:
279
- sql = sql.replace(f"{{{{ ref('{dep}') }}}}", dep)
689
+ # Extract alias from the original_nql_call string for NULL replacement
690
+ alias_match = re.search(r'\s+as\s+(\w+)(?:\W|$)', original_nql_call, re.IGNORECASE)
691
+ alias_name = alias_match.group(1) if alias_match else f"{func_name}_{params.get('column', 'result')}"
280
692
 
281
- nql_pattern = r"nql\.\w+\s*\([^)]*(?:\([^)]*\)[^)]*)*\)\s+as\s+(\w+)"
282
-
283
- def replace_nql_func(match):
284
- alias_name = match.group(1)
285
- return f"NULL as {alias_name}"
286
-
287
- cleaned_sql = re.sub(nql_pattern, replace_nql_func, sql, flags=re.DOTALL)
288
- return cleaned_sql
693
+ # Create a robust pattern for the original NQL call to handle whitespace variability
694
+ escaped_original_call = re.escape(original_nql_call.strip())
695
+ pattern_to_sub = re.compile(r"\s*".join(escaped_original_call.split()), flags=re.IGNORECASE)
289
696
 
290
- def _execute_standard_sql(self, sql: str) -> pd.DataFrame:
291
- try:
292
- sql = re.sub(r"--.*?\n", "\n", sql)
293
- sql = re.sub(r"\s+", " ", sql).strip()
294
- return pd.read_sql(sql, self.engine)
295
- except Exception as e:
296
- print(f"Failed to execute SQL: {sql}")
297
- print(f"Error: {str(e)}")
298
- raise
299
-
300
- def _execute_ai_model(self, sql: str, model: SQLModel) -> pd.DataFrame:
301
- """Execute SQL with AI functions"""
302
- source_pattern = r'FROM\s+(\w+)\.(\w+)'
303
- matches = re.findall(source_pattern, sql)
304
-
305
- if matches:
306
- source_name, table_name = matches[0]
307
- engine = self._get_engine(source_name)
308
-
309
- if self._has_native_ai_functions(source_name):
310
- return pd.read_sql(sql.replace(f"{source_name}.", ""), engine)
697
+ # Perform the replacement with NULL as alias
698
+ old_sql = modified_sql
699
+ modified_sql, count = pattern_to_sub.subn(f"NULL as {alias_name}", modified_sql)
700
+ if count == 0:
701
+ print(f"WARNING: NULL replacement failed for NQL call '{original_nql_call}' (no change to SQL). SQL still contains NQL call.")
311
702
  else:
312
- base_sql = self._extract_base_query(sql)
313
- df = pd.read_sql(base_sql.replace(f"{source_name}.", ""), engine)
314
- else:
315
- base_sql = self._extract_base_query(sql)
316
- df = pd.read_sql(base_sql, self.engine)
317
-
318
- for func_name, params in model.ai_functions.items():
319
- result_series = self.npc_operations.execute_ai_function(
320
- func_name, df, **params
321
- )
322
- df[f"{func_name}_result"] = result_series
323
-
324
- return df
703
+ print(f"DEBUG: Replaced NQL call '{original_nql_call}' with 'NULL as {alias_name}'.")
704
+
705
+ return modified_sql
325
706
 
326
707
  def execute_model(self, model_name: str) -> pd.DataFrame:
327
- """Execute a model and materialize it to the database"""
328
708
  self.current_model = model_name
329
709
  model = self.models[model_name]
330
710
 
331
- try:
332
- if model.has_ai_function:
333
- df = self._execute_ai_model(model.content, model)
334
- else:
335
- compiled_sql = self._replace_model_references(model.content)
336
- df = self._execute_standard_sql(compiled_sql)
711
+ cleaned_sql_content = self._clean_sql_for_execution(model.content)
712
+
713
+ print(f"DEBUG: Cleaned SQL content for model '{model_name}':\n{cleaned_sql_content}\n")
337
714
 
338
- self._materialize_to_db(model_name, df)
339
- return df
715
+ if model.has_ai_function:
716
+ df = self._execute_ai_model(cleaned_sql_content, model)
717
+ else:
718
+ compiled_sql = self._replace_model_references(
719
+ cleaned_sql_content
720
+ )
721
+ print(f"DEBUG: Compiled standard SQL for model '{model_name}':\n{compiled_sql}\n")
722
+ df = self._execute_standard_sql(
723
+ compiled_sql,
724
+ self.target_engine
725
+ )
340
726
 
341
- except Exception as e:
342
- print(f"Error executing model {model_name}: {str(e)}")
343
- raise
727
+ self._materialize_to_db(model_name, df, model.config)
728
+ return df
344
729
 
345
- def _materialize_to_db(self, model_name: str, df: pd.DataFrame):
346
- with self.engine.begin() as conn:
347
- conn.execute(text(f"DROP TABLE IF EXISTS {model_name}"))
348
- df.to_sql(model_name, self.engine, index=False)
349
- print(f"Materialized model {model_name} to database")
730
+ def _materialize_to_db(
731
+ self,
732
+ model_name: str,
733
+ df: pd.DataFrame,
734
+ config: Dict
735
+ ):
736
+ materialization = config.get('materialized', 'table')
737
+
738
+ table_name = model_name
739
+ table_name_with_schema = (
740
+ f"{self.target_schema}.{table_name}"
741
+ if self.target_schema
742
+ else table_name
743
+ )
744
+
745
+ with self.target_engine.begin() as conn:
746
+ if self.target_schema:
747
+ inspector = inspect(conn)
748
+ if not inspector.has_schema(self.target_schema):
749
+ print(f"Creating schema '{self.target_schema}'...")
750
+ conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {self.target_schema}"))
751
+ print(f"Schema '{self.target_schema}' created (if it didn't exist).")
752
+
753
+ if materialization == 'view':
754
+ print(
755
+ f"Warning: Materialization '{materialization}' requested for model '{model_name}'. "
756
+ f"Pandas `to_sql` does not directly create SQL VIEWS from DataFrames. "
757
+ f"Materializing as TABLE instead. You may need to manually create the view."
758
+ )
759
+ df.to_sql(
760
+ table_name,
761
+ self.target_engine,
762
+ schema=self.target_schema,
763
+ index=False,
764
+ if_exists='replace'
765
+ )
766
+ print(f"Materialized model {model_name} as TABLE to {table_name_with_schema}")
767
+ else:
768
+ df.to_sql(
769
+ table_name,
770
+ self.target_engine,
771
+ schema=self.target_schema,
772
+ index=False,
773
+ if_exists='replace'
774
+ )
775
+ print(f"Materialized model {model_name} as TABLE to {table_name_with_schema}")
350
776
 
351
777
  def _table_exists(self, table_name: str) -> bool:
352
- with self.engine.connect() as conn:
353
- try:
354
- result = conn.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
355
- return True
356
- except:
357
- return False
778
+ with self.target_engine.connect() as conn:
779
+ inspector = inspect(conn)
780
+ return inspector.has_table(table_name, schema=self.target_schema) or \
781
+ inspector.has_view(table_name, schema=self.target_schema)
358
782
 
359
783
  def run_all_models(self):
360
- """Execute all models in dependency order"""
361
784
  self.discover_models()
362
785
  execution_order = self.topological_sort()
363
786
 
@@ -370,8 +793,12 @@ class ModelCompiler:
370
793
  model = self.models[model_name]
371
794
  for dep in model.dependencies:
372
795
  if not self._table_exists(dep):
373
- raise ValueError(f"Dependency {dep} not found in database for model {model_name}")
796
+ if dep not in results:
797
+ raise ValueError(
798
+ f"Dependency '{dep}' for model '{model_name}' not found in database or already processed models. "
799
+ f"Please ensure all dependencies are resolved and run first."
800
+ )
374
801
 
375
802
  results[model_name] = self.execute_model(model_name)
376
803
 
377
- return results
804
+ return results