tdfs4ds 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1877 @@
1
+ from turtle import pd
2
+ from typing import Sequence, Optional, Dict, Any, List
3
+ import textwrap
4
+
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_core.output_parsers import JsonOutputParser
8
+ from langchain_core.runnables import Runnable, RunnableLambda
9
+ from langchain_core.messages import AIMessage
10
+ from IPython.display import HTML, display
11
+
12
+ import tdfs4ds
13
+ from tdfs4ds import logger_safe
14
+
15
+ import teradataml as tdml
16
+ import json
17
+ import ast
18
+ import re
19
+ import sqlparse
20
+
21
+ from teradataml.context.context import _get_database_username
22
+ import pandas as pd
23
+
24
+ from tdfs4ds.process_store.process_store_catalog_management import get_process_info
25
+
26
+
27
+ def _robust_json_parser(response: str) -> Dict[str, Any]:
28
+ """
29
+ Robustly extract and parse JSON from LLM responses.
30
+ Handles markdown code fences, escaped characters, and formatting variations.
31
+
32
+ Parameters
33
+ ----------
34
+ response : str
35
+ The raw response string from the LLM.
36
+
37
+ Returns
38
+ -------
39
+ dict
40
+ The parsed JSON as a dictionary.
41
+
42
+ Raises
43
+ ------
44
+ ValueError
45
+ If JSON cannot be extracted or parsed from the response.
46
+ """
47
+ if not isinstance(response, str):
48
+ raise ValueError(f"Expected string response, got {type(response)}")
49
+
50
+ # Try 1: Direct JSON parse (response might already be clean)
51
+ try:
52
+ return json.loads(response.strip())
53
+ except json.JSONDecodeError:
54
+ pass
55
+
56
+ # Try 2: Extract from markdown code fences (most flexible)
57
+ # Match opening backticks (with optional json language specifier) and closing backticks
58
+ # Using non-greedy matching with DOTALL to handle multiline content
59
+ markdown_patterns = [
60
+ r'```(?:json)?\s*\n(.*)\n```', # ```json\n...\n``` (any content in middle)
61
+ r'```(?:json)?\s*\r?\n(.*?)\r?\n```', # Handle Windows line endings
62
+ r'```(?:json)?\s*(.*?)\s*```', # ```...``` (flexible whitespace)
63
+ r'`{3}\s*(?:json)?\s*(.*?)\s*`{3}', # Alternative triple backticks
64
+ ]
65
+ for pattern in markdown_patterns:
66
+ match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
67
+ if match:
68
+ try:
69
+ extracted = match.group(1).strip()
70
+ # Normalize line endings
71
+ extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
72
+ if extracted: # Only try if we got something
73
+ return json.loads(extracted)
74
+ except (json.JSONDecodeError, IndexError):
75
+ pass
76
+
77
+ # Try 3: Extract first { ... } block (handles extra text before/after)
78
+ first_brace = response.find('{')
79
+ last_brace = response.rfind('}')
80
+ if first_brace != -1 and last_brace > first_brace:
81
+ try:
82
+ extracted = response[first_brace:last_brace+1]
83
+ # Normalize line endings
84
+ extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
85
+ return json.loads(extracted)
86
+ except json.JSONDecodeError:
87
+ pass
88
+
89
+ # Try 4: Remove markdown fences and retry
90
+ # Aggressively strip all markdown code fence markers
91
+ cleaned = response.strip()
92
+ cleaned = re.sub(r'^```\s*(?:json)?\s*', '', cleaned, flags=re.IGNORECASE)
93
+ cleaned = re.sub(r'\s*```\s*$', '', cleaned)
94
+ cleaned = re.sub(r'^`+\s*', '', cleaned)
95
+ cleaned = re.sub(r'\s*`+$', '', cleaned)
96
+ cleaned = cleaned.strip()
97
+ # Normalize line endings
98
+ cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
99
+ try:
100
+ return json.loads(cleaned)
101
+ except json.JSONDecodeError:
102
+ pass
103
+
104
+ # Try 5: As a last resort, try ast.literal_eval (for Python-like dicts)
105
+ try:
106
+ import ast
107
+ # Normalize line endings
108
+ cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
109
+ return ast.literal_eval(cleaned)
110
+ except (ValueError, SyntaxError):
111
+ pass
112
+
113
+ # If all else fails, raise informative error
114
+ logger_safe('error', f'Failed to parse JSON from LLM response. Full response: {response}')
115
+ raise ValueError(f"Could not extract valid JSON from response. First 200 chars: {response[:200]}")
116
+
117
+
118
+ # HTML Styling Constants
119
+ HTML_STYLES = {
120
+ "container": "font-family: Arial, sans-serif; margin: 10px 0;",
121
+ "title": "color: #1f618d; margin-bottom: 6px;",
122
+ "heading": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px;",
123
+ "heading_margin": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px; margin-top: 15px;",
124
+ "content": "background-color: #ecf0f1; padding: 10px; border-radius: 5px; line-height: 1.6;",
125
+ "list": "background-color: #ecf0f1; padding: 15px 30px; border-radius: 5px; line-height: 1.8;",
126
+ }
127
+
128
+
129
+ def _is_notebook() -> bool:
130
+ """Check if code is running in a Jupyter notebook."""
131
+ try:
132
+ # Check if IPython is available
133
+ from IPython import get_ipython
134
+ ipython = get_ipython()
135
+ if ipython is None:
136
+ return False
137
+
138
+ # Check for notebook kernel
139
+ if hasattr(ipython, 'kernel') and ipython.kernel is not None:
140
+ return True
141
+
142
+ # Check config for IPKernelApp (notebook kernel)
143
+ if hasattr(ipython, 'config') and 'IPKernelApp' in ipython.config:
144
+ return True
145
+
146
+ # Check if it's a ZMQInteractiveShell (notebook shell)
147
+ if ipython.__class__.__name__ == 'ZMQInteractiveShell':
148
+ return True
149
+
150
+ # Check for ipykernel in sys.modules
151
+ import sys
152
+ if 'ipykernel' in sys.modules:
153
+ return True
154
+
155
+ return False
156
+ except (ImportError, AttributeError, Exception):
157
+ return False
158
+
159
+
160
+ def _build_provider_llm_caller(llm: ChatOpenAI, provider: str, schema: Optional[Dict] = None):
161
+ """
162
+ Build a provider-specific LLM call wrapper for constrained output.
163
+
164
+ Parameters
165
+ ----------
166
+ llm : ChatOpenAI
167
+ The language model interface.
168
+ provider : str
169
+ The LLM provider (vllm, openai, ollama, azure, etc).
170
+ schema : dict, optional
171
+ JSON schema for constrained output.
172
+
173
+ Returns
174
+ -------
175
+ callable
176
+ A function that invokes the LLM with appropriate constraints.
177
+ """
178
+ if schema is None:
179
+ return lambda messages: llm.invoke(messages)
180
+
181
+ provider_l = provider.lower()
182
+
183
+ if provider_l in ("vllm", "openai-compatible"):
184
+ return lambda messages: llm.invoke(messages, extra_body={"guided_json": schema})
185
+
186
+ if provider_l in ("openai", "azure", "azure-openai"):
187
+ return lambda messages: llm.invoke(messages, response_format=schema)
188
+
189
+ if provider_l == "ollama":
190
+ return lambda messages: llm.invoke(messages, format=schema)
191
+
192
+ # Fallback: no constraints
193
+ return lambda messages: llm.invoke(messages)
194
+
195
+
196
+ def _print_documentation(
197
+ documented_sql: str,
198
+ entity_description: str,
199
+ documented_entity_columns: Dict[str, str],
200
+ documented_feature_columns: Dict[str, str],
201
+ process_id: Optional[str] = None,
202
+ view_name: Optional[str] = None,
203
+ explain_analysis: Optional[str] = None,
204
+ optimization_score: Optional[int] = None,
205
+ explain_warnings: Optional[List[str]] = None,
206
+ explain_recommendations: Optional[List[str]] = None,
207
+ sql_query: Optional[str] = None,
208
+ explain_plan: Optional[str] = None,
209
+ ) -> None:
210
+ """
211
+ Pretty print documentation with context-aware formatting.
212
+ Uses HTML in notebooks, text format in regular scripts.
213
+
214
+ Parameters
215
+ ----------
216
+ documented_sql : str
217
+ The query business logic description.
218
+ entity_description : str
219
+ The entity description.
220
+ documented_entity_columns : dict
221
+ Mapping of entity column names to descriptions.
222
+ documented_feature_columns : dict
223
+ Mapping of feature column names to descriptions.
224
+ process_id : str, optional
225
+ The process identifier for the title.
226
+ view_name : str, optional
227
+ The view name for the title.
228
+ explain_analysis : str, optional
229
+ The EXPLAIN plan analysis description.
230
+ optimization_score : int, optional
231
+ Optimization score from 1 to 5.
232
+ explain_warnings : list, optional
233
+ List of warnings from EXPLAIN analysis.
234
+ explain_recommendations : list, optional
235
+ List of recommendations from EXPLAIN analysis.
236
+ sql_query : str, optional
237
+ The original SQL query to display.
238
+ explain_plan : str, optional
239
+ The raw EXPLAIN plan output to display.
240
+ """
241
+ title = ''
242
+ if process_id or view_name:
243
+ title_parts = []
244
+ if process_id:
245
+ title_parts.append(f"Process: {process_id}")
246
+ if view_name:
247
+ title_parts.append(f"View: {view_name}")
248
+ title = ' — '.join(title_parts)
249
+
250
+ # Helpers to parse structured items and clean markdown (available in both contexts)
251
+ def _try_parse_structured(value):
252
+ if value is None:
253
+ return None
254
+ if isinstance(value, (dict, list)):
255
+ return value
256
+ if not isinstance(value, str):
257
+ return value
258
+ s = value.strip()
259
+ # Try JSON
260
+ try:
261
+ return json.loads(s)
262
+ except Exception:
263
+ pass
264
+ # Try Python literal
265
+ try:
266
+ return ast.literal_eval(s)
267
+ except Exception:
268
+ pass
269
+ return s
270
+
271
+ def _flatten_to_list(parsed):
272
+ if parsed is None:
273
+ return []
274
+ if isinstance(parsed, list):
275
+ out = []
276
+ for it in parsed:
277
+ out.extend(_flatten_to_list(it))
278
+ return out
279
+ if isinstance(parsed, dict):
280
+ # Prefer obvious value keys, else format key: value pairs
281
+ for k in ("issue", "warning", "action", "recommendation", "msg", "message"):
282
+ if k in parsed:
283
+ return [str(parsed[k])]
284
+ return ["; ".join(f"{kk}: {vv}" for kk, vv in parsed.items())]
285
+ return [str(parsed)]
286
+
287
+ def _strip_md(s: str) -> str:
288
+ # Remove **bold** and inline markdown emphasis for plain text
289
+ s = re.sub(r"\*\*(.*?)\*\*", r"\1", s)
290
+ s = re.sub(r"\*(.*?)\*", r"\1", s)
291
+ return s
292
+
293
+ def _md_to_html(s: str) -> str:
294
+ # Convert **bold** to <strong>
295
+ s = re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", s)
296
+ s = re.sub(r"\*(.*?)\*\*", r"<em>\1</em>", s)
297
+ # Escape simple < and > to avoid broken HTML (keep basic newlines)
298
+ s = s.replace("<", "&lt;").replace(">", "&gt;")
299
+ # Restore our strong/em tags
300
+ s = s.replace("&lt;strong&gt;", "<strong>").replace("&lt;/strong&gt;", "</strong>")
301
+ s = s.replace("&lt;em&gt;", "<em>").replace("&lt;/em&gt;", "</em>")
302
+ return s
303
+
304
+ # Build EXPLAIN section if available
305
+ parsed_explain = _try_parse_structured(explain_analysis)
306
+ parsed_warnings = _try_parse_structured(explain_warnings)
307
+ parsed_recs = _try_parse_structured(explain_recommendations)
308
+
309
+ warn_list = _flatten_to_list(parsed_warnings)
310
+ rec_list = _flatten_to_list(parsed_recs)
311
+
312
+ explain_section = ""
313
+ if parsed_explain or optimization_score or warn_list or rec_list:
314
+ score_color = "#27ae60" if optimization_score and optimization_score >= 4 else "#f39c12" if optimization_score and optimization_score == 3 else "#e74c3c"
315
+ explain_section = f"""
316
+ <h3 style="{HTML_STYLES['heading_margin']}">Query Optimization Analysis</h3>
317
+ <div style="background-color: #ecf0f1; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
318
+ <p><strong>Optimization Score:</strong> <span style="color: {score_color}; font-size: 18px; font-weight: bold;">{optimization_score}/5</span></p>
319
+ </div>
320
+ """
321
+
322
+ if parsed_explain:
323
+ # Display explanation as plain text, preserving newlines
324
+ explain_text = parsed_explain if isinstance(parsed_explain, str) else str(parsed_explain)
325
+ explain_text_html = explain_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('\n', '<br>')
326
+ explain_section += f'<div style="{HTML_STYLES["content"]}">{explain_text_html}</div>'
327
+
328
+ if warn_list:
329
+ warnings_html = '\n'.join(f'<li style="color: #c0392b;">{_md_to_html(w).replace('\n','<br>')}</li>' for w in warn_list)
330
+ explain_section += f"""
331
+ <h4 style="color: #c0392b; margin-top: 10px;">⚠ Warnings</h4>
332
+ <ul style="{HTML_STYLES['list']}">{warnings_html}</ul>
333
+ """
334
+
335
+ if rec_list:
336
+ recommendations_html = '\n'.join(f'<li style="color: #27ae60;">{_md_to_html(r).replace('\n','<br>')}</li>' for r in rec_list)
337
+ explain_section += f"""
338
+ <h4 style="color: #27ae60; margin-top: 10px;">✓ Recommendations</h4>
339
+ <ul style="{HTML_STYLES['list']}">{recommendations_html}</ul>
340
+ """
341
+
342
+ if _is_notebook():
343
+ title_html = f"<h2>{title}</h2>" if title else ""
344
+ entity_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_entity_columns.items())
345
+ feature_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_feature_columns.items())
346
+
347
+ # Build optional sections
348
+ sql_section = ""
349
+ if sql_query:
350
+ formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
351
+ sql_section = f"""
352
+ <h3 style="{HTML_STYLES['heading_margin']}">Original SQL Query</h3>
353
+ <pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{formatted_sql}</pre>
354
+ """
355
+
356
+ explain_plan_section = ""
357
+ if explain_plan:
358
+ explain_plan_section = f"""
359
+ <h3 style="{HTML_STYLES['heading_margin']}">EXPLAIN Plan</h3>
360
+ <pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{explain_plan}</pre>
361
+ """
362
+
363
+ html_content = f"""
364
+ <div style="{HTML_STYLES['container']}">
365
+ {title_html}
366
+ <h3 style="{HTML_STYLES['heading']}">Query Business Logic</h3>
367
+ <p style="{HTML_STYLES['content']}">{documented_sql}</p>
368
+
369
+ <h3 style="{HTML_STYLES['heading_margin']}">Entity Description</h3>
370
+ <p style="{HTML_STYLES['content']}">{entity_description}</p>
371
+
372
+ <h3 style="{HTML_STYLES['heading_margin']}">Entity Columns</h3>
373
+ <ul style="{HTML_STYLES['list']}">{entity_items}</ul>
374
+
375
+ <h3 style="{HTML_STYLES['heading_margin']}">Feature Columns</h3>
376
+ <ul style="{HTML_STYLES['list']}">{feature_items}</ul>
377
+
378
+ {explain_section}
379
+ {sql_section}
380
+ {explain_plan_section}
381
+ </div>
382
+ """
383
+ display(HTML(html_content))
384
+ else:
385
+ # Text formatting for regular scripts
386
+ print("\n" + "="*100)
387
+ print(title if title else "DOCUMENTATION")
388
+ print("="*100)
389
+ print("\nQuery Business Logic:")
390
+ print(textwrap.fill(documented_sql, width=100))
391
+
392
+ print("\nEntity Description:")
393
+ print(textwrap.fill(entity_description, width=100))
394
+
395
+ print("\nEntity Columns Documentation:")
396
+ for col, desc in documented_entity_columns.items():
397
+ print(f"\n {col}:")
398
+ print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
399
+
400
+ print("\nFeature Columns Documentation:")
401
+ for col, desc in documented_feature_columns.items():
402
+ print(f"\n {col}:")
403
+ print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
404
+
405
+ # Print EXPLAIN analysis if available
406
+ if explain_analysis or optimization_score or explain_warnings or explain_recommendations:
407
+ print("\n" + "-"*100)
408
+ print("QUERY OPTIMIZATION ANALYSIS")
409
+ print("-"*100)
410
+
411
+ if optimization_score:
412
+ print(f"Optimization Score: {optimization_score}/5")
413
+
414
+ # Print parsed explanation, preserving carriage returns.
415
+ if parsed_explain:
416
+ print("\nExplanation:")
417
+ if isinstance(parsed_explain, str):
418
+ print(parsed_explain)
419
+ else:
420
+ print(str(parsed_explain))
421
+
422
+ # Print warnings (flattened) preserving carriage returns
423
+ if warn_list:
424
+ print("\nWarnings:")
425
+ for w in warn_list:
426
+ print(f" - {w}")
427
+
428
+ # Print recommendations (flattened) preserving carriage returns
429
+ if rec_list:
430
+ print("\nRecommendations:")
431
+ for r in rec_list:
432
+ print(f" - {r}")
433
+
434
+ # Print original SQL query if provided
435
+ if sql_query:
436
+ print("\n" + "-"*100)
437
+ print("ORIGINAL SQL QUERY")
438
+ print("-"*100)
439
+ formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
440
+ print(textwrap.indent(formatted_sql, ' '))
441
+
442
+ # Print EXPLAIN plan if provided
443
+ if explain_plan:
444
+ print("\n" + "-"*100)
445
+ print("EXPLAIN PLAN")
446
+ print("-"*100)
447
+ print(explain_plan)
448
+
449
+ print("\n" + "="*100 + "\n")
450
+
451
+
452
+ def build_llm(
453
+ llm_service: str = "https://api-dmproject.myddns.me/v1",
454
+ api_key: str = "YOUR_API_KEY_HERE",
455
+ model_id: str = "mistralai/Ministral-3-14B-Instruct-2512",
456
+ temperature: float = 0.0,
457
+ timeout: int = 120,
458
+ ) -> ChatOpenAI:
459
+ """
460
+ Build and return a ChatOpenAI client pointed at your vLLM/OpenAI-compatible endpoint.
461
+
462
+ Parameters
463
+ ----------
464
+ llm_service : str
465
+ Base URL of the LLM service.
466
+ api_key : str
467
+ API key for authentication.
468
+ model_id : str
469
+ Model identifier.
470
+ temperature : float
471
+ Sampling temperature for response diversity.
472
+ timeout : int
473
+ Request timeout in seconds.
474
+
475
+ Returns
476
+ -------
477
+ ChatOpenAI
478
+ Configured LLM client.
479
+
480
+ Raises
481
+ ------
482
+ Exception
483
+ If LLM client creation fails.
484
+ """
485
+ logger_safe('info', f'build_llm: Using LLM service at {llm_service} with model {model_id}')
486
+ logger_safe('debug', f'build_llm: Temperature={temperature}, Timeout={timeout}s')
487
+
488
+ try:
489
+ return ChatOpenAI(
490
+ base_url=llm_service,
491
+ api_key=api_key,
492
+ model=model_id,
493
+ temperature=temperature,
494
+ timeout=timeout,
495
+ )
496
+ except Exception as e:
497
+ logger_safe('error', f'build_llm: Failed to create LLM client: {e}')
498
+ raise
499
+
500
+
501
+ from typing import Sequence
502
+
503
+ def build_documentation_json_schema(columns: List[str], provider: str = "generic") -> Dict[str, Any]:
504
+ """
505
+ Build a provider-appropriate JSON Schema used to enforce strict JSON output
506
+ for SQL column documentation across multiple LLM backends.
507
+
508
+ This function returns different schema shapes depending on the LLM provider,
509
+ because each ecosystem uses a different structured-output mechanism:
510
+
511
+ Provider Modes
512
+ --------------
513
+ - provider="openai", "azure"
514
+ Returns the JSON Schema wrapped in OpenAI's `response_format={"type": "json_schema", ...}`
515
+ structure. Supported by GPT-4.1, GPT-4o, GPT-3.5-Turbo, and Azure OpenAI.
516
+
517
+ - provider="anthropic", "claude"
518
+ Returns an Anthropic *tool schema* definition. Claude 3.x models use tool
519
+ schemas to enforce strict JSON output.
520
+
521
+ - provider="ollama"
522
+ Returns the raw JSON schema that Ollama expects under the `format=` parameter
523
+ of the generate API. (Ollama 0.2+ supports response schemas.)
524
+
525
+ - provider="vllm"
526
+ Returns plain JSON Schema for use with vLLM's `guided_json` constrained decoding.
527
+
528
+ - provider="bedrock"
529
+ Bedrock Claude follows the Anthropic tool schema format.
530
+ Bedrock Llama / Titan accept plain JSON schema. This function returns the base
531
+ schema and leaves the final wrapping to the caller.
532
+
533
+ - provider="generic"
534
+ Returns plain JSON schema. Useful for LLM backends that do not support
535
+ constrained decoding, prompt-only JSON generation, or post-processing repair.
536
+
537
+ Parameters
538
+ ----------
539
+ columns : list[str]
540
+ Column names to include as required JSON object keys. Each column will map
541
+ to a string description generated by the model.
542
+
543
+ provider : str, optional
544
+ The model provider or backend type. Determines the structural format
545
+ required for constrained generation. One of:
546
+ "openai", "anthropic", "ollama", "vllm", "bedrock", "generic".
547
+
548
+ Returns
549
+ -------
550
+ dict
551
+ A dictionary representing the JSON Schema or provider-specific wrapper
552
+ used to enforce strict JSON output during LLM generation.
553
+
554
+ Notes
555
+ -----
556
+ - All schemas require that:
557
+ * the output be a JSON object
558
+ * keys match exactly the column names
559
+ * all values be strings
560
+ * additional properties be disallowed
561
+
562
+ - Not all providers enforce schemas equally:
563
+ * OpenAI, Claude, and vLLM offer hard guarantees.
564
+ * Ollama enforces schema reasonably well.
565
+ * Generic models may require post-processing.
566
+ """
567
+ # Base JSON schema — used directly by vLLM, Ollama, Bedrock, fallback
568
+ base_schema = {
569
+ "type": "object",
570
+ "properties": {col: {"type": "string"} for col in columns},
571
+ "required": list(columns),
572
+ "additionalProperties": False,
573
+ }
574
+
575
+ # --- Provider-specific formats ---
576
+
577
+ if provider.lower() in ("openai", "azure", "azure-openai"):
578
+ # OpenAI's required wrapper structure
579
+ return {
580
+ "type": "json_schema",
581
+ "json_schema": {
582
+ "name": "ColumnDocumentation",
583
+ "schema": base_schema,
584
+ "strict": True,
585
+ }
586
+ }
587
+
588
+ if provider.lower() in ("anthropic", "claude"):
589
+ # Anthropic tool schema
590
+ # You embed this inside the "tools" field when calling the model
591
+ return {
592
+ "name": "column_documentation",
593
+ "description": "Generate documentation for SQL output columns.",
594
+ "input_schema": base_schema
595
+ }
596
+
597
+ if provider.lower() == "ollama":
598
+ # Ollama's output format schema (unwrapped JSON schema)
599
+ # Returned directly in: generate(..., format=schema)
600
+ return base_schema
601
+
602
+ if provider.lower() in ("vllm", "openai-compatible"):
603
+ # vLLM's guided_json uses *plain JSON Schema*
604
+ # so return base_schema exactly
605
+ return base_schema
606
+
607
+ if provider.lower() == "bedrock":
608
+ # Bedrock Claude uses Anthropic schema
609
+ # Bedrock Llama uses plain JSON schema
610
+ # Return base_schema and let caller choose
611
+ return base_schema
612
+
613
+ # Fallback: generic JSON schema
614
+ return base_schema
615
+
616
+
617
+ def build_sql_documentation_chain(
618
+ llm: ChatOpenAI,
619
+ entity_columns: Sequence[str],
620
+ feature_columns: Sequence[str],
621
+ provider: str = "vllm",
622
+ json_constraint: bool = True,
623
+ ) -> Runnable:
624
+ """
625
+ Build a LangChain Runnable that generates business-focused documentation
626
+ for lists of entity and feature columns from a SQL query output, with optional provider-specific JSON
627
+ constraints (vLLM, OpenAI, Ollama, etc.).
628
+
629
+ The resulting chain expects two input variables:
630
+ - sql_query: str → the SQL query whose output is being documented
631
+ - columns_str: str → formatted list of entity and feature columns (e.g. "Entity columns:\n- col1\n\nFeature columns:\n- col2")
632
+
633
+ Parameters
634
+ ----------
635
+ llm : ChatOpenAI
636
+ The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
637
+ entity_columns : Sequence[str]
638
+ List of entity/identifier columns that must appear as keys in the output JSON.
639
+ feature_columns : Sequence[str]
640
+ List of feature columns that must appear as keys in the output JSON.
641
+ provider : str, optional (default="vllm")
642
+ Indicates which structured-output mechanism to use.
643
+ Supported values:
644
+ - "vllm" → uses `guided_json` for strict JSON output
645
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
646
+ - "ollama" → uses Ollama's `format=` schema
647
+ - "openai-compatible" → alias for vLLM-style guided decoding
648
+ - any other value → fall back to unconstrained text output
649
+ json_constraint : bool, optional (default=True)
650
+ If True:
651
+ - a JSON Schema is generated from the column lists
652
+ - provider-specific constrained decoding is applied
653
+ If False:
654
+ - the chain does not enforce JSON structure at the LLM level
655
+ - the model is only guided by the prompt (weaker guarantees)
656
+
657
+ Returns
658
+ -------
659
+ Runnable
660
+ A LangChain Runnable that executes:
661
+ prompt → LLM (optionally schema-guided) → JSON parser
662
+
663
+ When invoked with:
664
+ {
665
+ "sql_query": "...",
666
+ "columns_str": "Entity columns:\n- column1\n\nFeature columns:\n- column2\n..."
667
+ }
668
+
669
+ It returns:
670
+ dict[str, str]
671
+ A mapping of each requested column name to a short,
672
+ business-oriented description (≤ 5 sentences), plus a 'query_business_logic' key
673
+ containing a high-level description of the query's business logic (5-10 sentences), and an 'entity_description' key
674
+ with a holistic description of the entity (3-5 sentences).
675
+
676
+ Notes
677
+ -----
678
+ - The chain enforces valid JSON when possible:
679
+ * vLLM → `guided_json`
680
+ * OpenAI → `response_format={"type": "json_schema", ...}`
681
+ * Ollama → `format=<schema>`
682
+ - For unsupported providers, the model may emit imperfect JSON.
683
+ - Descriptions focus on business meaning, business logic,
684
+ and optionally technical details only when relevant.
685
+ """
686
+ all_columns = entity_columns + feature_columns + ["query_business_logic", "entity_description"]
687
+ logger_safe('info', f'build_sql_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}, entity_columns={list(entity_columns)}, feature_columns={list(feature_columns)}')
688
+ prompt = ChatPromptTemplate.from_template(
689
+ """
690
+ You are a data documentation assistant.
691
+
692
+ Your target audience is business users.
693
+ Your explanations must focus primarily on the business meaning and business logic of each column,
694
+ and you may add technical details only when they meaningfully clarify the business context.
695
+
696
+ Given:
697
+ 1. A SQL query.
698
+ 2. Lists of entity and feature columns that must be documented.
699
+
700
+ Your job:
701
+ - For entity columns: Provide a brief 1-sentence description of how this column contributes to identifying the entity described holistically under 'entity_description'. Do not repeat the full entity description here.
702
+ - For feature columns: Write a clear and concise explanation of what the column represents from a business perspective, describing the business logic behind how the value is derived or used within the context of the SQL query.
703
+ - Add technical details only if relevant and only to help a business audience understand the concept.
704
+ - Each description must be at most 5 sentences.
705
+ - Do not include any columns that are not in the provided lists.
706
+ - If a column name is ambiguous, infer its meaning from the SQL query as best as possible and say so.
707
+ - If you cannot infer anything meaningful, state that clearly (still within 3 sentences).
708
+ - Additionally, provide a high-level description of the business logic of the SQL query itself under the key 'query_business_logic'. This should explain what the query does from a business perspective, including the main purpose, data sources, transformations, and business value. Keep it to 5-10 sentences.
709
+ - Additionally, provide a description of the entity as a whole under the key 'entity_description'. This should describe the business object that the entity columns collectively identify, noting that this is the object the features describe. Keep it to 3-5 sentences.
710
+ - Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors.
711
+ - Answer in {language}
712
+ Output format (very important):
713
+ - Return ONLY a valid JSON object.
714
+ - Each top-level key must be exactly the column name or 'query_business_logic'.
715
+ - Each value must be a single string with the description.
716
+
717
+ Example of the required format:
718
+ {{
719
+ "customer_id": "This column serves as the primary key for identifying individual customers in the entity.",
720
+ "order_date": "The business date when the order was created. It represents the transaction date used for reporting and may reflect the source system's timestamp.",
721
+ "entity_description": "The customer entity represents individual buyers in the business system, identified by customer_id and described by features like order history and demographics. This entity is the core object that the feature columns characterize for analysis and decision-making.",
722
+ "query_business_logic": "This query joins customer and order data to provide a comprehensive view of customer orders. It filters orders from 2024 onwards to focus on recent activity. The result helps business users understand customer purchasing patterns and regional distribution."
723
+ }}
724
+
725
+ Now generate documentation.
726
+
727
+ SQL query:
728
+ ```sql
729
+ {sql_query}
730
+ ```
731
+ Columns to document (only document these):
732
+ {columns_str}
733
+ """
734
+ )
735
+ parser = JsonOutputParser()
736
+ if not json_constraint:
737
+ return prompt | llm | parser
738
+
739
+ schema = build_documentation_json_schema(all_columns, provider=provider)
740
+ logger_safe('debug', f'build_sql_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
741
+
742
+ # Use helper to build provider-specific LLM caller
743
+ call_llm = _build_provider_llm_caller(llm, provider, schema)
744
+ constrained_llm = RunnableLambda(call_llm)
745
+
746
+ # Final chain: prompt -> LLM (schema-guided) -> JSON parser
747
+ def _parse(ai_msg: AIMessage):
748
+ raw = ai_msg.content
749
+ return parser.parse(raw)
750
+
751
+ return prompt | constrained_llm | RunnableLambda(_parse)
752
+
753
+ def run_sql_documentation(
754
+ chain: Runnable,
755
+ sql_query: str,
756
+ entity_columns: Sequence[str],
757
+ feature_columns: Sequence[str],
758
+ language: str = "English",
759
+ ) -> Dict[str, str]:
760
+ """
761
+ Execute a previously constructed SQL-documentation chain and return
762
+ business-friendly documentation for the specified SQL output columns.
763
+
764
+ This function prepares the chain inputs (SQL query, formatted column list,
765
+ target language) and invokes the chain. The chain itself must have been
766
+ created using `build_sql_documentation_chain()`, which ensures the model
767
+ produces structured JSON suitable for parsing.
768
+
769
+ Parameters
770
+ ----------
771
+ chain : Runnable
772
+ A LangChain Runnable returned by `build_sql_documentation_chain()`.
773
+ This Runnable encapsulates:
774
+ - the prompt template
775
+ - a provider-specific LLM invocation (with or without JSON constraints)
776
+ - a JSON output parser
777
+
778
+ sql_query : str
779
+ The SQL query whose resulting columns should be documented. This query is
780
+ shown to the model so it can infer business logic, derivation rules, and
781
+ column meaning.
782
+
783
+ entity_columns : Sequence[str]
784
+ The list of entity/identifier column names that must appear as keys in the output JSON.
785
+ Only these columns will be documented. The order does not matter.
786
+
787
+ feature_columns : Sequence[str]
788
+ The list of feature column names that must appear as keys in the output JSON.
789
+ Only these columns will be documented. The order does not matter.
790
+
791
+ language : str, optional (default="English")
792
+ The target output language for the generated documentation.
793
+ This value is passed into the prompt’s `{language}` variable.
794
+ Examples: "English", "French", "German", "Spanish", "Japanese".
795
+
796
+ Returns
797
+ -------
798
+ dict[str, str]
799
+ A dictionary mapping each column name to a human-readable, business-oriented
800
+ description generated by the model, plus a 'query_business_logic' key
801
+ with the query's business logic description, and an 'entity_description' key
802
+ with the holistic entity description. Example:
803
+ {
804
+ "customer_id": "Unique customer identifier used for ...",
805
+ "order_date": "Business date when the order was created ...",
806
+ "entity_description": "The customer entity represents...",
807
+ "query_business_logic": "This query provides a view of ..."
808
+ }
809
+
810
+ Notes
811
+ -----
812
+ - The output format is determined by the chain's JSON parser. If the model
813
+ fails to produce valid JSON (e.g., due to unsupported constraints),
814
+ a `OutputParserException` may be raised.
815
+ - The resulting descriptions are typically ≤ 5 sentences per column, unless
816
+ modified in the chain's prompt.
817
+ """
818
+ logger_safe('info', f'run_sql_documentation: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
819
+ columns_str = "Entity columns:\n" + "\n".join(f"- {col}" for col in entity_columns) + "\n\nFeature columns:\n" + "\n".join(f"- {col}" for col in feature_columns)
820
+
821
+ try:
822
+ result = chain.invoke({
823
+ "sql_query": sql_query,
824
+ "columns_str": columns_str,
825
+ "language" : language
826
+ })
827
+ logger_safe('info', f'run_sql_documentation: Successfully generated documentation for columns: {list(result.keys())}')
828
+ return result
829
+ except Exception as e:
830
+ logger_safe('error', f'run_sql_documentation: Failed to generate documentation: {e}')
831
+ raise
832
+
833
+
834
+ def document_sql_query_columns(
835
+ sql_query: str,
836
+ entity_columns: Sequence[str],
837
+ feature_columns: Sequence[str],
838
+ language: str = "English",
839
+ provider: Optional[str] = None,
840
+ json_constraint: bool = True,
841
+ ) -> Dict[str, Any]:
842
+ """
843
+ Convenience function to generate business-focused documentation for SQL query output columns
844
+ using the configured instruction model from tdfs4ds settings.
845
+
846
+ This function automatically builds the LLM client using the tdfs4ds configuration variables
847
+ (INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL), constructs the documentation
848
+ chain, and executes it to produce column descriptions.
849
+
850
+ Parameters
851
+ ----------
852
+ sql_query : str
853
+ The SQL query whose resulting columns should be documented. This query is
854
+ shown to the model so it can infer business logic, derivation rules, and
855
+ column meaning.
856
+
857
+ entity_columns : Sequence[str]
858
+ The list of entity/identifier column names that must appear as keys in the output JSON.
859
+ Only these columns will be documented. The order does not matter.
860
+
861
+ feature_columns : Sequence[str]
862
+ The list of feature column names that must appear as keys in the output JSON.
863
+ Only these columns will be documented. The order does not matter.
864
+
865
+ language : str, optional (default="English")
866
+ The target output language for the generated documentation.
867
+ This value is passed into the prompt's {language} variable.
868
+ Examples: "English", "French", "German", "Spanish", "Japanese".
869
+
870
+ provider : str, optional (default=None)
871
+ Indicates which structured-output mechanism to use for the LLM.
872
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
873
+ Supported values:
874
+ - "vllm" → uses `guided_json` for strict JSON output
875
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
876
+ - "ollama" → uses Ollama's `format=` schema
877
+ - "openai-compatible" → alias for vLLM-style guided decoding
878
+ - any other value → fall back to unconstrained text output
879
+
880
+ json_constraint : bool, optional (default=True)
881
+ If True:
882
+ - a JSON Schema is generated from the column lists
883
+ - provider-specific constrained decoding is applied
884
+ If False:
885
+ - the chain does not enforce JSON structure at the LLM level
886
+ - the model is only guided by the prompt (weaker guarantees)
887
+
888
+ Returns
889
+ -------
890
+ dict
891
+ A dictionary with four keys:
892
+ - "query_business_logic": str containing the high-level business logic description of the query
893
+ - "entity_description": str containing the holistic description of the entity
894
+ - "entity_columns": dict[str, str] mapping each entity column name to its description
895
+ - "feature_columns": dict[str, str] mapping each feature column name to its description
896
+
897
+ Raises
898
+ ------
899
+ ValueError
900
+ If any of the required tdfs4ds configuration variables (INSTRUCT_MODEL_URL,
901
+ INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL) are not set.
902
+
903
+ Notes
904
+ -----
905
+ - This function requires that the tdfs4ds instruction model configuration is properly set.
906
+ - The resulting descriptions are typically ≤ 5 sentences per column, focusing on
907
+ business meaning and logic.
908
+ - If the model fails to produce valid JSON, an exception will be raised.
909
+ """
910
+ # Import the configuration variables
911
+ from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
912
+
913
+ # Validate configuration
914
+ if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
915
+ raise ValueError(
916
+ "tdfs4ds instruction model configuration is incomplete. Please ensure "
917
+ "INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
918
+ )
919
+
920
+ logger_safe('info', f'document_sql_query_columns: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
921
+
922
+ if provider is None:
923
+ provider = INSTRUCT_MODEL_PROVIDER
924
+
925
+ # Build the LLM client
926
+ llm = build_llm(
927
+ llm_service=INSTRUCT_MODEL_URL,
928
+ api_key=INSTRUCT_MODEL_API_KEY,
929
+ model_id=INSTRUCT_MODEL_MODEL
930
+ )
931
+
932
+ # Build the documentation chain
933
+ sql_doc_chain = build_sql_documentation_chain(llm, entity_columns, feature_columns, provider=provider, json_constraint=json_constraint)
934
+
935
+ # Run the documentation
936
+ result = run_sql_documentation(sql_doc_chain, sql_query, entity_columns, feature_columns, language=language)
937
+
938
+ # Separate entity columns, feature columns, entity description, and query logic
939
+ entity_docs = {k: v for k, v in result.items() if k in entity_columns}
940
+ feature_docs = {k: v for k, v in result.items() if k in feature_columns}
941
+ entity_desc = result.get("entity_description", "")
942
+ query_logic = result.get("query_business_logic", "")
943
+
944
+ logger_safe('info', f'document_sql_query_columns: Successfully completed documentation for {len(entity_docs)} entity columns, {len(feature_docs)} feature columns, entity description and query logic')
945
+ return {
946
+ "query_business_logic": query_logic,
947
+ "entity_description": entity_desc,
948
+ "entity_columns": entity_docs,
949
+ "feature_columns": feature_docs
950
+ }
951
+
952
+
953
+ def build_explain_documentation_chain(
954
+ llm: ChatOpenAI,
955
+ provider: str = "vllm",
956
+ json_constraint: bool = True,
957
+ ) -> Runnable:
958
+ """
959
+ Build a LangChain Runnable that analyzes SQL EXPLAIN plans and generates
960
+ optimization scores, warnings, and recommendations.
961
+
962
+ The resulting chain expects two input variables:
963
+ - sql_query: str → the original SQL query
964
+ - explain_plan: str → the EXPLAIN output from the database
965
+
966
+ Parameters
967
+ ----------
968
+ llm : ChatOpenAI
969
+ The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
970
+ provider : str, optional (default="vllm")
971
+ Indicates which structured-output mechanism to use.
972
+ Supported values:
973
+ - "vllm" → uses `guided_json` for strict JSON output
974
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
975
+ - "ollama" → uses Ollama's `format=` schema
976
+ - "openai-compatible" → alias for vLLM-style guided decoding
977
+ - any other value → fall back to unconstrained text output
978
+ json_constraint : bool, optional (default=True)
979
+ If True: a JSON Schema is generated and provider-specific constrained decoding is applied.
980
+ If False: the chain does not enforce JSON structure at the LLM level.
981
+
982
+ Returns
983
+ -------
984
+ Runnable
985
+ A LangChain Runnable that executes:
986
+ prompt → LLM (optionally schema-guided) → JSON parser
987
+
988
+ When invoked with:
989
+ {
990
+ "sql_query": "SELECT ...",
991
+ "explain_plan": "..."
992
+ }
993
+
994
+ It returns:
995
+ dict with keys:
996
+ - "explanation": str describing the EXPLAIN plan in business terms
997
+ - "optimization_score": int from 1 (poorly optimized) to 5 (well optimized)
998
+ - "warnings": list[str] of potential issues or concerns
999
+ - "recommendations": list[str] of actionable optimization suggestions
1000
+ """
1001
+ logger_safe('info', f'build_explain_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}')
1002
+
1003
+ # JSON schema for EXPLAIN analysis output
1004
+ explain_schema = {
1005
+ "type": "object",
1006
+ "properties": {
1007
+ "explanation": {"type": "string"},
1008
+ "optimization_score": {
1009
+ "type": "integer",
1010
+ "minimum": 1,
1011
+ "maximum": 5,
1012
+ "description": "Score from 1 (poorly optimized) to 5 (well optimized)"
1013
+ },
1014
+ "warnings": {
1015
+ "type": "array",
1016
+ "items": {"type": "string"},
1017
+ "description": "List of potential issues or concerns"
1018
+ },
1019
+ "recommendations": {
1020
+ "type": "array",
1021
+ "items": {"type": "string"},
1022
+ "description": "List of actionable optimization suggestions"
1023
+ }
1024
+ },
1025
+ "required": ["explanation", "optimization_score", "warnings", "recommendations"],
1026
+ "additionalProperties": False
1027
+ }
1028
+
1029
+ prompt = ChatPromptTemplate.from_template(
1030
+ """
1031
+ You are an expert SQL query optimization analyst.
1032
+
1033
+ Your task is to analyze a SQL EXPLAIN plan and provide optimization guidance.
1034
+
1035
+ Provide your analysis in the following JSON format with these exact keys:
1036
+ - explanation: A clear, plain-text explanation of what the EXPLAIN plan shows. Include analysis of execution strategy, estimated costs, and any visible inefficiencies.
1037
+ - optimization_score: An integer from 1 to 5 (1 = poorly optimized, 5 = well optimized)
1038
+ - warnings: An array of warning strings about potential issues
1039
+ - recommendations: An array of actionable recommendation strings for improvement
1040
+
1041
+ Analysis Guidelines:
1042
+ - Focus on execution strategy, index usage, and join efficiency
1043
+ - Be detailed but business-friendly, avoiding unnecessary technical jargon
1044
+ - Consider factors like: full table scans vs index usage, join strategies, data distribution
1045
+ - Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors
1046
+
1047
+ Scoring Guidelines:
1048
+ - Score 1: Multiple full table scans, no indexes, inefficient joins
1049
+ - Score 2: Some index usage but still room for improvement, potentially expensive operations
1050
+ - Score 3: Reasonable query plan, acceptable performance, some optimization opportunities
1051
+ - Score 4: Good query plan with mostly optimized joins and indexes, minor improvements possible
1052
+ - Score 5: Excellent plan with efficient execution, proper use of indexes, optimal join strategies
1053
+
1054
+ Warnings should highlight specific concerns (e.g., 'Full table scan on large table ORDERS', 'Missing index on customer_id column').
1055
+ Recommendations should be specific and actionable (e.g., 'Add index on orders.customer_id', 'Consider using a different join strategy').
1056
+
1057
+ Output format (very important):
1058
+ - Return ONLY a valid JSON object.
1059
+ - Each top-level key must be exactly 'explanation', 'optimization_score', 'warnings', or 'recommendations'.
1060
+ - The 'explanation' value must be a single string.
1061
+ - The 'optimization_score' value must be an integer from 1 to 5.
1062
+ - The 'warnings' value must be an array of strings.
1063
+ - The 'recommendations' value must be an array of strings.
1064
+
1065
+ Example of the required format:
1066
+ {{
1067
+ "explanation": "The EXPLAIN plan shows a nested loop join between the customers and orders tables. The query performs a full table scan on the orders table, which has an estimated 1 million rows. The join condition uses the customer_id column, but there is no index on this column in the orders table.",
1068
+ "optimization_score": 2,
1069
+ "warnings": ["Full table scan on large orders table", "Missing index on orders.customer_id"],
1070
+ "recommendations": ["Add index on orders.customer_id", "Consider using a hash join instead of nested loop"]
1071
+ }}
1072
+
1073
+ SQL Query:
1074
+ ```sql
1075
+ {sql_query}
1076
+ ```
1077
+
1078
+ EXPLAIN Plan:
1079
+ ```
1080
+ {explain_plan}
1081
+ ```
1082
+
1083
+ Return ONLY valid JSON with the four keys above.
1084
+ """
1085
+ )
1086
+ parser = JsonOutputParser()
1087
+ if not json_constraint:
1088
+ return prompt | llm | parser
1089
+
1090
+ logger_safe('debug', f'build_explain_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
1091
+
1092
+ # Wrap schema for OpenAI providers
1093
+ if provider.lower() in ("openai", "azure", "azure-openai"):
1094
+ wrapped_schema = {
1095
+ "type": "json_schema",
1096
+ "json_schema": {
1097
+ "name": "ExplainAnalysis",
1098
+ "schema": explain_schema,
1099
+ "strict": True,
1100
+ }
1101
+ }
1102
+ else:
1103
+ wrapped_schema = explain_schema
1104
+
1105
+ # Use helper to build provider-specific LLM caller
1106
+ call_llm = _build_provider_llm_caller(llm, provider, wrapped_schema)
1107
+ constrained_llm = RunnableLambda(call_llm)
1108
+
1109
+ # Final chain: prompt -> LLM (schema-guided) -> JSON parser
1110
+ def _parse(ai_msg: AIMessage):
1111
+ raw = ai_msg.content
1112
+ return parser.parse(raw)
1113
+
1114
+ return prompt | constrained_llm | RunnableLambda(_parse)
1115
+
1116
+
1117
+ def run_explain_documentation(
1118
+ chain: Runnable,
1119
+ sql_query: str,
1120
+ explain_plan: str,
1121
+ ) -> Dict[str, Any]:
1122
+ """
1123
+ Execute an EXPLAIN-documentation chain and return optimization analysis.
1124
+
1125
+ Parameters
1126
+ ----------
1127
+ chain : Runnable
1128
+ A LangChain Runnable returned by `build_explain_documentation_chain()`.
1129
+ sql_query : str
1130
+ The original SQL query.
1131
+ explain_plan : str
1132
+ The EXPLAIN output from the database.
1133
+
1134
+ Returns
1135
+ -------
1136
+ dict
1137
+ A dictionary with keys: "explanation", "optimization_score", "warnings", "recommendations"
1138
+ """
1139
+ logger_safe('info', 'run_explain_documentation: Starting EXPLAIN analysis')
1140
+
1141
+ try:
1142
+ result = chain.invoke({
1143
+ "sql_query": sql_query,
1144
+ "explain_plan": explain_plan
1145
+ })
1146
+ logger_safe('info', f'run_explain_documentation: Successfully analyzed EXPLAIN plan. Score: {result.get("optimization_score", "N/A")}/5')
1147
+ return result
1148
+ except Exception as e:
1149
+ logger_safe('error', f'run_explain_documentation: Failed to analyze EXPLAIN plan: {e}')
1150
+ raise
1151
+
1152
+
1153
+ def document_sql_query_explain(
1154
+ sql_query: str,
1155
+ provider: Optional[str] = None,
1156
+ json_constraint: bool = True,
1157
+ ) -> Dict[str, Any]:
1158
+ """
1159
+ Analyze a SQL query's EXPLAIN plan and return optimization recommendations.
1160
+
1161
+ This function automatically builds the LLM client using tdfs4ds configuration,
1162
+ constructs the EXPLAIN analysis chain, and executes it.
1163
+
1164
+ Parameters
1165
+ ----------
1166
+ sql_query : str
1167
+ The original SQL query.
1168
+ explain_plan : str
1169
+ The EXPLAIN output from the database.
1170
+ provider : str, optional (default=None)
1171
+ Indicates which structured-output mechanism to use for the LLM.
1172
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
1173
+ Supported values: "vllm", "openai", "azure", "ollama", etc.
1174
+ json_constraint : bool, optional (default=True)
1175
+ If True: use provider-specific constrained decoding.
1176
+ If False: rely on prompt guidance only.
1177
+
1178
+ Returns
1179
+ -------
1180
+ dict
1181
+ A dictionary with keys:
1182
+ - "explanation": str describing the EXPLAIN plan
1183
+ - "optimization_score": int from 1 to 5
1184
+ - "warnings": list[str] of potential issues
1185
+ - "recommendations": list[str] of actionable suggestions
1186
+
1187
+ Raises
1188
+ ------
1189
+ ValueError
1190
+ If tdfs4ds instruction model configuration is incomplete.
1191
+ """
1192
+ from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
1193
+
1194
+ if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
1195
+ raise ValueError(
1196
+ "tdfs4ds instruction model configuration is incomplete. Please ensure "
1197
+ "INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
1198
+ )
1199
+
1200
+ logger_safe('info', 'document_sql_query_explain: Starting EXPLAIN analysis')
1201
+
1202
+ if provider is None:
1203
+ provider = INSTRUCT_MODEL_PROVIDER
1204
+
1205
+ # Build the LLM client
1206
+ llm = build_llm(
1207
+ llm_service=INSTRUCT_MODEL_URL,
1208
+ api_key=INSTRUCT_MODEL_API_KEY,
1209
+ model_id=INSTRUCT_MODEL_MODEL
1210
+ )
1211
+
1212
+ # get the explain plan:
1213
+ explain_plan = get_the_explain(sql_query)
1214
+ # Build and run the EXPLAIN analysis chain
1215
+ explain_chain = build_explain_documentation_chain(llm, provider=provider, json_constraint=json_constraint)
1216
+ result = run_explain_documentation(explain_chain, sql_query, explain_plan)
1217
+
1218
+ logger_safe('info', f'document_sql_query_explain: Successfully completed EXPLAIN analysis. Score: {result.get("optimization_score", "N/A")}/5')
1219
+ return result
1220
+
1221
+ def documentation_tables_creation():
1222
+ """
1223
+ Create the necessary documentation tables in the database if they do not already exist.
1224
+ tdml: The tdfs4ds TDML connection object."""
1225
+ query_process_table = f"""
1226
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} ,FALLBACK ,
1227
+ NO BEFORE JOURNAL,
1228
+ NO AFTER JOURNAL,
1229
+ CHECKSUM = DEFAULT,
1230
+ DEFAULT MERGEBLOCKRATIO,
1231
+ MAP = TD_MAP1
1232
+ (
1233
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1234
+ BUSINESS_LOGIC_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1235
+ ENTITY_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1236
+ ENTITY_COLUMNS_JSON VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1237
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1238
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1239
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1240
+ PRIMARY INDEX ( PROCESS_ID )
1241
+ """
1242
+
1243
+ query_process_features_table = f"""
1244
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} ,FALLBACK ,
1245
+ NO BEFORE JOURNAL,
1246
+ NO AFTER JOURNAL,
1247
+ CHECKSUM = DEFAULT,
1248
+ DEFAULT MERGEBLOCKRATIO,
1249
+ MAP = TD_MAP1
1250
+ (
1251
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1252
+ FEATURE_ID BIGINT NOT NULL,
1253
+ FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1254
+ FEATURE_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1255
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1256
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1257
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1258
+ PRIMARY INDEX ( PROCESS_ID, FEATURE_ID )
1259
+ """
1260
+
1261
+ query_process_explain_table = f"""
1262
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} ,FALLBACK ,
1263
+ NO BEFORE JOURNAL,
1264
+ NO AFTER JOURNAL,
1265
+ CHECKSUM = DEFAULT,
1266
+ DEFAULT MERGEBLOCKRATIO,
1267
+ MAP = TD_MAP1
1268
+ (
1269
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1270
+ EXPLAIN_ANALYSIS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1271
+ OPTIMIZATION_SCORE INT,
1272
+ WARNINGS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1273
+ RECOMMENDATIONS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1274
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1275
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1276
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1277
+ PRIMARY INDEX ( PROCESS_ID )
1278
+ """
1279
+
1280
+ try:
1281
+ tdml.execute_sql(query_process_table)
1282
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}')
1283
+ except Exception as e:
1284
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}: {e}')
1285
+ if 'already exists' in str(e).lower():
1286
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} already exists. Skipping creation.')
1287
+ pass
1288
+ else:
1289
+ raise
1290
+ try:
1291
+ tdml.execute_sql(query_process_features_table)
1292
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}')
1293
+ except Exception as e:
1294
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}: {e}')
1295
+ if 'already exists' in str(e).lower():
1296
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} already exists. Skipping creation.')
1297
+ pass
1298
+ else:
1299
+ raise
1300
+
1301
+ try:
1302
+ tdml.execute_sql(query_process_explain_table)
1303
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}')
1304
+ except Exception as e:
1305
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}: {e}')
1306
+ if 'already exists' in str(e).lower():
1307
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} already exists. Skipping creation.')
1308
+ pass
1309
+ else:
1310
+ raise
1311
+
1312
+ logger_safe('info', 'documentation_tables_creation: Documentation tables creation process completed.')
1313
+ return
1314
+
1315
+ def document_process(process_id: str, language: str = "English", json_constraint: bool = True, show_sql_query: bool = False, show_explain_plan: bool = False, display: bool = True, upload: bool = True) -> Optional[Dict[str, Any]]:
1316
+ """
1317
+ Generate and store documentation for a data process identified by process_id.
1318
+ This function retrieves the SQL query and output columns for the process,
1319
+ generates business-focused documentation using an LLM, and stores the results
1320
+ in the appropriate documentation tables.
1321
+
1322
+ Parameters
1323
+ ----------
1324
+ process_id : str
1325
+ The unique identifier of the data process to document.
1326
+
1327
+ language : str, optional (default="English")
1328
+ The target output language for the generated documentation. This value is
1329
+ passed into the prompt’s `{language}` variable. Examples: "English", "French", "German", "Spanish", "Japanese".
1330
+ provider : str, optional (default=None)
1331
+ Indicates which structured-output mechanism to use for the LLM.
1332
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
1333
+ Supported values:
1334
+ - "vllm" → uses `guided_json` for strict JSON output
1335
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
1336
+ - "ollama" → uses Ollama's `format=` schema
1337
+ - "openai-compatible" → alias for vLLM-style guided decoding
1338
+ - any other value → fall back to unconstrained text output
1339
+ json_constraint : bool, optional (default=True)
1340
+ If True:
1341
+ - a JSON Schema is generated from the column list
1342
+ - provider-specific constrained decoding is applied
1343
+ If False:
1344
+ - the chain does not enforce JSON structure at the LLM level
1345
+ - the model is only guided by the prompt (weaker guarantees)
1346
+ show_sql_query : bool, optional (default=False)
1347
+ If True, display the original SQL query at the end of the documentation report.
1348
+ show_explain_plan : bool, optional (default=False)
1349
+ If True, display the raw EXPLAIN plan output at the end of the documentation report.
1350
+ display : bool, optional (default=True)
1351
+ If True, print the generated documentation to the console.
1352
+ upload_documentation : bool, optional (default=True)
1353
+ If True, upload the generated documentation to the documentation tables.
1354
+
1355
+ Returns
1356
+ -------
1357
+ dict or None
1358
+ A dictionary containing the generated documentation and analysis, or None if an error occurred.
1359
+ The dictionary includes keys:
1360
+ - PROCESS_ID
1361
+ - DOCUMENTED_SQL
1362
+ - ENTITY_DESCRIPTION
1363
+ - DOCUMENTED_ENTITY_COLUMNS
1364
+ - DOCUMENTED_FEATURE_COLUMNS
1365
+ - EXPLAIN_ANALYSIS (if show_explain_plan is True)
1366
+ - OPTIMIZATION_SCORE (if show_explain_plan is True)
1367
+ - EXPLAIN_WARNINGS (if show_explain_plan is True)
1368
+ - EXPLAIN_RECOMMENDATIONS (if show_explain_plan is True)
1369
+ - RAW_EXPLAIN_PLAN (if show_explain_plan is True)
1370
+ Notes
1371
+ -----
1372
+ - This function requires that the tdfs4ds instruction model configuration is properly set.
1373
+ - If the model fails to produce valid JSON, an exception will be raised.
1374
+ - The resulting descriptions are typically ≤ 5 sentences per column, focusing on
1375
+ business meaning and logic.
1376
+ """
1377
+ logger_safe('info', f'document_process: Starting documentation for process_id {process_id} in {language}')
1378
+
1379
+ # Retrieve process SQL and columns
1380
+ try:
1381
+ process_info = tdfs4ds.process_store.process_store_catalog_management.get_process_info(process_id)
1382
+ except Exception as e:
1383
+ logger_safe('error', f"document_process: Error retrieving process info for process_id {process_id}: {e}")
1384
+ return
1385
+
1386
+ documentation = document_sql_query_columns(
1387
+ sql_query = process_info['PROCESS_SQL'],
1388
+ entity_columns = process_info['ENTITY_COLUMNS'],
1389
+ feature_columns = process_info['FEATURE_COLUMNS']
1390
+ )
1391
+
1392
+ process_info['DOCUMENTED_SQL'] = documentation['query_business_logic']
1393
+ process_info['ENTITY_DESCRIPTION'] = documentation['entity_description']
1394
+ process_info['DOCUMENTED_ENTITY_COLUMNS'] = documentation['entity_columns']
1395
+ process_info['DOCUMENTED_FEATURE_COLUMNS'] = documentation['feature_columns']
1396
+
1397
+ if True:
1398
+ explain_documentation = document_sql_query_explain(
1399
+ sql_query = process_info['PROCESS_SQL']
1400
+ )
1401
+
1402
+ process_info['EXPLAIN_ANALYSIS'] = explain_documentation['explanation']
1403
+ process_info['OPTIMIZATION_SCORE'] = explain_documentation['optimization_score']
1404
+ process_info['EXPLAIN_WARNINGS'] = explain_documentation['warnings']
1405
+ process_info['EXPLAIN_RECOMMENDATIONS'] = explain_documentation['recommendations']
1406
+
1407
+ # Store the raw EXPLAIN plan if needed for display
1408
+ if show_explain_plan:
1409
+ process_info['RAW_EXPLAIN_PLAN'] = get_the_explain(process_info['PROCESS_SQL'])
1410
+
1411
+ # Upload the generated documentation to the documentation tables:
1412
+ if upload:
1413
+ upload_documentation(process_info)
1414
+ logger_safe('info', f'document_process: Uploaded documentation for process_id {process_id} to documentation tables.')
1415
+ upload_documentation_explain(process_info)
1416
+ logger_safe('info', f'document_process: Uploaded EXPLAIN analysis for process_id {process_id} to documentation tables.')
1417
+
1418
+ # pretty print documentation for info:
1419
+ logger_safe('info', f"document_process: Documentation for process_id {process_id}:")
1420
+
1421
+ if display:
1422
+ _print_documentation(
1423
+ documented_sql = process_info.get('DOCUMENTED_SQL', None),
1424
+ entity_description = process_info.get('ENTITY_DESCRIPTION', None),
1425
+ documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
1426
+ documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
1427
+ process_id = process_info.get('PROCESS_ID', process_id),
1428
+ view_name = process_info.get('VIEW_NAME', None),
1429
+ explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
1430
+ optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
1431
+ explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
1432
+ explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
1433
+ sql_query = process_info.get('PROCESS_SQL', None) if show_sql_query else None,
1434
+ explain_plan = process_info.get('RAW_EXPLAIN_PLAN', None) if show_explain_plan else None,
1435
+ )
1436
+
1437
+ return process_info
1438
+
1439
+ def get_the_explain(sql_query: str) -> str:
1440
+ """
1441
+ Get the EXPLAIN plan for a given SQL query using the tdfs4ds TDML connection.
1442
+
1443
+ Parameters
1444
+ ----------
1445
+ sql_query : str
1446
+ The SQL query to explain.
1447
+
1448
+ Returns
1449
+ -------
1450
+ str
1451
+ The EXPLAIN plan as a formatted string.
1452
+ """
1453
+ def _extract_inner_query_from_view(query: str) -> str:
1454
+ """
1455
+ If the provided SQL is a CREATE/REPLACE VIEW (or REPLACE VIEW), extract and
1456
+ return the inner SELECT/definition. Otherwise return the original query.
1457
+
1458
+ This helps when running EXPLAIN: we want to analyze the query inside the
1459
+ view definition rather than the DDL wrapper.
1460
+ """
1461
+ if not isinstance(query, str):
1462
+ return query
1463
+ pattern = r'^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?|REPLACE\s+)?VIEW\b.*?\bAS\b\s*(?P<body>.*)$'
1464
+ m = re.search(pattern, query, flags=re.IGNORECASE | re.DOTALL)
1465
+ if not m:
1466
+ return query
1467
+ body = m.group('body').strip()
1468
+ # Strip outer parentheses if the definition is wrapped
1469
+ if body.startswith('(') and body.endswith(')'):
1470
+ body = body[1:-1].strip()
1471
+ # Remove trailing semicolon
1472
+ if body.endswith(';'):
1473
+ body = body[:-1].strip()
1474
+ # Remove trailing LOCK ROW FOR ACCESS (or similar) clauses that may appear
1475
+ # in view definitions (e.g., "LOCK ROW FOR ACCESS") so EXPLAIN focuses
1476
+ # on the inner SELECT statement.
1477
+ body = re.sub(r"\bLOCK\s+ROW\s+FOR\s+ACCESS\b\s*;?\s*$", "", body, flags=re.IGNORECASE)
1478
+ logger_safe('debug', 'get_the_explain: Extracted inner query from CREATE/REPLACE VIEW for EXPLAIN.')
1479
+ return body
1480
+
1481
+ inner_sql = _extract_inner_query_from_view(sql_query)
1482
+ try:
1483
+ explain_result = tdml.execute_sql(f"EXPLAIN {inner_sql}").fetchall()
1484
+ explain_lines = [row[0] for row in explain_result]
1485
+ explain_text = "\n".join(explain_lines)
1486
+ logger_safe('info', 'get_the_explain: Successfully retrieved EXPLAIN plan.')
1487
+ return explain_text
1488
+ except Exception as e:
1489
+ logger_safe('error', f'get_the_explain: Failed to retrieve EXPLAIN plan: {e}')
1490
+ raise
1491
+
1492
+ def upload_documentation(process_info: Dict[str, Any]) -> None:
1493
+ """
1494
+ Upload the generated documentation for a data process into the documentation tables.
1495
+
1496
+ Parameters
1497
+ ----------
1498
+ process_info : dict
1499
+ A dictionary containing the process documentation information.
1500
+ Expected keys:
1501
+ - PROCESS_ID: str
1502
+ - DOCUMENTED_SQL: str
1503
+ - ENTITY_DESCRIPTION: str
1504
+ - DOCUMENTED_ENTITY_COLUMNS: dict[str, str]
1505
+ - DOCUMENTED_FEATURE_COLUMNS: dict[str, str]
1506
+ """
1507
+
1508
+ process_id = process_info['PROCESS_ID']
1509
+ documented_sql = process_info['DOCUMENTED_SQL']
1510
+ entity_description = process_info['ENTITY_DESCRIPTION']
1511
+ entity_columns_json = json.dumps(process_info['DOCUMENTED_ENTITY_COLUMNS'])
1512
+ feature_columns = process_info['DOCUMENTED_FEATURE_COLUMNS']
1513
+
1514
+ # build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_BUSINESS_LOGIC
1515
+ # that contains PROCESS_ID, BUSINESS_LOGIC_DESCRIPTION, ENTITY_DESCRIPTION, ENTITY_COLUMNS_JSON
1516
+ df_business_logic = pd.DataFrame([{
1517
+ 'PROCESS_ID': process_id,
1518
+ 'BUSINESS_LOGIC_DESCRIPTION': documented_sql,
1519
+ 'ENTITY_DESCRIPTION': entity_description,
1520
+ 'ENTITY_COLUMNS_JSON': entity_columns_json
1521
+ }])
1522
+
1523
+ # build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_FEATURES
1524
+ # that contains PROCESS_ID, FEATURE_ID, FEATURE_DESCRIPTION
1525
+ # at this stage, FEATURE_ID is not known, so we will use the FEATURE_NAME as a placeholder
1526
+ # and later replace it with the actual FEATURE_ID after insertion with a join with the FS_FEATURE_CATALOG
1527
+ # here we need to explode the feature_columns dict into multiple rows
1528
+ feature_rows = []
1529
+ for feature_name, feature_description in feature_columns.items():
1530
+ feature_rows.append({
1531
+ 'PROCESS_ID': process_id,
1532
+ 'FEATURE_NAME': feature_name, # placeholder for FEATURE_ID
1533
+ 'FEATURE_DESCRIPTION': feature_description
1534
+ })
1535
+ df_features = pd.DataFrame(feature_rows)
1536
+
1537
+ # Determine end period based on tdfs4ds configuration
1538
+ if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
1539
+ end_period_ = '9999-01-01 00:00:00'
1540
+ else:
1541
+ end_period_ = tdfs4ds.END_PERIOD
1542
+
1543
+ # upload the df_business_logic dataframe into a staging volatile table
1544
+ logger_safe('info', f'upload_documentation: Uploading documentation for process_id {process_id} into staging tables.')
1545
+ tdml.copy_to_sql(
1546
+ df_business_logic,
1547
+ table_name = "DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING",
1548
+ if_exists = 'replace',
1549
+ temporary = True
1550
+ )
1551
+ logger_safe('info', f'upload_documentation: Uploaded business logic documentation for process_id {process_id} into staging table.')
1552
+
1553
+ # upload the df_features dataframe into a staging volatile table
1554
+ logger_safe('info', f'upload_documentation: Uploading feature documentation for process_id {process_id} into staging tables.')
1555
+ tdml.copy_to_sql(
1556
+ df_features,
1557
+ table_name = "DOCUMENTATION_PROCESS_FEATURES_STAGING",
1558
+ if_exists = 'replace',
1559
+ temporary = True
1560
+ )
1561
+ logger_safe('info', f'upload_documentation: Uploaded feature documentation for process_id {process_id} into staging table.')
1562
+
1563
+ # merge into DOCUMENTATION_PROCESS_BUSINESS_LOGIC from staging table
1564
+ query_insert_business_logic = f"""
1565
+ CURRENT VALIDTIME
1566
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} EXISTING
1567
+ USING (
1568
+ SELECT
1569
+ PROCESS_ID,
1570
+ BUSINESS_LOGIC_DESCRIPTION,
1571
+ ENTITY_DESCRIPTION,
1572
+ ENTITY_COLUMNS_JSON
1573
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING
1574
+ ) UPDATED
1575
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1576
+ WHEN MATCHED THEN
1577
+ UPDATE
1578
+ SET
1579
+ BUSINESS_LOGIC_DESCRIPTION = UPDATED.BUSINESS_LOGIC_DESCRIPTION,
1580
+ ENTITY_DESCRIPTION = UPDATED.ENTITY_DESCRIPTION,
1581
+ ENTITY_COLUMNS_JSON = UPDATED.ENTITY_COLUMNS_JSON
1582
+ WHEN NOT MATCHED THEN
1583
+ INSERT (
1584
+ UPDATED.PROCESS_ID,
1585
+ UPDATED.BUSINESS_LOGIC_DESCRIPTION,
1586
+ UPDATED.ENTITY_DESCRIPTION,
1587
+ UPDATED.ENTITY_COLUMNS_JSON
1588
+ )
1589
+ """
1590
+
1591
+ # merge into DOCUMENTATION_PROCESS_FEATURES from staging table
1592
+ query_insert_features = f"""
1593
+ CURRENT VALIDTIME
1594
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} EXISTING
1595
+ USING (
1596
+ SELECT
1597
+ A.PROCESS_ID,
1598
+ FC.FEATURE_ID,
1599
+ A.FEATURE_NAME,
1600
+ A.FEATURE_DESCRIPTION
1601
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
1602
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1603
+ ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
1604
+ AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
1605
+ ) UPDATED
1606
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1607
+ AND EXISTING.FEATURE_ID = UPDATED.FEATURE_ID
1608
+ WHEN MATCHED THEN
1609
+ UPDATE
1610
+ SET
1611
+ FEATURE_DESCRIPTION = UPDATED.FEATURE_DESCRIPTION,
1612
+ FEATURE_NAME = UPDATED.FEATURE_NAME
1613
+ WHEN NOT MATCHED THEN
1614
+ INSERT (
1615
+ UPDATED.PROCESS_ID,
1616
+ UPDATED.FEATURE_ID,
1617
+ UPDATED.FEATURE_NAME,
1618
+ UPDATED.FEATURE_DESCRIPTION
1619
+ )
1620
+ """
1621
+
1622
+ # Remove features that are no longer present in the documentation
1623
+ query_delete_missing_features = f"""
1624
+ CURRENT VALIDTIME
1625
+ DELETE FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}
1626
+ WHERE PROCESS_ID = '{process_id}'
1627
+ AND FEATURE_ID NOT IN (
1628
+ SELECT FC.FEATURE_ID
1629
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
1630
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1631
+ ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
1632
+ AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
1633
+ )
1634
+ """
1635
+
1636
+ # Execute the merges
1637
+ try:
1638
+ tdml.execute_sql(query_insert_business_logic)
1639
+ logger_safe('info', f'upload_documentation: Merged business logic documentation for process_id {process_id} into main table.')
1640
+ except Exception as e:
1641
+ logger_safe('error', f'upload_documentation: Failed to merge business logic documentation for process_id {process_id}: {e}')
1642
+ print(query_insert_business_logic)
1643
+ raise
1644
+ try:
1645
+ tdml.execute_sql(query_insert_features)
1646
+ logger_safe('info', f'upload_documentation: Merged feature documentation for process_id {process_id} into main table.')
1647
+ except Exception as e:
1648
+ logger_safe('error', f'upload_documentation: Failed to merge feature documentation for process_id {process_id}: {e}')
1649
+ print(query_insert_features)
1650
+ raise
1651
+ try:
1652
+ tdml.execute_sql(query_delete_missing_features)
1653
+ logger_safe('info', f'upload_documentation: Removed missing features for process_id {process_id} from main table.')
1654
+ except Exception as e:
1655
+ logger_safe('error', f'upload_documentation: Failed to remove missing features for process_id {process_id}: {e}')
1656
+ print(query_delete_missing_features)
1657
+ raise
1658
+
1659
+ # remove staging tables
1660
+ tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING")
1661
+ tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING")
1662
+ logger_safe('info', f'upload_documentation: Successfully uploaded documentation for process_id {process_id}.')
1663
+
1664
+ return
1665
+
1666
+ def retrieve_documentation(process_id: str) -> Dict[str, Any]:
1667
+ """
1668
+ Retrieve the documentation for a data process from the documentation tables.
1669
+
1670
+ Parameters
1671
+ ----------
1672
+ process_id : str
1673
+ The unique identifier of the data process.
1674
+
1675
+ Returns
1676
+ -------
1677
+ dict
1678
+ A dictionary containing the documentation information with keys:
1679
+ - documented_sql: str
1680
+ - entity_description: str
1681
+ - documented_entity_columns: dict[str, str]
1682
+ - documented_feature_columns: dict[str, str]
1683
+ """
1684
+ logger_safe('info', f'retrieve_documentation: Retrieving documentation for process_id {process_id}.')
1685
+
1686
+ # Retrieve business logic documentation
1687
+ query_business_logic = f"""
1688
+ CURRENT VALIDTIME
1689
+ SELECT
1690
+ BUSINESS_LOGIC_DESCRIPTION,
1691
+ ENTITY_DESCRIPTION,
1692
+ ENTITY_COLUMNS_JSON
1693
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}
1694
+ WHERE PROCESS_ID = '{process_id}'
1695
+ """
1696
+ result_bl = tdml.execute_sql(query_business_logic).fetchone()
1697
+ if not result_bl:
1698
+ logger_safe('warning', f'retrieve_documentation: No business logic documentation found for process_id {process_id}.')
1699
+ return {}
1700
+
1701
+ documented_sql = result_bl[0]
1702
+ entity_description = result_bl[1]
1703
+ entity_columns_json = result_bl[2]
1704
+ documented_entity_columns = json.loads(entity_columns_json) if entity_columns_json else {}
1705
+
1706
+ # Retrieve feature documentation
1707
+ query_features = f"""
1708
+ CURRENT VALIDTIME
1709
+ SELECT
1710
+ FC.FEATURE_NAME,
1711
+ DPF.FEATURE_DESCRIPTION
1712
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} DPF
1713
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1714
+ ON DPF.FEATURE_ID = FC.FEATURE_ID
1715
+ WHERE DPF.PROCESS_ID = '{process_id}'
1716
+ """
1717
+ result_features = tdml.execute_sql(query_features).fetchall()
1718
+ documented_feature_columns = {
1719
+ row[0]: row[1] for row in result_features
1720
+ }
1721
+
1722
+ logger_safe('info', f'retrieve_documentation: Successfully retrieved documentation for process_id {process_id}.')
1723
+ return {
1724
+ "DOCUMENTED_SQL" : documented_sql,
1725
+ "ENTITY_DESCRIPTION" : entity_description,
1726
+ "DOCUMENTED_ENTITY_COLUMNS" : documented_entity_columns,
1727
+ "DOCUMENTED_FEATURE_COLUMNS" : documented_feature_columns
1728
+ }
1729
+
1730
+ def retrieve_explain_documentation(process_id: str) -> Dict[str, Any]:
1731
+ """
1732
+ Retrieve the EXPLAIN documentation for a data process from the documentation tables.
1733
+
1734
+ Parameters
1735
+ ----------
1736
+ process_id : str
1737
+ The unique identifier of the data process.
1738
+
1739
+ Returns
1740
+ -------
1741
+ dict
1742
+ A dictionary containing the EXPLAIN documentation information with keys:
1743
+ - explanation: str
1744
+ - optimization_score: int
1745
+ - warnings: list[str]
1746
+ - recommendations: list[str]
1747
+ """
1748
+ logger_safe('info', f'retrieve_explain_documentation: Retrieving EXPLAIN documentation for process_id {process_id}.')
1749
+
1750
+ query_explain = f"""
1751
+ CURRENT VALIDTIME
1752
+ SELECT
1753
+ EXPLAIN_ANALYSIS,
1754
+ OPTIMIZATION_SCORE,
1755
+ WARNINGS,
1756
+ RECOMMENDATIONS
1757
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}
1758
+ WHERE PROCESS_ID = '{process_id}'
1759
+ """
1760
+ result_explain = tdml.execute_sql(query_explain).fetchone()
1761
+ if not result_explain:
1762
+ logger_safe('warning', f'retrieve_explain_documentation: No EXPLAIN documentation found for process_id {process_id}.')
1763
+ return {}
1764
+
1765
+ explanation = result_explain[0]
1766
+ optimization_score = result_explain[1]
1767
+ warnings = json.loads(result_explain[2]) if result_explain[2] else []
1768
+ recommendations = json.loads(result_explain[3]) if result_explain[3] else []
1769
+
1770
+ logger_safe('info', f'retrieve_explain_documentation: Successfully retrieved EXPLAIN documentation for process_id {process_id}.')
1771
+ return {
1772
+ "EXPLAIN_ANALYSIS" : explanation,
1773
+ "OPTIMIZATION_SCORE" : optimization_score,
1774
+ "EXPLAIN_WARNINGS" : warnings,
1775
+ "EXPLAIN_RECOMMENDATIONS" : recommendations
1776
+ }
1777
+
1778
+ def upload_documentation_explain(process_info: Dict[str, Any]) -> None:
1779
+ """
1780
+ Upload the EXPLAIN documentation for a data process into the documentation tables.
1781
+
1782
+ Parameters
1783
+ ----------
1784
+ process_id : str
1785
+ The unique identifier of the data process.
1786
+ explain_documentation : dict
1787
+ A dictionary containing the EXPLAIN documentation information with keys:
1788
+ - explanation: str
1789
+ - optimization_score: int
1790
+ - warnings: list[str]
1791
+ - recommendations: list[str]
1792
+ """
1793
+
1794
+ explanation = process_info['EXPLAIN_ANALYSIS']
1795
+ optimization_score = process_info['OPTIMIZATION_SCORE']
1796
+ warnings_json = json.dumps(process_info['EXPLAIN_WARNINGS'])
1797
+ recommendations_json= json.dumps(process_info['EXPLAIN_RECOMMENDATIONS'])
1798
+ process_id = process_info['PROCESS_ID']
1799
+
1800
+ # merge into DOCUMENTATION_PROCESS_EXPLAIN
1801
+ query_insert_explain = f"""
1802
+ CURRENT VALIDTIME
1803
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} EXISTING
1804
+ USING (
1805
+ SELECT
1806
+ '{process_id}' AS PROCESS_ID,
1807
+ '{explanation.replace("'", "''")}' AS EXPLAIN_ANALYSIS,
1808
+ {optimization_score} AS OPTIMIZATION_SCORE,
1809
+ '{warnings_json.replace("'", "''")}' AS WARNINGS,
1810
+ '{recommendations_json.replace("'", "''")}' AS RECOMMENDATIONS
1811
+ ) UPDATED
1812
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1813
+ WHEN MATCHED THEN
1814
+ UPDATE
1815
+ SET
1816
+ EXPLAIN_ANALYSIS = UPDATED.EXPLAIN_ANALYSIS,
1817
+ OPTIMIZATION_SCORE = UPDATED.OPTIMIZATION_SCORE,
1818
+ WARNINGS = UPDATED.WARNINGS,
1819
+ RECOMMENDATIONS = UPDATED.RECOMMENDATIONS
1820
+ WHEN NOT MATCHED THEN
1821
+ INSERT (
1822
+ UPDATED.PROCESS_ID,
1823
+ UPDATED.EXPLAIN_ANALYSIS,
1824
+ UPDATED.OPTIMIZATION_SCORE,
1825
+ UPDATED.WARNINGS,
1826
+ UPDATED.RECOMMENDATIONS
1827
+ )
1828
+ """
1829
+
1830
+ # Execute the merge
1831
+ try:
1832
+ tdml.execute_sql(query_insert_explain)
1833
+ logger_safe('info', f'upload_documentation_explain: Uploaded EXPLAIN documentation for process_id {process_id}.')
1834
+ except Exception as e:
1835
+ logger_safe('error', f'upload_documentation_explain: Failed to upload EXPLAIN documentation for process_id {process_id}: {e}')
1836
+ raise
1837
+
1838
+ return
1839
+
1840
+ def display_process_info(process_info: Dict[str, Any] = None, process_id : str = None) -> None:
1841
+ """
1842
+ Pretty print the documentation and EXPLAIN analysis for a data process from process_info dict or by retrieving it using process_id.
1843
+
1844
+ Parameters
1845
+ ----------
1846
+ process_info : dict, optional (default=None)
1847
+ A dictionary containing the process documentation information.
1848
+ If None, process_id must be provided to retrieve the information.
1849
+ process_id : str, optional (default=None)
1850
+ The unique identifier of the data process.
1851
+ If process_info is None, this parameter is used to retrieve the documentation.
1852
+ -----------
1853
+ Returns
1854
+ None
1855
+ """
1856
+
1857
+ if process_info is None:
1858
+ if process_id is None:
1859
+ raise ValueError("Either process_info or process_id must be provided.")
1860
+ logger_safe('info', f'display_process_info: Retrieving documentation for process_id {process_id}.')
1861
+ process_info = get_process_info(process_id)
1862
+
1863
+ # pretty print documentation for info:
1864
+ _print_documentation(
1865
+ documented_sql = process_info.get('DOCUMENTED_SQL', None),
1866
+ entity_description = process_info.get('ENTITY_DESCRIPTION', None),
1867
+ documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
1868
+ documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
1869
+ process_id = process_info.get('PROCESS_ID', None),
1870
+ view_name = process_info.get('VIEW_NAME', None),
1871
+ explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
1872
+ optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
1873
+ explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
1874
+ explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
1875
+ sql_query = process_info.get('PROCESS_SQL', None),
1876
+ )
1877
+ return