tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1878 @@
1
+ from turtle import pd
2
+ from typing import Sequence, Optional, Dict, Any, List
3
+ import textwrap
4
+
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_core.output_parsers import JsonOutputParser
8
+ from langchain_core.runnables import Runnable, RunnableLambda
9
+ from langchain_core.messages import AIMessage
10
+ from IPython.display import HTML, display
11
+
12
+ import tdfs4ds
13
+ from tdfs4ds import logger_safe
14
+
15
+ import teradataml as tdml
16
+ import json
17
+ import ast
18
+ import re
19
+ import sqlparse
20
+
21
+ from teradataml.context.context import _get_database_username
22
+ import pandas as pd
23
+
24
+ from tdfs4ds.process_store.process_store_catalog_management import get_process_info
25
+
26
+
27
+ def _robust_json_parser(response: str) -> Dict[str, Any]:
28
+ """
29
+ Robustly extract and parse JSON from LLM responses.
30
+ Handles markdown code fences, escaped characters, and formatting variations.
31
+
32
+ Parameters
33
+ ----------
34
+ response : str
35
+ The raw response string from the LLM.
36
+
37
+ Returns
38
+ -------
39
+ dict
40
+ The parsed JSON as a dictionary.
41
+
42
+ Raises
43
+ ------
44
+ ValueError
45
+ If JSON cannot be extracted or parsed from the response.
46
+ """
47
+ if not isinstance(response, str):
48
+ raise ValueError(f"Expected string response, got {type(response)}")
49
+
50
+ # Try 1: Direct JSON parse (response might already be clean)
51
+ try:
52
+ return json.loads(response.strip())
53
+ except json.JSONDecodeError:
54
+ pass
55
+
56
+ # Try 2: Extract from markdown code fences (most flexible)
57
+ # Match opening backticks (with optional json language specifier) and closing backticks
58
+ # Using non-greedy matching with DOTALL to handle multiline content
59
+ markdown_patterns = [
60
+ r'```(?:json)?\s*\n(.*)\n```', # ```json\n...\n``` (any content in middle)
61
+ r'```(?:json)?\s*\r?\n(.*?)\r?\n```', # Handle Windows line endings
62
+ r'```(?:json)?\s*(.*?)\s*```', # ```...``` (flexible whitespace)
63
+ r'`{3}\s*(?:json)?\s*(.*?)\s*`{3}', # Alternative triple backticks
64
+ ]
65
+ for pattern in markdown_patterns:
66
+ match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
67
+ if match:
68
+ try:
69
+ extracted = match.group(1).strip()
70
+ # Normalize line endings
71
+ extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
72
+ if extracted: # Only try if we got something
73
+ return json.loads(extracted)
74
+ except (json.JSONDecodeError, IndexError):
75
+ pass
76
+
77
+ # Try 3: Extract first { ... } block (handles extra text before/after)
78
+ first_brace = response.find('{')
79
+ last_brace = response.rfind('}')
80
+ if first_brace != -1 and last_brace > first_brace:
81
+ try:
82
+ extracted = response[first_brace:last_brace+1]
83
+ # Normalize line endings
84
+ extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
85
+ return json.loads(extracted)
86
+ except json.JSONDecodeError:
87
+ pass
88
+
89
+ # Try 4: Remove markdown fences and retry
90
+ # Aggressively strip all markdown code fence markers
91
+ cleaned = response.strip()
92
+ cleaned = re.sub(r'^```\s*(?:json)?\s*', '', cleaned, flags=re.IGNORECASE)
93
+ cleaned = re.sub(r'\s*```\s*$', '', cleaned)
94
+ cleaned = re.sub(r'^`+\s*', '', cleaned)
95
+ cleaned = re.sub(r'\s*`+$', '', cleaned)
96
+ cleaned = cleaned.strip()
97
+ # Normalize line endings
98
+ cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
99
+ try:
100
+ return json.loads(cleaned)
101
+ except json.JSONDecodeError:
102
+ pass
103
+
104
+ # Try 5: As a last resort, try ast.literal_eval (for Python-like dicts)
105
+ try:
106
+ import ast
107
+ # Normalize line endings
108
+ cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
109
+ return ast.literal_eval(cleaned)
110
+ except (ValueError, SyntaxError):
111
+ pass
112
+
113
+ # If all else fails, raise informative error
114
+ logger_safe('error', f'Failed to parse JSON from LLM response. Full response: {response}')
115
+ raise ValueError(f"Could not extract valid JSON from response. First 200 chars: {response[:200]}")
116
+
117
+
118
+ # HTML Styling Constants
119
+ HTML_STYLES = {
120
+ "container": "font-family: Arial, sans-serif; margin: 10px 0;",
121
+ "title": "color: #1f618d; margin-bottom: 6px;",
122
+ "heading": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px;",
123
+ "heading_margin": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px; margin-top: 15px;",
124
+ "content": "background-color: #ecf0f1; padding: 10px; border-radius: 5px; line-height: 1.6;",
125
+ "list": "background-color: #ecf0f1; padding: 15px 30px; border-radius: 5px; line-height: 1.8;",
126
+ }
127
+
128
+
129
+ def _is_notebook() -> bool:
130
+ """Check if code is running in a Jupyter notebook."""
131
+ try:
132
+ # Check if IPython is available
133
+ from IPython import get_ipython
134
+ ipython = get_ipython()
135
+ if ipython is None:
136
+ return False
137
+
138
+ # Check for notebook kernel
139
+ if hasattr(ipython, 'kernel') and ipython.kernel is not None:
140
+ return True
141
+
142
+ # Check config for IPKernelApp (notebook kernel)
143
+ if hasattr(ipython, 'config') and 'IPKernelApp' in ipython.config:
144
+ return True
145
+
146
+ # Check if it's a ZMQInteractiveShell (notebook shell)
147
+ if ipython.__class__.__name__ == 'ZMQInteractiveShell':
148
+ return True
149
+
150
+ # Check for ipykernel in sys.modules
151
+ import sys
152
+ if 'ipykernel' in sys.modules:
153
+ return True
154
+
155
+ return False
156
+ except (ImportError, AttributeError, Exception):
157
+ return False
158
+
159
+
160
+ def _build_provider_llm_caller(llm: ChatOpenAI, provider: str, schema: Optional[Dict] = None):
161
+ """
162
+ Build a provider-specific LLM call wrapper for constrained output.
163
+
164
+ Parameters
165
+ ----------
166
+ llm : ChatOpenAI
167
+ The language model interface.
168
+ provider : str
169
+ The LLM provider (vllm, openai, ollama, azure, etc).
170
+ schema : dict, optional
171
+ JSON schema for constrained output.
172
+
173
+ Returns
174
+ -------
175
+ callable
176
+ A function that invokes the LLM with appropriate constraints.
177
+ """
178
+ if schema is None:
179
+ return lambda messages: llm.invoke(messages)
180
+
181
+ provider_l = provider.lower()
182
+
183
+ if provider_l in ("vllm", "openai-compatible"):
184
+ return lambda messages: llm.invoke(messages, extra_body={"guided_json": schema})
185
+
186
+ if provider_l in ("openai", "azure", "azure-openai"):
187
+ return lambda messages: llm.invoke(messages, response_format=schema)
188
+
189
+ if provider_l == "ollama":
190
+ return lambda messages: llm.invoke(messages, format=schema)
191
+
192
+ # Fallback: no constraints
193
+ return lambda messages: llm.invoke(messages)
194
+
195
+
196
+ def _print_documentation(
197
+ documented_sql: str,
198
+ entity_description: str,
199
+ documented_entity_columns: Dict[str, str],
200
+ documented_feature_columns: Dict[str, str],
201
+ process_id: Optional[str] = None,
202
+ view_name: Optional[str] = None,
203
+ explain_analysis: Optional[str] = None,
204
+ optimization_score: Optional[int] = None,
205
+ explain_warnings: Optional[List[str]] = None,
206
+ explain_recommendations: Optional[List[str]] = None,
207
+ sql_query: Optional[str] = None,
208
+ explain_plan: Optional[str] = None,
209
+ ) -> None:
210
+ """
211
+ Pretty print documentation with context-aware formatting.
212
+ Uses HTML in notebooks, text format in regular scripts.
213
+
214
+ Parameters
215
+ ----------
216
+ documented_sql : str
217
+ The query business logic description.
218
+ entity_description : str
219
+ The entity description.
220
+ documented_entity_columns : dict
221
+ Mapping of entity column names to descriptions.
222
+ documented_feature_columns : dict
223
+ Mapping of feature column names to descriptions.
224
+ process_id : str, optional
225
+ The process identifier for the title.
226
+ view_name : str, optional
227
+ The view name for the title.
228
+ explain_analysis : str, optional
229
+ The EXPLAIN plan analysis description.
230
+ optimization_score : int, optional
231
+ Optimization score from 1 to 5.
232
+ explain_warnings : list, optional
233
+ List of warnings from EXPLAIN analysis.
234
+ explain_recommendations : list, optional
235
+ List of recommendations from EXPLAIN analysis.
236
+ sql_query : str, optional
237
+ The original SQL query to display.
238
+ explain_plan : str, optional
239
+ The raw EXPLAIN plan output to display.
240
+ """
241
+ title = ''
242
+ if process_id or view_name:
243
+ title_parts = []
244
+ if process_id:
245
+ title_parts.append(f"Process: {process_id}")
246
+ if view_name:
247
+ title_parts.append(f"View: {view_name}")
248
+ title = ' — '.join(title_parts)
249
+
250
+ # Helpers to parse structured items and clean markdown (available in both contexts)
251
+ def _try_parse_structured(value):
252
+ if value is None:
253
+ return None
254
+ if isinstance(value, (dict, list)):
255
+ return value
256
+ if not isinstance(value, str):
257
+ return value
258
+ s = value.strip()
259
+ # Try JSON
260
+ try:
261
+ return json.loads(s)
262
+ except Exception:
263
+ pass
264
+ # Try Python literal
265
+ try:
266
+ return ast.literal_eval(s)
267
+ except Exception:
268
+ pass
269
+ return s
270
+
271
+ def _flatten_to_list(parsed):
272
+ if parsed is None:
273
+ return []
274
+ if isinstance(parsed, list):
275
+ out = []
276
+ for it in parsed:
277
+ out.extend(_flatten_to_list(it))
278
+ return out
279
+ if isinstance(parsed, dict):
280
+ # Prefer obvious value keys, else format key: value pairs
281
+ for k in ("issue", "warning", "action", "recommendation", "msg", "message"):
282
+ if k in parsed:
283
+ return [str(parsed[k])]
284
+ return ["; ".join(f"{kk}: {vv}" for kk, vv in parsed.items())]
285
+ return [str(parsed)]
286
+
287
+ def _strip_md(s: str) -> str:
288
+ # Remove **bold** and inline markdown emphasis for plain text
289
+ s = re.sub(r"\*\*(.*?)\*\*", r"\1", s)
290
+ s = re.sub(r"\*(.*?)\*", r"\1", s)
291
+ return s
292
+
293
+ def _md_to_html(s: str) -> str:
294
+ # Convert **bold** to <strong>
295
+ s = re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", s)
296
+ s = re.sub(r"\*(.*?)\*\*", r"<em>\1</em>", s)
297
+ # Escape simple < and > to avoid broken HTML (keep basic newlines)
298
+ s = s.replace("<", "&lt;").replace(">", "&gt;")
299
+ # Restore our strong/em tags
300
+ s = s.replace("&lt;strong&gt;", "<strong>").replace("&lt;/strong&gt;", "</strong>")
301
+ s = s.replace("&lt;em&gt;", "<em>").replace("&lt;/em&gt;", "</em>")
302
+ return s
303
+
304
+ # Build EXPLAIN section if available
305
+ parsed_explain = _try_parse_structured(explain_analysis)
306
+ parsed_warnings = _try_parse_structured(explain_warnings)
307
+ parsed_recs = _try_parse_structured(explain_recommendations)
308
+
309
+ warn_list = _flatten_to_list(parsed_warnings)
310
+ rec_list = _flatten_to_list(parsed_recs)
311
+
312
+ explain_section = ""
313
+ newline = '\n'
314
+ if parsed_explain or optimization_score or warn_list or rec_list:
315
+ score_color = "#27ae60" if optimization_score and optimization_score >= 4 else "#f39c12" if optimization_score and optimization_score == 3 else "#e74c3c"
316
+ explain_section = f"""
317
+ <h3 style="{HTML_STYLES['heading_margin']}">Query Optimization Analysis</h3>
318
+ <div style="background-color: #ecf0f1; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
319
+ <p><strong>Optimization Score:</strong> <span style="color: {score_color}; font-size: 18px; font-weight: bold;">{optimization_score}/5</span></p>
320
+ </div>
321
+ """
322
+
323
+ if parsed_explain:
324
+ # Display explanation as plain text, preserving newlines
325
+ explain_text = parsed_explain if isinstance(parsed_explain, str) else str(parsed_explain)
326
+ explain_text_html = explain_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace(newline, '<br>')
327
+ explain_section += f'<div style="{HTML_STYLES["content"]}">{explain_text_html}</div>'
328
+
329
+ if warn_list:
330
+ warnings_html = '\n'.join(f'<li style="color: #c0392b;">{_md_to_html(w).replace(newline,'<br>')}</li>' for w in warn_list)
331
+ explain_section += f"""
332
+ <h4 style="color: #c0392b; margin-top: 10px;">⚠ Warnings</h4>
333
+ <ul style="{HTML_STYLES['list']}">{warnings_html}</ul>
334
+ """
335
+
336
+ if rec_list:
337
+ recommendations_html = '\n'.join(f'<li style="color: #27ae60;">{_md_to_html(r).replace(newline,'<br>')}</li>' for r in rec_list)
338
+ explain_section += f"""
339
+ <h4 style="color: #27ae60; margin-top: 10px;">✓ Recommendations</h4>
340
+ <ul style="{HTML_STYLES['list']}">{recommendations_html}</ul>
341
+ """
342
+
343
+ if _is_notebook():
344
+ title_html = f"<h2>{title}</h2>" if title else ""
345
+ entity_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_entity_columns.items())
346
+ feature_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_feature_columns.items())
347
+
348
+ # Build optional sections
349
+ sql_section = ""
350
+ if sql_query:
351
+ formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
352
+ sql_section = f"""
353
+ <h3 style="{HTML_STYLES['heading_margin']}">Original SQL Query</h3>
354
+ <pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{formatted_sql}</pre>
355
+ """
356
+
357
+ explain_plan_section = ""
358
+ if explain_plan:
359
+ explain_plan_section = f"""
360
+ <h3 style="{HTML_STYLES['heading_margin']}">EXPLAIN Plan</h3>
361
+ <pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{explain_plan}</pre>
362
+ """
363
+
364
+ html_content = f"""
365
+ <div style="{HTML_STYLES['container']}">
366
+ {title_html}
367
+ <h3 style="{HTML_STYLES['heading']}">Query Business Logic</h3>
368
+ <p style="{HTML_STYLES['content']}">{documented_sql}</p>
369
+
370
+ <h3 style="{HTML_STYLES['heading_margin']}">Entity Description</h3>
371
+ <p style="{HTML_STYLES['content']}">{entity_description}</p>
372
+
373
+ <h3 style="{HTML_STYLES['heading_margin']}">Entity Columns</h3>
374
+ <ul style="{HTML_STYLES['list']}">{entity_items}</ul>
375
+
376
+ <h3 style="{HTML_STYLES['heading_margin']}">Feature Columns</h3>
377
+ <ul style="{HTML_STYLES['list']}">{feature_items}</ul>
378
+
379
+ {explain_section}
380
+ {sql_section}
381
+ {explain_plan_section}
382
+ </div>
383
+ """
384
+ display(HTML(html_content))
385
+ else:
386
+ # Text formatting for regular scripts
387
+ print("\n" + "="*100)
388
+ print(title if title else "DOCUMENTATION")
389
+ print("="*100)
390
+ print("\nQuery Business Logic:")
391
+ print(textwrap.fill(documented_sql, width=100))
392
+
393
+ print("\nEntity Description:")
394
+ print(textwrap.fill(entity_description, width=100))
395
+
396
+ print("\nEntity Columns Documentation:")
397
+ for col, desc in documented_entity_columns.items():
398
+ print(f"\n {col}:")
399
+ print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
400
+
401
+ print("\nFeature Columns Documentation:")
402
+ for col, desc in documented_feature_columns.items():
403
+ print(f"\n {col}:")
404
+ print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
405
+
406
+ # Print EXPLAIN analysis if available
407
+ if explain_analysis or optimization_score or explain_warnings or explain_recommendations:
408
+ print("\n" + "-"*100)
409
+ print("QUERY OPTIMIZATION ANALYSIS")
410
+ print("-"*100)
411
+
412
+ if optimization_score:
413
+ print(f"Optimization Score: {optimization_score}/5")
414
+
415
+ # Print parsed explanation, preserving carriage returns.
416
+ if parsed_explain:
417
+ print("\nExplanation:")
418
+ if isinstance(parsed_explain, str):
419
+ print(parsed_explain)
420
+ else:
421
+ print(str(parsed_explain))
422
+
423
+ # Print warnings (flattened) preserving carriage returns
424
+ if warn_list:
425
+ print("\nWarnings:")
426
+ for w in warn_list:
427
+ print(f" - {w}")
428
+
429
+ # Print recommendations (flattened) preserving carriage returns
430
+ if rec_list:
431
+ print("\nRecommendations:")
432
+ for r in rec_list:
433
+ print(f" - {r}")
434
+
435
+ # Print original SQL query if provided
436
+ if sql_query:
437
+ print("\n" + "-"*100)
438
+ print("ORIGINAL SQL QUERY")
439
+ print("-"*100)
440
+ formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
441
+ print(textwrap.indent(formatted_sql, ' '))
442
+
443
+ # Print EXPLAIN plan if provided
444
+ if explain_plan:
445
+ print("\n" + "-"*100)
446
+ print("EXPLAIN PLAN")
447
+ print("-"*100)
448
+ print(explain_plan)
449
+
450
+ print("\n" + "="*100 + "\n")
451
+
452
+
453
+ def build_llm(
454
+ llm_service: str = "https://api-dmproject.myddns.me/v1",
455
+ api_key: str = "YOUR_API_KEY_HERE",
456
+ model_id: str = "mistralai/Ministral-3-14B-Instruct-2512",
457
+ temperature: float = 0.0,
458
+ timeout: int = 120,
459
+ ) -> ChatOpenAI:
460
+ """
461
+ Build and return a ChatOpenAI client pointed at your vLLM/OpenAI-compatible endpoint.
462
+
463
+ Parameters
464
+ ----------
465
+ llm_service : str
466
+ Base URL of the LLM service.
467
+ api_key : str
468
+ API key for authentication.
469
+ model_id : str
470
+ Model identifier.
471
+ temperature : float
472
+ Sampling temperature for response diversity.
473
+ timeout : int
474
+ Request timeout in seconds.
475
+
476
+ Returns
477
+ -------
478
+ ChatOpenAI
479
+ Configured LLM client.
480
+
481
+ Raises
482
+ ------
483
+ Exception
484
+ If LLM client creation fails.
485
+ """
486
+ logger_safe('info', f'build_llm: Using LLM service at {llm_service} with model {model_id}')
487
+ logger_safe('debug', f'build_llm: Temperature={temperature}, Timeout={timeout}s')
488
+
489
+ try:
490
+ return ChatOpenAI(
491
+ base_url=llm_service,
492
+ api_key=api_key,
493
+ model=model_id,
494
+ temperature=temperature,
495
+ timeout=timeout,
496
+ )
497
+ except Exception as e:
498
+ logger_safe('error', f'build_llm: Failed to create LLM client: {e}')
499
+ raise
500
+
501
+
502
+ from typing import Sequence
503
+
504
+ def build_documentation_json_schema(columns: List[str], provider: str = "generic") -> Dict[str, Any]:
505
+ """
506
+ Build a provider-appropriate JSON Schema used to enforce strict JSON output
507
+ for SQL column documentation across multiple LLM backends.
508
+
509
+ This function returns different schema shapes depending on the LLM provider,
510
+ because each ecosystem uses a different structured-output mechanism:
511
+
512
+ Provider Modes
513
+ --------------
514
+ - provider="openai", "azure"
515
+ Returns the JSON Schema wrapped in OpenAI's `response_format={"type": "json_schema", ...}`
516
+ structure. Supported by GPT-4.1, GPT-4o, GPT-3.5-Turbo, and Azure OpenAI.
517
+
518
+ - provider="anthropic", "claude"
519
+ Returns an Anthropic *tool schema* definition. Claude 3.x models use tool
520
+ schemas to enforce strict JSON output.
521
+
522
+ - provider="ollama"
523
+ Returns the raw JSON schema that Ollama expects under the `format=` parameter
524
+ of the generate API. (Ollama 0.2+ supports response schemas.)
525
+
526
+ - provider="vllm"
527
+ Returns plain JSON Schema for use with vLLM's `guided_json` constrained decoding.
528
+
529
+ - provider="bedrock"
530
+ Bedrock Claude follows the Anthropic tool schema format.
531
+ Bedrock Llama / Titan accept plain JSON schema. This function returns the base
532
+ schema and leaves the final wrapping to the caller.
533
+
534
+ - provider="generic"
535
+ Returns plain JSON schema. Useful for LLM backends that do not support
536
+ constrained decoding, prompt-only JSON generation, or post-processing repair.
537
+
538
+ Parameters
539
+ ----------
540
+ columns : list[str]
541
+ Column names to include as required JSON object keys. Each column will map
542
+ to a string description generated by the model.
543
+
544
+ provider : str, optional
545
+ The model provider or backend type. Determines the structural format
546
+ required for constrained generation. One of:
547
+ "openai", "anthropic", "ollama", "vllm", "bedrock", "generic".
548
+
549
+ Returns
550
+ -------
551
+ dict
552
+ A dictionary representing the JSON Schema or provider-specific wrapper
553
+ used to enforce strict JSON output during LLM generation.
554
+
555
+ Notes
556
+ -----
557
+ - All schemas require that:
558
+ * the output be a JSON object
559
+ * keys match exactly the column names
560
+ * all values be strings
561
+ * additional properties be disallowed
562
+
563
+ - Not all providers enforce schemas equally:
564
+ * OpenAI, Claude, and vLLM offer hard guarantees.
565
+ * Ollama enforces schema reasonably well.
566
+ * Generic models may require post-processing.
567
+ """
568
+ # Base JSON schema — used directly by vLLM, Ollama, Bedrock, fallback
569
+ base_schema = {
570
+ "type": "object",
571
+ "properties": {col: {"type": "string"} for col in columns},
572
+ "required": list(columns),
573
+ "additionalProperties": False,
574
+ }
575
+
576
+ # --- Provider-specific formats ---
577
+
578
+ if provider.lower() in ("openai", "azure", "azure-openai"):
579
+ # OpenAI's required wrapper structure
580
+ return {
581
+ "type": "json_schema",
582
+ "json_schema": {
583
+ "name": "ColumnDocumentation",
584
+ "schema": base_schema,
585
+ "strict": True,
586
+ }
587
+ }
588
+
589
+ if provider.lower() in ("anthropic", "claude"):
590
+ # Anthropic tool schema
591
+ # You embed this inside the "tools" field when calling the model
592
+ return {
593
+ "name": "column_documentation",
594
+ "description": "Generate documentation for SQL output columns.",
595
+ "input_schema": base_schema
596
+ }
597
+
598
+ if provider.lower() == "ollama":
599
+ # Ollama's output format schema (unwrapped JSON schema)
600
+ # Returned directly in: generate(..., format=schema)
601
+ return base_schema
602
+
603
+ if provider.lower() in ("vllm", "openai-compatible"):
604
+ # vLLM's guided_json uses *plain JSON Schema*
605
+ # so return base_schema exactly
606
+ return base_schema
607
+
608
+ if provider.lower() == "bedrock":
609
+ # Bedrock Claude uses Anthropic schema
610
+ # Bedrock Llama uses plain JSON schema
611
+ # Return base_schema and let caller choose
612
+ return base_schema
613
+
614
+ # Fallback: generic JSON schema
615
+ return base_schema
616
+
617
+
618
+ def build_sql_documentation_chain(
619
+ llm: ChatOpenAI,
620
+ entity_columns: Sequence[str],
621
+ feature_columns: Sequence[str],
622
+ provider: str = "vllm",
623
+ json_constraint: bool = True,
624
+ ) -> Runnable:
625
+ """
626
+ Build a LangChain Runnable that generates business-focused documentation
627
+ for lists of entity and feature columns from a SQL query output, with optional provider-specific JSON
628
+ constraints (vLLM, OpenAI, Ollama, etc.).
629
+
630
+ The resulting chain expects two input variables:
631
+ - sql_query: str → the SQL query whose output is being documented
632
+ - columns_str: str → formatted list of entity and feature columns (e.g. "Entity columns:\n- col1\n\nFeature columns:\n- col2")
633
+
634
+ Parameters
635
+ ----------
636
+ llm : ChatOpenAI
637
+ The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
638
+ entity_columns : Sequence[str]
639
+ List of entity/identifier columns that must appear as keys in the output JSON.
640
+ feature_columns : Sequence[str]
641
+ List of feature columns that must appear as keys in the output JSON.
642
+ provider : str, optional (default="vllm")
643
+ Indicates which structured-output mechanism to use.
644
+ Supported values:
645
+ - "vllm" → uses `guided_json` for strict JSON output
646
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
647
+ - "ollama" → uses Ollama's `format=` schema
648
+ - "openai-compatible" → alias for vLLM-style guided decoding
649
+ - any other value → fall back to unconstrained text output
650
+ json_constraint : bool, optional (default=True)
651
+ If True:
652
+ - a JSON Schema is generated from the column lists
653
+ - provider-specific constrained decoding is applied
654
+ If False:
655
+ - the chain does not enforce JSON structure at the LLM level
656
+ - the model is only guided by the prompt (weaker guarantees)
657
+
658
+ Returns
659
+ -------
660
+ Runnable
661
+ A LangChain Runnable that executes:
662
+ prompt → LLM (optionally schema-guided) → JSON parser
663
+
664
+ When invoked with:
665
+ {
666
+ "sql_query": "...",
667
+ "columns_str": "Entity columns:\n- column1\n\nFeature columns:\n- column2\n..."
668
+ }
669
+
670
+ It returns:
671
+ dict[str, str]
672
+ A mapping of each requested column name to a short,
673
+ business-oriented description (≤ 5 sentences), plus a 'query_business_logic' key
674
+ containing a high-level description of the query's business logic (5-10 sentences), and an 'entity_description' key
675
+ with a holistic description of the entity (3-5 sentences).
676
+
677
+ Notes
678
+ -----
679
+ - The chain enforces valid JSON when possible:
680
+ * vLLM → `guided_json`
681
+ * OpenAI → `response_format={"type": "json_schema", ...}`
682
+ * Ollama → `format=<schema>`
683
+ - For unsupported providers, the model may emit imperfect JSON.
684
+ - Descriptions focus on business meaning, business logic,
685
+ and optionally technical details only when relevant.
686
+ """
687
+ all_columns = entity_columns + feature_columns + ["query_business_logic", "entity_description"]
688
+ logger_safe('info', f'build_sql_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}, entity_columns={list(entity_columns)}, feature_columns={list(feature_columns)}')
689
+ prompt = ChatPromptTemplate.from_template(
690
+ """
691
+ You are a data documentation assistant.
692
+
693
+ Your target audience is business users.
694
+ Your explanations must focus primarily on the business meaning and business logic of each column,
695
+ and you may add technical details only when they meaningfully clarify the business context.
696
+
697
+ Given:
698
+ 1. A SQL query.
699
+ 2. Lists of entity and feature columns that must be documented.
700
+
701
+ Your job:
702
+ - For entity columns: Provide a brief 1-sentence description of how this column contributes to identifying the entity described holistically under 'entity_description'. Do not repeat the full entity description here.
703
+ - For feature columns: Write a clear and concise explanation of what the column represents from a business perspective, describing the business logic behind how the value is derived or used within the context of the SQL query.
704
+ - Add technical details only if relevant and only to help a business audience understand the concept.
705
+ - Each description must be at most 5 sentences.
706
+ - Do not include any columns that are not in the provided lists.
707
+ - If a column name is ambiguous, infer its meaning from the SQL query as best as possible and say so.
708
+ - If you cannot infer anything meaningful, state that clearly (still within 3 sentences).
709
+ - Additionally, provide a high-level description of the business logic of the SQL query itself under the key 'query_business_logic'. This should explain what the query does from a business perspective, including the main purpose, data sources, transformations, and business value. Keep it to 5-10 sentences.
710
+ - Additionally, provide a description of the entity as a whole under the key 'entity_description'. This should describe the business object that the entity columns collectively identify, noting that this is the object the features describe. Keep it to 3-5 sentences.
711
+ - Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors.
712
+ - Answer in {language}
713
+ Output format (very important):
714
+ - Return ONLY a valid JSON object.
715
+ - Each top-level key must be exactly the column name or 'query_business_logic'.
716
+ - Each value must be a single string with the description.
717
+
718
+ Example of the required format:
719
+ {{
720
+ "customer_id": "This column serves as the primary key for identifying individual customers in the entity.",
721
+ "order_date": "The business date when the order was created. It represents the transaction date used for reporting and may reflect the source system's timestamp.",
722
+ "entity_description": "The customer entity represents individual buyers in the business system, identified by customer_id and described by features like order history and demographics. This entity is the core object that the feature columns characterize for analysis and decision-making.",
723
+ "query_business_logic": "This query joins customer and order data to provide a comprehensive view of customer orders. It filters orders from 2024 onwards to focus on recent activity. The result helps business users understand customer purchasing patterns and regional distribution."
724
+ }}
725
+
726
+ Now generate documentation.
727
+
728
+ SQL query:
729
+ ```sql
730
+ {sql_query}
731
+ ```
732
+ Columns to document (only document these):
733
+ {columns_str}
734
+ """
735
+ )
736
+ parser = JsonOutputParser()
737
+ if not json_constraint:
738
+ return prompt | llm | parser
739
+
740
+ schema = build_documentation_json_schema(all_columns, provider=provider)
741
+ logger_safe('debug', f'build_sql_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
742
+
743
+ # Use helper to build provider-specific LLM caller
744
+ call_llm = _build_provider_llm_caller(llm, provider, schema)
745
+ constrained_llm = RunnableLambda(call_llm)
746
+
747
+ # Final chain: prompt -> LLM (schema-guided) -> JSON parser
748
+ def _parse(ai_msg: AIMessage):
749
+ raw = ai_msg.content
750
+ return parser.parse(raw)
751
+
752
+ return prompt | constrained_llm | RunnableLambda(_parse)
753
+
754
+ def run_sql_documentation(
755
+ chain: Runnable,
756
+ sql_query: str,
757
+ entity_columns: Sequence[str],
758
+ feature_columns: Sequence[str],
759
+ language: str = "English",
760
+ ) -> Dict[str, str]:
761
+ """
762
+ Execute a previously constructed SQL-documentation chain and return
763
+ business-friendly documentation for the specified SQL output columns.
764
+
765
+ This function prepares the chain inputs (SQL query, formatted column list,
766
+ target language) and invokes the chain. The chain itself must have been
767
+ created using `build_sql_documentation_chain()`, which ensures the model
768
+ produces structured JSON suitable for parsing.
769
+
770
+ Parameters
771
+ ----------
772
+ chain : Runnable
773
+ A LangChain Runnable returned by `build_sql_documentation_chain()`.
774
+ This Runnable encapsulates:
775
+ - the prompt template
776
+ - a provider-specific LLM invocation (with or without JSON constraints)
777
+ - a JSON output parser
778
+
779
+ sql_query : str
780
+ The SQL query whose resulting columns should be documented. This query is
781
+ shown to the model so it can infer business logic, derivation rules, and
782
+ column meaning.
783
+
784
+ entity_columns : Sequence[str]
785
+ The list of entity/identifier column names that must appear as keys in the output JSON.
786
+ Only these columns will be documented. The order does not matter.
787
+
788
+ feature_columns : Sequence[str]
789
+ The list of feature column names that must appear as keys in the output JSON.
790
+ Only these columns will be documented. The order does not matter.
791
+
792
+ language : str, optional (default="English")
793
+ The target output language for the generated documentation.
794
+ This value is passed into the prompt’s `{language}` variable.
795
+ Examples: "English", "French", "German", "Spanish", "Japanese".
796
+
797
+ Returns
798
+ -------
799
+ dict[str, str]
800
+ A dictionary mapping each column name to a human-readable, business-oriented
801
+ description generated by the model, plus a 'query_business_logic' key
802
+ with the query's business logic description, and an 'entity_description' key
803
+ with the holistic entity description. Example:
804
+ {
805
+ "customer_id": "Unique customer identifier used for ...",
806
+ "order_date": "Business date when the order was created ...",
807
+ "entity_description": "The customer entity represents...",
808
+ "query_business_logic": "This query provides a view of ..."
809
+ }
810
+
811
+ Notes
812
+ -----
813
+ - The output format is determined by the chain's JSON parser. If the model
814
+ fails to produce valid JSON (e.g., due to unsupported constraints),
815
+ a `OutputParserException` may be raised.
816
+ - The resulting descriptions are typically ≤ 5 sentences per column, unless
817
+ modified in the chain's prompt.
818
+ """
819
+ logger_safe('info', f'run_sql_documentation: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
820
+ columns_str = "Entity columns:\n" + "\n".join(f"- {col}" for col in entity_columns) + "\n\nFeature columns:\n" + "\n".join(f"- {col}" for col in feature_columns)
821
+
822
+ try:
823
+ result = chain.invoke({
824
+ "sql_query": sql_query,
825
+ "columns_str": columns_str,
826
+ "language" : language
827
+ })
828
+ logger_safe('info', f'run_sql_documentation: Successfully generated documentation for columns: {list(result.keys())}')
829
+ return result
830
+ except Exception as e:
831
+ logger_safe('error', f'run_sql_documentation: Failed to generate documentation: {e}')
832
+ raise
833
+
834
+
835
+ def document_sql_query_columns(
836
+ sql_query: str,
837
+ entity_columns: Sequence[str],
838
+ feature_columns: Sequence[str],
839
+ language: str = "English",
840
+ provider: Optional[str] = None,
841
+ json_constraint: bool = True,
842
+ ) -> Dict[str, Any]:
843
+ """
844
+ Convenience function to generate business-focused documentation for SQL query output columns
845
+ using the configured instruction model from tdfs4ds settings.
846
+
847
+ This function automatically builds the LLM client using the tdfs4ds configuration variables
848
+ (INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL), constructs the documentation
849
+ chain, and executes it to produce column descriptions.
850
+
851
+ Parameters
852
+ ----------
853
+ sql_query : str
854
+ The SQL query whose resulting columns should be documented. This query is
855
+ shown to the model so it can infer business logic, derivation rules, and
856
+ column meaning.
857
+
858
+ entity_columns : Sequence[str]
859
+ The list of entity/identifier column names that must appear as keys in the output JSON.
860
+ Only these columns will be documented. The order does not matter.
861
+
862
+ feature_columns : Sequence[str]
863
+ The list of feature column names that must appear as keys in the output JSON.
864
+ Only these columns will be documented. The order does not matter.
865
+
866
+ language : str, optional (default="English")
867
+ The target output language for the generated documentation.
868
+ This value is passed into the prompt's {language} variable.
869
+ Examples: "English", "French", "German", "Spanish", "Japanese".
870
+
871
+ provider : str, optional (default=None)
872
+ Indicates which structured-output mechanism to use for the LLM.
873
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
874
+ Supported values:
875
+ - "vllm" → uses `guided_json` for strict JSON output
876
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
877
+ - "ollama" → uses Ollama's `format=` schema
878
+ - "openai-compatible" → alias for vLLM-style guided decoding
879
+ - any other value → fall back to unconstrained text output
880
+
881
+ json_constraint : bool, optional (default=True)
882
+ If True:
883
+ - a JSON Schema is generated from the column lists
884
+ - provider-specific constrained decoding is applied
885
+ If False:
886
+ - the chain does not enforce JSON structure at the LLM level
887
+ - the model is only guided by the prompt (weaker guarantees)
888
+
889
+ Returns
890
+ -------
891
+ dict
892
+ A dictionary with four keys:
893
+ - "query_business_logic": str containing the high-level business logic description of the query
894
+ - "entity_description": str containing the holistic description of the entity
895
+ - "entity_columns": dict[str, str] mapping each entity column name to its description
896
+ - "feature_columns": dict[str, str] mapping each feature column name to its description
897
+
898
+ Raises
899
+ ------
900
+ ValueError
901
+ If any of the required tdfs4ds configuration variables (INSTRUCT_MODEL_URL,
902
+ INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL) are not set.
903
+
904
+ Notes
905
+ -----
906
+ - This function requires that the tdfs4ds instruction model configuration is properly set.
907
+ - The resulting descriptions are typically ≤ 5 sentences per column, focusing on
908
+ business meaning and logic.
909
+ - If the model fails to produce valid JSON, an exception will be raised.
910
+ """
911
+ # Import the configuration variables
912
+ from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
913
+
914
+ # Validate configuration
915
+ if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
916
+ raise ValueError(
917
+ "tdfs4ds instruction model configuration is incomplete. Please ensure "
918
+ "INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
919
+ )
920
+
921
+ logger_safe('info', f'document_sql_query_columns: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
922
+
923
+ if provider is None:
924
+ provider = INSTRUCT_MODEL_PROVIDER
925
+
926
+ # Build the LLM client
927
+ llm = build_llm(
928
+ llm_service=INSTRUCT_MODEL_URL,
929
+ api_key=INSTRUCT_MODEL_API_KEY,
930
+ model_id=INSTRUCT_MODEL_MODEL
931
+ )
932
+
933
+ # Build the documentation chain
934
+ sql_doc_chain = build_sql_documentation_chain(llm, entity_columns, feature_columns, provider=provider, json_constraint=json_constraint)
935
+
936
+ # Run the documentation
937
+ result = run_sql_documentation(sql_doc_chain, sql_query, entity_columns, feature_columns, language=language)
938
+
939
+ # Separate entity columns, feature columns, entity description, and query logic
940
+ entity_docs = {k: v for k, v in result.items() if k in entity_columns}
941
+ feature_docs = {k: v for k, v in result.items() if k in feature_columns}
942
+ entity_desc = result.get("entity_description", "")
943
+ query_logic = result.get("query_business_logic", "")
944
+
945
+ logger_safe('info', f'document_sql_query_columns: Successfully completed documentation for {len(entity_docs)} entity columns, {len(feature_docs)} feature columns, entity description and query logic')
946
+ return {
947
+ "query_business_logic": query_logic,
948
+ "entity_description": entity_desc,
949
+ "entity_columns": entity_docs,
950
+ "feature_columns": feature_docs
951
+ }
952
+
953
+
954
+ def build_explain_documentation_chain(
955
+ llm: ChatOpenAI,
956
+ provider: str = "vllm",
957
+ json_constraint: bool = True,
958
+ ) -> Runnable:
959
+ """
960
+ Build a LangChain Runnable that analyzes SQL EXPLAIN plans and generates
961
+ optimization scores, warnings, and recommendations.
962
+
963
+ The resulting chain expects two input variables:
964
+ - sql_query: str → the original SQL query
965
+ - explain_plan: str → the EXPLAIN output from the database
966
+
967
+ Parameters
968
+ ----------
969
+ llm : ChatOpenAI
970
+ The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
971
+ provider : str, optional (default="vllm")
972
+ Indicates which structured-output mechanism to use.
973
+ Supported values:
974
+ - "vllm" → uses `guided_json` for strict JSON output
975
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
976
+ - "ollama" → uses Ollama's `format=` schema
977
+ - "openai-compatible" → alias for vLLM-style guided decoding
978
+ - any other value → fall back to unconstrained text output
979
+ json_constraint : bool, optional (default=True)
980
+ If True: a JSON Schema is generated and provider-specific constrained decoding is applied.
981
+ If False: the chain does not enforce JSON structure at the LLM level.
982
+
983
+ Returns
984
+ -------
985
+ Runnable
986
+ A LangChain Runnable that executes:
987
+ prompt → LLM (optionally schema-guided) → JSON parser
988
+
989
+ When invoked with:
990
+ {
991
+ "sql_query": "SELECT ...",
992
+ "explain_plan": "..."
993
+ }
994
+
995
+ It returns:
996
+ dict with keys:
997
+ - "explanation": str describing the EXPLAIN plan in business terms
998
+ - "optimization_score": int from 1 (poorly optimized) to 5 (well optimized)
999
+ - "warnings": list[str] of potential issues or concerns
1000
+ - "recommendations": list[str] of actionable optimization suggestions
1001
+ """
1002
+ logger_safe('info', f'build_explain_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}')
1003
+
1004
+ # JSON schema for EXPLAIN analysis output
1005
+ explain_schema = {
1006
+ "type": "object",
1007
+ "properties": {
1008
+ "explanation": {"type": "string"},
1009
+ "optimization_score": {
1010
+ "type": "integer",
1011
+ "minimum": 1,
1012
+ "maximum": 5,
1013
+ "description": "Score from 1 (poorly optimized) to 5 (well optimized)"
1014
+ },
1015
+ "warnings": {
1016
+ "type": "array",
1017
+ "items": {"type": "string"},
1018
+ "description": "List of potential issues or concerns"
1019
+ },
1020
+ "recommendations": {
1021
+ "type": "array",
1022
+ "items": {"type": "string"},
1023
+ "description": "List of actionable optimization suggestions"
1024
+ }
1025
+ },
1026
+ "required": ["explanation", "optimization_score", "warnings", "recommendations"],
1027
+ "additionalProperties": False
1028
+ }
1029
+
1030
+ prompt = ChatPromptTemplate.from_template(
1031
+ """
1032
+ You are an expert SQL query optimization analyst.
1033
+
1034
+ Your task is to analyze a SQL EXPLAIN plan and provide optimization guidance.
1035
+
1036
+ Provide your analysis in the following JSON format with these exact keys:
1037
+ - explanation: A clear, plain-text explanation of what the EXPLAIN plan shows. Include analysis of execution strategy, estimated costs, and any visible inefficiencies.
1038
+ - optimization_score: An integer from 1 to 5 (1 = poorly optimized, 5 = well optimized)
1039
+ - warnings: An array of warning strings about potential issues
1040
+ - recommendations: An array of actionable recommendation strings for improvement
1041
+
1042
+ Analysis Guidelines:
1043
+ - Focus on execution strategy, index usage, and join efficiency
1044
+ - Be detailed but business-friendly, avoiding unnecessary technical jargon
1045
+ - Consider factors like: full table scans vs index usage, join strategies, data distribution
1046
+ - Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors
1047
+
1048
+ Scoring Guidelines:
1049
+ - Score 1: Multiple full table scans, no indexes, inefficient joins
1050
+ - Score 2: Some index usage but still room for improvement, potentially expensive operations
1051
+ - Score 3: Reasonable query plan, acceptable performance, some optimization opportunities
1052
+ - Score 4: Good query plan with mostly optimized joins and indexes, minor improvements possible
1053
+ - Score 5: Excellent plan with efficient execution, proper use of indexes, optimal join strategies
1054
+
1055
+ Warnings should highlight specific concerns (e.g., 'Full table scan on large table ORDERS', 'Missing index on customer_id column').
1056
+ Recommendations should be specific and actionable (e.g., 'Add index on orders.customer_id', 'Consider using a different join strategy').
1057
+
1058
+ Output format (very important):
1059
+ - Return ONLY a valid JSON object.
1060
+ - Each top-level key must be exactly 'explanation', 'optimization_score', 'warnings', or 'recommendations'.
1061
+ - The 'explanation' value must be a single string.
1062
+ - The 'optimization_score' value must be an integer from 1 to 5.
1063
+ - The 'warnings' value must be an array of strings.
1064
+ - The 'recommendations' value must be an array of strings.
1065
+
1066
+ Example of the required format:
1067
+ {{
1068
+ "explanation": "The EXPLAIN plan shows a nested loop join between the customers and orders tables. The query performs a full table scan on the orders table, which has an estimated 1 million rows. The join condition uses the customer_id column, but there is no index on this column in the orders table.",
1069
+ "optimization_score": 2,
1070
+ "warnings": ["Full table scan on large orders table", "Missing index on orders.customer_id"],
1071
+ "recommendations": ["Add index on orders.customer_id", "Consider using a hash join instead of nested loop"]
1072
+ }}
1073
+
1074
+ SQL Query:
1075
+ ```sql
1076
+ {sql_query}
1077
+ ```
1078
+
1079
+ EXPLAIN Plan:
1080
+ ```
1081
+ {explain_plan}
1082
+ ```
1083
+
1084
+ Return ONLY valid JSON with the four keys above.
1085
+ """
1086
+ )
1087
+ parser = JsonOutputParser()
1088
+ if not json_constraint:
1089
+ return prompt | llm | parser
1090
+
1091
+ logger_safe('debug', f'build_explain_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
1092
+
1093
+ # Wrap schema for OpenAI providers
1094
+ if provider.lower() in ("openai", "azure", "azure-openai"):
1095
+ wrapped_schema = {
1096
+ "type": "json_schema",
1097
+ "json_schema": {
1098
+ "name": "ExplainAnalysis",
1099
+ "schema": explain_schema,
1100
+ "strict": True,
1101
+ }
1102
+ }
1103
+ else:
1104
+ wrapped_schema = explain_schema
1105
+
1106
+ # Use helper to build provider-specific LLM caller
1107
+ call_llm = _build_provider_llm_caller(llm, provider, wrapped_schema)
1108
+ constrained_llm = RunnableLambda(call_llm)
1109
+
1110
+ # Final chain: prompt -> LLM (schema-guided) -> JSON parser
1111
+ def _parse(ai_msg: AIMessage):
1112
+ raw = ai_msg.content
1113
+ return parser.parse(raw)
1114
+
1115
+ return prompt | constrained_llm | RunnableLambda(_parse)
1116
+
1117
+
1118
+ def run_explain_documentation(
1119
+ chain: Runnable,
1120
+ sql_query: str,
1121
+ explain_plan: str,
1122
+ ) -> Dict[str, Any]:
1123
+ """
1124
+ Execute an EXPLAIN-documentation chain and return optimization analysis.
1125
+
1126
+ Parameters
1127
+ ----------
1128
+ chain : Runnable
1129
+ A LangChain Runnable returned by `build_explain_documentation_chain()`.
1130
+ sql_query : str
1131
+ The original SQL query.
1132
+ explain_plan : str
1133
+ The EXPLAIN output from the database.
1134
+
1135
+ Returns
1136
+ -------
1137
+ dict
1138
+ A dictionary with keys: "explanation", "optimization_score", "warnings", "recommendations"
1139
+ """
1140
+ logger_safe('info', 'run_explain_documentation: Starting EXPLAIN analysis')
1141
+
1142
+ try:
1143
+ result = chain.invoke({
1144
+ "sql_query": sql_query,
1145
+ "explain_plan": explain_plan
1146
+ })
1147
+ logger_safe('info', f'run_explain_documentation: Successfully analyzed EXPLAIN plan. Score: {result.get("optimization_score", "N/A")}/5')
1148
+ return result
1149
+ except Exception as e:
1150
+ logger_safe('error', f'run_explain_documentation: Failed to analyze EXPLAIN plan: {e}')
1151
+ raise
1152
+
1153
+
1154
+ def document_sql_query_explain(
1155
+ sql_query: str,
1156
+ provider: Optional[str] = None,
1157
+ json_constraint: bool = True,
1158
+ ) -> Dict[str, Any]:
1159
+ """
1160
+ Analyze a SQL query's EXPLAIN plan and return optimization recommendations.
1161
+
1162
+ This function automatically builds the LLM client using tdfs4ds configuration,
1163
+ constructs the EXPLAIN analysis chain, and executes it.
1164
+
1165
+ Parameters
1166
+ ----------
1167
+ sql_query : str
1168
+ The original SQL query.
1169
+ explain_plan : str
1170
+ The EXPLAIN output from the database.
1171
+ provider : str, optional (default=None)
1172
+ Indicates which structured-output mechanism to use for the LLM.
1173
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
1174
+ Supported values: "vllm", "openai", "azure", "ollama", etc.
1175
+ json_constraint : bool, optional (default=True)
1176
+ If True: use provider-specific constrained decoding.
1177
+ If False: rely on prompt guidance only.
1178
+
1179
+ Returns
1180
+ -------
1181
+ dict
1182
+ A dictionary with keys:
1183
+ - "explanation": str describing the EXPLAIN plan
1184
+ - "optimization_score": int from 1 to 5
1185
+ - "warnings": list[str] of potential issues
1186
+ - "recommendations": list[str] of actionable suggestions
1187
+
1188
+ Raises
1189
+ ------
1190
+ ValueError
1191
+ If tdfs4ds instruction model configuration is incomplete.
1192
+ """
1193
+ from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
1194
+
1195
+ if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
1196
+ raise ValueError(
1197
+ "tdfs4ds instruction model configuration is incomplete. Please ensure "
1198
+ "INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
1199
+ )
1200
+
1201
+ logger_safe('info', 'document_sql_query_explain: Starting EXPLAIN analysis')
1202
+
1203
+ if provider is None:
1204
+ provider = INSTRUCT_MODEL_PROVIDER
1205
+
1206
+ # Build the LLM client
1207
+ llm = build_llm(
1208
+ llm_service=INSTRUCT_MODEL_URL,
1209
+ api_key=INSTRUCT_MODEL_API_KEY,
1210
+ model_id=INSTRUCT_MODEL_MODEL
1211
+ )
1212
+
1213
+ # get the explain plan:
1214
+ explain_plan = get_the_explain(sql_query)
1215
+ # Build and run the EXPLAIN analysis chain
1216
+ explain_chain = build_explain_documentation_chain(llm, provider=provider, json_constraint=json_constraint)
1217
+ result = run_explain_documentation(explain_chain, sql_query, explain_plan)
1218
+
1219
+ logger_safe('info', f'document_sql_query_explain: Successfully completed EXPLAIN analysis. Score: {result.get("optimization_score", "N/A")}/5')
1220
+ return result
1221
+
1222
+ def documentation_tables_creation():
1223
+ """
1224
+ Create the necessary documentation tables in the database if they do not already exist.
1225
+ tdml: The tdfs4ds TDML connection object."""
1226
+ query_process_table = f"""
1227
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} ,FALLBACK ,
1228
+ NO BEFORE JOURNAL,
1229
+ NO AFTER JOURNAL,
1230
+ CHECKSUM = DEFAULT,
1231
+ DEFAULT MERGEBLOCKRATIO,
1232
+ MAP = TD_MAP1
1233
+ (
1234
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1235
+ BUSINESS_LOGIC_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1236
+ ENTITY_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1237
+ ENTITY_COLUMNS_JSON VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1238
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1239
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1240
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1241
+ PRIMARY INDEX ( PROCESS_ID )
1242
+ """
1243
+
1244
+ query_process_features_table = f"""
1245
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} ,FALLBACK ,
1246
+ NO BEFORE JOURNAL,
1247
+ NO AFTER JOURNAL,
1248
+ CHECKSUM = DEFAULT,
1249
+ DEFAULT MERGEBLOCKRATIO,
1250
+ MAP = TD_MAP1
1251
+ (
1252
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1253
+ FEATURE_ID BIGINT NOT NULL,
1254
+ FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1255
+ FEATURE_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1256
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1257
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1258
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1259
+ PRIMARY INDEX ( PROCESS_ID, FEATURE_ID )
1260
+ """
1261
+
1262
+ query_process_explain_table = f"""
1263
+ CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} ,FALLBACK ,
1264
+ NO BEFORE JOURNAL,
1265
+ NO AFTER JOURNAL,
1266
+ CHECKSUM = DEFAULT,
1267
+ DEFAULT MERGEBLOCKRATIO,
1268
+ MAP = TD_MAP1
1269
+ (
1270
+ PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
1271
+ EXPLAIN_ANALYSIS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1272
+ OPTIMIZATION_SCORE INT,
1273
+ WARNINGS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1274
+ RECOMMENDATIONS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
1275
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1276
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
1277
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
1278
+ PRIMARY INDEX ( PROCESS_ID )
1279
+ """
1280
+
1281
+ try:
1282
+ tdml.execute_sql(query_process_table)
1283
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}')
1284
+ except Exception as e:
1285
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}: {e}')
1286
+ if 'already exists' in str(e).lower():
1287
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} already exists. Skipping creation.')
1288
+ pass
1289
+ else:
1290
+ raise
1291
+ try:
1292
+ tdml.execute_sql(query_process_features_table)
1293
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}')
1294
+ except Exception as e:
1295
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}: {e}')
1296
+ if 'already exists' in str(e).lower():
1297
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} already exists. Skipping creation.')
1298
+ pass
1299
+ else:
1300
+ raise
1301
+
1302
+ try:
1303
+ tdml.execute_sql(query_process_explain_table)
1304
+ logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}')
1305
+ except Exception as e:
1306
+ logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}: {e}')
1307
+ if 'already exists' in str(e).lower():
1308
+ logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} already exists. Skipping creation.')
1309
+ pass
1310
+ else:
1311
+ raise
1312
+
1313
+ logger_safe('info', 'documentation_tables_creation: Documentation tables creation process completed.')
1314
+ return
1315
+
1316
+ def document_process(process_id: str, language: str = "English", json_constraint: bool = True, show_sql_query: bool = False, show_explain_plan: bool = False, display: bool = True, upload: bool = True) -> Optional[Dict[str, Any]]:
1317
+ """
1318
+ Generate and store documentation for a data process identified by process_id.
1319
+ This function retrieves the SQL query and output columns for the process,
1320
+ generates business-focused documentation using an LLM, and stores the results
1321
+ in the appropriate documentation tables.
1322
+
1323
+ Parameters
1324
+ ----------
1325
+ process_id : str
1326
+ The unique identifier of the data process to document.
1327
+
1328
+ language : str, optional (default="English")
1329
+ The target output language for the generated documentation. This value is
1330
+ passed into the prompt’s `{language}` variable. Examples: "English", "French", "German", "Spanish", "Japanese".
1331
+ provider : str, optional (default=None)
1332
+ Indicates which structured-output mechanism to use for the LLM.
1333
+ If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
1334
+ Supported values:
1335
+ - "vllm" → uses `guided_json` for strict JSON output
1336
+ - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
1337
+ - "ollama" → uses Ollama's `format=` schema
1338
+ - "openai-compatible" → alias for vLLM-style guided decoding
1339
+ - any other value → fall back to unconstrained text output
1340
+ json_constraint : bool, optional (default=True)
1341
+ If True:
1342
+ - a JSON Schema is generated from the column list
1343
+ - provider-specific constrained decoding is applied
1344
+ If False:
1345
+ - the chain does not enforce JSON structure at the LLM level
1346
+ - the model is only guided by the prompt (weaker guarantees)
1347
+ show_sql_query : bool, optional (default=False)
1348
+ If True, display the original SQL query at the end of the documentation report.
1349
+ show_explain_plan : bool, optional (default=False)
1350
+ If True, display the raw EXPLAIN plan output at the end of the documentation report.
1351
+ display : bool, optional (default=True)
1352
+ If True, print the generated documentation to the console.
1353
+ upload_documentation : bool, optional (default=True)
1354
+ If True, upload the generated documentation to the documentation tables.
1355
+
1356
+ Returns
1357
+ -------
1358
+ dict or None
1359
+ A dictionary containing the generated documentation and analysis, or None if an error occurred.
1360
+ The dictionary includes keys:
1361
+ - PROCESS_ID
1362
+ - DOCUMENTED_SQL
1363
+ - ENTITY_DESCRIPTION
1364
+ - DOCUMENTED_ENTITY_COLUMNS
1365
+ - DOCUMENTED_FEATURE_COLUMNS
1366
+ - EXPLAIN_ANALYSIS (if show_explain_plan is True)
1367
+ - OPTIMIZATION_SCORE (if show_explain_plan is True)
1368
+ - EXPLAIN_WARNINGS (if show_explain_plan is True)
1369
+ - EXPLAIN_RECOMMENDATIONS (if show_explain_plan is True)
1370
+ - RAW_EXPLAIN_PLAN (if show_explain_plan is True)
1371
+ Notes
1372
+ -----
1373
+ - This function requires that the tdfs4ds instruction model configuration is properly set.
1374
+ - If the model fails to produce valid JSON, an exception will be raised.
1375
+ - The resulting descriptions are typically ≤ 5 sentences per column, focusing on
1376
+ business meaning and logic.
1377
+ """
1378
+ logger_safe('info', f'document_process: Starting documentation for process_id {process_id} in {language}')
1379
+
1380
+ # Retrieve process SQL and columns
1381
+ try:
1382
+ process_info = tdfs4ds.process_store.process_store_catalog_management.get_process_info(process_id)
1383
+ except Exception as e:
1384
+ logger_safe('error', f"document_process: Error retrieving process info for process_id {process_id}: {e}")
1385
+ return
1386
+
1387
+ documentation = document_sql_query_columns(
1388
+ sql_query = process_info['PROCESS_SQL'],
1389
+ entity_columns = process_info['ENTITY_COLUMNS'],
1390
+ feature_columns = process_info['FEATURE_COLUMNS']
1391
+ )
1392
+
1393
+ process_info['DOCUMENTED_SQL'] = documentation['query_business_logic']
1394
+ process_info['ENTITY_DESCRIPTION'] = documentation['entity_description']
1395
+ process_info['DOCUMENTED_ENTITY_COLUMNS'] = documentation['entity_columns']
1396
+ process_info['DOCUMENTED_FEATURE_COLUMNS'] = documentation['feature_columns']
1397
+
1398
+ if True:
1399
+ explain_documentation = document_sql_query_explain(
1400
+ sql_query = process_info['PROCESS_SQL']
1401
+ )
1402
+
1403
+ process_info['EXPLAIN_ANALYSIS'] = explain_documentation['explanation']
1404
+ process_info['OPTIMIZATION_SCORE'] = explain_documentation['optimization_score']
1405
+ process_info['EXPLAIN_WARNINGS'] = explain_documentation['warnings']
1406
+ process_info['EXPLAIN_RECOMMENDATIONS'] = explain_documentation['recommendations']
1407
+
1408
+ # Store the raw EXPLAIN plan if needed for display
1409
+ if show_explain_plan:
1410
+ process_info['RAW_EXPLAIN_PLAN'] = get_the_explain(process_info['PROCESS_SQL'])
1411
+
1412
+ # Upload the generated documentation to the documentation tables:
1413
+ if upload:
1414
+ upload_documentation(process_info)
1415
+ logger_safe('info', f'document_process: Uploaded documentation for process_id {process_id} to documentation tables.')
1416
+ upload_documentation_explain(process_info)
1417
+ logger_safe('info', f'document_process: Uploaded EXPLAIN analysis for process_id {process_id} to documentation tables.')
1418
+
1419
+ # pretty print documentation for info:
1420
+ logger_safe('info', f"document_process: Documentation for process_id {process_id}:")
1421
+
1422
+ if display:
1423
+ _print_documentation(
1424
+ documented_sql = process_info.get('DOCUMENTED_SQL', None),
1425
+ entity_description = process_info.get('ENTITY_DESCRIPTION', None),
1426
+ documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
1427
+ documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
1428
+ process_id = process_info.get('PROCESS_ID', process_id),
1429
+ view_name = process_info.get('VIEW_NAME', None),
1430
+ explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
1431
+ optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
1432
+ explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
1433
+ explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
1434
+ sql_query = process_info.get('PROCESS_SQL', None) if show_sql_query else None,
1435
+ explain_plan = process_info.get('RAW_EXPLAIN_PLAN', None) if show_explain_plan else None,
1436
+ )
1437
+
1438
+ return process_info
1439
+
1440
+ def get_the_explain(sql_query: str) -> str:
1441
+ """
1442
+ Get the EXPLAIN plan for a given SQL query using the tdfs4ds TDML connection.
1443
+
1444
+ Parameters
1445
+ ----------
1446
+ sql_query : str
1447
+ The SQL query to explain.
1448
+
1449
+ Returns
1450
+ -------
1451
+ str
1452
+ The EXPLAIN plan as a formatted string.
1453
+ """
1454
+ def _extract_inner_query_from_view(query: str) -> str:
1455
+ """
1456
+ If the provided SQL is a CREATE/REPLACE VIEW (or REPLACE VIEW), extract and
1457
+ return the inner SELECT/definition. Otherwise return the original query.
1458
+
1459
+ This helps when running EXPLAIN: we want to analyze the query inside the
1460
+ view definition rather than the DDL wrapper.
1461
+ """
1462
+ if not isinstance(query, str):
1463
+ return query
1464
+ pattern = r'^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?|REPLACE\s+)?VIEW\b.*?\bAS\b\s*(?P<body>.*)$'
1465
+ m = re.search(pattern, query, flags=re.IGNORECASE | re.DOTALL)
1466
+ if not m:
1467
+ return query
1468
+ body = m.group('body').strip()
1469
+ # Strip outer parentheses if the definition is wrapped
1470
+ if body.startswith('(') and body.endswith(')'):
1471
+ body = body[1:-1].strip()
1472
+ # Remove trailing semicolon
1473
+ if body.endswith(';'):
1474
+ body = body[:-1].strip()
1475
+ # Remove trailing LOCK ROW FOR ACCESS (or similar) clauses that may appear
1476
+ # in view definitions (e.g., "LOCK ROW FOR ACCESS") so EXPLAIN focuses
1477
+ # on the inner SELECT statement.
1478
+ body = re.sub(r"\bLOCK\s+ROW\s+FOR\s+ACCESS\b\s*;?\s*$", "", body, flags=re.IGNORECASE)
1479
+ logger_safe('debug', 'get_the_explain: Extracted inner query from CREATE/REPLACE VIEW for EXPLAIN.')
1480
+ return body
1481
+
1482
+ inner_sql = _extract_inner_query_from_view(sql_query)
1483
+ try:
1484
+ explain_result = tdml.execute_sql(f"EXPLAIN {inner_sql}").fetchall()
1485
+ explain_lines = [row[0] for row in explain_result]
1486
+ explain_text = "\n".join(explain_lines)
1487
+ logger_safe('info', 'get_the_explain: Successfully retrieved EXPLAIN plan.')
1488
+ return explain_text
1489
+ except Exception as e:
1490
+ logger_safe('error', f'get_the_explain: Failed to retrieve EXPLAIN plan: {e}')
1491
+ raise
1492
+
1493
+ def upload_documentation(process_info: Dict[str, Any]) -> None:
1494
+ """
1495
+ Upload the generated documentation for a data process into the documentation tables.
1496
+
1497
+ Parameters
1498
+ ----------
1499
+ process_info : dict
1500
+ A dictionary containing the process documentation information.
1501
+ Expected keys:
1502
+ - PROCESS_ID: str
1503
+ - DOCUMENTED_SQL: str
1504
+ - ENTITY_DESCRIPTION: str
1505
+ - DOCUMENTED_ENTITY_COLUMNS: dict[str, str]
1506
+ - DOCUMENTED_FEATURE_COLUMNS: dict[str, str]
1507
+ """
1508
+
1509
+ process_id = process_info['PROCESS_ID']
1510
+ documented_sql = process_info['DOCUMENTED_SQL']
1511
+ entity_description = process_info['ENTITY_DESCRIPTION']
1512
+ entity_columns_json = json.dumps(process_info['DOCUMENTED_ENTITY_COLUMNS'])
1513
+ feature_columns = process_info['DOCUMENTED_FEATURE_COLUMNS']
1514
+
1515
+ # build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_BUSINESS_LOGIC
1516
+ # that contains PROCESS_ID, BUSINESS_LOGIC_DESCRIPTION, ENTITY_DESCRIPTION, ENTITY_COLUMNS_JSON
1517
+ df_business_logic = pd.DataFrame([{
1518
+ 'PROCESS_ID': process_id,
1519
+ 'BUSINESS_LOGIC_DESCRIPTION': documented_sql,
1520
+ 'ENTITY_DESCRIPTION': entity_description,
1521
+ 'ENTITY_COLUMNS_JSON': entity_columns_json
1522
+ }])
1523
+
1524
+ # build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_FEATURES
1525
+ # that contains PROCESS_ID, FEATURE_ID, FEATURE_DESCRIPTION
1526
+ # at this stage, FEATURE_ID is not known, so we will use the FEATURE_NAME as a placeholder
1527
+ # and later replace it with the actual FEATURE_ID after insertion with a join with the FS_FEATURE_CATALOG
1528
+ # here we need to explode the feature_columns dict into multiple rows
1529
+ feature_rows = []
1530
+ for feature_name, feature_description in feature_columns.items():
1531
+ feature_rows.append({
1532
+ 'PROCESS_ID': process_id,
1533
+ 'FEATURE_NAME': feature_name, # placeholder for FEATURE_ID
1534
+ 'FEATURE_DESCRIPTION': feature_description
1535
+ })
1536
+ df_features = pd.DataFrame(feature_rows)
1537
+
1538
+ # Determine end period based on tdfs4ds configuration
1539
+ if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
1540
+ end_period_ = '9999-01-01 00:00:00'
1541
+ else:
1542
+ end_period_ = tdfs4ds.END_PERIOD
1543
+
1544
+ # upload the df_business_logic dataframe into a staging volatile table
1545
+ logger_safe('info', f'upload_documentation: Uploading documentation for process_id {process_id} into staging tables.')
1546
+ tdml.copy_to_sql(
1547
+ df_business_logic,
1548
+ table_name = "DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING",
1549
+ if_exists = 'replace',
1550
+ temporary = True
1551
+ )
1552
+ logger_safe('info', f'upload_documentation: Uploaded business logic documentation for process_id {process_id} into staging table.')
1553
+
1554
+ # upload the df_features dataframe into a staging volatile table
1555
+ logger_safe('info', f'upload_documentation: Uploading feature documentation for process_id {process_id} into staging tables.')
1556
+ tdml.copy_to_sql(
1557
+ df_features,
1558
+ table_name = "DOCUMENTATION_PROCESS_FEATURES_STAGING",
1559
+ if_exists = 'replace',
1560
+ temporary = True
1561
+ )
1562
+ logger_safe('info', f'upload_documentation: Uploaded feature documentation for process_id {process_id} into staging table.')
1563
+
1564
+ # merge into DOCUMENTATION_PROCESS_BUSINESS_LOGIC from staging table
1565
+ query_insert_business_logic = f"""
1566
+ CURRENT VALIDTIME
1567
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} EXISTING
1568
+ USING (
1569
+ SELECT
1570
+ PROCESS_ID,
1571
+ BUSINESS_LOGIC_DESCRIPTION,
1572
+ ENTITY_DESCRIPTION,
1573
+ ENTITY_COLUMNS_JSON
1574
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING
1575
+ ) UPDATED
1576
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1577
+ WHEN MATCHED THEN
1578
+ UPDATE
1579
+ SET
1580
+ BUSINESS_LOGIC_DESCRIPTION = UPDATED.BUSINESS_LOGIC_DESCRIPTION,
1581
+ ENTITY_DESCRIPTION = UPDATED.ENTITY_DESCRIPTION,
1582
+ ENTITY_COLUMNS_JSON = UPDATED.ENTITY_COLUMNS_JSON
1583
+ WHEN NOT MATCHED THEN
1584
+ INSERT (
1585
+ UPDATED.PROCESS_ID,
1586
+ UPDATED.BUSINESS_LOGIC_DESCRIPTION,
1587
+ UPDATED.ENTITY_DESCRIPTION,
1588
+ UPDATED.ENTITY_COLUMNS_JSON
1589
+ )
1590
+ """
1591
+
1592
+ # merge into DOCUMENTATION_PROCESS_FEATURES from staging table
1593
+ query_insert_features = f"""
1594
+ CURRENT VALIDTIME
1595
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} EXISTING
1596
+ USING (
1597
+ SELECT
1598
+ A.PROCESS_ID,
1599
+ FC.FEATURE_ID,
1600
+ A.FEATURE_NAME,
1601
+ A.FEATURE_DESCRIPTION
1602
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
1603
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1604
+ ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
1605
+ AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
1606
+ ) UPDATED
1607
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1608
+ AND EXISTING.FEATURE_ID = UPDATED.FEATURE_ID
1609
+ WHEN MATCHED THEN
1610
+ UPDATE
1611
+ SET
1612
+ FEATURE_DESCRIPTION = UPDATED.FEATURE_DESCRIPTION,
1613
+ FEATURE_NAME = UPDATED.FEATURE_NAME
1614
+ WHEN NOT MATCHED THEN
1615
+ INSERT (
1616
+ UPDATED.PROCESS_ID,
1617
+ UPDATED.FEATURE_ID,
1618
+ UPDATED.FEATURE_NAME,
1619
+ UPDATED.FEATURE_DESCRIPTION
1620
+ )
1621
+ """
1622
+
1623
+ # Remove features that are no longer present in the documentation
1624
+ query_delete_missing_features = f"""
1625
+ CURRENT VALIDTIME
1626
+ DELETE FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}
1627
+ WHERE PROCESS_ID = '{process_id}'
1628
+ AND FEATURE_ID NOT IN (
1629
+ SELECT FC.FEATURE_ID
1630
+ FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
1631
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1632
+ ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
1633
+ AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
1634
+ )
1635
+ """
1636
+
1637
+ # Execute the merges
1638
+ try:
1639
+ tdml.execute_sql(query_insert_business_logic)
1640
+ logger_safe('info', f'upload_documentation: Merged business logic documentation for process_id {process_id} into main table.')
1641
+ except Exception as e:
1642
+ logger_safe('error', f'upload_documentation: Failed to merge business logic documentation for process_id {process_id}: {e}')
1643
+ print(query_insert_business_logic)
1644
+ raise
1645
+ try:
1646
+ tdml.execute_sql(query_insert_features)
1647
+ logger_safe('info', f'upload_documentation: Merged feature documentation for process_id {process_id} into main table.')
1648
+ except Exception as e:
1649
+ logger_safe('error', f'upload_documentation: Failed to merge feature documentation for process_id {process_id}: {e}')
1650
+ print(query_insert_features)
1651
+ raise
1652
+ try:
1653
+ tdml.execute_sql(query_delete_missing_features)
1654
+ logger_safe('info', f'upload_documentation: Removed missing features for process_id {process_id} from main table.')
1655
+ except Exception as e:
1656
+ logger_safe('error', f'upload_documentation: Failed to remove missing features for process_id {process_id}: {e}')
1657
+ print(query_delete_missing_features)
1658
+ raise
1659
+
1660
+ # remove staging tables
1661
+ tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING")
1662
+ tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING")
1663
+ logger_safe('info', f'upload_documentation: Successfully uploaded documentation for process_id {process_id}.')
1664
+
1665
+ return
1666
+
1667
+ def retrieve_documentation(process_id: str) -> Dict[str, Any]:
1668
+ """
1669
+ Retrieve the documentation for a data process from the documentation tables.
1670
+
1671
+ Parameters
1672
+ ----------
1673
+ process_id : str
1674
+ The unique identifier of the data process.
1675
+
1676
+ Returns
1677
+ -------
1678
+ dict
1679
+ A dictionary containing the documentation information with keys:
1680
+ - documented_sql: str
1681
+ - entity_description: str
1682
+ - documented_entity_columns: dict[str, str]
1683
+ - documented_feature_columns: dict[str, str]
1684
+ """
1685
+ logger_safe('info', f'retrieve_documentation: Retrieving documentation for process_id {process_id}.')
1686
+
1687
+ # Retrieve business logic documentation
1688
+ query_business_logic = f"""
1689
+ CURRENT VALIDTIME
1690
+ SELECT
1691
+ BUSINESS_LOGIC_DESCRIPTION,
1692
+ ENTITY_DESCRIPTION,
1693
+ ENTITY_COLUMNS_JSON
1694
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}
1695
+ WHERE PROCESS_ID = '{process_id}'
1696
+ """
1697
+ result_bl = tdml.execute_sql(query_business_logic).fetchone()
1698
+ if not result_bl:
1699
+ logger_safe('warning', f'retrieve_documentation: No business logic documentation found for process_id {process_id}.')
1700
+ return {}
1701
+
1702
+ documented_sql = result_bl[0]
1703
+ entity_description = result_bl[1]
1704
+ entity_columns_json = result_bl[2]
1705
+ documented_entity_columns = json.loads(entity_columns_json) if entity_columns_json else {}
1706
+
1707
+ # Retrieve feature documentation
1708
+ query_features = f"""
1709
+ CURRENT VALIDTIME
1710
+ SELECT
1711
+ FC.FEATURE_NAME,
1712
+ DPF.FEATURE_DESCRIPTION
1713
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} DPF
1714
+ INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
1715
+ ON DPF.FEATURE_ID = FC.FEATURE_ID
1716
+ WHERE DPF.PROCESS_ID = '{process_id}'
1717
+ """
1718
+ result_features = tdml.execute_sql(query_features).fetchall()
1719
+ documented_feature_columns = {
1720
+ row[0]: row[1] for row in result_features
1721
+ }
1722
+
1723
+ logger_safe('info', f'retrieve_documentation: Successfully retrieved documentation for process_id {process_id}.')
1724
+ return {
1725
+ "DOCUMENTED_SQL" : documented_sql,
1726
+ "ENTITY_DESCRIPTION" : entity_description,
1727
+ "DOCUMENTED_ENTITY_COLUMNS" : documented_entity_columns,
1728
+ "DOCUMENTED_FEATURE_COLUMNS" : documented_feature_columns
1729
+ }
1730
+
1731
+ def retrieve_explain_documentation(process_id: str) -> Dict[str, Any]:
1732
+ """
1733
+ Retrieve the EXPLAIN documentation for a data process from the documentation tables.
1734
+
1735
+ Parameters
1736
+ ----------
1737
+ process_id : str
1738
+ The unique identifier of the data process.
1739
+
1740
+ Returns
1741
+ -------
1742
+ dict
1743
+ A dictionary containing the EXPLAIN documentation information with keys:
1744
+ - explanation: str
1745
+ - optimization_score: int
1746
+ - warnings: list[str]
1747
+ - recommendations: list[str]
1748
+ """
1749
+ logger_safe('info', f'retrieve_explain_documentation: Retrieving EXPLAIN documentation for process_id {process_id}.')
1750
+
1751
+ query_explain = f"""
1752
+ CURRENT VALIDTIME
1753
+ SELECT
1754
+ EXPLAIN_ANALYSIS,
1755
+ OPTIMIZATION_SCORE,
1756
+ WARNINGS,
1757
+ RECOMMENDATIONS
1758
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}
1759
+ WHERE PROCESS_ID = '{process_id}'
1760
+ """
1761
+ result_explain = tdml.execute_sql(query_explain).fetchone()
1762
+ if not result_explain:
1763
+ logger_safe('warning', f'retrieve_explain_documentation: No EXPLAIN documentation found for process_id {process_id}.')
1764
+ return {}
1765
+
1766
+ explanation = result_explain[0]
1767
+ optimization_score = result_explain[1]
1768
+ warnings = json.loads(result_explain[2]) if result_explain[2] else []
1769
+ recommendations = json.loads(result_explain[3]) if result_explain[3] else []
1770
+
1771
+ logger_safe('info', f'retrieve_explain_documentation: Successfully retrieved EXPLAIN documentation for process_id {process_id}.')
1772
+ return {
1773
+ "EXPLAIN_ANALYSIS" : explanation,
1774
+ "OPTIMIZATION_SCORE" : optimization_score,
1775
+ "EXPLAIN_WARNINGS" : warnings,
1776
+ "EXPLAIN_RECOMMENDATIONS" : recommendations
1777
+ }
1778
+
1779
+ def upload_documentation_explain(process_info: Dict[str, Any]) -> None:
1780
+ """
1781
+ Upload the EXPLAIN documentation for a data process into the documentation tables.
1782
+
1783
+ Parameters
1784
+ ----------
1785
+ process_id : str
1786
+ The unique identifier of the data process.
1787
+ explain_documentation : dict
1788
+ A dictionary containing the EXPLAIN documentation information with keys:
1789
+ - explanation: str
1790
+ - optimization_score: int
1791
+ - warnings: list[str]
1792
+ - recommendations: list[str]
1793
+ """
1794
+
1795
+ explanation = process_info['EXPLAIN_ANALYSIS']
1796
+ optimization_score = process_info['OPTIMIZATION_SCORE']
1797
+ warnings_json = json.dumps(process_info['EXPLAIN_WARNINGS'])
1798
+ recommendations_json= json.dumps(process_info['EXPLAIN_RECOMMENDATIONS'])
1799
+ process_id = process_info['PROCESS_ID']
1800
+
1801
+ # merge into DOCUMENTATION_PROCESS_EXPLAIN
1802
+ query_insert_explain = f"""
1803
+ CURRENT VALIDTIME
1804
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} EXISTING
1805
+ USING (
1806
+ SELECT
1807
+ '{process_id}' AS PROCESS_ID,
1808
+ '{explanation.replace("'", "''")}' AS EXPLAIN_ANALYSIS,
1809
+ {optimization_score} AS OPTIMIZATION_SCORE,
1810
+ '{warnings_json.replace("'", "''")}' AS WARNINGS,
1811
+ '{recommendations_json.replace("'", "''")}' AS RECOMMENDATIONS
1812
+ ) UPDATED
1813
+ ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
1814
+ WHEN MATCHED THEN
1815
+ UPDATE
1816
+ SET
1817
+ EXPLAIN_ANALYSIS = UPDATED.EXPLAIN_ANALYSIS,
1818
+ OPTIMIZATION_SCORE = UPDATED.OPTIMIZATION_SCORE,
1819
+ WARNINGS = UPDATED.WARNINGS,
1820
+ RECOMMENDATIONS = UPDATED.RECOMMENDATIONS
1821
+ WHEN NOT MATCHED THEN
1822
+ INSERT (
1823
+ UPDATED.PROCESS_ID,
1824
+ UPDATED.EXPLAIN_ANALYSIS,
1825
+ UPDATED.OPTIMIZATION_SCORE,
1826
+ UPDATED.WARNINGS,
1827
+ UPDATED.RECOMMENDATIONS
1828
+ )
1829
+ """
1830
+
1831
+ # Execute the merge
1832
+ try:
1833
+ tdml.execute_sql(query_insert_explain)
1834
+ logger_safe('info', f'upload_documentation_explain: Uploaded EXPLAIN documentation for process_id {process_id}.')
1835
+ except Exception as e:
1836
+ logger_safe('error', f'upload_documentation_explain: Failed to upload EXPLAIN documentation for process_id {process_id}: {e}')
1837
+ raise
1838
+
1839
+ return
1840
+
1841
+ def display_process_info(process_info: Dict[str, Any] = None, process_id : str = None) -> None:
1842
+ """
1843
+ Pretty print the documentation and EXPLAIN analysis for a data process from process_info dict or by retrieving it using process_id.
1844
+
1845
+ Parameters
1846
+ ----------
1847
+ process_info : dict, optional (default=None)
1848
+ A dictionary containing the process documentation information.
1849
+ If None, process_id must be provided to retrieve the information.
1850
+ process_id : str, optional (default=None)
1851
+ The unique identifier of the data process.
1852
+ If process_info is None, this parameter is used to retrieve the documentation.
1853
+ -----------
1854
+ Returns
1855
+ None
1856
+ """
1857
+
1858
+ if process_info is None:
1859
+ if process_id is None:
1860
+ raise ValueError("Either process_info or process_id must be provided.")
1861
+ logger_safe('info', f'display_process_info: Retrieving documentation for process_id {process_id}.')
1862
+ process_info = get_process_info(process_id)
1863
+
1864
+ # pretty print documentation for info:
1865
+ _print_documentation(
1866
+ documented_sql = process_info.get('DOCUMENTED_SQL', None),
1867
+ entity_description = process_info.get('ENTITY_DESCRIPTION', None),
1868
+ documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
1869
+ documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
1870
+ process_id = process_info.get('PROCESS_ID', None),
1871
+ view_name = process_info.get('VIEW_NAME', None),
1872
+ explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
1873
+ optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
1874
+ explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
1875
+ explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
1876
+ sql_query = process_info.get('PROCESS_SQL', None),
1877
+ )
1878
+ return