tdfs4ds 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +216 -40
- tdfs4ds/feature_store/feature_data_processing.py +9 -28
- tdfs4ds/feature_store/feature_store_management.py +1 -1
- tdfs4ds/genai/__init__.py +27 -351
- tdfs4ds/genai/documentation.py +1877 -0
- tdfs4ds/process_store/process_store_catalog_management.py +77 -24
- tdfs4ds/utils/filter_management.py +21 -12
- tdfs4ds/utils/time_management.py +22 -12
- {tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/RECORD +12 -19
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- {tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1877 @@
|
|
|
1
|
+
from turtle import pd
|
|
2
|
+
from typing import Sequence, Optional, Dict, Any, List
|
|
3
|
+
import textwrap
|
|
4
|
+
|
|
5
|
+
from langchain_openai import ChatOpenAI
|
|
6
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
7
|
+
from langchain_core.output_parsers import JsonOutputParser
|
|
8
|
+
from langchain_core.runnables import Runnable, RunnableLambda
|
|
9
|
+
from langchain_core.messages import AIMessage
|
|
10
|
+
from IPython.display import HTML, display
|
|
11
|
+
|
|
12
|
+
import tdfs4ds
|
|
13
|
+
from tdfs4ds import logger_safe
|
|
14
|
+
|
|
15
|
+
import teradataml as tdml
|
|
16
|
+
import json
|
|
17
|
+
import ast
|
|
18
|
+
import re
|
|
19
|
+
import sqlparse
|
|
20
|
+
|
|
21
|
+
from teradataml.context.context import _get_database_username
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
from tdfs4ds.process_store.process_store_catalog_management import get_process_info
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _robust_json_parser(response: str) -> Dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Robustly extract and parse JSON from LLM responses.
|
|
30
|
+
Handles markdown code fences, escaped characters, and formatting variations.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
response : str
|
|
35
|
+
The raw response string from the LLM.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
dict
|
|
40
|
+
The parsed JSON as a dictionary.
|
|
41
|
+
|
|
42
|
+
Raises
|
|
43
|
+
------
|
|
44
|
+
ValueError
|
|
45
|
+
If JSON cannot be extracted or parsed from the response.
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(response, str):
|
|
48
|
+
raise ValueError(f"Expected string response, got {type(response)}")
|
|
49
|
+
|
|
50
|
+
# Try 1: Direct JSON parse (response might already be clean)
|
|
51
|
+
try:
|
|
52
|
+
return json.loads(response.strip())
|
|
53
|
+
except json.JSONDecodeError:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
# Try 2: Extract from markdown code fences (most flexible)
|
|
57
|
+
# Match opening backticks (with optional json language specifier) and closing backticks
|
|
58
|
+
# Using non-greedy matching with DOTALL to handle multiline content
|
|
59
|
+
markdown_patterns = [
|
|
60
|
+
r'```(?:json)?\s*\n(.*)\n```', # ```json\n...\n``` (any content in middle)
|
|
61
|
+
r'```(?:json)?\s*\r?\n(.*?)\r?\n```', # Handle Windows line endings
|
|
62
|
+
r'```(?:json)?\s*(.*?)\s*```', # ```...``` (flexible whitespace)
|
|
63
|
+
r'`{3}\s*(?:json)?\s*(.*?)\s*`{3}', # Alternative triple backticks
|
|
64
|
+
]
|
|
65
|
+
for pattern in markdown_patterns:
|
|
66
|
+
match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
|
|
67
|
+
if match:
|
|
68
|
+
try:
|
|
69
|
+
extracted = match.group(1).strip()
|
|
70
|
+
# Normalize line endings
|
|
71
|
+
extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
|
|
72
|
+
if extracted: # Only try if we got something
|
|
73
|
+
return json.loads(extracted)
|
|
74
|
+
except (json.JSONDecodeError, IndexError):
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
# Try 3: Extract first { ... } block (handles extra text before/after)
|
|
78
|
+
first_brace = response.find('{')
|
|
79
|
+
last_brace = response.rfind('}')
|
|
80
|
+
if first_brace != -1 and last_brace > first_brace:
|
|
81
|
+
try:
|
|
82
|
+
extracted = response[first_brace:last_brace+1]
|
|
83
|
+
# Normalize line endings
|
|
84
|
+
extracted = extracted.replace('\r\n', '\n').replace('\r', '\n')
|
|
85
|
+
return json.loads(extracted)
|
|
86
|
+
except json.JSONDecodeError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
# Try 4: Remove markdown fences and retry
|
|
90
|
+
# Aggressively strip all markdown code fence markers
|
|
91
|
+
cleaned = response.strip()
|
|
92
|
+
cleaned = re.sub(r'^```\s*(?:json)?\s*', '', cleaned, flags=re.IGNORECASE)
|
|
93
|
+
cleaned = re.sub(r'\s*```\s*$', '', cleaned)
|
|
94
|
+
cleaned = re.sub(r'^`+\s*', '', cleaned)
|
|
95
|
+
cleaned = re.sub(r'\s*`+$', '', cleaned)
|
|
96
|
+
cleaned = cleaned.strip()
|
|
97
|
+
# Normalize line endings
|
|
98
|
+
cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
|
|
99
|
+
try:
|
|
100
|
+
return json.loads(cleaned)
|
|
101
|
+
except json.JSONDecodeError:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
# Try 5: As a last resort, try ast.literal_eval (for Python-like dicts)
|
|
105
|
+
try:
|
|
106
|
+
import ast
|
|
107
|
+
# Normalize line endings
|
|
108
|
+
cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
|
|
109
|
+
return ast.literal_eval(cleaned)
|
|
110
|
+
except (ValueError, SyntaxError):
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
# If all else fails, raise informative error
|
|
114
|
+
logger_safe('error', f'Failed to parse JSON from LLM response. Full response: {response}')
|
|
115
|
+
raise ValueError(f"Could not extract valid JSON from response. First 200 chars: {response[:200]}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# HTML Styling Constants
|
|
119
|
+
HTML_STYLES = {
|
|
120
|
+
"container": "font-family: Arial, sans-serif; margin: 10px 0;",
|
|
121
|
+
"title": "color: #1f618d; margin-bottom: 6px;",
|
|
122
|
+
"heading": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px;",
|
|
123
|
+
"heading_margin": "color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px; margin-top: 15px;",
|
|
124
|
+
"content": "background-color: #ecf0f1; padding: 10px; border-radius: 5px; line-height: 1.6;",
|
|
125
|
+
"list": "background-color: #ecf0f1; padding: 15px 30px; border-radius: 5px; line-height: 1.8;",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _is_notebook() -> bool:
|
|
130
|
+
"""Check if code is running in a Jupyter notebook."""
|
|
131
|
+
try:
|
|
132
|
+
# Check if IPython is available
|
|
133
|
+
from IPython import get_ipython
|
|
134
|
+
ipython = get_ipython()
|
|
135
|
+
if ipython is None:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
# Check for notebook kernel
|
|
139
|
+
if hasattr(ipython, 'kernel') and ipython.kernel is not None:
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
# Check config for IPKernelApp (notebook kernel)
|
|
143
|
+
if hasattr(ipython, 'config') and 'IPKernelApp' in ipython.config:
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
# Check if it's a ZMQInteractiveShell (notebook shell)
|
|
147
|
+
if ipython.__class__.__name__ == 'ZMQInteractiveShell':
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
# Check for ipykernel in sys.modules
|
|
151
|
+
import sys
|
|
152
|
+
if 'ipykernel' in sys.modules:
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
return False
|
|
156
|
+
except (ImportError, AttributeError, Exception):
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _build_provider_llm_caller(llm: ChatOpenAI, provider: str, schema: Optional[Dict] = None):
|
|
161
|
+
"""
|
|
162
|
+
Build a provider-specific LLM call wrapper for constrained output.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
llm : ChatOpenAI
|
|
167
|
+
The language model interface.
|
|
168
|
+
provider : str
|
|
169
|
+
The LLM provider (vllm, openai, ollama, azure, etc).
|
|
170
|
+
schema : dict, optional
|
|
171
|
+
JSON schema for constrained output.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
callable
|
|
176
|
+
A function that invokes the LLM with appropriate constraints.
|
|
177
|
+
"""
|
|
178
|
+
if schema is None:
|
|
179
|
+
return lambda messages: llm.invoke(messages)
|
|
180
|
+
|
|
181
|
+
provider_l = provider.lower()
|
|
182
|
+
|
|
183
|
+
if provider_l in ("vllm", "openai-compatible"):
|
|
184
|
+
return lambda messages: llm.invoke(messages, extra_body={"guided_json": schema})
|
|
185
|
+
|
|
186
|
+
if provider_l in ("openai", "azure", "azure-openai"):
|
|
187
|
+
return lambda messages: llm.invoke(messages, response_format=schema)
|
|
188
|
+
|
|
189
|
+
if provider_l == "ollama":
|
|
190
|
+
return lambda messages: llm.invoke(messages, format=schema)
|
|
191
|
+
|
|
192
|
+
# Fallback: no constraints
|
|
193
|
+
return lambda messages: llm.invoke(messages)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _print_documentation(
|
|
197
|
+
documented_sql: str,
|
|
198
|
+
entity_description: str,
|
|
199
|
+
documented_entity_columns: Dict[str, str],
|
|
200
|
+
documented_feature_columns: Dict[str, str],
|
|
201
|
+
process_id: Optional[str] = None,
|
|
202
|
+
view_name: Optional[str] = None,
|
|
203
|
+
explain_analysis: Optional[str] = None,
|
|
204
|
+
optimization_score: Optional[int] = None,
|
|
205
|
+
explain_warnings: Optional[List[str]] = None,
|
|
206
|
+
explain_recommendations: Optional[List[str]] = None,
|
|
207
|
+
sql_query: Optional[str] = None,
|
|
208
|
+
explain_plan: Optional[str] = None,
|
|
209
|
+
) -> None:
|
|
210
|
+
"""
|
|
211
|
+
Pretty print documentation with context-aware formatting.
|
|
212
|
+
Uses HTML in notebooks, text format in regular scripts.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
documented_sql : str
|
|
217
|
+
The query business logic description.
|
|
218
|
+
entity_description : str
|
|
219
|
+
The entity description.
|
|
220
|
+
documented_entity_columns : dict
|
|
221
|
+
Mapping of entity column names to descriptions.
|
|
222
|
+
documented_feature_columns : dict
|
|
223
|
+
Mapping of feature column names to descriptions.
|
|
224
|
+
process_id : str, optional
|
|
225
|
+
The process identifier for the title.
|
|
226
|
+
view_name : str, optional
|
|
227
|
+
The view name for the title.
|
|
228
|
+
explain_analysis : str, optional
|
|
229
|
+
The EXPLAIN plan analysis description.
|
|
230
|
+
optimization_score : int, optional
|
|
231
|
+
Optimization score from 1 to 5.
|
|
232
|
+
explain_warnings : list, optional
|
|
233
|
+
List of warnings from EXPLAIN analysis.
|
|
234
|
+
explain_recommendations : list, optional
|
|
235
|
+
List of recommendations from EXPLAIN analysis.
|
|
236
|
+
sql_query : str, optional
|
|
237
|
+
The original SQL query to display.
|
|
238
|
+
explain_plan : str, optional
|
|
239
|
+
The raw EXPLAIN plan output to display.
|
|
240
|
+
"""
|
|
241
|
+
title = ''
|
|
242
|
+
if process_id or view_name:
|
|
243
|
+
title_parts = []
|
|
244
|
+
if process_id:
|
|
245
|
+
title_parts.append(f"Process: {process_id}")
|
|
246
|
+
if view_name:
|
|
247
|
+
title_parts.append(f"View: {view_name}")
|
|
248
|
+
title = ' — '.join(title_parts)
|
|
249
|
+
|
|
250
|
+
# Helpers to parse structured items and clean markdown (available in both contexts)
|
|
251
|
+
def _try_parse_structured(value):
|
|
252
|
+
if value is None:
|
|
253
|
+
return None
|
|
254
|
+
if isinstance(value, (dict, list)):
|
|
255
|
+
return value
|
|
256
|
+
if not isinstance(value, str):
|
|
257
|
+
return value
|
|
258
|
+
s = value.strip()
|
|
259
|
+
# Try JSON
|
|
260
|
+
try:
|
|
261
|
+
return json.loads(s)
|
|
262
|
+
except Exception:
|
|
263
|
+
pass
|
|
264
|
+
# Try Python literal
|
|
265
|
+
try:
|
|
266
|
+
return ast.literal_eval(s)
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
return s
|
|
270
|
+
|
|
271
|
+
def _flatten_to_list(parsed):
|
|
272
|
+
if parsed is None:
|
|
273
|
+
return []
|
|
274
|
+
if isinstance(parsed, list):
|
|
275
|
+
out = []
|
|
276
|
+
for it in parsed:
|
|
277
|
+
out.extend(_flatten_to_list(it))
|
|
278
|
+
return out
|
|
279
|
+
if isinstance(parsed, dict):
|
|
280
|
+
# Prefer obvious value keys, else format key: value pairs
|
|
281
|
+
for k in ("issue", "warning", "action", "recommendation", "msg", "message"):
|
|
282
|
+
if k in parsed:
|
|
283
|
+
return [str(parsed[k])]
|
|
284
|
+
return ["; ".join(f"{kk}: {vv}" for kk, vv in parsed.items())]
|
|
285
|
+
return [str(parsed)]
|
|
286
|
+
|
|
287
|
+
def _strip_md(s: str) -> str:
|
|
288
|
+
# Remove **bold** and inline markdown emphasis for plain text
|
|
289
|
+
s = re.sub(r"\*\*(.*?)\*\*", r"\1", s)
|
|
290
|
+
s = re.sub(r"\*(.*?)\*", r"\1", s)
|
|
291
|
+
return s
|
|
292
|
+
|
|
293
|
+
def _md_to_html(s: str) -> str:
|
|
294
|
+
# Convert **bold** to <strong>
|
|
295
|
+
s = re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", s)
|
|
296
|
+
s = re.sub(r"\*(.*?)\*\*", r"<em>\1</em>", s)
|
|
297
|
+
# Escape simple < and > to avoid broken HTML (keep basic newlines)
|
|
298
|
+
s = s.replace("<", "<").replace(">", ">")
|
|
299
|
+
# Restore our strong/em tags
|
|
300
|
+
s = s.replace("<strong>", "<strong>").replace("</strong>", "</strong>")
|
|
301
|
+
s = s.replace("<em>", "<em>").replace("</em>", "</em>")
|
|
302
|
+
return s
|
|
303
|
+
|
|
304
|
+
# Build EXPLAIN section if available
|
|
305
|
+
parsed_explain = _try_parse_structured(explain_analysis)
|
|
306
|
+
parsed_warnings = _try_parse_structured(explain_warnings)
|
|
307
|
+
parsed_recs = _try_parse_structured(explain_recommendations)
|
|
308
|
+
|
|
309
|
+
warn_list = _flatten_to_list(parsed_warnings)
|
|
310
|
+
rec_list = _flatten_to_list(parsed_recs)
|
|
311
|
+
|
|
312
|
+
explain_section = ""
|
|
313
|
+
if parsed_explain or optimization_score or warn_list or rec_list:
|
|
314
|
+
score_color = "#27ae60" if optimization_score and optimization_score >= 4 else "#f39c12" if optimization_score and optimization_score == 3 else "#e74c3c"
|
|
315
|
+
explain_section = f"""
|
|
316
|
+
<h3 style="{HTML_STYLES['heading_margin']}">Query Optimization Analysis</h3>
|
|
317
|
+
<div style="background-color: #ecf0f1; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
|
|
318
|
+
<p><strong>Optimization Score:</strong> <span style="color: {score_color}; font-size: 18px; font-weight: bold;">{optimization_score}/5</span></p>
|
|
319
|
+
</div>
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
if parsed_explain:
|
|
323
|
+
# Display explanation as plain text, preserving newlines
|
|
324
|
+
explain_text = parsed_explain if isinstance(parsed_explain, str) else str(parsed_explain)
|
|
325
|
+
explain_text_html = explain_text.replace('&', '&').replace('<', '<').replace('>', '>').replace('\n', '<br>')
|
|
326
|
+
explain_section += f'<div style="{HTML_STYLES["content"]}">{explain_text_html}</div>'
|
|
327
|
+
|
|
328
|
+
if warn_list:
|
|
329
|
+
warnings_html = '\n'.join(f'<li style="color: #c0392b;">{_md_to_html(w).replace('\n','<br>')}</li>' for w in warn_list)
|
|
330
|
+
explain_section += f"""
|
|
331
|
+
<h4 style="color: #c0392b; margin-top: 10px;">⚠ Warnings</h4>
|
|
332
|
+
<ul style="{HTML_STYLES['list']}">{warnings_html}</ul>
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
if rec_list:
|
|
336
|
+
recommendations_html = '\n'.join(f'<li style="color: #27ae60;">{_md_to_html(r).replace('\n','<br>')}</li>' for r in rec_list)
|
|
337
|
+
explain_section += f"""
|
|
338
|
+
<h4 style="color: #27ae60; margin-top: 10px;">✓ Recommendations</h4>
|
|
339
|
+
<ul style="{HTML_STYLES['list']}">{recommendations_html}</ul>
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
if _is_notebook():
|
|
343
|
+
title_html = f"<h2>{title}</h2>" if title else ""
|
|
344
|
+
entity_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_entity_columns.items())
|
|
345
|
+
feature_items = '\n'.join(f'<li><strong>{col}:</strong> {_md_to_html(desc)}</li>' for col, desc in documented_feature_columns.items())
|
|
346
|
+
|
|
347
|
+
# Build optional sections
|
|
348
|
+
sql_section = ""
|
|
349
|
+
if sql_query:
|
|
350
|
+
formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
|
|
351
|
+
sql_section = f"""
|
|
352
|
+
<h3 style="{HTML_STYLES['heading_margin']}">Original SQL Query</h3>
|
|
353
|
+
<pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{formatted_sql}</pre>
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
explain_plan_section = ""
|
|
357
|
+
if explain_plan:
|
|
358
|
+
explain_plan_section = f"""
|
|
359
|
+
<h3 style="{HTML_STYLES['heading_margin']}">EXPLAIN Plan</h3>
|
|
360
|
+
<pre style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6; font-family: 'Courier New', monospace; font-size: 12px; overflow-x: auto; white-space: pre-wrap;">{explain_plan}</pre>
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
html_content = f"""
|
|
364
|
+
<div style="{HTML_STYLES['container']}">
|
|
365
|
+
{title_html}
|
|
366
|
+
<h3 style="{HTML_STYLES['heading']}">Query Business Logic</h3>
|
|
367
|
+
<p style="{HTML_STYLES['content']}">{documented_sql}</p>
|
|
368
|
+
|
|
369
|
+
<h3 style="{HTML_STYLES['heading_margin']}">Entity Description</h3>
|
|
370
|
+
<p style="{HTML_STYLES['content']}">{entity_description}</p>
|
|
371
|
+
|
|
372
|
+
<h3 style="{HTML_STYLES['heading_margin']}">Entity Columns</h3>
|
|
373
|
+
<ul style="{HTML_STYLES['list']}">{entity_items}</ul>
|
|
374
|
+
|
|
375
|
+
<h3 style="{HTML_STYLES['heading_margin']}">Feature Columns</h3>
|
|
376
|
+
<ul style="{HTML_STYLES['list']}">{feature_items}</ul>
|
|
377
|
+
|
|
378
|
+
{explain_section}
|
|
379
|
+
{sql_section}
|
|
380
|
+
{explain_plan_section}
|
|
381
|
+
</div>
|
|
382
|
+
"""
|
|
383
|
+
display(HTML(html_content))
|
|
384
|
+
else:
|
|
385
|
+
# Text formatting for regular scripts
|
|
386
|
+
print("\n" + "="*100)
|
|
387
|
+
print(title if title else "DOCUMENTATION")
|
|
388
|
+
print("="*100)
|
|
389
|
+
print("\nQuery Business Logic:")
|
|
390
|
+
print(textwrap.fill(documented_sql, width=100))
|
|
391
|
+
|
|
392
|
+
print("\nEntity Description:")
|
|
393
|
+
print(textwrap.fill(entity_description, width=100))
|
|
394
|
+
|
|
395
|
+
print("\nEntity Columns Documentation:")
|
|
396
|
+
for col, desc in documented_entity_columns.items():
|
|
397
|
+
print(f"\n {col}:")
|
|
398
|
+
print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
|
|
399
|
+
|
|
400
|
+
print("\nFeature Columns Documentation:")
|
|
401
|
+
for col, desc in documented_feature_columns.items():
|
|
402
|
+
print(f"\n {col}:")
|
|
403
|
+
print(textwrap.fill(desc, width=95, initial_indent=" ", subsequent_indent=" "))
|
|
404
|
+
|
|
405
|
+
# Print EXPLAIN analysis if available
|
|
406
|
+
if explain_analysis or optimization_score or explain_warnings or explain_recommendations:
|
|
407
|
+
print("\n" + "-"*100)
|
|
408
|
+
print("QUERY OPTIMIZATION ANALYSIS")
|
|
409
|
+
print("-"*100)
|
|
410
|
+
|
|
411
|
+
if optimization_score:
|
|
412
|
+
print(f"Optimization Score: {optimization_score}/5")
|
|
413
|
+
|
|
414
|
+
# Print parsed explanation, preserving carriage returns.
|
|
415
|
+
if parsed_explain:
|
|
416
|
+
print("\nExplanation:")
|
|
417
|
+
if isinstance(parsed_explain, str):
|
|
418
|
+
print(parsed_explain)
|
|
419
|
+
else:
|
|
420
|
+
print(str(parsed_explain))
|
|
421
|
+
|
|
422
|
+
# Print warnings (flattened) preserving carriage returns
|
|
423
|
+
if warn_list:
|
|
424
|
+
print("\nWarnings:")
|
|
425
|
+
for w in warn_list:
|
|
426
|
+
print(f" - {w}")
|
|
427
|
+
|
|
428
|
+
# Print recommendations (flattened) preserving carriage returns
|
|
429
|
+
if rec_list:
|
|
430
|
+
print("\nRecommendations:")
|
|
431
|
+
for r in rec_list:
|
|
432
|
+
print(f" - {r}")
|
|
433
|
+
|
|
434
|
+
# Print original SQL query if provided
|
|
435
|
+
if sql_query:
|
|
436
|
+
print("\n" + "-"*100)
|
|
437
|
+
print("ORIGINAL SQL QUERY")
|
|
438
|
+
print("-"*100)
|
|
439
|
+
formatted_sql = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
|
|
440
|
+
print(textwrap.indent(formatted_sql, ' '))
|
|
441
|
+
|
|
442
|
+
# Print EXPLAIN plan if provided
|
|
443
|
+
if explain_plan:
|
|
444
|
+
print("\n" + "-"*100)
|
|
445
|
+
print("EXPLAIN PLAN")
|
|
446
|
+
print("-"*100)
|
|
447
|
+
print(explain_plan)
|
|
448
|
+
|
|
449
|
+
print("\n" + "="*100 + "\n")
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def build_llm(
|
|
453
|
+
llm_service: str = "https://api-dmproject.myddns.me/v1",
|
|
454
|
+
api_key: str = "YOUR_API_KEY_HERE",
|
|
455
|
+
model_id: str = "mistralai/Ministral-3-14B-Instruct-2512",
|
|
456
|
+
temperature: float = 0.0,
|
|
457
|
+
timeout: int = 120,
|
|
458
|
+
) -> ChatOpenAI:
|
|
459
|
+
"""
|
|
460
|
+
Build and return a ChatOpenAI client pointed at your vLLM/OpenAI-compatible endpoint.
|
|
461
|
+
|
|
462
|
+
Parameters
|
|
463
|
+
----------
|
|
464
|
+
llm_service : str
|
|
465
|
+
Base URL of the LLM service.
|
|
466
|
+
api_key : str
|
|
467
|
+
API key for authentication.
|
|
468
|
+
model_id : str
|
|
469
|
+
Model identifier.
|
|
470
|
+
temperature : float
|
|
471
|
+
Sampling temperature for response diversity.
|
|
472
|
+
timeout : int
|
|
473
|
+
Request timeout in seconds.
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
ChatOpenAI
|
|
478
|
+
Configured LLM client.
|
|
479
|
+
|
|
480
|
+
Raises
|
|
481
|
+
------
|
|
482
|
+
Exception
|
|
483
|
+
If LLM client creation fails.
|
|
484
|
+
"""
|
|
485
|
+
logger_safe('info', f'build_llm: Using LLM service at {llm_service} with model {model_id}')
|
|
486
|
+
logger_safe('debug', f'build_llm: Temperature={temperature}, Timeout={timeout}s')
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
return ChatOpenAI(
|
|
490
|
+
base_url=llm_service,
|
|
491
|
+
api_key=api_key,
|
|
492
|
+
model=model_id,
|
|
493
|
+
temperature=temperature,
|
|
494
|
+
timeout=timeout,
|
|
495
|
+
)
|
|
496
|
+
except Exception as e:
|
|
497
|
+
logger_safe('error', f'build_llm: Failed to create LLM client: {e}')
|
|
498
|
+
raise
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
from typing import Sequence
|
|
502
|
+
|
|
503
|
+
def build_documentation_json_schema(columns: List[str], provider: str = "generic") -> Dict[str, Any]:
|
|
504
|
+
"""
|
|
505
|
+
Build a provider-appropriate JSON Schema used to enforce strict JSON output
|
|
506
|
+
for SQL column documentation across multiple LLM backends.
|
|
507
|
+
|
|
508
|
+
This function returns different schema shapes depending on the LLM provider,
|
|
509
|
+
because each ecosystem uses a different structured-output mechanism:
|
|
510
|
+
|
|
511
|
+
Provider Modes
|
|
512
|
+
--------------
|
|
513
|
+
- provider="openai", "azure"
|
|
514
|
+
Returns the JSON Schema wrapped in OpenAI's `response_format={"type": "json_schema", ...}`
|
|
515
|
+
structure. Supported by GPT-4.1, GPT-4o, GPT-3.5-Turbo, and Azure OpenAI.
|
|
516
|
+
|
|
517
|
+
- provider="anthropic", "claude"
|
|
518
|
+
Returns an Anthropic *tool schema* definition. Claude 3.x models use tool
|
|
519
|
+
schemas to enforce strict JSON output.
|
|
520
|
+
|
|
521
|
+
- provider="ollama"
|
|
522
|
+
Returns the raw JSON schema that Ollama expects under the `format=` parameter
|
|
523
|
+
of the generate API. (Ollama 0.2+ supports response schemas.)
|
|
524
|
+
|
|
525
|
+
- provider="vllm"
|
|
526
|
+
Returns plain JSON Schema for use with vLLM's `guided_json` constrained decoding.
|
|
527
|
+
|
|
528
|
+
- provider="bedrock"
|
|
529
|
+
Bedrock Claude follows the Anthropic tool schema format.
|
|
530
|
+
Bedrock Llama / Titan accept plain JSON schema. This function returns the base
|
|
531
|
+
schema and leaves the final wrapping to the caller.
|
|
532
|
+
|
|
533
|
+
- provider="generic"
|
|
534
|
+
Returns plain JSON schema. Useful for LLM backends that do not support
|
|
535
|
+
constrained decoding, prompt-only JSON generation, or post-processing repair.
|
|
536
|
+
|
|
537
|
+
Parameters
|
|
538
|
+
----------
|
|
539
|
+
columns : list[str]
|
|
540
|
+
Column names to include as required JSON object keys. Each column will map
|
|
541
|
+
to a string description generated by the model.
|
|
542
|
+
|
|
543
|
+
provider : str, optional
|
|
544
|
+
The model provider or backend type. Determines the structural format
|
|
545
|
+
required for constrained generation. One of:
|
|
546
|
+
"openai", "anthropic", "ollama", "vllm", "bedrock", "generic".
|
|
547
|
+
|
|
548
|
+
Returns
|
|
549
|
+
-------
|
|
550
|
+
dict
|
|
551
|
+
A dictionary representing the JSON Schema or provider-specific wrapper
|
|
552
|
+
used to enforce strict JSON output during LLM generation.
|
|
553
|
+
|
|
554
|
+
Notes
|
|
555
|
+
-----
|
|
556
|
+
- All schemas require that:
|
|
557
|
+
* the output be a JSON object
|
|
558
|
+
* keys match exactly the column names
|
|
559
|
+
* all values be strings
|
|
560
|
+
* additional properties be disallowed
|
|
561
|
+
|
|
562
|
+
- Not all providers enforce schemas equally:
|
|
563
|
+
* OpenAI, Claude, and vLLM offer hard guarantees.
|
|
564
|
+
* Ollama enforces schema reasonably well.
|
|
565
|
+
* Generic models may require post-processing.
|
|
566
|
+
"""
|
|
567
|
+
# Base JSON schema — used directly by vLLM, Ollama, Bedrock, fallback
|
|
568
|
+
base_schema = {
|
|
569
|
+
"type": "object",
|
|
570
|
+
"properties": {col: {"type": "string"} for col in columns},
|
|
571
|
+
"required": list(columns),
|
|
572
|
+
"additionalProperties": False,
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
# --- Provider-specific formats ---
|
|
576
|
+
|
|
577
|
+
if provider.lower() in ("openai", "azure", "azure-openai"):
|
|
578
|
+
# OpenAI's required wrapper structure
|
|
579
|
+
return {
|
|
580
|
+
"type": "json_schema",
|
|
581
|
+
"json_schema": {
|
|
582
|
+
"name": "ColumnDocumentation",
|
|
583
|
+
"schema": base_schema,
|
|
584
|
+
"strict": True,
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
if provider.lower() in ("anthropic", "claude"):
|
|
589
|
+
# Anthropic tool schema
|
|
590
|
+
# You embed this inside the "tools" field when calling the model
|
|
591
|
+
return {
|
|
592
|
+
"name": "column_documentation",
|
|
593
|
+
"description": "Generate documentation for SQL output columns.",
|
|
594
|
+
"input_schema": base_schema
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
if provider.lower() == "ollama":
|
|
598
|
+
# Ollama's output format schema (unwrapped JSON schema)
|
|
599
|
+
# Returned directly in: generate(..., format=schema)
|
|
600
|
+
return base_schema
|
|
601
|
+
|
|
602
|
+
if provider.lower() in ("vllm", "openai-compatible"):
|
|
603
|
+
# vLLM's guided_json uses *plain JSON Schema*
|
|
604
|
+
# so return base_schema exactly
|
|
605
|
+
return base_schema
|
|
606
|
+
|
|
607
|
+
if provider.lower() == "bedrock":
|
|
608
|
+
# Bedrock Claude uses Anthropic schema
|
|
609
|
+
# Bedrock Llama uses plain JSON schema
|
|
610
|
+
# Return base_schema and let caller choose
|
|
611
|
+
return base_schema
|
|
612
|
+
|
|
613
|
+
# Fallback: generic JSON schema
|
|
614
|
+
return base_schema
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def build_sql_documentation_chain(
|
|
618
|
+
llm: ChatOpenAI,
|
|
619
|
+
entity_columns: Sequence[str],
|
|
620
|
+
feature_columns: Sequence[str],
|
|
621
|
+
provider: str = "vllm",
|
|
622
|
+
json_constraint: bool = True,
|
|
623
|
+
) -> Runnable:
|
|
624
|
+
"""
|
|
625
|
+
Build a LangChain Runnable that generates business-focused documentation
|
|
626
|
+
for lists of entity and feature columns from a SQL query output, with optional provider-specific JSON
|
|
627
|
+
constraints (vLLM, OpenAI, Ollama, etc.).
|
|
628
|
+
|
|
629
|
+
The resulting chain expects two input variables:
|
|
630
|
+
- sql_query: str → the SQL query whose output is being documented
|
|
631
|
+
- columns_str: str → formatted list of entity and feature columns (e.g. "Entity columns:\n- col1\n\nFeature columns:\n- col2")
|
|
632
|
+
|
|
633
|
+
Parameters
|
|
634
|
+
----------
|
|
635
|
+
llm : ChatOpenAI
|
|
636
|
+
The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
|
|
637
|
+
entity_columns : Sequence[str]
|
|
638
|
+
List of entity/identifier columns that must appear as keys in the output JSON.
|
|
639
|
+
feature_columns : Sequence[str]
|
|
640
|
+
List of feature columns that must appear as keys in the output JSON.
|
|
641
|
+
provider : str, optional (default="vllm")
|
|
642
|
+
Indicates which structured-output mechanism to use.
|
|
643
|
+
Supported values:
|
|
644
|
+
- "vllm" → uses `guided_json` for strict JSON output
|
|
645
|
+
- "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
|
|
646
|
+
- "ollama" → uses Ollama's `format=` schema
|
|
647
|
+
- "openai-compatible" → alias for vLLM-style guided decoding
|
|
648
|
+
- any other value → fall back to unconstrained text output
|
|
649
|
+
json_constraint : bool, optional (default=True)
|
|
650
|
+
If True:
|
|
651
|
+
- a JSON Schema is generated from the column lists
|
|
652
|
+
- provider-specific constrained decoding is applied
|
|
653
|
+
If False:
|
|
654
|
+
- the chain does not enforce JSON structure at the LLM level
|
|
655
|
+
- the model is only guided by the prompt (weaker guarantees)
|
|
656
|
+
|
|
657
|
+
Returns
|
|
658
|
+
-------
|
|
659
|
+
Runnable
|
|
660
|
+
A LangChain Runnable that executes:
|
|
661
|
+
prompt → LLM (optionally schema-guided) → JSON parser
|
|
662
|
+
|
|
663
|
+
When invoked with:
|
|
664
|
+
{
|
|
665
|
+
"sql_query": "...",
|
|
666
|
+
"columns_str": "Entity columns:\n- column1\n\nFeature columns:\n- column2\n..."
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
It returns:
|
|
670
|
+
dict[str, str]
|
|
671
|
+
A mapping of each requested column name to a short,
|
|
672
|
+
business-oriented description (≤ 5 sentences), plus a 'query_business_logic' key
|
|
673
|
+
containing a high-level description of the query's business logic (5-10 sentences), and an 'entity_description' key
|
|
674
|
+
with a holistic description of the entity (3-5 sentences).
|
|
675
|
+
|
|
676
|
+
Notes
|
|
677
|
+
-----
|
|
678
|
+
- The chain enforces valid JSON when possible:
|
|
679
|
+
* vLLM → `guided_json`
|
|
680
|
+
* OpenAI → `response_format={"type": "json_schema", ...}`
|
|
681
|
+
* Ollama → `format=<schema>`
|
|
682
|
+
- For unsupported providers, the model may emit imperfect JSON.
|
|
683
|
+
- Descriptions focus on business meaning, business logic,
|
|
684
|
+
and optionally technical details only when relevant.
|
|
685
|
+
"""
|
|
686
|
+
all_columns = entity_columns + feature_columns + ["query_business_logic", "entity_description"]
|
|
687
|
+
logger_safe('info', f'build_sql_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}, entity_columns={list(entity_columns)}, feature_columns={list(feature_columns)}')
|
|
688
|
+
prompt = ChatPromptTemplate.from_template(
|
|
689
|
+
"""
|
|
690
|
+
You are a data documentation assistant.
|
|
691
|
+
|
|
692
|
+
Your target audience is business users.
|
|
693
|
+
Your explanations must focus primarily on the business meaning and business logic of each column,
|
|
694
|
+
and you may add technical details only when they meaningfully clarify the business context.
|
|
695
|
+
|
|
696
|
+
Given:
|
|
697
|
+
1. A SQL query.
|
|
698
|
+
2. Lists of entity and feature columns that must be documented.
|
|
699
|
+
|
|
700
|
+
Your job:
|
|
701
|
+
- For entity columns: Provide a brief 1-sentence description of how this column contributes to identifying the entity described holistically under 'entity_description'. Do not repeat the full entity description here.
|
|
702
|
+
- For feature columns: Write a clear and concise explanation of what the column represents from a business perspective, describing the business logic behind how the value is derived or used within the context of the SQL query.
|
|
703
|
+
- Add technical details only if relevant and only to help a business audience understand the concept.
|
|
704
|
+
- Each description must be at most 5 sentences.
|
|
705
|
+
- Do not include any columns that are not in the provided lists.
|
|
706
|
+
- If a column name is ambiguous, infer its meaning from the SQL query as best as possible and say so.
|
|
707
|
+
- If you cannot infer anything meaningful, state that clearly (still within 3 sentences).
|
|
708
|
+
- Additionally, provide a high-level description of the business logic of the SQL query itself under the key 'query_business_logic'. This should explain what the query does from a business perspective, including the main purpose, data sources, transformations, and business value. Keep it to 5-10 sentences.
|
|
709
|
+
- Additionally, provide a description of the entity as a whole under the key 'entity_description'. This should describe the business object that the entity columns collectively identify, noting that this is the object the features describe. Keep it to 3-5 sentences.
|
|
710
|
+
- Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors.
|
|
711
|
+
- Answer in {language}
|
|
712
|
+
Output format (very important):
|
|
713
|
+
- Return ONLY a valid JSON object.
|
|
714
|
+
- Each top-level key must be exactly the column name or 'query_business_logic'.
|
|
715
|
+
- Each value must be a single string with the description.
|
|
716
|
+
|
|
717
|
+
Example of the required format:
|
|
718
|
+
{{
|
|
719
|
+
"customer_id": "This column serves as the primary key for identifying individual customers in the entity.",
|
|
720
|
+
"order_date": "The business date when the order was created. It represents the transaction date used for reporting and may reflect the source system's timestamp.",
|
|
721
|
+
"entity_description": "The customer entity represents individual buyers in the business system, identified by customer_id and described by features like order history and demographics. This entity is the core object that the feature columns characterize for analysis and decision-making.",
|
|
722
|
+
"query_business_logic": "This query joins customer and order data to provide a comprehensive view of customer orders. It filters orders from 2024 onwards to focus on recent activity. The result helps business users understand customer purchasing patterns and regional distribution."
|
|
723
|
+
}}
|
|
724
|
+
|
|
725
|
+
Now generate documentation.
|
|
726
|
+
|
|
727
|
+
SQL query:
|
|
728
|
+
```sql
|
|
729
|
+
{sql_query}
|
|
730
|
+
```
|
|
731
|
+
Columns to document (only document these):
|
|
732
|
+
{columns_str}
|
|
733
|
+
"""
|
|
734
|
+
)
|
|
735
|
+
parser = JsonOutputParser()
|
|
736
|
+
if not json_constraint:
|
|
737
|
+
return prompt | llm | parser
|
|
738
|
+
|
|
739
|
+
schema = build_documentation_json_schema(all_columns, provider=provider)
|
|
740
|
+
logger_safe('debug', f'build_sql_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
|
|
741
|
+
|
|
742
|
+
# Use helper to build provider-specific LLM caller
|
|
743
|
+
call_llm = _build_provider_llm_caller(llm, provider, schema)
|
|
744
|
+
constrained_llm = RunnableLambda(call_llm)
|
|
745
|
+
|
|
746
|
+
# Final chain: prompt -> LLM (schema-guided) -> JSON parser
|
|
747
|
+
def _parse(ai_msg: AIMessage):
|
|
748
|
+
raw = ai_msg.content
|
|
749
|
+
return parser.parse(raw)
|
|
750
|
+
|
|
751
|
+
return prompt | constrained_llm | RunnableLambda(_parse)
|
|
752
|
+
|
|
753
|
+
def run_sql_documentation(
|
|
754
|
+
chain: Runnable,
|
|
755
|
+
sql_query: str,
|
|
756
|
+
entity_columns: Sequence[str],
|
|
757
|
+
feature_columns: Sequence[str],
|
|
758
|
+
language: str = "English",
|
|
759
|
+
) -> Dict[str, str]:
|
|
760
|
+
"""
|
|
761
|
+
Execute a previously constructed SQL-documentation chain and return
|
|
762
|
+
business-friendly documentation for the specified SQL output columns.
|
|
763
|
+
|
|
764
|
+
This function prepares the chain inputs (SQL query, formatted column list,
|
|
765
|
+
target language) and invokes the chain. The chain itself must have been
|
|
766
|
+
created using `build_sql_documentation_chain()`, which ensures the model
|
|
767
|
+
produces structured JSON suitable for parsing.
|
|
768
|
+
|
|
769
|
+
Parameters
|
|
770
|
+
----------
|
|
771
|
+
chain : Runnable
|
|
772
|
+
A LangChain Runnable returned by `build_sql_documentation_chain()`.
|
|
773
|
+
This Runnable encapsulates:
|
|
774
|
+
- the prompt template
|
|
775
|
+
- a provider-specific LLM invocation (with or without JSON constraints)
|
|
776
|
+
- a JSON output parser
|
|
777
|
+
|
|
778
|
+
sql_query : str
|
|
779
|
+
The SQL query whose resulting columns should be documented. This query is
|
|
780
|
+
shown to the model so it can infer business logic, derivation rules, and
|
|
781
|
+
column meaning.
|
|
782
|
+
|
|
783
|
+
entity_columns : Sequence[str]
|
|
784
|
+
The list of entity/identifier column names that must appear as keys in the output JSON.
|
|
785
|
+
Only these columns will be documented. The order does not matter.
|
|
786
|
+
|
|
787
|
+
feature_columns : Sequence[str]
|
|
788
|
+
The list of feature column names that must appear as keys in the output JSON.
|
|
789
|
+
Only these columns will be documented. The order does not matter.
|
|
790
|
+
|
|
791
|
+
language : str, optional (default="English")
|
|
792
|
+
The target output language for the generated documentation.
|
|
793
|
+
This value is passed into the prompt’s `{language}` variable.
|
|
794
|
+
Examples: "English", "French", "German", "Spanish", "Japanese".
|
|
795
|
+
|
|
796
|
+
Returns
|
|
797
|
+
-------
|
|
798
|
+
dict[str, str]
|
|
799
|
+
A dictionary mapping each column name to a human-readable, business-oriented
|
|
800
|
+
description generated by the model, plus a 'query_business_logic' key
|
|
801
|
+
with the query's business logic description, and an 'entity_description' key
|
|
802
|
+
with the holistic entity description. Example:
|
|
803
|
+
{
|
|
804
|
+
"customer_id": "Unique customer identifier used for ...",
|
|
805
|
+
"order_date": "Business date when the order was created ...",
|
|
806
|
+
"entity_description": "The customer entity represents...",
|
|
807
|
+
"query_business_logic": "This query provides a view of ..."
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
Notes
|
|
811
|
+
-----
|
|
812
|
+
- The output format is determined by the chain's JSON parser. If the model
|
|
813
|
+
fails to produce valid JSON (e.g., due to unsupported constraints),
|
|
814
|
+
a `OutputParserException` may be raised.
|
|
815
|
+
- The resulting descriptions are typically ≤ 5 sentences per column, unless
|
|
816
|
+
modified in the chain's prompt.
|
|
817
|
+
"""
|
|
818
|
+
logger_safe('info', f'run_sql_documentation: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
|
|
819
|
+
columns_str = "Entity columns:\n" + "\n".join(f"- {col}" for col in entity_columns) + "\n\nFeature columns:\n" + "\n".join(f"- {col}" for col in feature_columns)
|
|
820
|
+
|
|
821
|
+
try:
|
|
822
|
+
result = chain.invoke({
|
|
823
|
+
"sql_query": sql_query,
|
|
824
|
+
"columns_str": columns_str,
|
|
825
|
+
"language" : language
|
|
826
|
+
})
|
|
827
|
+
logger_safe('info', f'run_sql_documentation: Successfully generated documentation for columns: {list(result.keys())}')
|
|
828
|
+
return result
|
|
829
|
+
except Exception as e:
|
|
830
|
+
logger_safe('error', f'run_sql_documentation: Failed to generate documentation: {e}')
|
|
831
|
+
raise
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def document_sql_query_columns(
|
|
835
|
+
sql_query: str,
|
|
836
|
+
entity_columns: Sequence[str],
|
|
837
|
+
feature_columns: Sequence[str],
|
|
838
|
+
language: str = "English",
|
|
839
|
+
provider: Optional[str] = None,
|
|
840
|
+
json_constraint: bool = True,
|
|
841
|
+
) -> Dict[str, Any]:
|
|
842
|
+
"""
|
|
843
|
+
Convenience function to generate business-focused documentation for SQL query output columns
|
|
844
|
+
using the configured instruction model from tdfs4ds settings.
|
|
845
|
+
|
|
846
|
+
This function automatically builds the LLM client using the tdfs4ds configuration variables
|
|
847
|
+
(INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL), constructs the documentation
|
|
848
|
+
chain, and executes it to produce column descriptions.
|
|
849
|
+
|
|
850
|
+
Parameters
|
|
851
|
+
----------
|
|
852
|
+
sql_query : str
|
|
853
|
+
The SQL query whose resulting columns should be documented. This query is
|
|
854
|
+
shown to the model so it can infer business logic, derivation rules, and
|
|
855
|
+
column meaning.
|
|
856
|
+
|
|
857
|
+
entity_columns : Sequence[str]
|
|
858
|
+
The list of entity/identifier column names that must appear as keys in the output JSON.
|
|
859
|
+
Only these columns will be documented. The order does not matter.
|
|
860
|
+
|
|
861
|
+
feature_columns : Sequence[str]
|
|
862
|
+
The list of feature column names that must appear as keys in the output JSON.
|
|
863
|
+
Only these columns will be documented. The order does not matter.
|
|
864
|
+
|
|
865
|
+
language : str, optional (default="English")
|
|
866
|
+
The target output language for the generated documentation.
|
|
867
|
+
This value is passed into the prompt's {language} variable.
|
|
868
|
+
Examples: "English", "French", "German", "Spanish", "Japanese".
|
|
869
|
+
|
|
870
|
+
provider : str, optional (default=None)
|
|
871
|
+
Indicates which structured-output mechanism to use for the LLM.
|
|
872
|
+
If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
|
|
873
|
+
Supported values:
|
|
874
|
+
- "vllm" → uses `guided_json` for strict JSON output
|
|
875
|
+
- "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
|
|
876
|
+
- "ollama" → uses Ollama's `format=` schema
|
|
877
|
+
- "openai-compatible" → alias for vLLM-style guided decoding
|
|
878
|
+
- any other value → fall back to unconstrained text output
|
|
879
|
+
|
|
880
|
+
json_constraint : bool, optional (default=True)
|
|
881
|
+
If True:
|
|
882
|
+
- a JSON Schema is generated from the column lists
|
|
883
|
+
- provider-specific constrained decoding is applied
|
|
884
|
+
If False:
|
|
885
|
+
- the chain does not enforce JSON structure at the LLM level
|
|
886
|
+
- the model is only guided by the prompt (weaker guarantees)
|
|
887
|
+
|
|
888
|
+
Returns
|
|
889
|
+
-------
|
|
890
|
+
dict
|
|
891
|
+
A dictionary with four keys:
|
|
892
|
+
- "query_business_logic": str containing the high-level business logic description of the query
|
|
893
|
+
- "entity_description": str containing the holistic description of the entity
|
|
894
|
+
- "entity_columns": dict[str, str] mapping each entity column name to its description
|
|
895
|
+
- "feature_columns": dict[str, str] mapping each feature column name to its description
|
|
896
|
+
|
|
897
|
+
Raises
|
|
898
|
+
------
|
|
899
|
+
ValueError
|
|
900
|
+
If any of the required tdfs4ds configuration variables (INSTRUCT_MODEL_URL,
|
|
901
|
+
INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL) are not set.
|
|
902
|
+
|
|
903
|
+
Notes
|
|
904
|
+
-----
|
|
905
|
+
- This function requires that the tdfs4ds instruction model configuration is properly set.
|
|
906
|
+
- The resulting descriptions are typically ≤ 5 sentences per column, focusing on
|
|
907
|
+
business meaning and logic.
|
|
908
|
+
- If the model fails to produce valid JSON, an exception will be raised.
|
|
909
|
+
"""
|
|
910
|
+
# Import the configuration variables
|
|
911
|
+
from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
|
|
912
|
+
|
|
913
|
+
# Validate configuration
|
|
914
|
+
if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
|
|
915
|
+
raise ValueError(
|
|
916
|
+
"tdfs4ds instruction model configuration is incomplete. Please ensure "
|
|
917
|
+
"INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
logger_safe('info', f'document_sql_query_columns: Starting documentation for {len(entity_columns)} entity columns and {len(feature_columns)} feature columns in {language}')
|
|
921
|
+
|
|
922
|
+
if provider is None:
|
|
923
|
+
provider = INSTRUCT_MODEL_PROVIDER
|
|
924
|
+
|
|
925
|
+
# Build the LLM client
|
|
926
|
+
llm = build_llm(
|
|
927
|
+
llm_service=INSTRUCT_MODEL_URL,
|
|
928
|
+
api_key=INSTRUCT_MODEL_API_KEY,
|
|
929
|
+
model_id=INSTRUCT_MODEL_MODEL
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
# Build the documentation chain
|
|
933
|
+
sql_doc_chain = build_sql_documentation_chain(llm, entity_columns, feature_columns, provider=provider, json_constraint=json_constraint)
|
|
934
|
+
|
|
935
|
+
# Run the documentation
|
|
936
|
+
result = run_sql_documentation(sql_doc_chain, sql_query, entity_columns, feature_columns, language=language)
|
|
937
|
+
|
|
938
|
+
# Separate entity columns, feature columns, entity description, and query logic
|
|
939
|
+
entity_docs = {k: v for k, v in result.items() if k in entity_columns}
|
|
940
|
+
feature_docs = {k: v for k, v in result.items() if k in feature_columns}
|
|
941
|
+
entity_desc = result.get("entity_description", "")
|
|
942
|
+
query_logic = result.get("query_business_logic", "")
|
|
943
|
+
|
|
944
|
+
logger_safe('info', f'document_sql_query_columns: Successfully completed documentation for {len(entity_docs)} entity columns, {len(feature_docs)} feature columns, entity description and query logic')
|
|
945
|
+
return {
|
|
946
|
+
"query_business_logic": query_logic,
|
|
947
|
+
"entity_description": entity_desc,
|
|
948
|
+
"entity_columns": entity_docs,
|
|
949
|
+
"feature_columns": feature_docs
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def build_explain_documentation_chain(
|
|
954
|
+
llm: ChatOpenAI,
|
|
955
|
+
provider: str = "vllm",
|
|
956
|
+
json_constraint: bool = True,
|
|
957
|
+
) -> Runnable:
|
|
958
|
+
"""
|
|
959
|
+
Build a LangChain Runnable that analyzes SQL EXPLAIN plans and generates
|
|
960
|
+
optimization scores, warnings, and recommendations.
|
|
961
|
+
|
|
962
|
+
The resulting chain expects two input variables:
|
|
963
|
+
- sql_query: str → the original SQL query
|
|
964
|
+
- explain_plan: str → the EXPLAIN output from the database
|
|
965
|
+
|
|
966
|
+
Parameters
|
|
967
|
+
----------
|
|
968
|
+
llm : ChatOpenAI
|
|
969
|
+
The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
|
|
970
|
+
provider : str, optional (default="vllm")
|
|
971
|
+
Indicates which structured-output mechanism to use.
|
|
972
|
+
Supported values:
|
|
973
|
+
- "vllm" → uses `guided_json` for strict JSON output
|
|
974
|
+
- "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
|
|
975
|
+
- "ollama" → uses Ollama's `format=` schema
|
|
976
|
+
- "openai-compatible" → alias for vLLM-style guided decoding
|
|
977
|
+
- any other value → fall back to unconstrained text output
|
|
978
|
+
json_constraint : bool, optional (default=True)
|
|
979
|
+
If True: a JSON Schema is generated and provider-specific constrained decoding is applied.
|
|
980
|
+
If False: the chain does not enforce JSON structure at the LLM level.
|
|
981
|
+
|
|
982
|
+
Returns
|
|
983
|
+
-------
|
|
984
|
+
Runnable
|
|
985
|
+
A LangChain Runnable that executes:
|
|
986
|
+
prompt → LLM (optionally schema-guided) → JSON parser
|
|
987
|
+
|
|
988
|
+
When invoked with:
|
|
989
|
+
{
|
|
990
|
+
"sql_query": "SELECT ...",
|
|
991
|
+
"explain_plan": "..."
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
It returns:
|
|
995
|
+
dict with keys:
|
|
996
|
+
- "explanation": str describing the EXPLAIN plan in business terms
|
|
997
|
+
- "optimization_score": int from 1 (poorly optimized) to 5 (well optimized)
|
|
998
|
+
- "warnings": list[str] of potential issues or concerns
|
|
999
|
+
- "recommendations": list[str] of actionable optimization suggestions
|
|
1000
|
+
"""
|
|
1001
|
+
logger_safe('info', f'build_explain_documentation_chain: Building chain for provider {provider}, json_constraint={json_constraint}')
|
|
1002
|
+
|
|
1003
|
+
# JSON schema for EXPLAIN analysis output
|
|
1004
|
+
explain_schema = {
|
|
1005
|
+
"type": "object",
|
|
1006
|
+
"properties": {
|
|
1007
|
+
"explanation": {"type": "string"},
|
|
1008
|
+
"optimization_score": {
|
|
1009
|
+
"type": "integer",
|
|
1010
|
+
"minimum": 1,
|
|
1011
|
+
"maximum": 5,
|
|
1012
|
+
"description": "Score from 1 (poorly optimized) to 5 (well optimized)"
|
|
1013
|
+
},
|
|
1014
|
+
"warnings": {
|
|
1015
|
+
"type": "array",
|
|
1016
|
+
"items": {"type": "string"},
|
|
1017
|
+
"description": "List of potential issues or concerns"
|
|
1018
|
+
},
|
|
1019
|
+
"recommendations": {
|
|
1020
|
+
"type": "array",
|
|
1021
|
+
"items": {"type": "string"},
|
|
1022
|
+
"description": "List of actionable optimization suggestions"
|
|
1023
|
+
}
|
|
1024
|
+
},
|
|
1025
|
+
"required": ["explanation", "optimization_score", "warnings", "recommendations"],
|
|
1026
|
+
"additionalProperties": False
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
prompt = ChatPromptTemplate.from_template(
|
|
1030
|
+
"""
|
|
1031
|
+
You are an expert SQL query optimization analyst.
|
|
1032
|
+
|
|
1033
|
+
Your task is to analyze a SQL EXPLAIN plan and provide optimization guidance.
|
|
1034
|
+
|
|
1035
|
+
Provide your analysis in the following JSON format with these exact keys:
|
|
1036
|
+
- explanation: A clear, plain-text explanation of what the EXPLAIN plan shows. Include analysis of execution strategy, estimated costs, and any visible inefficiencies.
|
|
1037
|
+
- optimization_score: An integer from 1 to 5 (1 = poorly optimized, 5 = well optimized)
|
|
1038
|
+
- warnings: An array of warning strings about potential issues
|
|
1039
|
+
- recommendations: An array of actionable recommendation strings for improvement
|
|
1040
|
+
|
|
1041
|
+
Analysis Guidelines:
|
|
1042
|
+
- Focus on execution strategy, index usage, and join efficiency
|
|
1043
|
+
- Be detailed but business-friendly, avoiding unnecessary technical jargon
|
|
1044
|
+
- Consider factors like: full table scans vs index usage, join strategies, data distribution
|
|
1045
|
+
- Avoid using double quotes (") in the explanation text; use single quotes or rephrase to prevent JSON parsing errors
|
|
1046
|
+
|
|
1047
|
+
Scoring Guidelines:
|
|
1048
|
+
- Score 1: Multiple full table scans, no indexes, inefficient joins
|
|
1049
|
+
- Score 2: Some index usage but still room for improvement, potentially expensive operations
|
|
1050
|
+
- Score 3: Reasonable query plan, acceptable performance, some optimization opportunities
|
|
1051
|
+
- Score 4: Good query plan with mostly optimized joins and indexes, minor improvements possible
|
|
1052
|
+
- Score 5: Excellent plan with efficient execution, proper use of indexes, optimal join strategies
|
|
1053
|
+
|
|
1054
|
+
Warnings should highlight specific concerns (e.g., 'Full table scan on large table ORDERS', 'Missing index on customer_id column').
|
|
1055
|
+
Recommendations should be specific and actionable (e.g., 'Add index on orders.customer_id', 'Consider using a different join strategy').
|
|
1056
|
+
|
|
1057
|
+
Output format (very important):
|
|
1058
|
+
- Return ONLY a valid JSON object.
|
|
1059
|
+
- Each top-level key must be exactly 'explanation', 'optimization_score', 'warnings', or 'recommendations'.
|
|
1060
|
+
- The 'explanation' value must be a single string.
|
|
1061
|
+
- The 'optimization_score' value must be an integer from 1 to 5.
|
|
1062
|
+
- The 'warnings' value must be an array of strings.
|
|
1063
|
+
- The 'recommendations' value must be an array of strings.
|
|
1064
|
+
|
|
1065
|
+
Example of the required format:
|
|
1066
|
+
{{
|
|
1067
|
+
"explanation": "The EXPLAIN plan shows a nested loop join between the customers and orders tables. The query performs a full table scan on the orders table, which has an estimated 1 million rows. The join condition uses the customer_id column, but there is no index on this column in the orders table.",
|
|
1068
|
+
"optimization_score": 2,
|
|
1069
|
+
"warnings": ["Full table scan on large orders table", "Missing index on orders.customer_id"],
|
|
1070
|
+
"recommendations": ["Add index on orders.customer_id", "Consider using a hash join instead of nested loop"]
|
|
1071
|
+
}}
|
|
1072
|
+
|
|
1073
|
+
SQL Query:
|
|
1074
|
+
```sql
|
|
1075
|
+
{sql_query}
|
|
1076
|
+
```
|
|
1077
|
+
|
|
1078
|
+
EXPLAIN Plan:
|
|
1079
|
+
```
|
|
1080
|
+
{explain_plan}
|
|
1081
|
+
```
|
|
1082
|
+
|
|
1083
|
+
Return ONLY valid JSON with the four keys above.
|
|
1084
|
+
"""
|
|
1085
|
+
)
|
|
1086
|
+
parser = JsonOutputParser()
|
|
1087
|
+
if not json_constraint:
|
|
1088
|
+
return prompt | llm | parser
|
|
1089
|
+
|
|
1090
|
+
logger_safe('debug', f'build_explain_documentation_chain: Using provider {provider} with json_constraint={json_constraint}')
|
|
1091
|
+
|
|
1092
|
+
# Wrap schema for OpenAI providers
|
|
1093
|
+
if provider.lower() in ("openai", "azure", "azure-openai"):
|
|
1094
|
+
wrapped_schema = {
|
|
1095
|
+
"type": "json_schema",
|
|
1096
|
+
"json_schema": {
|
|
1097
|
+
"name": "ExplainAnalysis",
|
|
1098
|
+
"schema": explain_schema,
|
|
1099
|
+
"strict": True,
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
else:
|
|
1103
|
+
wrapped_schema = explain_schema
|
|
1104
|
+
|
|
1105
|
+
# Use helper to build provider-specific LLM caller
|
|
1106
|
+
call_llm = _build_provider_llm_caller(llm, provider, wrapped_schema)
|
|
1107
|
+
constrained_llm = RunnableLambda(call_llm)
|
|
1108
|
+
|
|
1109
|
+
# Final chain: prompt -> LLM (schema-guided) -> JSON parser
|
|
1110
|
+
def _parse(ai_msg: AIMessage):
|
|
1111
|
+
raw = ai_msg.content
|
|
1112
|
+
return parser.parse(raw)
|
|
1113
|
+
|
|
1114
|
+
return prompt | constrained_llm | RunnableLambda(_parse)
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
def run_explain_documentation(
|
|
1118
|
+
chain: Runnable,
|
|
1119
|
+
sql_query: str,
|
|
1120
|
+
explain_plan: str,
|
|
1121
|
+
) -> Dict[str, Any]:
|
|
1122
|
+
"""
|
|
1123
|
+
Execute an EXPLAIN-documentation chain and return optimization analysis.
|
|
1124
|
+
|
|
1125
|
+
Parameters
|
|
1126
|
+
----------
|
|
1127
|
+
chain : Runnable
|
|
1128
|
+
A LangChain Runnable returned by `build_explain_documentation_chain()`.
|
|
1129
|
+
sql_query : str
|
|
1130
|
+
The original SQL query.
|
|
1131
|
+
explain_plan : str
|
|
1132
|
+
The EXPLAIN output from the database.
|
|
1133
|
+
|
|
1134
|
+
Returns
|
|
1135
|
+
-------
|
|
1136
|
+
dict
|
|
1137
|
+
A dictionary with keys: "explanation", "optimization_score", "warnings", "recommendations"
|
|
1138
|
+
"""
|
|
1139
|
+
logger_safe('info', 'run_explain_documentation: Starting EXPLAIN analysis')
|
|
1140
|
+
|
|
1141
|
+
try:
|
|
1142
|
+
result = chain.invoke({
|
|
1143
|
+
"sql_query": sql_query,
|
|
1144
|
+
"explain_plan": explain_plan
|
|
1145
|
+
})
|
|
1146
|
+
logger_safe('info', f'run_explain_documentation: Successfully analyzed EXPLAIN plan. Score: {result.get("optimization_score", "N/A")}/5')
|
|
1147
|
+
return result
|
|
1148
|
+
except Exception as e:
|
|
1149
|
+
logger_safe('error', f'run_explain_documentation: Failed to analyze EXPLAIN plan: {e}')
|
|
1150
|
+
raise
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def document_sql_query_explain(
|
|
1154
|
+
sql_query: str,
|
|
1155
|
+
provider: Optional[str] = None,
|
|
1156
|
+
json_constraint: bool = True,
|
|
1157
|
+
) -> Dict[str, Any]:
|
|
1158
|
+
"""
|
|
1159
|
+
Analyze a SQL query's EXPLAIN plan and return optimization recommendations.
|
|
1160
|
+
|
|
1161
|
+
This function automatically builds the LLM client using tdfs4ds configuration,
|
|
1162
|
+
constructs the EXPLAIN analysis chain, and executes it.
|
|
1163
|
+
|
|
1164
|
+
Parameters
|
|
1165
|
+
----------
|
|
1166
|
+
sql_query : str
|
|
1167
|
+
The original SQL query.
|
|
1168
|
+
explain_plan : str
|
|
1169
|
+
The EXPLAIN output from the database.
|
|
1170
|
+
provider : str, optional (default=None)
|
|
1171
|
+
Indicates which structured-output mechanism to use for the LLM.
|
|
1172
|
+
If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
|
|
1173
|
+
Supported values: "vllm", "openai", "azure", "ollama", etc.
|
|
1174
|
+
json_constraint : bool, optional (default=True)
|
|
1175
|
+
If True: use provider-specific constrained decoding.
|
|
1176
|
+
If False: rely on prompt guidance only.
|
|
1177
|
+
|
|
1178
|
+
Returns
|
|
1179
|
+
-------
|
|
1180
|
+
dict
|
|
1181
|
+
A dictionary with keys:
|
|
1182
|
+
- "explanation": str describing the EXPLAIN plan
|
|
1183
|
+
- "optimization_score": int from 1 to 5
|
|
1184
|
+
- "warnings": list[str] of potential issues
|
|
1185
|
+
- "recommendations": list[str] of actionable suggestions
|
|
1186
|
+
|
|
1187
|
+
Raises
|
|
1188
|
+
------
|
|
1189
|
+
ValueError
|
|
1190
|
+
If tdfs4ds instruction model configuration is incomplete.
|
|
1191
|
+
"""
|
|
1192
|
+
from tdfs4ds import INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, INSTRUCT_MODEL_PROVIDER
|
|
1193
|
+
|
|
1194
|
+
if not INSTRUCT_MODEL_URL or not INSTRUCT_MODEL_API_KEY or not INSTRUCT_MODEL_MODEL or not INSTRUCT_MODEL_PROVIDER:
|
|
1195
|
+
raise ValueError(
|
|
1196
|
+
"tdfs4ds instruction model configuration is incomplete. Please ensure "
|
|
1197
|
+
"INSTRUCT_MODEL_URL, INSTRUCT_MODEL_API_KEY, INSTRUCT_MODEL_MODEL, and INSTRUCT_MODEL_PROVIDER are set."
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
logger_safe('info', 'document_sql_query_explain: Starting EXPLAIN analysis')
|
|
1201
|
+
|
|
1202
|
+
if provider is None:
|
|
1203
|
+
provider = INSTRUCT_MODEL_PROVIDER
|
|
1204
|
+
|
|
1205
|
+
# Build the LLM client
|
|
1206
|
+
llm = build_llm(
|
|
1207
|
+
llm_service=INSTRUCT_MODEL_URL,
|
|
1208
|
+
api_key=INSTRUCT_MODEL_API_KEY,
|
|
1209
|
+
model_id=INSTRUCT_MODEL_MODEL
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
# get the explain plan:
|
|
1213
|
+
explain_plan = get_the_explain(sql_query)
|
|
1214
|
+
# Build and run the EXPLAIN analysis chain
|
|
1215
|
+
explain_chain = build_explain_documentation_chain(llm, provider=provider, json_constraint=json_constraint)
|
|
1216
|
+
result = run_explain_documentation(explain_chain, sql_query, explain_plan)
|
|
1217
|
+
|
|
1218
|
+
logger_safe('info', f'document_sql_query_explain: Successfully completed EXPLAIN analysis. Score: {result.get("optimization_score", "N/A")}/5')
|
|
1219
|
+
return result
|
|
1220
|
+
|
|
1221
|
+
def documentation_tables_creation():
|
|
1222
|
+
"""
|
|
1223
|
+
Create the necessary documentation tables in the database if they do not already exist.
|
|
1224
|
+
tdml: The tdfs4ds TDML connection object."""
|
|
1225
|
+
query_process_table = f"""
|
|
1226
|
+
CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} ,FALLBACK ,
|
|
1227
|
+
NO BEFORE JOURNAL,
|
|
1228
|
+
NO AFTER JOURNAL,
|
|
1229
|
+
CHECKSUM = DEFAULT,
|
|
1230
|
+
DEFAULT MERGEBLOCKRATIO,
|
|
1231
|
+
MAP = TD_MAP1
|
|
1232
|
+
(
|
|
1233
|
+
PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
1234
|
+
BUSINESS_LOGIC_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1235
|
+
ENTITY_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1236
|
+
ENTITY_COLUMNS_JSON VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1237
|
+
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1238
|
+
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1239
|
+
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
|
|
1240
|
+
PRIMARY INDEX ( PROCESS_ID )
|
|
1241
|
+
"""
|
|
1242
|
+
|
|
1243
|
+
query_process_features_table = f"""
|
|
1244
|
+
CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} ,FALLBACK ,
|
|
1245
|
+
NO BEFORE JOURNAL,
|
|
1246
|
+
NO AFTER JOURNAL,
|
|
1247
|
+
CHECKSUM = DEFAULT,
|
|
1248
|
+
DEFAULT MERGEBLOCKRATIO,
|
|
1249
|
+
MAP = TD_MAP1
|
|
1250
|
+
(
|
|
1251
|
+
PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
1252
|
+
FEATURE_ID BIGINT NOT NULL,
|
|
1253
|
+
FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
1254
|
+
FEATURE_DESCRIPTION VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1255
|
+
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1256
|
+
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1257
|
+
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
|
|
1258
|
+
PRIMARY INDEX ( PROCESS_ID, FEATURE_ID )
|
|
1259
|
+
"""
|
|
1260
|
+
|
|
1261
|
+
query_process_explain_table = f"""
|
|
1262
|
+
CREATE MULTISET TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} ,FALLBACK ,
|
|
1263
|
+
NO BEFORE JOURNAL,
|
|
1264
|
+
NO AFTER JOURNAL,
|
|
1265
|
+
CHECKSUM = DEFAULT,
|
|
1266
|
+
DEFAULT MERGEBLOCKRATIO,
|
|
1267
|
+
MAP = TD_MAP1
|
|
1268
|
+
(
|
|
1269
|
+
PROCESS_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
1270
|
+
EXPLAIN_ANALYSIS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1271
|
+
OPTIMIZATION_SCORE INT,
|
|
1272
|
+
WARNINGS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1273
|
+
RECOMMENDATIONS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
1274
|
+
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1275
|
+
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
1276
|
+
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME)
|
|
1277
|
+
PRIMARY INDEX ( PROCESS_ID )
|
|
1278
|
+
"""
|
|
1279
|
+
|
|
1280
|
+
try:
|
|
1281
|
+
tdml.execute_sql(query_process_table)
|
|
1282
|
+
logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}')
|
|
1283
|
+
except Exception as e:
|
|
1284
|
+
logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}: {e}')
|
|
1285
|
+
if 'already exists' in str(e).lower():
|
|
1286
|
+
logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} already exists. Skipping creation.')
|
|
1287
|
+
pass
|
|
1288
|
+
else:
|
|
1289
|
+
raise
|
|
1290
|
+
try:
|
|
1291
|
+
tdml.execute_sql(query_process_features_table)
|
|
1292
|
+
logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}')
|
|
1293
|
+
except Exception as e:
|
|
1294
|
+
logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}: {e}')
|
|
1295
|
+
if 'already exists' in str(e).lower():
|
|
1296
|
+
logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} already exists. Skipping creation.')
|
|
1297
|
+
pass
|
|
1298
|
+
else:
|
|
1299
|
+
raise
|
|
1300
|
+
|
|
1301
|
+
try:
|
|
1302
|
+
tdml.execute_sql(query_process_explain_table)
|
|
1303
|
+
logger_safe('info', f'documentation_tables_creation: Created table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}')
|
|
1304
|
+
except Exception as e:
|
|
1305
|
+
logger_safe('error', f'documentation_tables_creation: Failed to create table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}: {e}')
|
|
1306
|
+
if 'already exists' in str(e).lower():
|
|
1307
|
+
logger_safe('info', f'documentation_tables_creation: Table {tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} already exists. Skipping creation.')
|
|
1308
|
+
pass
|
|
1309
|
+
else:
|
|
1310
|
+
raise
|
|
1311
|
+
|
|
1312
|
+
logger_safe('info', 'documentation_tables_creation: Documentation tables creation process completed.')
|
|
1313
|
+
return
|
|
1314
|
+
|
|
1315
|
+
def document_process(process_id: str, language: str = "English", json_constraint: bool = True, show_sql_query: bool = False, show_explain_plan: bool = False, display: bool = True, upload: bool = True) -> Optional[Dict[str, Any]]:
|
|
1316
|
+
"""
|
|
1317
|
+
Generate and store documentation for a data process identified by process_id.
|
|
1318
|
+
This function retrieves the SQL query and output columns for the process,
|
|
1319
|
+
generates business-focused documentation using an LLM, and stores the results
|
|
1320
|
+
in the appropriate documentation tables.
|
|
1321
|
+
|
|
1322
|
+
Parameters
|
|
1323
|
+
----------
|
|
1324
|
+
process_id : str
|
|
1325
|
+
The unique identifier of the data process to document.
|
|
1326
|
+
|
|
1327
|
+
language : str, optional (default="English")
|
|
1328
|
+
The target output language for the generated documentation. This value is
|
|
1329
|
+
passed into the prompt’s `{language}` variable. Examples: "English", "French", "German", "Spanish", "Japanese".
|
|
1330
|
+
provider : str, optional (default=None)
|
|
1331
|
+
Indicates which structured-output mechanism to use for the LLM.
|
|
1332
|
+
If None, uses INSTRUCT_MODEL_PROVIDER from tdfs4ds config.
|
|
1333
|
+
Supported values:
|
|
1334
|
+
- "vllm" → uses `guided_json` for strict JSON output
|
|
1335
|
+
- "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
|
|
1336
|
+
- "ollama" → uses Ollama's `format=` schema
|
|
1337
|
+
- "openai-compatible" → alias for vLLM-style guided decoding
|
|
1338
|
+
- any other value → fall back to unconstrained text output
|
|
1339
|
+
json_constraint : bool, optional (default=True)
|
|
1340
|
+
If True:
|
|
1341
|
+
- a JSON Schema is generated from the column list
|
|
1342
|
+
- provider-specific constrained decoding is applied
|
|
1343
|
+
If False:
|
|
1344
|
+
- the chain does not enforce JSON structure at the LLM level
|
|
1345
|
+
- the model is only guided by the prompt (weaker guarantees)
|
|
1346
|
+
show_sql_query : bool, optional (default=False)
|
|
1347
|
+
If True, display the original SQL query at the end of the documentation report.
|
|
1348
|
+
show_explain_plan : bool, optional (default=False)
|
|
1349
|
+
If True, display the raw EXPLAIN plan output at the end of the documentation report.
|
|
1350
|
+
display : bool, optional (default=True)
|
|
1351
|
+
If True, print the generated documentation to the console.
|
|
1352
|
+
upload_documentation : bool, optional (default=True)
|
|
1353
|
+
If True, upload the generated documentation to the documentation tables.
|
|
1354
|
+
|
|
1355
|
+
Returns
|
|
1356
|
+
-------
|
|
1357
|
+
dict or None
|
|
1358
|
+
A dictionary containing the generated documentation and analysis, or None if an error occurred.
|
|
1359
|
+
The dictionary includes keys:
|
|
1360
|
+
- PROCESS_ID
|
|
1361
|
+
- DOCUMENTED_SQL
|
|
1362
|
+
- ENTITY_DESCRIPTION
|
|
1363
|
+
- DOCUMENTED_ENTITY_COLUMNS
|
|
1364
|
+
- DOCUMENTED_FEATURE_COLUMNS
|
|
1365
|
+
- EXPLAIN_ANALYSIS (if show_explain_plan is True)
|
|
1366
|
+
- OPTIMIZATION_SCORE (if show_explain_plan is True)
|
|
1367
|
+
- EXPLAIN_WARNINGS (if show_explain_plan is True)
|
|
1368
|
+
- EXPLAIN_RECOMMENDATIONS (if show_explain_plan is True)
|
|
1369
|
+
- RAW_EXPLAIN_PLAN (if show_explain_plan is True)
|
|
1370
|
+
Notes
|
|
1371
|
+
-----
|
|
1372
|
+
- This function requires that the tdfs4ds instruction model configuration is properly set.
|
|
1373
|
+
- If the model fails to produce valid JSON, an exception will be raised.
|
|
1374
|
+
- The resulting descriptions are typically ≤ 5 sentences per column, focusing on
|
|
1375
|
+
business meaning and logic.
|
|
1376
|
+
"""
|
|
1377
|
+
logger_safe('info', f'document_process: Starting documentation for process_id {process_id} in {language}')
|
|
1378
|
+
|
|
1379
|
+
# Retrieve process SQL and columns
|
|
1380
|
+
try:
|
|
1381
|
+
process_info = tdfs4ds.process_store.process_store_catalog_management.get_process_info(process_id)
|
|
1382
|
+
except Exception as e:
|
|
1383
|
+
logger_safe('error', f"document_process: Error retrieving process info for process_id {process_id}: {e}")
|
|
1384
|
+
return
|
|
1385
|
+
|
|
1386
|
+
documentation = document_sql_query_columns(
|
|
1387
|
+
sql_query = process_info['PROCESS_SQL'],
|
|
1388
|
+
entity_columns = process_info['ENTITY_COLUMNS'],
|
|
1389
|
+
feature_columns = process_info['FEATURE_COLUMNS']
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
process_info['DOCUMENTED_SQL'] = documentation['query_business_logic']
|
|
1393
|
+
process_info['ENTITY_DESCRIPTION'] = documentation['entity_description']
|
|
1394
|
+
process_info['DOCUMENTED_ENTITY_COLUMNS'] = documentation['entity_columns']
|
|
1395
|
+
process_info['DOCUMENTED_FEATURE_COLUMNS'] = documentation['feature_columns']
|
|
1396
|
+
|
|
1397
|
+
if True:
|
|
1398
|
+
explain_documentation = document_sql_query_explain(
|
|
1399
|
+
sql_query = process_info['PROCESS_SQL']
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
process_info['EXPLAIN_ANALYSIS'] = explain_documentation['explanation']
|
|
1403
|
+
process_info['OPTIMIZATION_SCORE'] = explain_documentation['optimization_score']
|
|
1404
|
+
process_info['EXPLAIN_WARNINGS'] = explain_documentation['warnings']
|
|
1405
|
+
process_info['EXPLAIN_RECOMMENDATIONS'] = explain_documentation['recommendations']
|
|
1406
|
+
|
|
1407
|
+
# Store the raw EXPLAIN plan if needed for display
|
|
1408
|
+
if show_explain_plan:
|
|
1409
|
+
process_info['RAW_EXPLAIN_PLAN'] = get_the_explain(process_info['PROCESS_SQL'])
|
|
1410
|
+
|
|
1411
|
+
# Upload the generated documentation to the documentation tables:
|
|
1412
|
+
if upload:
|
|
1413
|
+
upload_documentation(process_info)
|
|
1414
|
+
logger_safe('info', f'document_process: Uploaded documentation for process_id {process_id} to documentation tables.')
|
|
1415
|
+
upload_documentation_explain(process_info)
|
|
1416
|
+
logger_safe('info', f'document_process: Uploaded EXPLAIN analysis for process_id {process_id} to documentation tables.')
|
|
1417
|
+
|
|
1418
|
+
# pretty print documentation for info:
|
|
1419
|
+
logger_safe('info', f"document_process: Documentation for process_id {process_id}:")
|
|
1420
|
+
|
|
1421
|
+
if display:
|
|
1422
|
+
_print_documentation(
|
|
1423
|
+
documented_sql = process_info.get('DOCUMENTED_SQL', None),
|
|
1424
|
+
entity_description = process_info.get('ENTITY_DESCRIPTION', None),
|
|
1425
|
+
documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
|
|
1426
|
+
documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
|
|
1427
|
+
process_id = process_info.get('PROCESS_ID', process_id),
|
|
1428
|
+
view_name = process_info.get('VIEW_NAME', None),
|
|
1429
|
+
explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
|
|
1430
|
+
optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
|
|
1431
|
+
explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
|
|
1432
|
+
explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
|
|
1433
|
+
sql_query = process_info.get('PROCESS_SQL', None) if show_sql_query else None,
|
|
1434
|
+
explain_plan = process_info.get('RAW_EXPLAIN_PLAN', None) if show_explain_plan else None,
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
return process_info
|
|
1438
|
+
|
|
1439
|
+
def get_the_explain(sql_query: str) -> str:
|
|
1440
|
+
"""
|
|
1441
|
+
Get the EXPLAIN plan for a given SQL query using the tdfs4ds TDML connection.
|
|
1442
|
+
|
|
1443
|
+
Parameters
|
|
1444
|
+
----------
|
|
1445
|
+
sql_query : str
|
|
1446
|
+
The SQL query to explain.
|
|
1447
|
+
|
|
1448
|
+
Returns
|
|
1449
|
+
-------
|
|
1450
|
+
str
|
|
1451
|
+
The EXPLAIN plan as a formatted string.
|
|
1452
|
+
"""
|
|
1453
|
+
def _extract_inner_query_from_view(query: str) -> str:
|
|
1454
|
+
"""
|
|
1455
|
+
If the provided SQL is a CREATE/REPLACE VIEW (or REPLACE VIEW), extract and
|
|
1456
|
+
return the inner SELECT/definition. Otherwise return the original query.
|
|
1457
|
+
|
|
1458
|
+
This helps when running EXPLAIN: we want to analyze the query inside the
|
|
1459
|
+
view definition rather than the DDL wrapper.
|
|
1460
|
+
"""
|
|
1461
|
+
if not isinstance(query, str):
|
|
1462
|
+
return query
|
|
1463
|
+
pattern = r'^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?|REPLACE\s+)?VIEW\b.*?\bAS\b\s*(?P<body>.*)$'
|
|
1464
|
+
m = re.search(pattern, query, flags=re.IGNORECASE | re.DOTALL)
|
|
1465
|
+
if not m:
|
|
1466
|
+
return query
|
|
1467
|
+
body = m.group('body').strip()
|
|
1468
|
+
# Strip outer parentheses if the definition is wrapped
|
|
1469
|
+
if body.startswith('(') and body.endswith(')'):
|
|
1470
|
+
body = body[1:-1].strip()
|
|
1471
|
+
# Remove trailing semicolon
|
|
1472
|
+
if body.endswith(';'):
|
|
1473
|
+
body = body[:-1].strip()
|
|
1474
|
+
# Remove trailing LOCK ROW FOR ACCESS (or similar) clauses that may appear
|
|
1475
|
+
# in view definitions (e.g., "LOCK ROW FOR ACCESS") so EXPLAIN focuses
|
|
1476
|
+
# on the inner SELECT statement.
|
|
1477
|
+
body = re.sub(r"\bLOCK\s+ROW\s+FOR\s+ACCESS\b\s*;?\s*$", "", body, flags=re.IGNORECASE)
|
|
1478
|
+
logger_safe('debug', 'get_the_explain: Extracted inner query from CREATE/REPLACE VIEW for EXPLAIN.')
|
|
1479
|
+
return body
|
|
1480
|
+
|
|
1481
|
+
inner_sql = _extract_inner_query_from_view(sql_query)
|
|
1482
|
+
try:
|
|
1483
|
+
explain_result = tdml.execute_sql(f"EXPLAIN {inner_sql}").fetchall()
|
|
1484
|
+
explain_lines = [row[0] for row in explain_result]
|
|
1485
|
+
explain_text = "\n".join(explain_lines)
|
|
1486
|
+
logger_safe('info', 'get_the_explain: Successfully retrieved EXPLAIN plan.')
|
|
1487
|
+
return explain_text
|
|
1488
|
+
except Exception as e:
|
|
1489
|
+
logger_safe('error', f'get_the_explain: Failed to retrieve EXPLAIN plan: {e}')
|
|
1490
|
+
raise
|
|
1491
|
+
|
|
1492
|
+
def upload_documentation(process_info: Dict[str, Any]) -> None:
|
|
1493
|
+
"""
|
|
1494
|
+
Upload the generated documentation for a data process into the documentation tables.
|
|
1495
|
+
|
|
1496
|
+
Parameters
|
|
1497
|
+
----------
|
|
1498
|
+
process_info : dict
|
|
1499
|
+
A dictionary containing the process documentation information.
|
|
1500
|
+
Expected keys:
|
|
1501
|
+
- PROCESS_ID: str
|
|
1502
|
+
- DOCUMENTED_SQL: str
|
|
1503
|
+
- ENTITY_DESCRIPTION: str
|
|
1504
|
+
- DOCUMENTED_ENTITY_COLUMNS: dict[str, str]
|
|
1505
|
+
- DOCUMENTED_FEATURE_COLUMNS: dict[str, str]
|
|
1506
|
+
"""
|
|
1507
|
+
|
|
1508
|
+
process_id = process_info['PROCESS_ID']
|
|
1509
|
+
documented_sql = process_info['DOCUMENTED_SQL']
|
|
1510
|
+
entity_description = process_info['ENTITY_DESCRIPTION']
|
|
1511
|
+
entity_columns_json = json.dumps(process_info['DOCUMENTED_ENTITY_COLUMNS'])
|
|
1512
|
+
feature_columns = process_info['DOCUMENTED_FEATURE_COLUMNS']
|
|
1513
|
+
|
|
1514
|
+
# build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_BUSINESS_LOGIC
|
|
1515
|
+
# that contains PROCESS_ID, BUSINESS_LOGIC_DESCRIPTION, ENTITY_DESCRIPTION, ENTITY_COLUMNS_JSON
|
|
1516
|
+
df_business_logic = pd.DataFrame([{
|
|
1517
|
+
'PROCESS_ID': process_id,
|
|
1518
|
+
'BUSINESS_LOGIC_DESCRIPTION': documented_sql,
|
|
1519
|
+
'ENTITY_DESCRIPTION': entity_description,
|
|
1520
|
+
'ENTITY_COLUMNS_JSON': entity_columns_json
|
|
1521
|
+
}])
|
|
1522
|
+
|
|
1523
|
+
# build a pandas dataframe containing the data to be uploaded in DOCUMENTATION_PROCESS_FEATURES
|
|
1524
|
+
# that contains PROCESS_ID, FEATURE_ID, FEATURE_DESCRIPTION
|
|
1525
|
+
# at this stage, FEATURE_ID is not known, so we will use the FEATURE_NAME as a placeholder
|
|
1526
|
+
# and later replace it with the actual FEATURE_ID after insertion with a join with the FS_FEATURE_CATALOG
|
|
1527
|
+
# here we need to explode the feature_columns dict into multiple rows
|
|
1528
|
+
feature_rows = []
|
|
1529
|
+
for feature_name, feature_description in feature_columns.items():
|
|
1530
|
+
feature_rows.append({
|
|
1531
|
+
'PROCESS_ID': process_id,
|
|
1532
|
+
'FEATURE_NAME': feature_name, # placeholder for FEATURE_ID
|
|
1533
|
+
'FEATURE_DESCRIPTION': feature_description
|
|
1534
|
+
})
|
|
1535
|
+
df_features = pd.DataFrame(feature_rows)
|
|
1536
|
+
|
|
1537
|
+
# Determine end period based on tdfs4ds configuration
|
|
1538
|
+
if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
|
|
1539
|
+
end_period_ = '9999-01-01 00:00:00'
|
|
1540
|
+
else:
|
|
1541
|
+
end_period_ = tdfs4ds.END_PERIOD
|
|
1542
|
+
|
|
1543
|
+
# upload the df_business_logic dataframe into a staging volatile table
|
|
1544
|
+
logger_safe('info', f'upload_documentation: Uploading documentation for process_id {process_id} into staging tables.')
|
|
1545
|
+
tdml.copy_to_sql(
|
|
1546
|
+
df_business_logic,
|
|
1547
|
+
table_name = "DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING",
|
|
1548
|
+
if_exists = 'replace',
|
|
1549
|
+
temporary = True
|
|
1550
|
+
)
|
|
1551
|
+
logger_safe('info', f'upload_documentation: Uploaded business logic documentation for process_id {process_id} into staging table.')
|
|
1552
|
+
|
|
1553
|
+
# upload the df_features dataframe into a staging volatile table
|
|
1554
|
+
logger_safe('info', f'upload_documentation: Uploading feature documentation for process_id {process_id} into staging tables.')
|
|
1555
|
+
tdml.copy_to_sql(
|
|
1556
|
+
df_features,
|
|
1557
|
+
table_name = "DOCUMENTATION_PROCESS_FEATURES_STAGING",
|
|
1558
|
+
if_exists = 'replace',
|
|
1559
|
+
temporary = True
|
|
1560
|
+
)
|
|
1561
|
+
logger_safe('info', f'upload_documentation: Uploaded feature documentation for process_id {process_id} into staging table.')
|
|
1562
|
+
|
|
1563
|
+
# merge into DOCUMENTATION_PROCESS_BUSINESS_LOGIC from staging table
|
|
1564
|
+
query_insert_business_logic = f"""
|
|
1565
|
+
CURRENT VALIDTIME
|
|
1566
|
+
MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC} EXISTING
|
|
1567
|
+
USING (
|
|
1568
|
+
SELECT
|
|
1569
|
+
PROCESS_ID,
|
|
1570
|
+
BUSINESS_LOGIC_DESCRIPTION,
|
|
1571
|
+
ENTITY_DESCRIPTION,
|
|
1572
|
+
ENTITY_COLUMNS_JSON
|
|
1573
|
+
FROM {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING
|
|
1574
|
+
) UPDATED
|
|
1575
|
+
ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
|
|
1576
|
+
WHEN MATCHED THEN
|
|
1577
|
+
UPDATE
|
|
1578
|
+
SET
|
|
1579
|
+
BUSINESS_LOGIC_DESCRIPTION = UPDATED.BUSINESS_LOGIC_DESCRIPTION,
|
|
1580
|
+
ENTITY_DESCRIPTION = UPDATED.ENTITY_DESCRIPTION,
|
|
1581
|
+
ENTITY_COLUMNS_JSON = UPDATED.ENTITY_COLUMNS_JSON
|
|
1582
|
+
WHEN NOT MATCHED THEN
|
|
1583
|
+
INSERT (
|
|
1584
|
+
UPDATED.PROCESS_ID,
|
|
1585
|
+
UPDATED.BUSINESS_LOGIC_DESCRIPTION,
|
|
1586
|
+
UPDATED.ENTITY_DESCRIPTION,
|
|
1587
|
+
UPDATED.ENTITY_COLUMNS_JSON
|
|
1588
|
+
)
|
|
1589
|
+
"""
|
|
1590
|
+
|
|
1591
|
+
# merge into DOCUMENTATION_PROCESS_FEATURES from staging table
|
|
1592
|
+
query_insert_features = f"""
|
|
1593
|
+
CURRENT VALIDTIME
|
|
1594
|
+
MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} EXISTING
|
|
1595
|
+
USING (
|
|
1596
|
+
SELECT
|
|
1597
|
+
A.PROCESS_ID,
|
|
1598
|
+
FC.FEATURE_ID,
|
|
1599
|
+
A.FEATURE_NAME,
|
|
1600
|
+
A.FEATURE_DESCRIPTION
|
|
1601
|
+
FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
|
|
1602
|
+
INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
|
|
1603
|
+
ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
|
|
1604
|
+
AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
|
|
1605
|
+
) UPDATED
|
|
1606
|
+
ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
|
|
1607
|
+
AND EXISTING.FEATURE_ID = UPDATED.FEATURE_ID
|
|
1608
|
+
WHEN MATCHED THEN
|
|
1609
|
+
UPDATE
|
|
1610
|
+
SET
|
|
1611
|
+
FEATURE_DESCRIPTION = UPDATED.FEATURE_DESCRIPTION,
|
|
1612
|
+
FEATURE_NAME = UPDATED.FEATURE_NAME
|
|
1613
|
+
WHEN NOT MATCHED THEN
|
|
1614
|
+
INSERT (
|
|
1615
|
+
UPDATED.PROCESS_ID,
|
|
1616
|
+
UPDATED.FEATURE_ID,
|
|
1617
|
+
UPDATED.FEATURE_NAME,
|
|
1618
|
+
UPDATED.FEATURE_DESCRIPTION
|
|
1619
|
+
)
|
|
1620
|
+
"""
|
|
1621
|
+
|
|
1622
|
+
# Remove features that are no longer present in the documentation
|
|
1623
|
+
query_delete_missing_features = f"""
|
|
1624
|
+
CURRENT VALIDTIME
|
|
1625
|
+
DELETE FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES}
|
|
1626
|
+
WHERE PROCESS_ID = '{process_id}'
|
|
1627
|
+
AND FEATURE_ID NOT IN (
|
|
1628
|
+
SELECT FC.FEATURE_ID
|
|
1629
|
+
FROM {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING A
|
|
1630
|
+
INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
|
|
1631
|
+
ON UPPER(FC.FEATURE_NAME) = UPPER(A.FEATURE_NAME)
|
|
1632
|
+
AND UPPER(FC.DATA_DOMAIN) = '{process_info['DATA_DOMAIN'].upper()}'
|
|
1633
|
+
)
|
|
1634
|
+
"""
|
|
1635
|
+
|
|
1636
|
+
# Execute the merges
|
|
1637
|
+
try:
|
|
1638
|
+
tdml.execute_sql(query_insert_business_logic)
|
|
1639
|
+
logger_safe('info', f'upload_documentation: Merged business logic documentation for process_id {process_id} into main table.')
|
|
1640
|
+
except Exception as e:
|
|
1641
|
+
logger_safe('error', f'upload_documentation: Failed to merge business logic documentation for process_id {process_id}: {e}')
|
|
1642
|
+
print(query_insert_business_logic)
|
|
1643
|
+
raise
|
|
1644
|
+
try:
|
|
1645
|
+
tdml.execute_sql(query_insert_features)
|
|
1646
|
+
logger_safe('info', f'upload_documentation: Merged feature documentation for process_id {process_id} into main table.')
|
|
1647
|
+
except Exception as e:
|
|
1648
|
+
logger_safe('error', f'upload_documentation: Failed to merge feature documentation for process_id {process_id}: {e}')
|
|
1649
|
+
print(query_insert_features)
|
|
1650
|
+
raise
|
|
1651
|
+
try:
|
|
1652
|
+
tdml.execute_sql(query_delete_missing_features)
|
|
1653
|
+
logger_safe('info', f'upload_documentation: Removed missing features for process_id {process_id} from main table.')
|
|
1654
|
+
except Exception as e:
|
|
1655
|
+
logger_safe('error', f'upload_documentation: Failed to remove missing features for process_id {process_id}: {e}')
|
|
1656
|
+
print(query_delete_missing_features)
|
|
1657
|
+
raise
|
|
1658
|
+
|
|
1659
|
+
# remove staging tables
|
|
1660
|
+
tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_BUSINESS_LOGIC_STAGING")
|
|
1661
|
+
tdml.execute_sql(f"DROP TABLE {_get_database_username()}.DOCUMENTATION_PROCESS_FEATURES_STAGING")
|
|
1662
|
+
logger_safe('info', f'upload_documentation: Successfully uploaded documentation for process_id {process_id}.')
|
|
1663
|
+
|
|
1664
|
+
return
|
|
1665
|
+
|
|
1666
|
+
def retrieve_documentation(process_id: str) -> Dict[str, Any]:
|
|
1667
|
+
"""
|
|
1668
|
+
Retrieve the documentation for a data process from the documentation tables.
|
|
1669
|
+
|
|
1670
|
+
Parameters
|
|
1671
|
+
----------
|
|
1672
|
+
process_id : str
|
|
1673
|
+
The unique identifier of the data process.
|
|
1674
|
+
|
|
1675
|
+
Returns
|
|
1676
|
+
-------
|
|
1677
|
+
dict
|
|
1678
|
+
A dictionary containing the documentation information with keys:
|
|
1679
|
+
- documented_sql: str
|
|
1680
|
+
- entity_description: str
|
|
1681
|
+
- documented_entity_columns: dict[str, str]
|
|
1682
|
+
- documented_feature_columns: dict[str, str]
|
|
1683
|
+
"""
|
|
1684
|
+
logger_safe('info', f'retrieve_documentation: Retrieving documentation for process_id {process_id}.')
|
|
1685
|
+
|
|
1686
|
+
# Retrieve business logic documentation
|
|
1687
|
+
query_business_logic = f"""
|
|
1688
|
+
CURRENT VALIDTIME
|
|
1689
|
+
SELECT
|
|
1690
|
+
BUSINESS_LOGIC_DESCRIPTION,
|
|
1691
|
+
ENTITY_DESCRIPTION,
|
|
1692
|
+
ENTITY_COLUMNS_JSON
|
|
1693
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC}
|
|
1694
|
+
WHERE PROCESS_ID = '{process_id}'
|
|
1695
|
+
"""
|
|
1696
|
+
result_bl = tdml.execute_sql(query_business_logic).fetchone()
|
|
1697
|
+
if not result_bl:
|
|
1698
|
+
logger_safe('warning', f'retrieve_documentation: No business logic documentation found for process_id {process_id}.')
|
|
1699
|
+
return {}
|
|
1700
|
+
|
|
1701
|
+
documented_sql = result_bl[0]
|
|
1702
|
+
entity_description = result_bl[1]
|
|
1703
|
+
entity_columns_json = result_bl[2]
|
|
1704
|
+
documented_entity_columns = json.loads(entity_columns_json) if entity_columns_json else {}
|
|
1705
|
+
|
|
1706
|
+
# Retrieve feature documentation
|
|
1707
|
+
query_features = f"""
|
|
1708
|
+
CURRENT VALIDTIME
|
|
1709
|
+
SELECT
|
|
1710
|
+
FC.FEATURE_NAME,
|
|
1711
|
+
DPF.FEATURE_DESCRIPTION
|
|
1712
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_FEATURES} DPF
|
|
1713
|
+
INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} FC
|
|
1714
|
+
ON DPF.FEATURE_ID = FC.FEATURE_ID
|
|
1715
|
+
WHERE DPF.PROCESS_ID = '{process_id}'
|
|
1716
|
+
"""
|
|
1717
|
+
result_features = tdml.execute_sql(query_features).fetchall()
|
|
1718
|
+
documented_feature_columns = {
|
|
1719
|
+
row[0]: row[1] for row in result_features
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
logger_safe('info', f'retrieve_documentation: Successfully retrieved documentation for process_id {process_id}.')
|
|
1723
|
+
return {
|
|
1724
|
+
"DOCUMENTED_SQL" : documented_sql,
|
|
1725
|
+
"ENTITY_DESCRIPTION" : entity_description,
|
|
1726
|
+
"DOCUMENTED_ENTITY_COLUMNS" : documented_entity_columns,
|
|
1727
|
+
"DOCUMENTED_FEATURE_COLUMNS" : documented_feature_columns
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
def retrieve_explain_documentation(process_id: str) -> Dict[str, Any]:
|
|
1731
|
+
"""
|
|
1732
|
+
Retrieve the EXPLAIN documentation for a data process from the documentation tables.
|
|
1733
|
+
|
|
1734
|
+
Parameters
|
|
1735
|
+
----------
|
|
1736
|
+
process_id : str
|
|
1737
|
+
The unique identifier of the data process.
|
|
1738
|
+
|
|
1739
|
+
Returns
|
|
1740
|
+
-------
|
|
1741
|
+
dict
|
|
1742
|
+
A dictionary containing the EXPLAIN documentation information with keys:
|
|
1743
|
+
- explanation: str
|
|
1744
|
+
- optimization_score: int
|
|
1745
|
+
- warnings: list[str]
|
|
1746
|
+
- recommendations: list[str]
|
|
1747
|
+
"""
|
|
1748
|
+
logger_safe('info', f'retrieve_explain_documentation: Retrieving EXPLAIN documentation for process_id {process_id}.')
|
|
1749
|
+
|
|
1750
|
+
query_explain = f"""
|
|
1751
|
+
CURRENT VALIDTIME
|
|
1752
|
+
SELECT
|
|
1753
|
+
EXPLAIN_ANALYSIS,
|
|
1754
|
+
OPTIMIZATION_SCORE,
|
|
1755
|
+
WARNINGS,
|
|
1756
|
+
RECOMMENDATIONS
|
|
1757
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN}
|
|
1758
|
+
WHERE PROCESS_ID = '{process_id}'
|
|
1759
|
+
"""
|
|
1760
|
+
result_explain = tdml.execute_sql(query_explain).fetchone()
|
|
1761
|
+
if not result_explain:
|
|
1762
|
+
logger_safe('warning', f'retrieve_explain_documentation: No EXPLAIN documentation found for process_id {process_id}.')
|
|
1763
|
+
return {}
|
|
1764
|
+
|
|
1765
|
+
explanation = result_explain[0]
|
|
1766
|
+
optimization_score = result_explain[1]
|
|
1767
|
+
warnings = json.loads(result_explain[2]) if result_explain[2] else []
|
|
1768
|
+
recommendations = json.loads(result_explain[3]) if result_explain[3] else []
|
|
1769
|
+
|
|
1770
|
+
logger_safe('info', f'retrieve_explain_documentation: Successfully retrieved EXPLAIN documentation for process_id {process_id}.')
|
|
1771
|
+
return {
|
|
1772
|
+
"EXPLAIN_ANALYSIS" : explanation,
|
|
1773
|
+
"OPTIMIZATION_SCORE" : optimization_score,
|
|
1774
|
+
"EXPLAIN_WARNINGS" : warnings,
|
|
1775
|
+
"EXPLAIN_RECOMMENDATIONS" : recommendations
|
|
1776
|
+
}
|
|
1777
|
+
|
|
1778
|
+
def upload_documentation_explain(process_info: Dict[str, Any]) -> None:
|
|
1779
|
+
"""
|
|
1780
|
+
Upload the EXPLAIN documentation for a data process into the documentation tables.
|
|
1781
|
+
|
|
1782
|
+
Parameters
|
|
1783
|
+
----------
|
|
1784
|
+
process_id : str
|
|
1785
|
+
The unique identifier of the data process.
|
|
1786
|
+
explain_documentation : dict
|
|
1787
|
+
A dictionary containing the EXPLAIN documentation information with keys:
|
|
1788
|
+
- explanation: str
|
|
1789
|
+
- optimization_score: int
|
|
1790
|
+
- warnings: list[str]
|
|
1791
|
+
- recommendations: list[str]
|
|
1792
|
+
"""
|
|
1793
|
+
|
|
1794
|
+
explanation = process_info['EXPLAIN_ANALYSIS']
|
|
1795
|
+
optimization_score = process_info['OPTIMIZATION_SCORE']
|
|
1796
|
+
warnings_json = json.dumps(process_info['EXPLAIN_WARNINGS'])
|
|
1797
|
+
recommendations_json= json.dumps(process_info['EXPLAIN_RECOMMENDATIONS'])
|
|
1798
|
+
process_id = process_info['PROCESS_ID']
|
|
1799
|
+
|
|
1800
|
+
# merge into DOCUMENTATION_PROCESS_EXPLAIN
|
|
1801
|
+
query_insert_explain = f"""
|
|
1802
|
+
CURRENT VALIDTIME
|
|
1803
|
+
MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN} EXISTING
|
|
1804
|
+
USING (
|
|
1805
|
+
SELECT
|
|
1806
|
+
'{process_id}' AS PROCESS_ID,
|
|
1807
|
+
'{explanation.replace("'", "''")}' AS EXPLAIN_ANALYSIS,
|
|
1808
|
+
{optimization_score} AS OPTIMIZATION_SCORE,
|
|
1809
|
+
'{warnings_json.replace("'", "''")}' AS WARNINGS,
|
|
1810
|
+
'{recommendations_json.replace("'", "''")}' AS RECOMMENDATIONS
|
|
1811
|
+
) UPDATED
|
|
1812
|
+
ON EXISTING.PROCESS_ID = UPDATED.PROCESS_ID
|
|
1813
|
+
WHEN MATCHED THEN
|
|
1814
|
+
UPDATE
|
|
1815
|
+
SET
|
|
1816
|
+
EXPLAIN_ANALYSIS = UPDATED.EXPLAIN_ANALYSIS,
|
|
1817
|
+
OPTIMIZATION_SCORE = UPDATED.OPTIMIZATION_SCORE,
|
|
1818
|
+
WARNINGS = UPDATED.WARNINGS,
|
|
1819
|
+
RECOMMENDATIONS = UPDATED.RECOMMENDATIONS
|
|
1820
|
+
WHEN NOT MATCHED THEN
|
|
1821
|
+
INSERT (
|
|
1822
|
+
UPDATED.PROCESS_ID,
|
|
1823
|
+
UPDATED.EXPLAIN_ANALYSIS,
|
|
1824
|
+
UPDATED.OPTIMIZATION_SCORE,
|
|
1825
|
+
UPDATED.WARNINGS,
|
|
1826
|
+
UPDATED.RECOMMENDATIONS
|
|
1827
|
+
)
|
|
1828
|
+
"""
|
|
1829
|
+
|
|
1830
|
+
# Execute the merge
|
|
1831
|
+
try:
|
|
1832
|
+
tdml.execute_sql(query_insert_explain)
|
|
1833
|
+
logger_safe('info', f'upload_documentation_explain: Uploaded EXPLAIN documentation for process_id {process_id}.')
|
|
1834
|
+
except Exception as e:
|
|
1835
|
+
logger_safe('error', f'upload_documentation_explain: Failed to upload EXPLAIN documentation for process_id {process_id}: {e}')
|
|
1836
|
+
raise
|
|
1837
|
+
|
|
1838
|
+
return
|
|
1839
|
+
|
|
1840
|
+
def display_process_info(process_info: Dict[str, Any] = None, process_id : str = None) -> None:
|
|
1841
|
+
"""
|
|
1842
|
+
Pretty print the documentation and EXPLAIN analysis for a data process from process_info dict or by retrieving it using process_id.
|
|
1843
|
+
|
|
1844
|
+
Parameters
|
|
1845
|
+
----------
|
|
1846
|
+
process_info : dict, optional (default=None)
|
|
1847
|
+
A dictionary containing the process documentation information.
|
|
1848
|
+
If None, process_id must be provided to retrieve the information.
|
|
1849
|
+
process_id : str, optional (default=None)
|
|
1850
|
+
The unique identifier of the data process.
|
|
1851
|
+
If process_info is None, this parameter is used to retrieve the documentation.
|
|
1852
|
+
-----------
|
|
1853
|
+
Returns
|
|
1854
|
+
None
|
|
1855
|
+
"""
|
|
1856
|
+
|
|
1857
|
+
if process_info is None:
|
|
1858
|
+
if process_id is None:
|
|
1859
|
+
raise ValueError("Either process_info or process_id must be provided.")
|
|
1860
|
+
logger_safe('info', f'display_process_info: Retrieving documentation for process_id {process_id}.')
|
|
1861
|
+
process_info = get_process_info(process_id)
|
|
1862
|
+
|
|
1863
|
+
# pretty print documentation for info:
|
|
1864
|
+
_print_documentation(
|
|
1865
|
+
documented_sql = process_info.get('DOCUMENTED_SQL', None),
|
|
1866
|
+
entity_description = process_info.get('ENTITY_DESCRIPTION', None),
|
|
1867
|
+
documented_entity_columns = process_info.get('DOCUMENTED_ENTITY_COLUMNS', None),
|
|
1868
|
+
documented_feature_columns = process_info.get('DOCUMENTED_FEATURE_COLUMNS', None),
|
|
1869
|
+
process_id = process_info.get('PROCESS_ID', None),
|
|
1870
|
+
view_name = process_info.get('VIEW_NAME', None),
|
|
1871
|
+
explain_analysis = process_info.get('EXPLAIN_ANALYSIS', None),
|
|
1872
|
+
optimization_score = process_info.get('OPTIMIZATION_SCORE', None),
|
|
1873
|
+
explain_warnings = process_info.get('EXPLAIN_WARNINGS', None),
|
|
1874
|
+
explain_recommendations = process_info.get('EXPLAIN_RECOMMENDATIONS', None),
|
|
1875
|
+
sql_query = process_info.get('PROCESS_SQL', None),
|
|
1876
|
+
)
|
|
1877
|
+
return
|