tdfs4ds 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/genai/__init__.py CHANGED
@@ -1,351 +1,27 @@
1
- from typing import Sequence, List
2
-
3
- from langchain_openai import ChatOpenAI
4
- from langchain_core.prompts import ChatPromptTemplate
5
- from langchain_core.output_parsers import JsonOutputParser
6
- from langchain_core.runnables import Runnable, RunnableLambda
7
- from langchain_core.messages import AIMessage
8
-
9
-
10
- def build_llm(
11
- llm_service: str = "https://api-dmproject.myddns.me/v1",
12
- api_key: str = "YOUR_API_KEY_HERE", # 🔒 Better not ship the real key in code
13
- model_id: str = "mistralai/Ministral-8B-Instruct-2410",
14
- temperature: float = 0.2,
15
- timeout: int = 120,
16
- ) -> ChatOpenAI:
17
- """
18
- Build and return a ChatOpenAI client pointed at your vLLM/OpenAI-compatible endpoint.
19
- """
20
- return ChatOpenAI(
21
- base_url=llm_service,
22
- api_key=api_key,
23
- model=model_id,
24
- temperature=temperature,
25
- timeout=timeout,
26
- )
27
-
28
-
29
- from typing import Sequence
30
-
31
- def build_documentation_json_schema(columns, provider="generic"):
32
- """
33
- Build a provider-appropriate JSON Schema used to enforce strict JSON output
34
- for SQL column documentation across multiple LLM backends.
35
-
36
- This function returns different schema shapes depending on the LLM provider,
37
- because each ecosystem uses a different structured-output mechanism:
38
-
39
- Provider Modes
40
- --------------
41
- - provider="openai", "azure"
42
- Returns the JSON Schema wrapped in OpenAI's `response_format={"type": "json_schema", ...}`
43
- structure. Supported by GPT-4.1, GPT-4o, GPT-3.5-Turbo, and Azure OpenAI.
44
-
45
- - provider="anthropic", "claude"
46
- Returns an Anthropic *tool schema* definition. Claude 3.x models use tool
47
- schemas to enforce strict JSON output.
48
-
49
- - provider="ollama"
50
- Returns the raw JSON schema that Ollama expects under the `format=` parameter
51
- of the generate API. (Ollama 0.2+ supports response schemas.)
52
-
53
- - provider="vllm"
54
- Returns plain JSON Schema for use with vLLM's `guided_json` constrained decoding.
55
-
56
- - provider="bedrock"
57
- Bedrock Claude follows the Anthropic tool schema format.
58
- Bedrock Llama / Titan accept plain JSON schema. This function returns the base
59
- schema and leaves the final wrapping to the caller.
60
-
61
- - provider="generic"
62
- Returns plain JSON schema. Useful for LLM backends that do not support
63
- constrained decoding, prompt-only JSON generation, or post-processing repair.
64
-
65
- Parameters
66
- ----------
67
- columns : list[str]
68
- Column names to include as required JSON object keys. Each column will map
69
- to a string description generated by the model.
70
-
71
- provider : str, optional
72
- The model provider or backend type. Determines the structural format
73
- required for constrained generation. One of:
74
- "openai", "anthropic", "ollama", "vllm", "bedrock", "generic".
75
-
76
- Returns
77
- -------
78
- dict
79
- A dictionary representing the JSON Schema or provider-specific wrapper
80
- used to enforce strict JSON output during LLM generation.
81
-
82
- Notes
83
- -----
84
- - All schemas require that:
85
- * the output be a JSON object
86
- * keys match exactly the column names
87
- * all values be strings
88
- * additional properties be disallowed
89
-
90
- - Not all providers enforce schemas equally:
91
- * OpenAI, Claude, and vLLM offer hard guarantees.
92
- * Ollama enforces schema reasonably well.
93
- * Generic models may require post-processing.
94
- """
95
- # Base JSON schema — used directly by vLLM, Ollama, Bedrock, fallback
96
- base_schema = {
97
- "type": "object",
98
- "properties": {col: {"type": "string"} for col in columns},
99
- "required": list(columns),
100
- "additionalProperties": False,
101
- }
102
-
103
- # --- Provider-specific formats ---
104
-
105
- if provider.lower() in ("openai", "azure", "azure-openai"):
106
- # OpenAI's required wrapper structure
107
- return {
108
- "type": "json_schema",
109
- "json_schema": {
110
- "name": "ColumnDocumentation",
111
- "schema": base_schema,
112
- "strict": True,
113
- }
114
- }
115
-
116
- if provider.lower() in ("anthropic", "claude"):
117
- # Anthropic tool schema
118
- # You embed this inside the "tools" field when calling the model
119
- return {
120
- "name": "column_documentation",
121
- "description": "Generate documentation for SQL output columns.",
122
- "input_schema": base_schema
123
- }
124
-
125
- if provider.lower() == "ollama":
126
- # Ollama's output format schema (unwrapped JSON schema)
127
- # Returned directly in: generate(..., format=schema)
128
- return base_schema
129
-
130
- if provider.lower() in ("vllm", "openai-compatible"):
131
- # vLLM's guided_json uses *plain JSON Schema*
132
- # so return base_schema exactly
133
- return base_schema
134
-
135
- if provider.lower() == "bedrock":
136
- # Bedrock Claude uses Anthropic schema
137
- # Bedrock Llama uses plain JSON schema
138
- # Return base_schema and let caller choose
139
- return base_schema
140
-
141
- # Fallback: generic JSON schema
142
- return base_schema
143
-
144
-
145
- def build_sql_documentation_chain(llm: ChatOpenAI, columns: Sequence[str], provider: str="vllm", json_constraint=True) -> Runnable:
146
- """
147
- Build a LangChain Runnable that generates business-focused documentation
148
- for a list of SQL output columns, with optional provider-specific JSON
149
- constraints (vLLM, OpenAI, Ollama, etc.).
150
-
151
- The resulting chain expects two input variables:
152
- - sql_query: str → the SQL query whose output is being documented
153
- - columns_str: str → formatted list of columns (e.g. "- col1\n- col2")
154
-
155
- Parameters
156
- ----------
157
- llm : ChatOpenAI
158
- The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
159
- columns : Sequence[str]
160
- List of columns that must appear as keys in the output JSON.
161
- provider : str, optional (default="vllm")
162
- Indicates which structured-output mechanism to use.
163
- Supported values:
164
- - "vllm" → uses `guided_json` for strict JSON output
165
- - "openai" / "azure" → uses OpenAI JSON Schema via `response_format`
166
- - "ollama" → uses Ollama's `format=` schema
167
- - "openai-compatible" → alias for vLLM-style guided decoding
168
- - any other value → fall back to unconstrained text output
169
- json_constraint : bool, optional (default=True)
170
- If True:
171
- - a JSON Schema is generated from the column list
172
- - provider-specific constrained decoding is applied
173
- If False:
174
- - the chain does not enforce JSON structure at the LLM level
175
- - the model is only guided by the prompt (weaker guarantees)
176
-
177
- Returns
178
- -------
179
- Runnable
180
- A LangChain Runnable that executes:
181
- prompt → LLM (optionally schema-guided) → JSON parser
182
-
183
- When invoked with:
184
- {
185
- "sql_query": "...",
186
- "columns_str": "- column1\n- column2\n..."
187
- }
188
-
189
- It returns:
190
- dict[str, str]
191
- A mapping of each requested column name to a short,
192
- business-oriented description (≤ 5 sentences).
193
-
194
- Notes
195
- -----
196
- - The chain enforces valid JSON when possible:
197
- * vLLM → `guided_json`
198
- * OpenAI → `response_format={"type": "json_schema", ...}`
199
- * Ollama → `format=<schema>`
200
- - For unsupported providers, the model may emit imperfect JSON.
201
- - Descriptions focus on business meaning, business logic,
202
- and optionally technical details only when relevant.
203
- """
204
- prompt = ChatPromptTemplate.from_template(
205
- """
206
- You are a data documentation assistant.
207
-
208
- Your target audience is business users.
209
- Your explanations must focus primarily on the business meaning and business logic of each column,
210
- and you may add technical details only when they meaningfully clarify the business context.
211
-
212
- Given:
213
- 1. A SQL query.
214
- 2. A list of output columns that must be documented.
215
-
216
- Your job:
217
- - For each column in the provided list, write a clear and concise explanation of what the column represents from a business perspective.
218
- - Describe the business logic behind how the value is derived or used within the context of the SQL query.
219
- - Add technical details only if relevant and only to help a business audience understand the concept.
220
- - Each description must be at most 5 sentences.
221
- - Do not include any columns that are not in the provided list.
222
- - If a column name is ambiguous, infer its meaning from the SQL query as best as possible and say so.
223
- - If you cannot infer anything meaningful, state that clearly (still within 3 sentences).
224
- - Answer in {language}
225
- Output format (very important):
226
- - Return ONLY a valid JSON object.
227
- - Each top-level key must be exactly the column name.
228
- - Each value must be a single string with the description.
229
-
230
- Example of the required format:
231
- {{
232
- "customer_id": "Unique identifier of the customer placing the order, used to track customers at a business level. Also serves as the technical key linking orders to customer records.",
233
- "order_date": "The business date when the order was created. It represents the transaction date used for reporting and may reflect the source system’s timestamp."
234
- }}
235
-
236
- Now generate documentation.
237
-
238
- SQL query:
239
- ```sql
240
- {sql_query}
241
- ```
242
- Columns to document (only document these):
243
- {columns_str}
244
- """
245
- )
246
- parser = JsonOutputParser()
247
- # ✅ Final chain: prompt -> grammar-constrained LLM -> JSON parser
248
- if not json_constraint:
249
- chain: Runnable = prompt | llm | parser
250
- return chain
251
-
252
- schema = build_documentation_json_schema(columns, provider=provider)
253
- # ---- provider-specific LLM call wrapper ----
254
- provider_l = provider.lower()
255
-
256
- if provider_l in ("vllm", "openai-compatible"):
257
- # vLLM guided_json
258
- def _call_llm(messages):
259
- return llm.invoke(
260
- messages,
261
- extra_body={"guided_json": schema},
262
- )
263
-
264
- elif provider_l in ("openai", "azure", "azure-openai"):
265
- # OpenAI / Azure OpenAI JSON schema via response_format
266
- def _call_llm(messages):
267
- return llm.invoke(
268
- messages,
269
- response_format=schema,
270
- )
271
-
272
- elif provider_l == "ollama":
273
- # Ollama's `format` parameter (schema directly)
274
- def _call_llm(messages):
275
- return llm.invoke(
276
- messages,
277
- format=schema,
278
- )
279
-
280
- else:
281
- # Fallback: no hard structure, rely on prompt & parser
282
- def _call_llm(messages):
283
- return llm.invoke(messages)
284
-
285
- constrained_llm = RunnableLambda(_call_llm)
286
- # Final chain: prompt -> LLM (schema-guided) -> JSON parser
287
- def _parse(ai_msg: AIMessage):
288
- raw = ai_msg.content
289
- return parser.parse(raw)
290
-
291
- chain: Runnable = prompt | constrained_llm | RunnableLambda(_parse)
292
- return chain
293
-
294
- def run_sql_documentation(chain: Runnable, sql_query: str, columns_to_document: Sequence[str], language: str = "English"):
295
- """
296
- Execute a previously constructed SQL-documentation chain and return
297
- business-friendly documentation for the specified SQL output columns.
298
-
299
- This function prepares the chain inputs (SQL query, formatted column list,
300
- target language) and invokes the chain. The chain itself must have been
301
- created using `build_sql_documentation_chain()`, which ensures the model
302
- produces structured JSON suitable for parsing.
303
-
304
- Parameters
305
- ----------
306
- chain : Runnable
307
- A LangChain Runnable returned by `build_sql_documentation_chain()`.
308
- This Runnable encapsulates:
309
- - the prompt template
310
- - a provider-specific LLM invocation (with or without JSON constraints)
311
- - a JSON output parser
312
-
313
- sql_query : str
314
- The SQL query whose resulting columns should be documented. This query is
315
- shown to the model so it can infer business logic, derivation rules, and
316
- column meaning.
317
-
318
- columns_to_document : Sequence[str]
319
- The list of column names that must appear as keys in the output JSON.
320
- Only these columns will be documented. The order does not matter.
321
-
322
- language : str, optional (default="English")
323
- The target output language for the generated documentation.
324
- This value is passed into the prompt’s `{language}` variable.
325
- Examples: "English", "French", "German", "Spanish", "Japanese".
326
-
327
- Returns
328
- -------
329
- dict[str, str]
330
- A dictionary mapping each column name to a human-readable, business-oriented
331
- description generated by the model. Example:
332
- {
333
- "customer_id": "Unique customer identifier used for ...",
334
- "order_date": "Business date when the order was created ..."
335
- }
336
-
337
- Notes
338
- -----
339
- - The output format is determined by the chain's JSON parser. If the model
340
- fails to produce valid JSON (e.g., due to unsupported constraints),
341
- a `OutputParserException` may be raised.
342
- - The resulting descriptions are typically ≤ 5 sentences per column, unless
343
- modified in the chain's prompt.
344
- """
345
- columns_str = "\n".join(f"- {col}" for col in columns_to_document)
346
-
347
- return chain.invoke({
348
- "sql_query": sql_query,
349
- "columns_str": columns_str,
350
- "language" : language
351
- })
1
+ from .documentation import (
2
+ document_sql_query_columns,
3
+ document_process,
4
+ documentation_tables_creation,
5
+ document_sql_query_explain,
6
+ build_explain_documentation_chain,
7
+ run_explain_documentation,
8
+ build_sql_documentation_chain,
9
+ run_sql_documentation,
10
+ build_llm,
11
+ get_the_explain,
12
+ display_process_info
13
+ )
14
+
15
+ __all__ = [
16
+ "document_sql_query_columns",
17
+ "document_process",
18
+ "documentation_tables_creation",
19
+ "document_sql_query_explain",
20
+ "build_explain_documentation_chain",
21
+ "run_explain_documentation",
22
+ "build_sql_documentation_chain",
23
+ "run_sql_documentation",
24
+ "build_llm",
25
+ "get_the_explain",
26
+ "display_process_info"
27
+ ]