PyPI - tdfs4ds - Versions diffs - 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl - Mend

tdfs4ds 0.2.4.47py3-none-any.whl → 0.2.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

tdfs4ds/__init__.py +216 -40
tdfs4ds/feature_store/feature_data_processing.py +9 -28
tdfs4ds/feature_store/feature_store_management.py +1 -1
tdfs4ds/genai/__init__.py +27 -351
tdfs4ds/genai/documentation.py +1877 -0
tdfs4ds/process_store/process_store_catalog_management.py +77 -24
tdfs4ds/utils/filter_management.py +21 -12
tdfs4ds/utils/time_management.py +22 -12
{tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/RECORD +12 -19
tdfs/__init__.py +0 -1
tdfs/data/curves.csv +0 -5086
tdfs/datasets.py +0 -27
tdfs/feature_store.py +0 -723
tdfs4ds/feature_engineering.py +0 -152
tdfs4ds/feature_store.py +0 -1529
tdfs4ds/process_store.py +0 -387
tdfs4ds/utils.py +0 -579
{tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.47.dist-info → tdfs4ds-0.2.5.0.dist-info}/top_level.txt +0 -0

tdfs4ds/genai/__init__.py CHANGED Viewed

@@ -1,351 +1,27 @@
-from typing import Sequence, List
-from langchain_openai import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import JsonOutputParser
-from langchain_core.runnables import Runnable, RunnableLambda
-from langchain_core.messages import AIMessage
-def build_llm(
-    llm_service: str = "https://api-dmproject.myddns.me/v1",
-    api_key: str = "YOUR_API_KEY_HERE",  # 🔒 Better not ship the real key in code
-    model_id: str = "mistralai/Ministral-8B-Instruct-2410",
-    temperature: float = 0.2,
-    timeout: int = 120,
-) -> ChatOpenAI:
-    """
-    Build and return a ChatOpenAI client pointed at your vLLM/OpenAI-compatible endpoint.
-    """
-    return ChatOpenAI(
-        base_url=llm_service,
-        api_key=api_key,
-        model=model_id,
-        temperature=temperature,
-        timeout=timeout,
-    )
-from typing import Sequence
-def build_documentation_json_schema(columns, provider="generic"):
-    """
-    Build a provider-appropriate JSON Schema used to enforce strict JSON output
-    for SQL column documentation across multiple LLM backends.
-    This function returns different schema shapes depending on the LLM provider,
-    because each ecosystem uses a different structured-output mechanism:
-    Provider Modes
-    --------------
-    - provider="openai", "azure"
-        Returns the JSON Schema wrapped in OpenAI's `response_format={"type": "json_schema", ...}`
-        structure. Supported by GPT-4.1, GPT-4o, GPT-3.5-Turbo, and Azure OpenAI.
-    - provider="anthropic", "claude"
-        Returns an Anthropic *tool schema* definition. Claude 3.x models use tool
-        schemas to enforce strict JSON output.
-    - provider="ollama"
-        Returns the raw JSON schema that Ollama expects under the `format=` parameter
-        of the generate API. (Ollama 0.2+ supports response schemas.)
-    - provider="vllm"
-        Returns plain JSON Schema for use with vLLM's `guided_json` constrained decoding.
-    - provider="bedrock"
-        Bedrock Claude follows the Anthropic tool schema format.
-        Bedrock Llama / Titan accept plain JSON schema. This function returns the base
-        schema and leaves the final wrapping to the caller.
-    - provider="generic"
-        Returns plain JSON schema. Useful for LLM backends that do not support
-        constrained decoding, prompt-only JSON generation, or post-processing repair.
-    Parameters
-    ----------
-    columns : list[str]
-        Column names to include as required JSON object keys. Each column will map
-        to a string description generated by the model.
-    provider : str, optional
-        The model provider or backend type. Determines the structural format
-        required for constrained generation. One of:
-        "openai", "anthropic", "ollama", "vllm", "bedrock", "generic".
-    Returns
-    -------
-    dict
-        A dictionary representing the JSON Schema or provider-specific wrapper
-        used to enforce strict JSON output during LLM generation.
-    Notes
-    -----
-    - All schemas require that:
-        * the output be a JSON object
-        * keys match exactly the column names
-        * all values be strings
-        * additional properties be disallowed
-    - Not all providers enforce schemas equally:
-        * OpenAI, Claude, and vLLM offer hard guarantees.
-        * Ollama enforces schema reasonably well.
-        * Generic models may require post-processing.
-    """
-    # Base JSON schema — used directly by vLLM, Ollama, Bedrock, fallback
-    base_schema = {
-        "type": "object",
-        "properties": {col: {"type": "string"} for col in columns},
-        "required": list(columns),
-        "additionalProperties": False,
-    }
-    # --- Provider-specific formats ---
-    if provider.lower() in ("openai", "azure", "azure-openai"):
-        # OpenAI's required wrapper structure
-        return {
-            "type": "json_schema",
-            "json_schema": {
-                "name": "ColumnDocumentation",
-                "schema": base_schema,
-                "strict": True,
-            }
-        }
-    if provider.lower() in ("anthropic", "claude"):
-        # Anthropic tool schema
-        # You embed this inside the "tools" field when calling the model
-        return {
-            "name": "column_documentation",
-            "description": "Generate documentation for SQL output columns.",
-            "input_schema": base_schema
-        }
-    if provider.lower() == "ollama":
-        # Ollama's output format schema (unwrapped JSON schema)
-        # Returned directly in: generate(..., format=schema)
-        return base_schema
-    if provider.lower() in ("vllm", "openai-compatible"):
-        # vLLM's guided_json uses *plain JSON Schema*
-        # so return base_schema exactly
-        return base_schema
-    if provider.lower() == "bedrock":
-        # Bedrock Claude uses Anthropic schema
-        # Bedrock Llama uses plain JSON schema
-        # Return base_schema and let caller choose
-        return base_schema
-    # Fallback: generic JSON schema
-    return base_schema
-def build_sql_documentation_chain(llm: ChatOpenAI, columns: Sequence[str], provider: str="vllm", json_constraint=True) -> Runnable:
-    """
-    Build a LangChain Runnable that generates business-focused documentation
-    for a list of SQL output columns, with optional provider-specific JSON
-    constraints (vLLM, OpenAI, Ollama, etc.).
-    The resulting chain expects two input variables:
-        - sql_query: str         → the SQL query whose output is being documented
-        - columns_str: str       → formatted list of columns (e.g. "- col1\n- col2")
-    Parameters
-    ----------
-    llm : ChatOpenAI
-        The language model interface (may point to vLLM, OpenAI, Ollama, etc.).
-    columns : Sequence[str]
-        List of columns that must appear as keys in the output JSON.
-    provider : str, optional (default="vllm")
-        Indicates which structured-output mechanism to use.
-        Supported values:
-            - "vllm"               → uses `guided_json` for strict JSON output
-            - "openai" / "azure"   → uses OpenAI JSON Schema via `response_format`
-            - "ollama"             → uses Ollama's `format=` schema
-            - "openai-compatible"  → alias for vLLM-style guided decoding
-            - any other value      → fall back to unconstrained text output
-    json_constraint : bool, optional (default=True)
-        If True:
-            - a JSON Schema is generated from the column list
-            - provider-specific constrained decoding is applied
-        If False:
-            - the chain does not enforce JSON structure at the LLM level
-            - the model is only guided by the prompt (weaker guarantees)
-    Returns
-    -------
-    Runnable
-        A LangChain Runnable that executes:
-            prompt → LLM (optionally schema-guided) → JSON parser
-        When invoked with:
-            {
-                "sql_query": "...",
-                "columns_str": "- column1\n- column2\n..."
-            }
-        It returns:
-            dict[str, str]
-                A mapping of each requested column name to a short,
-                business-oriented description (≤ 5 sentences).
-    Notes
-    -----
-    - The chain enforces valid JSON when possible:
-        * vLLM → `guided_json`
-        * OpenAI → `response_format={"type": "json_schema", ...}`
-        * Ollama → `format=<schema>`
-    - For unsupported providers, the model may emit imperfect JSON.
-    - Descriptions focus on business meaning, business logic,
-      and optionally technical details only when relevant.
-    """
-    prompt = ChatPromptTemplate.from_template(
-        """
-You are a data documentation assistant.
-Your target audience is business users.
-Your explanations must focus primarily on the business meaning and business logic of each column,
-and you may add technical details only when they meaningfully clarify the business context.
-Given:
-1. A SQL query.
-2. A list of output columns that must be documented.
-Your job:
-- For each column in the provided list, write a clear and concise explanation of what the column represents from a business perspective.
-- Describe the business logic behind how the value is derived or used within the context of the SQL query.
-- Add technical details only if relevant and only to help a business audience understand the concept.
-- Each description must be at most 5 sentences.
-- Do not include any columns that are not in the provided list.
-- If a column name is ambiguous, infer its meaning from the SQL query as best as possible and say so.
-- If you cannot infer anything meaningful, state that clearly (still within 3 sentences).
-- Answer in {language}
-Output format (very important):
-- Return ONLY a valid JSON object.
-- Each top-level key must be exactly the column name.
-- Each value must be a single string with the description.
-Example of the required format:
-{{
-  "customer_id": "Unique identifier of the customer placing the order, used to track customers at a business level. Also serves as the technical key linking orders to customer records.",
-  "order_date": "The business date when the order was created. It represents the transaction date used for reporting and may reflect the source system’s timestamp."
-}}
-Now generate documentation.
-SQL query:
-```sql
-{sql_query}
-```
-Columns to document (only document these):
-{columns_str}
-"""
-    )
-    parser = JsonOutputParser()
-    # ✅ Final chain: prompt -> grammar-constrained LLM -> JSON parser
-    if not json_constraint:
-        chain: Runnable = prompt | llm | parser
-        return chain
-    schema = build_documentation_json_schema(columns, provider=provider)
-    # ---- provider-specific LLM call wrapper ----
-    provider_l = provider.lower()
-    if provider_l in ("vllm", "openai-compatible"):
-        # vLLM guided_json
-        def _call_llm(messages):
-            return llm.invoke(
-                messages,
-                extra_body={"guided_json": schema},
-            )
-    elif provider_l in ("openai", "azure", "azure-openai"):
-        # OpenAI / Azure OpenAI JSON schema via response_format
-        def _call_llm(messages):
-            return llm.invoke(
-                messages,
-                response_format=schema,
-            )
-    elif provider_l == "ollama":
-        # Ollama's `format` parameter (schema directly)
-        def _call_llm(messages):
-            return llm.invoke(
-                messages,
-                format=schema,
-            )
-    else:
-        # Fallback: no hard structure, rely on prompt & parser
-        def _call_llm(messages):
-            return llm.invoke(messages)
-    constrained_llm = RunnableLambda(_call_llm)
-    # Final chain: prompt -> LLM (schema-guided) -> JSON parser
-    def _parse(ai_msg: AIMessage):
-        raw = ai_msg.content
-        return parser.parse(raw)
-    chain: Runnable = prompt | constrained_llm | RunnableLambda(_parse)
-    return chain
-def run_sql_documentation(chain: Runnable, sql_query: str, columns_to_document: Sequence[str], language: str = "English"):
-    """
-    Execute a previously constructed SQL-documentation chain and return
-    business-friendly documentation for the specified SQL output columns.
-    This function prepares the chain inputs (SQL query, formatted column list,
-    target language) and invokes the chain. The chain itself must have been
-    created using `build_sql_documentation_chain()`, which ensures the model
-    produces structured JSON suitable for parsing.
-    Parameters
-    ----------
-    chain : Runnable
-        A LangChain Runnable returned by `build_sql_documentation_chain()`.
-        This Runnable encapsulates:
-            - the prompt template
-            - a provider-specific LLM invocation (with or without JSON constraints)
-            - a JSON output parser
-    sql_query : str
-        The SQL query whose resulting columns should be documented. This query is
-        shown to the model so it can infer business logic, derivation rules, and
-        column meaning.
-    columns_to_document : Sequence[str]
-        The list of column names that must appear as keys in the output JSON.
-        Only these columns will be documented. The order does not matter.
-    language : str, optional (default="English")
-        The target output language for the generated documentation.
-        This value is passed into the prompt’s `{language}` variable.
-        Examples: "English", "French", "German", "Spanish", "Japanese".
-    Returns
-    -------
-    dict[str, str]
-        A dictionary mapping each column name to a human-readable, business-oriented
-        description generated by the model. Example:
-            {
-                "customer_id": "Unique customer identifier used for ...",
-                "order_date": "Business date when the order was created ..."
-            }
-    Notes
-    -----
-    - The output format is determined by the chain's JSON parser. If the model
-      fails to produce valid JSON (e.g., due to unsupported constraints),
-      a `OutputParserException` may be raised.
-    - The resulting descriptions are typically ≤ 5 sentences per column, unless
-      modified in the chain's prompt.
-    """
-    columns_str = "\n".join(f"- {col}" for col in columns_to_document)
-    return chain.invoke({
-        "sql_query": sql_query,
-        "columns_str": columns_str,
-        "language" : language
-    })
+from .documentation import (
+    document_sql_query_columns,
+    document_process,
+    documentation_tables_creation,
+    document_sql_query_explain,
+    build_explain_documentation_chain,
+    run_explain_documentation,
+    build_sql_documentation_chain,
+    run_sql_documentation,
+    build_llm,
+    get_the_explain,
+    display_process_info
+)
+__all__ = [
+    "document_sql_query_columns",
+    "document_process",
+    "documentation_tables_creation",
+    "document_sql_query_explain",
+    "build_explain_documentation_chain",
+    "run_explain_documentation",
+    "build_sql_documentation_chain",
+    "run_sql_documentation",
+    "build_llm",
+    "get_the_explain",
+    "display_process_info"
+]

tdfs4ds 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl

tdfs4ds 0.2.4.47py3-none-any.whl → 0.2.5.0py3-none-any.whl