PyPI - undatum - Versions diffs - 1.0.17__py2.py3-none-any.whl - Mend

undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

undatum/__init__.py +9 -0
undatum/__main__.py +25 -0
undatum/ai/__init__.py +145 -0
undatum/ai/base.py +85 -0
undatum/ai/config.py +184 -0
undatum/ai/perplexity.py +79 -0
undatum/ai/providers.py +1002 -0
undatum/ai/schemas.py +42 -0
undatum/cmds/__init__.py +6 -0
undatum/cmds/analyzer.py +697 -0
undatum/cmds/converter.py +646 -0
undatum/cmds/ingester.py +116 -0
undatum/cmds/query.py +68 -0
undatum/cmds/schemer.py +328 -0
undatum/cmds/selector.py +437 -0
undatum/cmds/statistics.py +158 -0
undatum/cmds/textproc.py +59 -0
undatum/cmds/transformer.py +81 -0
undatum/cmds/validator.py +137 -0
undatum/common/__init__.py +6 -0
undatum/common/functions.py +81 -0
undatum/common/iterable.py +222 -0
undatum/common/scheme.py +261 -0
undatum/constants.py +21 -0
undatum/core.py +616 -0
undatum/formats/__init__.py +6 -0
undatum/formats/docx.py +160 -0
undatum/utils.py +298 -0
undatum/validate/__init__.py +11 -0
undatum/validate/commonrules.py +15 -0
undatum/validate/ruscodes.py +202 -0
undatum-1.0.17.dist-info/METADATA +610 -0
undatum-1.0.17.dist-info/RECORD +37 -0
undatum-1.0.17.dist-info/WHEEL +6 -0
undatum-1.0.17.dist-info/entry_points.txt +3 -0
undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
undatum-1.0.17.dist-info/top_level.txt +1 -0

undatum/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf8 -*-
+"""
+undatum: a command-line tool for data processing. Brings CSV simplicity to JSON lines and BSON
+"""
+__version__ = "1.0.17"
+__author__ = 'Ivan Begtin'
+__licence__ = 'MIT'

undatum/__main__.py ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+"""The main entry point. Invoke as `undatum' or `python -m undatum`.
+This module provides the CLI entry point for the undatum package.
+"""
+import sys
+from .core import app
+def main():
+    """Main entry point for the application.
+    Handles the CLI invocation and graceful shutdown on keyboard interrupt.
+    """
+    try:
+        app()
+    except KeyboardInterrupt:
+        print("Ctrl-C pressed. Aborting")
+    sys.exit(0)
+if __name__ == '__main__':
+    main()

undatum/ai/__init__.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""AI service module for dataset documentation."""
+from typing import Optional, Dict, Any
+from .base import AIService, AIServiceError, AIConfigurationError, AIAPIError
+from .config import get_ai_config, get_provider_config
+from .providers import (
+    OpenAIProvider,
+    OpenRouterProvider,
+    OllamaProvider,
+    LMStudioProvider,
+    PerplexityProvider
+)
+# Provider registry
+PROVIDERS = {
+    'openai': OpenAIProvider,
+    'openrouter': OpenRouterProvider,
+    'ollama': OllamaProvider,
+    'lmstudio': LMStudioProvider,
+    'perplexity': PerplexityProvider,
+}
+def get_ai_service(provider: Optional[str] = None,
+                   config: Optional[Dict[str, Any]] = None) -> AIService:
+    """Get AI service instance based on configuration.
+    Args:
+        provider: Provider name (openai, openrouter, ollama, lmstudio, perplexity)
+                  If None, will be auto-detected from config
+        config: Optional configuration dictionary. If None, will load from
+                environment variables and config files
+    Returns:
+        Configured AI service instance
+    Raises:
+        AIConfigurationError: If provider is not configured or invalid
+    Examples:
+        >>> # Auto-detect from environment
+        >>> service = get_ai_service()
+        >>> # Explicit provider
+        >>> service = get_ai_service('openai', {'api_key': '...', 'model': 'gpt-4'})
+        >>> # From config file
+        >>> service = get_ai_service('ollama')
+    """
+    # Load configuration
+    full_config = get_ai_config(config or {})
+    # Determine provider
+    if provider:
+        provider_name = provider.lower()
+    else:
+        provider_name = full_config.get('provider', '').lower()
+    # Backward compatibility: if PERPLEXITY_API_KEY is set and no provider specified
+    if not provider_name:
+        import os
+        if os.getenv('PERPLEXITY_API_KEY'):
+            provider_name = 'perplexity'
+            full_config['provider'] = 'perplexity'
+    if not provider_name:
+        raise AIConfigurationError(
+            "No AI provider specified. Set UNDATUM_AI_PROVIDER environment variable, "
+            "configure in undatum.yaml, or pass provider argument."
+        )
+    if provider_name not in PROVIDERS:
+        raise AIConfigurationError(
+            f"Unknown provider: {provider_name}. "
+            f"Available providers: {', '.join(PROVIDERS.keys())}"
+        )
+    # Get provider class
+    provider_class = PROVIDERS[provider_name]
+    # Get provider-specific configuration
+    provider_config = get_provider_config(full_config, provider_name)
+    # Instantiate provider
+    try:
+        return provider_class(**provider_config)
+    except AIConfigurationError as e:
+        raise AIConfigurationError(
+            f"Failed to configure {provider_name} provider: {str(e)}"
+        )
+# Backward compatibility: export old function signatures
+def get_fields_info(fields, language='English', ai_service: Optional[AIService] = None):
+    """Get field descriptions (backward compatibility wrapper).
+    Args:
+        fields: List of field names or comma-separated string
+        language: Language for descriptions
+        ai_service: Optional AI service instance. If None, will auto-detect.
+    Returns:
+        Dictionary mapping field names to descriptions
+    """
+    if ai_service is None:
+        ai_service = get_ai_service()
+    # Handle both list and string input
+    if isinstance(fields, str):
+        fields = [f.strip() for f in fields.split(',')]
+    return ai_service.get_fields_info(fields, language)
+def get_description(data, language='English', ai_service: Optional[AIService] = None):
+    """Get dataset description (backward compatibility wrapper).
+    Args:
+        data: Sample data as CSV string
+        language: Language for description
+        ai_service: Optional AI service instance. If None, will auto-detect.
+    Returns:
+        String description of the dataset
+    """
+    if ai_service is None:
+        ai_service = get_ai_service()
+    return ai_service.get_description(data, language)
+__all__ = [
+    'AIService',
+    'AIServiceError',
+    'AIConfigurationError',
+    'AIAPIError',
+    'get_ai_service',
+    'get_fields_info',
+    'get_description',
+    'OpenAIProvider',
+    'OpenRouterProvider',
+    'OllamaProvider',
+    'LMStudioProvider',
+    'PerplexityProvider',
+]

undatum/ai/base.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Base AI service interface for dataset documentation."""
+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+class AIServiceError(Exception):
+    """Base exception for AI service errors."""
+    pass
+class AIConfigurationError(AIServiceError):
+    """Raised when AI service configuration is invalid."""
+    pass
+class AIAPIError(AIServiceError):
+    """Raised when AI API call fails."""
+    def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None):
+        super().__init__(message)
+        self.status_code = status_code
+        self.response = response
+class AIService(ABC):
+    """Abstract base class for AI service providers."""
+    def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
+                 model: Optional[str] = None, timeout: int = 30):
+        """Initialize AI service.
+        Args:
+            api_key: API key for authentication
+            base_url: Base URL for API endpoint (optional, provider-specific defaults)
+            model: Model name to use
+            timeout: Request timeout in seconds
+        """
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model = model
+        self.timeout = timeout
+    @abstractmethod
+    def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
+        """Get descriptions for a list of field names.
+        Args:
+            fields: List of field names to describe
+            language: Language for descriptions (default: 'English')
+        Returns:
+            Dictionary mapping field names to their descriptions
+        Raises:
+            AIConfigurationError: If service is not properly configured
+            AIAPIError: If API call fails
+        """
+        pass
+    @abstractmethod
+    def get_description(self, data: str, language: str = 'English') -> str:
+        """Get a description of the dataset.
+        Args:
+            data: Sample data as CSV string
+            language: Language for description (default: 'English')
+        Returns:
+            String description of the dataset
+        Raises:
+            AIConfigurationError: If service is not properly configured
+            AIAPIError: If API call fails
+        """
+        pass
+    def _validate_config(self) -> None:
+        """Validate that required configuration is present.
+        Raises:
+            AIConfigurationError: If configuration is invalid
+        """
+        if not self.api_key:
+            raise AIConfigurationError(f"API key is required for {self.__class__.__name__}")
+        if not self.model:
+            raise AIConfigurationError(f"Model is required for {self.__class__.__name__}")

undatum/ai/config.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Configuration management for AI services."""
+import os
+import yaml
+from pathlib import Path
+from typing import Optional, Dict, Any
+def find_config_file() -> Optional[Path]:
+    """Find configuration file in standard locations.
+    Checks:
+    1. Current directory: undatum.yaml
+    2. Home directory: ~/.undatum/config.yaml
+    Returns:
+        Path to config file if found, None otherwise
+    """
+    # Check current directory
+    current_dir_config = Path.cwd() / 'undatum.yaml'
+    if current_dir_config.exists():
+        return current_dir_config
+    # Check home directory
+    home_config = Path.home() / '.undatum' / 'config.yaml'
+    if home_config.exists():
+        return home_config
+    return None
+def load_config_file() -> Dict[str, Any]:
+    """Load configuration from YAML file.
+    Returns:
+        Dictionary with configuration, empty dict if no file found
+    """
+    config_file = find_config_file()
+    if not config_file:
+        return {}
+    try:
+        with open(config_file, 'r', encoding='utf-8') as f:
+            config = yaml.safe_load(f) or {}
+            return config.get('ai', {})
+    except (yaml.YAMLError, IOError, KeyError):
+        return {}
+def get_env_config() -> Dict[str, Any]:
+    """Load configuration from environment variables.
+    Environment variables:
+    - UNDATUM_AI_PROVIDER: Provider name (openai, openrouter, ollama, lmstudio, perplexity)
+    - {PROVIDER}_API_KEY: API key for the provider
+    - OLLAMA_BASE_URL: Base URL for Ollama (defaults to http://localhost:11434)
+    - LMSTUDIO_BASE_URL: Base URL for LM Studio (defaults to http://localhost:1234/v1)
+    Returns:
+        Dictionary with configuration from environment
+    """
+    config = {}
+    provider = os.getenv('UNDATUM_AI_PROVIDER')
+    if provider:
+        config['provider'] = provider
+    # Check for provider-specific API keys
+    api_keys = {
+        'openai': os.getenv('OPENAI_API_KEY'),
+        'openrouter': os.getenv('OPENROUTER_API_KEY'),
+        'perplexity': os.getenv('PERPLEXITY_API_KEY'),
+    }
+    # Use the first available API key if provider not specified
+    if not provider:
+        for prov, key in api_keys.items():
+            if key:
+                config['provider'] = prov
+                config['api_key'] = key
+                break
+    else:
+        # Use provider-specific key
+        key = api_keys.get(provider.lower())
+        if key:
+            config['api_key'] = key
+    # Provider-specific base URLs
+    ollama_url = os.getenv('OLLAMA_BASE_URL')
+    if ollama_url:
+        config['ollama_base_url'] = ollama_url
+    lmstudio_url = os.getenv('LMSTUDIO_BASE_URL')
+    if lmstudio_url:
+        config['lmstudio_base_url'] = lmstudio_url
+    return config
+def merge_config(cli_config: Optional[Dict[str, Any]] = None,
+                 file_config: Optional[Dict[str, Any]] = None,
+                 env_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Merge configurations with precedence: CLI > File > Environment.
+    Args:
+        cli_config: Configuration from CLI arguments
+        file_config: Configuration from config file
+        env_config: Configuration from environment variables
+    Returns:
+        Merged configuration dictionary
+    """
+    if env_config is None:
+        env_config = get_env_config()
+    if file_config is None:
+        file_config = load_config_file()
+    if cli_config is None:
+        cli_config = {}
+    # Start with environment config (lowest precedence)
+    merged = env_config.copy()
+    # Override with file config
+    merged.update(file_config)
+    # Override with CLI config (highest precedence)
+    merged.update(cli_config)
+    return merged
+def get_ai_config(cli_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Get AI configuration with proper precedence.
+    Args:
+        cli_config: Optional CLI configuration to override defaults
+    Returns:
+        Complete AI configuration dictionary
+    """
+    return merge_config(cli_config=cli_config)
+def get_provider_config(config: Dict[str, Any], provider: str) -> Dict[str, Any]:
+    """Extract provider-specific configuration.
+    Args:
+        config: Full configuration dictionary
+        provider: Provider name
+    Returns:
+        Provider-specific configuration
+    """
+    # Map providers to their environment variable API keys
+    provider_api_keys = {
+        'openai': os.getenv('OPENAI_API_KEY'),
+        'openrouter': os.getenv('OPENROUTER_API_KEY'),
+        'perplexity': os.getenv('PERPLEXITY_API_KEY'),
+    }
+    # Determine API key: use provider-specific env var if available, otherwise use config
+    api_key = None
+    if provider.lower() in provider_api_keys:
+        # Always prefer provider-specific environment variable
+        api_key = provider_api_keys[provider.lower()]
+    # Fall back to config api_key if provider-specific env var not set
+    if not api_key:
+        api_key = config.get('api_key')
+    provider_config = {
+        'api_key': api_key,
+        'model': config.get('model'),
+        'timeout': config.get('timeout', 30),
+    }
+    # Provider-specific base URLs
+    if provider == 'ollama':
+        provider_config['base_url'] = config.get('ollama_base_url') or config.get('base_url')
+    elif provider == 'lmstudio':
+        provider_config['base_url'] = config.get('lmstudio_base_url') or config.get('base_url')
+    else:
+        provider_config['base_url'] = config.get('base_url')
+    return provider_config

undatum/ai/perplexity.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""AI-powered data analysis using Perplexity API."""
+import requests
+import csv
+import sys
+import os
+from io import StringIO
+PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY', )
+def find_between( s, first, last ):
+    try:
+        start = s.index( first ) + len( first )
+        end = s.index( last, start )
+        return s[start:end]
+    except ValueError:
+        return ""
+def get_fields_info(fields, language='English'):
+    """Returns information about data fields"""
+    url = "https://api.perplexity.ai/chat/completions"
+    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
+    payload = {
+        "model": "sonar",
+        "messages": [
+            {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
+            {"role": "user", "content": (
+                f"Please describe in {language} these fields delimited by comma: {fields}"
+                "Please output as single csv table only with following fields: name and description"
+            )},
+        ],
+        "response_format": {
+                "type": "text",
+        },
+    }
+    response = requests.post(url, headers=headers, json=payload).json()
+    text = response["choices"][0]["message"]["content"]
+    a_text = find_between(text, "```csv", "```").strip()
+    if len(a_text) == 0:
+        a_text = find_between(text, "```", "```").strip()
+    f = StringIO()
+    f.write(a_text)
+    f.seek(0)
+    table = {}
+    dr = csv.reader(f, delimiter=',')
+    n = 0
+    for r in dr:
+        n += 1
+        if n == 1: continue
+        table[r[0]] = r[1]
+    return table
+def get_description(data, language='English'):
+    url = "https://api.perplexity.ai/chat/completions"
+    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
+    payload = {
+        "model": "sonar",
+        "messages": [
+            {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
+            {"role": "user", "content": (
+                f"""
+I have the following CSV data:
+{data}
+Please provide short description in about this data in {language}. Consider this data as sample of the bigger dataset.Don't generate any code and data examples""")},
+        ],
+        "response_format": {
+                "type": "text",
+        },
+    }
+    response = requests.post(url, headers=headers, json=payload).json()
+    return response["choices"][0]["message"]["content"]
+if __name__ == "__main__":
+    print(get_fields_info(sys.argv[1], sys.argv[2]))