undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
undatum/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf8 -*-
2
+ """
3
+ undatum: a command-line tool for data processing. Brings CSV simplicity to JSON lines and BSON
4
+
5
+ """
6
+
7
+ __version__ = "1.0.17"
8
+ __author__ = 'Ivan Begtin'
9
+ __licence__ = 'MIT'
undatum/__main__.py ADDED
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf8 -*-
3
+ """The main entry point. Invoke as `undatum' or `python -m undatum`.
4
+
5
+ This module provides the CLI entry point for the undatum package.
6
+ """
7
+ import sys
8
+
9
+ from .core import app
10
+
11
+
12
+ def main():
13
+ """Main entry point for the application.
14
+
15
+ Handles the CLI invocation and graceful shutdown on keyboard interrupt.
16
+ """
17
+ try:
18
+ app()
19
+ except KeyboardInterrupt:
20
+ print("Ctrl-C pressed. Aborting")
21
+ sys.exit(0)
22
+
23
+
24
+ if __name__ == '__main__':
25
+ main()
undatum/ai/__init__.py ADDED
@@ -0,0 +1,145 @@
1
+ """AI service module for dataset documentation."""
2
+ from typing import Optional, Dict, Any
3
+
4
+ from .base import AIService, AIServiceError, AIConfigurationError, AIAPIError
5
+ from .config import get_ai_config, get_provider_config
6
+ from .providers import (
7
+ OpenAIProvider,
8
+ OpenRouterProvider,
9
+ OllamaProvider,
10
+ LMStudioProvider,
11
+ PerplexityProvider
12
+ )
13
+
14
+ # Provider registry
15
+ PROVIDERS = {
16
+ 'openai': OpenAIProvider,
17
+ 'openrouter': OpenRouterProvider,
18
+ 'ollama': OllamaProvider,
19
+ 'lmstudio': LMStudioProvider,
20
+ 'perplexity': PerplexityProvider,
21
+ }
22
+
23
+
24
+ def get_ai_service(provider: Optional[str] = None,
25
+ config: Optional[Dict[str, Any]] = None) -> AIService:
26
+ """Get AI service instance based on configuration.
27
+
28
+ Args:
29
+ provider: Provider name (openai, openrouter, ollama, lmstudio, perplexity)
30
+ If None, will be auto-detected from config
31
+ config: Optional configuration dictionary. If None, will load from
32
+ environment variables and config files
33
+
34
+ Returns:
35
+ Configured AI service instance
36
+
37
+ Raises:
38
+ AIConfigurationError: If provider is not configured or invalid
39
+
40
+ Examples:
41
+ >>> # Auto-detect from environment
42
+ >>> service = get_ai_service()
43
+
44
+ >>> # Explicit provider
45
+ >>> service = get_ai_service('openai', {'api_key': '...', 'model': 'gpt-4'})
46
+
47
+ >>> # From config file
48
+ >>> service = get_ai_service('ollama')
49
+ """
50
+ # Load configuration
51
+ full_config = get_ai_config(config or {})
52
+
53
+ # Determine provider
54
+ if provider:
55
+ provider_name = provider.lower()
56
+ else:
57
+ provider_name = full_config.get('provider', '').lower()
58
+
59
+ # Backward compatibility: if PERPLEXITY_API_KEY is set and no provider specified
60
+ if not provider_name:
61
+ import os
62
+ if os.getenv('PERPLEXITY_API_KEY'):
63
+ provider_name = 'perplexity'
64
+ full_config['provider'] = 'perplexity'
65
+
66
+ if not provider_name:
67
+ raise AIConfigurationError(
68
+ "No AI provider specified. Set UNDATUM_AI_PROVIDER environment variable, "
69
+ "configure in undatum.yaml, or pass provider argument."
70
+ )
71
+
72
+ if provider_name not in PROVIDERS:
73
+ raise AIConfigurationError(
74
+ f"Unknown provider: {provider_name}. "
75
+ f"Available providers: {', '.join(PROVIDERS.keys())}"
76
+ )
77
+
78
+ # Get provider class
79
+ provider_class = PROVIDERS[provider_name]
80
+
81
+ # Get provider-specific configuration
82
+ provider_config = get_provider_config(full_config, provider_name)
83
+
84
+ # Instantiate provider
85
+ try:
86
+ return provider_class(**provider_config)
87
+ except AIConfigurationError as e:
88
+ raise AIConfigurationError(
89
+ f"Failed to configure {provider_name} provider: {str(e)}"
90
+ )
91
+
92
+
93
+ # Backward compatibility: export old function signatures
94
+ def get_fields_info(fields, language='English', ai_service: Optional[AIService] = None):
95
+ """Get field descriptions (backward compatibility wrapper).
96
+
97
+ Args:
98
+ fields: List of field names or comma-separated string
99
+ language: Language for descriptions
100
+ ai_service: Optional AI service instance. If None, will auto-detect.
101
+
102
+ Returns:
103
+ Dictionary mapping field names to descriptions
104
+ """
105
+ if ai_service is None:
106
+ ai_service = get_ai_service()
107
+
108
+ # Handle both list and string input
109
+ if isinstance(fields, str):
110
+ fields = [f.strip() for f in fields.split(',')]
111
+
112
+ return ai_service.get_fields_info(fields, language)
113
+
114
+
115
+ def get_description(data, language='English', ai_service: Optional[AIService] = None):
116
+ """Get dataset description (backward compatibility wrapper).
117
+
118
+ Args:
119
+ data: Sample data as CSV string
120
+ language: Language for description
121
+ ai_service: Optional AI service instance. If None, will auto-detect.
122
+
123
+ Returns:
124
+ String description of the dataset
125
+ """
126
+ if ai_service is None:
127
+ ai_service = get_ai_service()
128
+
129
+ return ai_service.get_description(data, language)
130
+
131
+
132
+ __all__ = [
133
+ 'AIService',
134
+ 'AIServiceError',
135
+ 'AIConfigurationError',
136
+ 'AIAPIError',
137
+ 'get_ai_service',
138
+ 'get_fields_info',
139
+ 'get_description',
140
+ 'OpenAIProvider',
141
+ 'OpenRouterProvider',
142
+ 'OllamaProvider',
143
+ 'LMStudioProvider',
144
+ 'PerplexityProvider',
145
+ ]
undatum/ai/base.py ADDED
@@ -0,0 +1,85 @@
1
+ """Base AI service interface for dataset documentation."""
2
+ from abc import ABC, abstractmethod
3
+ from typing import Dict, Optional
4
+
5
+
6
+ class AIServiceError(Exception):
7
+ """Base exception for AI service errors."""
8
+ pass
9
+
10
+
11
+ class AIConfigurationError(AIServiceError):
12
+ """Raised when AI service configuration is invalid."""
13
+ pass
14
+
15
+
16
+ class AIAPIError(AIServiceError):
17
+ """Raised when AI API call fails."""
18
+ def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None):
19
+ super().__init__(message)
20
+ self.status_code = status_code
21
+ self.response = response
22
+
23
+
24
+ class AIService(ABC):
25
+ """Abstract base class for AI service providers."""
26
+
27
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
28
+ model: Optional[str] = None, timeout: int = 30):
29
+ """Initialize AI service.
30
+
31
+ Args:
32
+ api_key: API key for authentication
33
+ base_url: Base URL for API endpoint (optional, provider-specific defaults)
34
+ model: Model name to use
35
+ timeout: Request timeout in seconds
36
+ """
37
+ self.api_key = api_key
38
+ self.base_url = base_url
39
+ self.model = model
40
+ self.timeout = timeout
41
+
42
+ @abstractmethod
43
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
44
+ """Get descriptions for a list of field names.
45
+
46
+ Args:
47
+ fields: List of field names to describe
48
+ language: Language for descriptions (default: 'English')
49
+
50
+ Returns:
51
+ Dictionary mapping field names to their descriptions
52
+
53
+ Raises:
54
+ AIConfigurationError: If service is not properly configured
55
+ AIAPIError: If API call fails
56
+ """
57
+ pass
58
+
59
+ @abstractmethod
60
+ def get_description(self, data: str, language: str = 'English') -> str:
61
+ """Get a description of the dataset.
62
+
63
+ Args:
64
+ data: Sample data as CSV string
65
+ language: Language for description (default: 'English')
66
+
67
+ Returns:
68
+ String description of the dataset
69
+
70
+ Raises:
71
+ AIConfigurationError: If service is not properly configured
72
+ AIAPIError: If API call fails
73
+ """
74
+ pass
75
+
76
+ def _validate_config(self) -> None:
77
+ """Validate that required configuration is present.
78
+
79
+ Raises:
80
+ AIConfigurationError: If configuration is invalid
81
+ """
82
+ if not self.api_key:
83
+ raise AIConfigurationError(f"API key is required for {self.__class__.__name__}")
84
+ if not self.model:
85
+ raise AIConfigurationError(f"Model is required for {self.__class__.__name__}")
undatum/ai/config.py ADDED
@@ -0,0 +1,184 @@
1
+ """Configuration management for AI services."""
2
+ import os
3
+ import yaml
4
+ from pathlib import Path
5
+ from typing import Optional, Dict, Any
6
+
7
+
8
+ def find_config_file() -> Optional[Path]:
9
+ """Find configuration file in standard locations.
10
+
11
+ Checks:
12
+ 1. Current directory: undatum.yaml
13
+ 2. Home directory: ~/.undatum/config.yaml
14
+
15
+ Returns:
16
+ Path to config file if found, None otherwise
17
+ """
18
+ # Check current directory
19
+ current_dir_config = Path.cwd() / 'undatum.yaml'
20
+ if current_dir_config.exists():
21
+ return current_dir_config
22
+
23
+ # Check home directory
24
+ home_config = Path.home() / '.undatum' / 'config.yaml'
25
+ if home_config.exists():
26
+ return home_config
27
+
28
+ return None
29
+
30
+
31
+ def load_config_file() -> Dict[str, Any]:
32
+ """Load configuration from YAML file.
33
+
34
+ Returns:
35
+ Dictionary with configuration, empty dict if no file found
36
+ """
37
+ config_file = find_config_file()
38
+ if not config_file:
39
+ return {}
40
+
41
+ try:
42
+ with open(config_file, 'r', encoding='utf-8') as f:
43
+ config = yaml.safe_load(f) or {}
44
+ return config.get('ai', {})
45
+ except (yaml.YAMLError, IOError, KeyError):
46
+ return {}
47
+
48
+
49
+ def get_env_config() -> Dict[str, Any]:
50
+ """Load configuration from environment variables.
51
+
52
+ Environment variables:
53
+ - UNDATUM_AI_PROVIDER: Provider name (openai, openrouter, ollama, lmstudio, perplexity)
54
+ - {PROVIDER}_API_KEY: API key for the provider
55
+ - OLLAMA_BASE_URL: Base URL for Ollama (defaults to http://localhost:11434)
56
+ - LMSTUDIO_BASE_URL: Base URL for LM Studio (defaults to http://localhost:1234/v1)
57
+
58
+ Returns:
59
+ Dictionary with configuration from environment
60
+ """
61
+ config = {}
62
+
63
+ provider = os.getenv('UNDATUM_AI_PROVIDER')
64
+ if provider:
65
+ config['provider'] = provider
66
+
67
+ # Check for provider-specific API keys
68
+ api_keys = {
69
+ 'openai': os.getenv('OPENAI_API_KEY'),
70
+ 'openrouter': os.getenv('OPENROUTER_API_KEY'),
71
+ 'perplexity': os.getenv('PERPLEXITY_API_KEY'),
72
+ }
73
+
74
+ # Use the first available API key if provider not specified
75
+ if not provider:
76
+ for prov, key in api_keys.items():
77
+ if key:
78
+ config['provider'] = prov
79
+ config['api_key'] = key
80
+ break
81
+ else:
82
+ # Use provider-specific key
83
+ key = api_keys.get(provider.lower())
84
+ if key:
85
+ config['api_key'] = key
86
+
87
+ # Provider-specific base URLs
88
+ ollama_url = os.getenv('OLLAMA_BASE_URL')
89
+ if ollama_url:
90
+ config['ollama_base_url'] = ollama_url
91
+
92
+ lmstudio_url = os.getenv('LMSTUDIO_BASE_URL')
93
+ if lmstudio_url:
94
+ config['lmstudio_base_url'] = lmstudio_url
95
+
96
+ return config
97
+
98
+
99
+ def merge_config(cli_config: Optional[Dict[str, Any]] = None,
100
+ file_config: Optional[Dict[str, Any]] = None,
101
+ env_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
102
+ """Merge configurations with precedence: CLI > File > Environment.
103
+
104
+ Args:
105
+ cli_config: Configuration from CLI arguments
106
+ file_config: Configuration from config file
107
+ env_config: Configuration from environment variables
108
+
109
+ Returns:
110
+ Merged configuration dictionary
111
+ """
112
+ if env_config is None:
113
+ env_config = get_env_config()
114
+ if file_config is None:
115
+ file_config = load_config_file()
116
+ if cli_config is None:
117
+ cli_config = {}
118
+
119
+ # Start with environment config (lowest precedence)
120
+ merged = env_config.copy()
121
+
122
+ # Override with file config
123
+ merged.update(file_config)
124
+
125
+ # Override with CLI config (highest precedence)
126
+ merged.update(cli_config)
127
+
128
+ return merged
129
+
130
+
131
+ def get_ai_config(cli_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
132
+ """Get AI configuration with proper precedence.
133
+
134
+ Args:
135
+ cli_config: Optional CLI configuration to override defaults
136
+
137
+ Returns:
138
+ Complete AI configuration dictionary
139
+ """
140
+ return merge_config(cli_config=cli_config)
141
+
142
+
143
+ def get_provider_config(config: Dict[str, Any], provider: str) -> Dict[str, Any]:
144
+ """Extract provider-specific configuration.
145
+
146
+ Args:
147
+ config: Full configuration dictionary
148
+ provider: Provider name
149
+
150
+ Returns:
151
+ Provider-specific configuration
152
+ """
153
+ # Map providers to their environment variable API keys
154
+ provider_api_keys = {
155
+ 'openai': os.getenv('OPENAI_API_KEY'),
156
+ 'openrouter': os.getenv('OPENROUTER_API_KEY'),
157
+ 'perplexity': os.getenv('PERPLEXITY_API_KEY'),
158
+ }
159
+
160
+ # Determine API key: use provider-specific env var if available, otherwise use config
161
+ api_key = None
162
+ if provider.lower() in provider_api_keys:
163
+ # Always prefer provider-specific environment variable
164
+ api_key = provider_api_keys[provider.lower()]
165
+
166
+ # Fall back to config api_key if provider-specific env var not set
167
+ if not api_key:
168
+ api_key = config.get('api_key')
169
+
170
+ provider_config = {
171
+ 'api_key': api_key,
172
+ 'model': config.get('model'),
173
+ 'timeout': config.get('timeout', 30),
174
+ }
175
+
176
+ # Provider-specific base URLs
177
+ if provider == 'ollama':
178
+ provider_config['base_url'] = config.get('ollama_base_url') or config.get('base_url')
179
+ elif provider == 'lmstudio':
180
+ provider_config['base_url'] = config.get('lmstudio_base_url') or config.get('base_url')
181
+ else:
182
+ provider_config['base_url'] = config.get('base_url')
183
+
184
+ return provider_config
@@ -0,0 +1,79 @@
1
+ """AI-powered data analysis using Perplexity API."""
2
+ import requests
3
+ import csv
4
+ import sys
5
+ import os
6
+ from io import StringIO
7
+
8
+ PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY', )
9
+
10
+
11
+ def find_between( s, first, last ):
12
+ try:
13
+ start = s.index( first ) + len( first )
14
+ end = s.index( last, start )
15
+ return s[start:end]
16
+ except ValueError:
17
+ return ""
18
+
19
+ def get_fields_info(fields, language='English'):
20
+ """Returns information about data fields"""
21
+ url = "https://api.perplexity.ai/chat/completions"
22
+ headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
23
+ payload = {
24
+ "model": "sonar",
25
+ "messages": [
26
+ {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
27
+ {"role": "user", "content": (
28
+ f"Please describe in {language} these fields delimited by comma: {fields}"
29
+ "Please output as single csv table only with following fields: name and description"
30
+ )},
31
+ ],
32
+ "response_format": {
33
+ "type": "text",
34
+ },
35
+ }
36
+ response = requests.post(url, headers=headers, json=payload).json()
37
+ text = response["choices"][0]["message"]["content"]
38
+ a_text = find_between(text, "```csv", "```").strip()
39
+ if len(a_text) == 0:
40
+ a_text = find_between(text, "```", "```").strip()
41
+ f = StringIO()
42
+ f.write(a_text)
43
+ f.seek(0)
44
+ table = {}
45
+ dr = csv.reader(f, delimiter=',')
46
+ n = 0
47
+ for r in dr:
48
+ n += 1
49
+ if n == 1: continue
50
+ table[r[0]] = r[1]
51
+ return table
52
+
53
+
54
+
55
+ def get_description(data, language='English'):
56
+ url = "https://api.perplexity.ai/chat/completions"
57
+ headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
58
+ payload = {
59
+ "model": "sonar",
60
+ "messages": [
61
+ {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
62
+ {"role": "user", "content": (
63
+ f"""
64
+ I have the following CSV data:
65
+ {data}
66
+ Please provide short description in about this data in {language}. Consider this data as sample of the bigger dataset.Don't generate any code and data examples""")},
67
+ ],
68
+ "response_format": {
69
+ "type": "text",
70
+ },
71
+ }
72
+ response = requests.post(url, headers=headers, json=payload).json()
73
+ return response["choices"][0]["message"]["content"]
74
+
75
+
76
+
77
+
78
+ if __name__ == "__main__":
79
+ print(get_fields_info(sys.argv[1], sys.argv[2]))