undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
undatum/ai/providers.py
ADDED
|
@@ -0,0 +1,1002 @@
|
|
|
1
|
+
"""AI service provider implementations."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from typing import Dict, Optional, Callable
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from .base import AIService, AIConfigurationError, AIAPIError
|
|
11
|
+
from .schemas import FIELD_INFO_SCHEMA, DATASET_DESCRIPTION_SCHEMA
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def retry_with_backoff(func: Callable, max_retries: int = 3, initial_delay: float = 1.0,
|
|
15
|
+
backoff_factor: float = 2.0, retry_statuses: tuple = (429, 500, 502, 503, 504)):
|
|
16
|
+
"""Retry a function with exponential backoff for rate limiting and server errors.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
func: Function to retry (should raise requests.exceptions.RequestException)
|
|
20
|
+
max_retries: Maximum number of retry attempts
|
|
21
|
+
initial_delay: Initial delay in seconds before first retry
|
|
22
|
+
backoff_factor: Factor to multiply delay by for each retry
|
|
23
|
+
retry_statuses: HTTP status codes that should trigger a retry
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Result of the function call
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
AIAPIError: If all retries are exhausted
|
|
30
|
+
"""
|
|
31
|
+
delay = initial_delay
|
|
32
|
+
last_exception = None
|
|
33
|
+
|
|
34
|
+
for attempt in range(max_retries + 1):
|
|
35
|
+
try:
|
|
36
|
+
return func()
|
|
37
|
+
except requests.exceptions.RequestException as e:
|
|
38
|
+
last_exception = e
|
|
39
|
+
status_code = getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None
|
|
40
|
+
|
|
41
|
+
# Only retry on specific status codes or if status_code is None (network error)
|
|
42
|
+
if status_code not in retry_statuses and status_code is not None:
|
|
43
|
+
raise
|
|
44
|
+
|
|
45
|
+
# Don't retry on last attempt
|
|
46
|
+
if attempt >= max_retries:
|
|
47
|
+
break
|
|
48
|
+
|
|
49
|
+
# Extract retry-after header if present (for 429 errors)
|
|
50
|
+
retry_after = None
|
|
51
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
52
|
+
retry_after = e.response.headers.get('Retry-After')
|
|
53
|
+
if retry_after:
|
|
54
|
+
try:
|
|
55
|
+
delay = float(retry_after)
|
|
56
|
+
except ValueError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
# Wait before retrying
|
|
60
|
+
time.sleep(delay)
|
|
61
|
+
delay *= backoff_factor
|
|
62
|
+
|
|
63
|
+
# All retries exhausted, raise the last exception
|
|
64
|
+
status_code = getattr(last_exception.response, 'status_code', None) if hasattr(last_exception, 'response') else None
|
|
65
|
+
error_msg = f"API request failed after {max_retries + 1} attempts: {str(last_exception)}"
|
|
66
|
+
|
|
67
|
+
if status_code == 429:
|
|
68
|
+
error_msg += "\nRate limit exceeded. Please wait a moment and try again, or check your API usage limits."
|
|
69
|
+
elif status_code in (500, 502, 503, 504):
|
|
70
|
+
error_msg += "\nServer error. The API may be temporarily unavailable. Please try again later."
|
|
71
|
+
|
|
72
|
+
raise AIAPIError(error_msg,
|
|
73
|
+
status_code=status_code,
|
|
74
|
+
response=getattr(last_exception.response, 'text', None) if hasattr(last_exception, 'response') else None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class OpenAIProvider(AIService):
|
|
78
|
+
"""OpenAI API provider."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
|
|
81
|
+
model: Optional[str] = None, timeout: int = 30):
|
|
82
|
+
"""Initialize OpenAI provider.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
86
|
+
base_url: Base URL (defaults to https://api.openai.com/v1)
|
|
87
|
+
model: Model name (defaults to gpt-4o-mini)
|
|
88
|
+
timeout: Request timeout in seconds
|
|
89
|
+
"""
|
|
90
|
+
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
|
91
|
+
base_url = base_url or 'https://api.openai.com/v1'
|
|
92
|
+
model = model or 'gpt-4o-mini'
|
|
93
|
+
super().__init__(api_key, base_url, model, timeout)
|
|
94
|
+
self._validate_config()
|
|
95
|
+
|
|
96
|
+
def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
|
|
97
|
+
"""Get field descriptions using OpenAI API."""
|
|
98
|
+
fields_str = ', '.join(fields)
|
|
99
|
+
url = f"{self.base_url}/chat/completions"
|
|
100
|
+
headers = {
|
|
101
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
102
|
+
"Content-Type": "application/json"
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
payload = {
|
|
106
|
+
"model": self.model,
|
|
107
|
+
"messages": [
|
|
108
|
+
{
|
|
109
|
+
"role": "system",
|
|
110
|
+
"content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"role": "user",
|
|
114
|
+
"content": f"Please describe these data fields in {language}: {fields_str}. Provide a description for each field explaining what it represents. Return your response as a JSON object with a 'fields' array containing objects with 'name' and 'description' keys."
|
|
115
|
+
}
|
|
116
|
+
],
|
|
117
|
+
"response_format": {
|
|
118
|
+
"type": "json_object"
|
|
119
|
+
},
|
|
120
|
+
"temperature": 0.3
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
def _make_request():
|
|
124
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
125
|
+
response.raise_for_status()
|
|
126
|
+
return response
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
response = retry_with_backoff(_make_request)
|
|
130
|
+
data = response.json()
|
|
131
|
+
|
|
132
|
+
content = data["choices"][0]["message"]["content"]
|
|
133
|
+
result = json.loads(content)
|
|
134
|
+
|
|
135
|
+
# Validate and convert to expected format
|
|
136
|
+
if "fields" not in result:
|
|
137
|
+
raise AIAPIError("Invalid response format: missing 'fields' key")
|
|
138
|
+
|
|
139
|
+
field_dict = {}
|
|
140
|
+
for field_info in result["fields"]:
|
|
141
|
+
if "name" not in field_info or "description" not in field_info:
|
|
142
|
+
continue
|
|
143
|
+
field_dict[field_info["name"]] = field_info["description"]
|
|
144
|
+
|
|
145
|
+
# Ensure all requested fields are in the result
|
|
146
|
+
for field in fields:
|
|
147
|
+
if field not in field_dict:
|
|
148
|
+
field_dict[field] = f"Field: {field}"
|
|
149
|
+
|
|
150
|
+
return field_dict
|
|
151
|
+
|
|
152
|
+
except AIAPIError:
|
|
153
|
+
raise
|
|
154
|
+
except requests.exceptions.RequestException as e:
|
|
155
|
+
raise AIAPIError(f"OpenAI API request failed: {str(e)}",
|
|
156
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
157
|
+
response=getattr(e.response, 'text', None))
|
|
158
|
+
except json.JSONDecodeError as e:
|
|
159
|
+
raise AIAPIError(f"Failed to parse OpenAI response: {str(e)}")
|
|
160
|
+
|
|
161
|
+
def get_description(self, data: str, language: str = 'English') -> str:
|
|
162
|
+
"""Get dataset description using OpenAI API."""
|
|
163
|
+
# Truncate data if too large (OpenAI has token limits)
|
|
164
|
+
# Use a more conservative limit to account for prompt overhead
|
|
165
|
+
MAX_DATA_LENGTH = 3000
|
|
166
|
+
if len(data) > MAX_DATA_LENGTH:
|
|
167
|
+
data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
|
|
168
|
+
|
|
169
|
+
url = f"{self.base_url}/chat/completions"
|
|
170
|
+
headers = {
|
|
171
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
172
|
+
"Content-Type": "application/json"
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
user_content = f"""I have the following CSV data sample:
|
|
176
|
+
{data}
|
|
177
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
|
|
178
|
+
Return your response as a JSON object with a "description" key."""
|
|
179
|
+
|
|
180
|
+
payload = {
|
|
181
|
+
"model": self.model,
|
|
182
|
+
"messages": [
|
|
183
|
+
{
|
|
184
|
+
"role": "system",
|
|
185
|
+
"content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON."
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
"role": "user",
|
|
189
|
+
"content": user_content
|
|
190
|
+
}
|
|
191
|
+
],
|
|
192
|
+
"response_format": {
|
|
193
|
+
"type": "json_object"
|
|
194
|
+
},
|
|
195
|
+
"temperature": 0.3
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
def _make_request():
|
|
199
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
200
|
+
response.raise_for_status()
|
|
201
|
+
return response
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
response = retry_with_backoff(_make_request)
|
|
205
|
+
response_data = response.json()
|
|
206
|
+
|
|
207
|
+
content = response_data["choices"][0]["message"]["content"]
|
|
208
|
+
result = json.loads(content)
|
|
209
|
+
|
|
210
|
+
if "description" in result:
|
|
211
|
+
return result["description"]
|
|
212
|
+
else:
|
|
213
|
+
# Fallback: return the content as-is if structure is different
|
|
214
|
+
return content
|
|
215
|
+
|
|
216
|
+
except AIAPIError:
|
|
217
|
+
raise
|
|
218
|
+
except requests.exceptions.RequestException as e:
|
|
219
|
+
error_msg = f"OpenAI API request failed: {str(e)}"
|
|
220
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
221
|
+
try:
|
|
222
|
+
error_detail = e.response.json()
|
|
223
|
+
if 'error' in error_detail:
|
|
224
|
+
error_info = error_detail['error']
|
|
225
|
+
if 'message' in error_info:
|
|
226
|
+
error_msg += f"\nError details: {error_info['message']}"
|
|
227
|
+
if 'code' in error_info:
|
|
228
|
+
error_msg += f"\nError code: {error_info['code']}"
|
|
229
|
+
except (ValueError, KeyError):
|
|
230
|
+
# If we can't parse the error response, include the raw text
|
|
231
|
+
error_text = getattr(e.response, 'text', None)
|
|
232
|
+
if error_text:
|
|
233
|
+
error_msg += f"\nResponse: {error_text[:500]}"
|
|
234
|
+
raise AIAPIError(error_msg,
|
|
235
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
236
|
+
response=getattr(e.response, 'text', None))
|
|
237
|
+
except json.JSONDecodeError:
|
|
238
|
+
# If JSON parsing fails, try to extract description from text
|
|
239
|
+
try:
|
|
240
|
+
content = response_data["choices"][0]["message"]["content"]
|
|
241
|
+
return content
|
|
242
|
+
except (KeyError, IndexError):
|
|
243
|
+
raise AIAPIError("Failed to extract description from OpenAI response")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class OpenRouterProvider(AIService):
|
|
247
|
+
"""OpenRouter API provider (OpenAI-compatible)."""
|
|
248
|
+
|
|
249
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
|
|
250
|
+
model: Optional[str] = None, timeout: int = 30):
|
|
251
|
+
"""Initialize OpenRouter provider.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
api_key: OpenRouter API key (defaults to OPENROUTER_API_KEY env var)
|
|
255
|
+
base_url: Base URL (defaults to https://openrouter.ai/api/v1)
|
|
256
|
+
model: Model name (defaults to openai/gpt-4o-mini)
|
|
257
|
+
timeout: Request timeout in seconds
|
|
258
|
+
"""
|
|
259
|
+
api_key = api_key or os.getenv('OPENROUTER_API_KEY')
|
|
260
|
+
base_url = base_url or 'https://openrouter.ai/api/v1'
|
|
261
|
+
model = model or 'openai/gpt-4o-mini'
|
|
262
|
+
super().__init__(api_key, base_url, model, timeout)
|
|
263
|
+
self._validate_config()
|
|
264
|
+
|
|
265
|
+
def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
|
|
266
|
+
"""Get field descriptions using OpenRouter API."""
|
|
267
|
+
fields_str = ', '.join(fields)
|
|
268
|
+
url = f"{self.base_url}/chat/completions"
|
|
269
|
+
headers = {
|
|
270
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
271
|
+
"Content-Type": "application/json",
|
|
272
|
+
"HTTP-Referer": "https://github.com/datacoon/undatum",
|
|
273
|
+
"X-Title": "Undatum Data Analysis"
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
payload = {
|
|
277
|
+
"model": self.model,
|
|
278
|
+
"messages": [
|
|
279
|
+
{
|
|
280
|
+
"role": "system",
|
|
281
|
+
"content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON."
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
"role": "user",
|
|
285
|
+
"content": f"Please describe these data fields in {language}: {fields_str}. Provide a description for each field explaining what it represents. Return your response as a JSON object with a 'fields' array containing objects with 'name' and 'description' keys."
|
|
286
|
+
}
|
|
287
|
+
],
|
|
288
|
+
"response_format": {
|
|
289
|
+
"type": "json_object"
|
|
290
|
+
},
|
|
291
|
+
"temperature": 0.3
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
def _make_request():
|
|
295
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
296
|
+
response.raise_for_status()
|
|
297
|
+
return response
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
response = retry_with_backoff(_make_request)
|
|
301
|
+
data = response.json()
|
|
302
|
+
|
|
303
|
+
content = data["choices"][0]["message"]["content"]
|
|
304
|
+
result = json.loads(content)
|
|
305
|
+
|
|
306
|
+
field_dict = {}
|
|
307
|
+
if "fields" in result:
|
|
308
|
+
for field_info in result["fields"]:
|
|
309
|
+
if "name" in field_info and "description" in field_info:
|
|
310
|
+
field_dict[field_info["name"]] = field_info["description"]
|
|
311
|
+
|
|
312
|
+
# Ensure all requested fields are in the result
|
|
313
|
+
for field in fields:
|
|
314
|
+
if field not in field_dict:
|
|
315
|
+
field_dict[field] = f"Field: {field}"
|
|
316
|
+
|
|
317
|
+
return field_dict
|
|
318
|
+
|
|
319
|
+
except AIAPIError:
|
|
320
|
+
raise
|
|
321
|
+
except requests.exceptions.RequestException as e:
|
|
322
|
+
raise AIAPIError(f"OpenRouter API request failed: {str(e)}",
|
|
323
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
324
|
+
response=getattr(e.response, 'text', None))
|
|
325
|
+
except json.JSONDecodeError as e:
|
|
326
|
+
raise AIAPIError(f"Failed to parse OpenRouter response: {str(e)}")
|
|
327
|
+
|
|
328
|
+
def get_description(self, data: str, language: str = 'English') -> str:
|
|
329
|
+
"""Get dataset description using OpenRouter API."""
|
|
330
|
+
url = f"{self.base_url}/chat/completions"
|
|
331
|
+
headers = {
|
|
332
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
333
|
+
"Content-Type": "application/json",
|
|
334
|
+
"HTTP-Referer": "https://github.com/datacoon/undatum",
|
|
335
|
+
"X-Title": "Undatum Data Analysis"
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
user_content = f"""I have the following CSV data sample:
|
|
339
|
+
{data}
|
|
340
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
|
|
341
|
+
Return your response as a JSON object with a "description" key."""
|
|
342
|
+
|
|
343
|
+
payload = {
|
|
344
|
+
"model": self.model,
|
|
345
|
+
"messages": [
|
|
346
|
+
{
|
|
347
|
+
"role": "system",
|
|
348
|
+
"content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON."
|
|
349
|
+
},
|
|
350
|
+
{
|
|
351
|
+
"role": "user",
|
|
352
|
+
"content": user_content
|
|
353
|
+
}
|
|
354
|
+
],
|
|
355
|
+
"response_format": {
|
|
356
|
+
"type": "json_object"
|
|
357
|
+
},
|
|
358
|
+
"temperature": 0.3
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
# Truncate data if too large
|
|
362
|
+
MAX_DATA_LENGTH = 5000
|
|
363
|
+
if len(data) > MAX_DATA_LENGTH:
|
|
364
|
+
data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
|
|
365
|
+
payload["messages"][1]["content"] = f"""I have the following CSV data sample:
|
|
366
|
+
{data}
|
|
367
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples. Return JSON with a 'description' key."""
|
|
368
|
+
|
|
369
|
+
def _make_request():
|
|
370
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
371
|
+
response.raise_for_status()
|
|
372
|
+
return response
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
response = retry_with_backoff(_make_request)
|
|
376
|
+
response_data = response.json()
|
|
377
|
+
|
|
378
|
+
content = response_data["choices"][0]["message"]["content"]
|
|
379
|
+
result = json.loads(content)
|
|
380
|
+
|
|
381
|
+
if "description" in result:
|
|
382
|
+
return result["description"]
|
|
383
|
+
else:
|
|
384
|
+
return content
|
|
385
|
+
|
|
386
|
+
except AIAPIError:
|
|
387
|
+
raise
|
|
388
|
+
except requests.exceptions.RequestException as e:
|
|
389
|
+
raise AIAPIError(f"OpenRouter API request failed: {str(e)}",
|
|
390
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
391
|
+
response=getattr(e.response, 'text', None))
|
|
392
|
+
except json.JSONDecodeError:
|
|
393
|
+
try:
|
|
394
|
+
content = response_data["choices"][0]["message"]["content"]
|
|
395
|
+
return content
|
|
396
|
+
except (KeyError, IndexError):
|
|
397
|
+
raise AIAPIError("Failed to extract description from OpenRouter response")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class OllamaProvider(AIService):
|
|
401
|
+
"""Ollama local API provider."""
|
|
402
|
+
|
|
403
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
|
|
404
|
+
model: Optional[str] = None, timeout: int = 30):
|
|
405
|
+
"""Initialize Ollama provider.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
api_key: Not used for Ollama (kept for interface compatibility)
|
|
409
|
+
base_url: Base URL (defaults to http://localhost:11434)
|
|
410
|
+
model: Model name (defaults to llama3.2)
|
|
411
|
+
timeout: Request timeout in seconds
|
|
412
|
+
"""
|
|
413
|
+
base_url = base_url or os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
|
414
|
+
model = model or 'llama3.2'
|
|
415
|
+
super().__init__(api_key, base_url, model, timeout)
|
|
416
|
+
if not self.model:
|
|
417
|
+
raise AIConfigurationError("Model is required for OllamaProvider")
|
|
418
|
+
|
|
419
|
+
def _validate_config(self) -> None:
|
|
420
|
+
"""Ollama doesn't require API key."""
|
|
421
|
+
if not self.model:
|
|
422
|
+
raise AIConfigurationError("Model is required for OllamaProvider")
|
|
423
|
+
|
|
424
|
+
def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
|
|
425
|
+
"""Get field descriptions using Ollama API."""
|
|
426
|
+
fields_str = ', '.join(fields)
|
|
427
|
+
url = f"{self.base_url}/api/chat"
|
|
428
|
+
|
|
429
|
+
payload = {
|
|
430
|
+
"model": self.model,
|
|
431
|
+
"messages": [
|
|
432
|
+
{
|
|
433
|
+
"role": "system",
|
|
434
|
+
"content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only."
|
|
435
|
+
},
|
|
436
|
+
{
|
|
437
|
+
"role": "user",
|
|
438
|
+
"content": f"""Please describe these data fields in {language}: {fields_str}.
|
|
439
|
+
Return a JSON object with a "fields" array. Each item should have "name" and "description" keys.
|
|
440
|
+
Example format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}"""
|
|
441
|
+
}
|
|
442
|
+
],
|
|
443
|
+
"format": "json",
|
|
444
|
+
"stream": False, # Explicitly disable streaming
|
|
445
|
+
"options": {
|
|
446
|
+
"temperature": 0.3
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
response = requests.post(url, json=payload, timeout=self.timeout)
|
|
452
|
+
response.raise_for_status()
|
|
453
|
+
|
|
454
|
+
# Handle potential streaming response or malformed JSON
|
|
455
|
+
# Ollama may return multiple JSON objects even with stream=False
|
|
456
|
+
response_text = response.text.strip()
|
|
457
|
+
data = None
|
|
458
|
+
|
|
459
|
+
try:
|
|
460
|
+
data = response.json()
|
|
461
|
+
except (ValueError, json.JSONDecodeError) as e:
|
|
462
|
+
# If response.json() fails, it might be a streaming response with multiple JSON objects
|
|
463
|
+
# Try to parse the last complete JSON object from the response text
|
|
464
|
+
if response_text:
|
|
465
|
+
# Split by newlines and try to parse each line as JSON
|
|
466
|
+
# Ollama streaming format has one JSON object per line
|
|
467
|
+
lines = response_text.strip().split('\n')
|
|
468
|
+
for line in reversed(lines): # Start from the last line
|
|
469
|
+
line = line.strip()
|
|
470
|
+
if not line:
|
|
471
|
+
continue
|
|
472
|
+
try:
|
|
473
|
+
# Try to parse as JSON
|
|
474
|
+
parsed = json.loads(line)
|
|
475
|
+
# Check if it's a valid Ollama response structure
|
|
476
|
+
if isinstance(parsed, dict) and ('message' in parsed or 'content' in parsed or 'response' in parsed):
|
|
477
|
+
data = parsed
|
|
478
|
+
break
|
|
479
|
+
except json.JSONDecodeError:
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
# If we still don't have data, try to extract JSON object with regex
|
|
483
|
+
if data is None:
|
|
484
|
+
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
|
485
|
+
if json_match:
|
|
486
|
+
try:
|
|
487
|
+
data = json.loads(json_match.group(0))
|
|
488
|
+
except json.JSONDecodeError:
|
|
489
|
+
pass
|
|
490
|
+
|
|
491
|
+
if data is None:
|
|
492
|
+
raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Response: {response_text[:500]}")
|
|
493
|
+
else:
|
|
494
|
+
raise AIAPIError(f"Empty response from Ollama: {str(e)}")
|
|
495
|
+
|
|
496
|
+
content = data.get("message", {}).get("content", "")
|
|
497
|
+
|
|
498
|
+
# Try to extract JSON from content if it contains markdown code blocks
|
|
499
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
500
|
+
if json_match:
|
|
501
|
+
content = json_match.group(1)
|
|
502
|
+
else:
|
|
503
|
+
# Try to find JSON object in the text
|
|
504
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
505
|
+
if json_match:
|
|
506
|
+
content = json_match.group(0)
|
|
507
|
+
|
|
508
|
+
result = json.loads(content)
|
|
509
|
+
|
|
510
|
+
field_dict = {}
|
|
511
|
+
if "fields" in result:
|
|
512
|
+
for field_info in result["fields"]:
|
|
513
|
+
if "name" in field_info and "description" in field_info:
|
|
514
|
+
field_dict[field_info["name"]] = field_info["description"]
|
|
515
|
+
|
|
516
|
+
# Ensure all requested fields are in the result
|
|
517
|
+
for field in fields:
|
|
518
|
+
if field not in field_dict:
|
|
519
|
+
field_dict[field] = f"Field: {field}"
|
|
520
|
+
|
|
521
|
+
return field_dict
|
|
522
|
+
|
|
523
|
+
except requests.exceptions.RequestException as e:
|
|
524
|
+
raise AIAPIError(f"Ollama API request failed: {str(e)}",
|
|
525
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
526
|
+
response=getattr(e.response, 'text', None))
|
|
527
|
+
except json.JSONDecodeError as e:
|
|
528
|
+
raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Content: {content[:200] if 'content' in locals() else 'N/A'}")
|
|
529
|
+
|
|
530
|
+
def get_description(self, data: str, language: str = 'English') -> str:
|
|
531
|
+
"""Get dataset description using Ollama API."""
|
|
532
|
+
url = f"{self.base_url}/api/chat"
|
|
533
|
+
|
|
534
|
+
payload = {
|
|
535
|
+
"model": self.model,
|
|
536
|
+
"messages": [
|
|
537
|
+
{
|
|
538
|
+
"role": "system",
|
|
539
|
+
"content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only."
|
|
540
|
+
},
|
|
541
|
+
{
|
|
542
|
+
"role": "user",
|
|
543
|
+
"content": f"""I have the following CSV data sample:
|
|
544
|
+
{data}
|
|
545
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
|
|
546
|
+
Return JSON with format: {{"description": "..."}}"""
|
|
547
|
+
}
|
|
548
|
+
],
|
|
549
|
+
"format": "json",
|
|
550
|
+
"stream": False, # Explicitly disable streaming
|
|
551
|
+
"options": {
|
|
552
|
+
"temperature": 0.3
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
response = requests.post(url, json=payload, timeout=self.timeout)
|
|
558
|
+
response.raise_for_status()
|
|
559
|
+
|
|
560
|
+
# Handle potential streaming response or malformed JSON
|
|
561
|
+
# Ollama may return multiple JSON objects even with stream=False
|
|
562
|
+
response_text = response.text.strip()
|
|
563
|
+
data = None
|
|
564
|
+
|
|
565
|
+
try:
|
|
566
|
+
data = response.json()
|
|
567
|
+
except (ValueError, json.JSONDecodeError) as e:
|
|
568
|
+
# If response.json() fails, it might be a streaming response with multiple JSON objects
|
|
569
|
+
# Try to parse the last complete JSON object from the response text
|
|
570
|
+
if response_text:
|
|
571
|
+
# Split by newlines and try to parse each line as JSON
|
|
572
|
+
# Ollama streaming format has one JSON object per line
|
|
573
|
+
lines = response_text.strip().split('\n')
|
|
574
|
+
for line in reversed(lines): # Start from the last line
|
|
575
|
+
line = line.strip()
|
|
576
|
+
if not line:
|
|
577
|
+
continue
|
|
578
|
+
try:
|
|
579
|
+
# Try to parse as JSON
|
|
580
|
+
parsed = json.loads(line)
|
|
581
|
+
# Check if it's a valid Ollama response structure
|
|
582
|
+
if isinstance(parsed, dict) and ('message' in parsed or 'content' in parsed or 'response' in parsed):
|
|
583
|
+
data = parsed
|
|
584
|
+
break
|
|
585
|
+
except json.JSONDecodeError:
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
# If we still don't have data, try to extract JSON object with regex
|
|
589
|
+
if data is None:
|
|
590
|
+
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
|
591
|
+
if json_match:
|
|
592
|
+
try:
|
|
593
|
+
data = json.loads(json_match.group(0))
|
|
594
|
+
except json.JSONDecodeError:
|
|
595
|
+
pass
|
|
596
|
+
|
|
597
|
+
if data is None:
|
|
598
|
+
raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Response: {response_text[:500]}")
|
|
599
|
+
else:
|
|
600
|
+
raise AIAPIError(f"Empty response from Ollama: {str(e)}")
|
|
601
|
+
|
|
602
|
+
content = data.get("message", {}).get("content", "")
|
|
603
|
+
|
|
604
|
+
# Try to extract JSON from content if it contains markdown code blocks
|
|
605
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
606
|
+
if json_match:
|
|
607
|
+
content = json_match.group(1)
|
|
608
|
+
else:
|
|
609
|
+
# Try to find JSON object in the text
|
|
610
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
611
|
+
if json_match:
|
|
612
|
+
content = json_match.group(0)
|
|
613
|
+
|
|
614
|
+
# Try to parse as JSON
|
|
615
|
+
try:
|
|
616
|
+
result = json.loads(content)
|
|
617
|
+
if "description" in result:
|
|
618
|
+
return result["description"]
|
|
619
|
+
else:
|
|
620
|
+
# If JSON is valid but doesn't have description, return content as-is
|
|
621
|
+
return content
|
|
622
|
+
except json.JSONDecodeError as json_err:
|
|
623
|
+
# If JSON parsing fails, try to return the raw content
|
|
624
|
+
# This handles cases where the model returns plain text instead of JSON
|
|
625
|
+
if content:
|
|
626
|
+
return content
|
|
627
|
+
raise AIAPIError(f"Failed to parse Ollama JSON response: {str(json_err)}. Content: {content[:200]}")
|
|
628
|
+
|
|
629
|
+
except requests.exceptions.RequestException as e:
|
|
630
|
+
raise AIAPIError(f"Ollama API request failed: {str(e)}",
|
|
631
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
632
|
+
response=getattr(e.response, 'text', None))
|
|
633
|
+
except json.JSONDecodeError as e:
|
|
634
|
+
# This should not happen now, but keep as fallback
|
|
635
|
+
try:
|
|
636
|
+
content = data.get("message", {}).get("content", "")
|
|
637
|
+
if content:
|
|
638
|
+
return content
|
|
639
|
+
except (KeyError, IndexError, NameError):
|
|
640
|
+
pass
|
|
641
|
+
raise AIAPIError(f"Failed to extract description from Ollama response: {str(e)}")
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
class LMStudioProvider(AIService):
|
|
645
|
+
"""LM Studio local API provider (OpenAI-compatible)."""
|
|
646
|
+
|
|
647
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
|
|
648
|
+
model: Optional[str] = None, timeout: int = 30):
|
|
649
|
+
"""Initialize LM Studio provider.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
api_key: Not used for LM Studio (kept for interface compatibility, can be "lm-studio")
|
|
653
|
+
base_url: Base URL (defaults to http://localhost:1234/v1)
|
|
654
|
+
model: Model name (REQUIRED - must match a model loaded in LM Studio)
|
|
655
|
+
timeout: Request timeout in seconds
|
|
656
|
+
"""
|
|
657
|
+
base_url = base_url or os.getenv('LMSTUDIO_BASE_URL', 'http://localhost:1234/v1')
|
|
658
|
+
super().__init__(api_key, base_url, model, timeout)
|
|
659
|
+
self._validate_config()
|
|
660
|
+
|
|
661
|
+
def _validate_config(self) -> None:
|
|
662
|
+
"""Validate LM Studio configuration."""
|
|
663
|
+
if not self.model:
|
|
664
|
+
raise AIConfigurationError(
|
|
665
|
+
"Model name is required for LM Studio. "
|
|
666
|
+
"Please specify a model name that matches a model loaded in LM Studio. "
|
|
667
|
+
"Example: --ai-model 'your-model-name'"
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
def _get_available_models(self) -> list[str]:
|
|
671
|
+
"""Get list of available models from LM Studio."""
|
|
672
|
+
try:
|
|
673
|
+
models_url = f"{self.base_url}/models"
|
|
674
|
+
response = requests.get(models_url, timeout=self.timeout)
|
|
675
|
+
if response.status_code == 200:
|
|
676
|
+
data = response.json()
|
|
677
|
+
if isinstance(data, dict) and 'data' in data:
|
|
678
|
+
return [model.get('id', '') for model in data['data'] if 'id' in model]
|
|
679
|
+
except Exception:
|
|
680
|
+
pass
|
|
681
|
+
return []
|
|
682
|
+
|
|
683
|
+
def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
|
|
684
|
+
"""Get field descriptions using LM Studio API."""
|
|
685
|
+
fields_str = ', '.join(fields)
|
|
686
|
+
url = f"{self.base_url}/chat/completions"
|
|
687
|
+
headers = {
|
|
688
|
+
"Content-Type": "application/json"
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
payload = {
|
|
692
|
+
"model": self.model,
|
|
693
|
+
"messages": [
|
|
694
|
+
{
|
|
695
|
+
"role": "system",
|
|
696
|
+
"content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only, no markdown, no code blocks."
|
|
697
|
+
},
|
|
698
|
+
{
|
|
699
|
+
"role": "user",
|
|
700
|
+
"content": f"""Please describe these data fields in {language}: {fields_str}.
|
|
701
|
+
Return ONLY a JSON object with a "fields" array. Each item must have "name" and "description" keys.
|
|
702
|
+
Format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}
|
|
703
|
+
Return only the JSON, nothing else."""
|
|
704
|
+
}
|
|
705
|
+
],
|
|
706
|
+
"temperature": 0.3
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
# Try with json_object format first (some models support it)
|
|
710
|
+
# If that fails, fall back to text parsing
|
|
711
|
+
try:
|
|
712
|
+
payload["response_format"] = {"type": "json_object"}
|
|
713
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
714
|
+
response.raise_for_status()
|
|
715
|
+
except requests.exceptions.RequestException:
|
|
716
|
+
# Remove response_format if not supported
|
|
717
|
+
payload.pop("response_format", None)
|
|
718
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
719
|
+
response.raise_for_status()
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
data = response.json()
|
|
723
|
+
content = data["choices"][0]["message"]["content"].strip()
|
|
724
|
+
|
|
725
|
+
# Try to extract JSON from markdown code blocks if present
|
|
726
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
727
|
+
if json_match:
|
|
728
|
+
content = json_match.group(1)
|
|
729
|
+
else:
|
|
730
|
+
# Try to find JSON object in the text
|
|
731
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
732
|
+
if json_match:
|
|
733
|
+
content = json_match.group(0)
|
|
734
|
+
|
|
735
|
+
result = json.loads(content)
|
|
736
|
+
|
|
737
|
+
field_dict = {}
|
|
738
|
+
if "fields" in result:
|
|
739
|
+
for field_info in result["fields"]:
|
|
740
|
+
if "name" in field_info and "description" in field_info:
|
|
741
|
+
field_dict[field_info["name"]] = field_info["description"]
|
|
742
|
+
|
|
743
|
+
# Ensure all requested fields are in the result
|
|
744
|
+
for field in fields:
|
|
745
|
+
if field not in field_dict:
|
|
746
|
+
field_dict[field] = f"Field: {field}"
|
|
747
|
+
|
|
748
|
+
return field_dict
|
|
749
|
+
|
|
750
|
+
except requests.exceptions.RequestException as e:
|
|
751
|
+
available_models = self._get_available_models()
|
|
752
|
+
error_msg = f"LM Studio API request failed: {str(e)}"
|
|
753
|
+
if available_models:
|
|
754
|
+
error_msg += f"\nAvailable models: {', '.join(available_models)}"
|
|
755
|
+
elif e.response and e.response.status_code == 404:
|
|
756
|
+
error_msg += "\nMake sure LM Studio server is running and a model is loaded."
|
|
757
|
+
raise AIAPIError(error_msg,
|
|
758
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
759
|
+
response=getattr(e.response, 'text', None))
|
|
760
|
+
except json.JSONDecodeError as e:
|
|
761
|
+
raise AIAPIError(f"Failed to parse LM Studio response: {str(e)}. Response content: {content[:200]}")
|
|
762
|
+
|
|
763
|
+
def get_description(self, data: str, language: str = 'English') -> str:
|
|
764
|
+
"""Get dataset description using LM Studio API."""
|
|
765
|
+
# Truncate data if too large
|
|
766
|
+
MAX_DATA_LENGTH = 5000
|
|
767
|
+
if len(data) > MAX_DATA_LENGTH:
|
|
768
|
+
data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
|
|
769
|
+
|
|
770
|
+
url = f"{self.base_url}/chat/completions"
|
|
771
|
+
headers = {
|
|
772
|
+
"Content-Type": "application/json"
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
payload = {
|
|
776
|
+
"model": self.model,
|
|
777
|
+
"messages": [
|
|
778
|
+
{
|
|
779
|
+
"role": "system",
|
|
780
|
+
"content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only, no markdown, no code blocks."
|
|
781
|
+
},
|
|
782
|
+
{
|
|
783
|
+
"role": "user",
|
|
784
|
+
"content": f"""I have the following CSV data sample:
|
|
785
|
+
{data}
|
|
786
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
|
|
787
|
+
Return ONLY a JSON object with format: {{"description": "..."}}
|
|
788
|
+
Return only the JSON, nothing else."""
|
|
789
|
+
}
|
|
790
|
+
],
|
|
791
|
+
"temperature": 0.3
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
# Try with json_object format first (some models support it)
|
|
795
|
+
# If that fails, fall back to text parsing
|
|
796
|
+
try:
|
|
797
|
+
payload["response_format"] = {"type": "json_object"}
|
|
798
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
799
|
+
response.raise_for_status()
|
|
800
|
+
except requests.exceptions.RequestException:
|
|
801
|
+
# Remove response_format if not supported
|
|
802
|
+
payload.pop("response_format", None)
|
|
803
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
804
|
+
response.raise_for_status()
|
|
805
|
+
|
|
806
|
+
try:
|
|
807
|
+
response_data = response.json()
|
|
808
|
+
content = response_data["choices"][0]["message"]["content"].strip()
|
|
809
|
+
|
|
810
|
+
# Try to extract JSON from markdown code blocks if present
|
|
811
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
812
|
+
if json_match:
|
|
813
|
+
content = json_match.group(1)
|
|
814
|
+
else:
|
|
815
|
+
# Try to find JSON object in the text
|
|
816
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
817
|
+
if json_match:
|
|
818
|
+
content = json_match.group(0)
|
|
819
|
+
|
|
820
|
+
# Try to parse as JSON first
|
|
821
|
+
try:
|
|
822
|
+
result = json.loads(content)
|
|
823
|
+
if "description" in result:
|
|
824
|
+
return result["description"]
|
|
825
|
+
except json.JSONDecodeError:
|
|
826
|
+
pass
|
|
827
|
+
|
|
828
|
+
# If JSON parsing fails, return the content as-is (might be plain text description)
|
|
829
|
+
return content
|
|
830
|
+
|
|
831
|
+
except requests.exceptions.RequestException as e:
|
|
832
|
+
available_models = self._get_available_models()
|
|
833
|
+
error_msg = f"LM Studio API request failed: {str(e)}"
|
|
834
|
+
if available_models:
|
|
835
|
+
error_msg += f"\nAvailable models: {', '.join(available_models)}"
|
|
836
|
+
elif e.response and e.response.status_code == 404:
|
|
837
|
+
error_msg += "\nMake sure LM Studio server is running and a model is loaded."
|
|
838
|
+
raise AIAPIError(error_msg,
|
|
839
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
840
|
+
response=getattr(e.response, 'text', None))
|
|
841
|
+
except (KeyError, IndexError) as e:
|
|
842
|
+
raise AIAPIError(f"Failed to extract description from LM Studio response: {str(e)}")
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
class PerplexityProvider(AIService):
|
|
846
|
+
"""Perplexity API provider with structured output."""
|
|
847
|
+
|
|
848
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
|
|
849
|
+
model: Optional[str] = None, timeout: int = 30):
|
|
850
|
+
"""Initialize Perplexity provider.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
api_key: Perplexity API key (defaults to PERPLEXITY_API_KEY env var)
|
|
854
|
+
base_url: Base URL (defaults to https://api.perplexity.ai)
|
|
855
|
+
model: Model name (defaults to sonar)
|
|
856
|
+
timeout: Request timeout in seconds
|
|
857
|
+
"""
|
|
858
|
+
# Always prioritize PERPLEXITY_API_KEY environment variable
|
|
859
|
+
perplexity_key = os.getenv('PERPLEXITY_API_KEY')
|
|
860
|
+
if perplexity_key:
|
|
861
|
+
# If PERPLEXITY_API_KEY is set, always use it (ignore passed api_key)
|
|
862
|
+
api_key = perplexity_key
|
|
863
|
+
# If PERPLEXITY_API_KEY is not set, use the passed api_key (which may be None)
|
|
864
|
+
|
|
865
|
+
base_url = base_url or 'https://api.perplexity.ai'
|
|
866
|
+
model = model or 'sonar'
|
|
867
|
+
super().__init__(api_key, base_url, model, timeout)
|
|
868
|
+
self._validate_config()
|
|
869
|
+
|
|
870
|
+
def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
|
|
871
|
+
"""Get field descriptions using Perplexity API."""
|
|
872
|
+
fields_str = ', '.join(fields)
|
|
873
|
+
url = f"{self.base_url}/chat/completions"
|
|
874
|
+
headers = {
|
|
875
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
876
|
+
"Content-Type": "application/json"
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
payload = {
|
|
880
|
+
"model": self.model,
|
|
881
|
+
"messages": [
|
|
882
|
+
{
|
|
883
|
+
"role": "system",
|
|
884
|
+
"content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only, no markdown, no code blocks."
|
|
885
|
+
},
|
|
886
|
+
{
|
|
887
|
+
"role": "user",
|
|
888
|
+
"content": f"""Please describe these data fields in {language}: {fields_str}.
|
|
889
|
+
Return ONLY a JSON object with a "fields" array. Each item must have "name" and "description" keys.
|
|
890
|
+
Format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}
|
|
891
|
+
Return only the JSON, nothing else."""
|
|
892
|
+
}
|
|
893
|
+
],
|
|
894
|
+
"temperature": 0.3
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
try:
|
|
898
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
899
|
+
response.raise_for_status()
|
|
900
|
+
data = response.json()
|
|
901
|
+
|
|
902
|
+
content = data["choices"][0]["message"]["content"].strip()
|
|
903
|
+
|
|
904
|
+
# Try to extract JSON from markdown code blocks if present
|
|
905
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
906
|
+
if json_match:
|
|
907
|
+
content = json_match.group(1)
|
|
908
|
+
else:
|
|
909
|
+
# Try to find JSON object in the text
|
|
910
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
911
|
+
if json_match:
|
|
912
|
+
content = json_match.group(0)
|
|
913
|
+
|
|
914
|
+
result = json.loads(content)
|
|
915
|
+
|
|
916
|
+
field_dict = {}
|
|
917
|
+
if "fields" in result:
|
|
918
|
+
for field_info in result["fields"]:
|
|
919
|
+
if "name" in field_info and "description" in field_info:
|
|
920
|
+
field_dict[field_info["name"]] = field_info["description"]
|
|
921
|
+
|
|
922
|
+
# Ensure all requested fields are in the result
|
|
923
|
+
for field in fields:
|
|
924
|
+
if field not in field_dict:
|
|
925
|
+
field_dict[field] = f"Field: {field}"
|
|
926
|
+
|
|
927
|
+
return field_dict
|
|
928
|
+
|
|
929
|
+
except requests.exceptions.RequestException as e:
|
|
930
|
+
raise AIAPIError(f"Perplexity API request failed: {str(e)}",
|
|
931
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
932
|
+
response=getattr(e.response, 'text', None))
|
|
933
|
+
except json.JSONDecodeError as e:
|
|
934
|
+
raise AIAPIError(f"Failed to parse Perplexity response: {str(e)}. Response content: {content[:200]}")
|
|
935
|
+
|
|
936
|
+
def get_description(self, data: str, language: str = 'English') -> str:
|
|
937
|
+
"""Get dataset description using Perplexity API."""
|
|
938
|
+
# Truncate data if too large (Perplexity has token limits)
|
|
939
|
+
# Keep first ~5000 characters to ensure we stay within limits
|
|
940
|
+
MAX_DATA_LENGTH = 5000
|
|
941
|
+
if len(data) > MAX_DATA_LENGTH:
|
|
942
|
+
data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
|
|
943
|
+
|
|
944
|
+
url = f"{self.base_url}/chat/completions"
|
|
945
|
+
headers = {
|
|
946
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
947
|
+
"Content-Type": "application/json"
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
payload = {
|
|
951
|
+
"model": self.model,
|
|
952
|
+
"messages": [
|
|
953
|
+
{
|
|
954
|
+
"role": "system",
|
|
955
|
+
"content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only, no markdown, no code blocks."
|
|
956
|
+
},
|
|
957
|
+
{
|
|
958
|
+
"role": "user",
|
|
959
|
+
"content": f"""I have the following CSV data sample:
|
|
960
|
+
{data}
|
|
961
|
+
Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
|
|
962
|
+
Return ONLY a JSON object with format: {{"description": "..."}}
|
|
963
|
+
Return only the JSON, nothing else."""
|
|
964
|
+
}
|
|
965
|
+
],
|
|
966
|
+
"temperature": 0.3
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
try:
|
|
970
|
+
response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
|
|
971
|
+
response.raise_for_status()
|
|
972
|
+
response_data = response.json()
|
|
973
|
+
|
|
974
|
+
content = response_data["choices"][0]["message"]["content"].strip()
|
|
975
|
+
|
|
976
|
+
# Try to extract JSON from markdown code blocks if present
|
|
977
|
+
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
978
|
+
if json_match:
|
|
979
|
+
content = json_match.group(1)
|
|
980
|
+
else:
|
|
981
|
+
# Try to find JSON object in the text
|
|
982
|
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
|
983
|
+
if json_match:
|
|
984
|
+
content = json_match.group(0)
|
|
985
|
+
|
|
986
|
+
# Try to parse as JSON first
|
|
987
|
+
try:
|
|
988
|
+
result = json.loads(content)
|
|
989
|
+
if "description" in result:
|
|
990
|
+
return result["description"]
|
|
991
|
+
except json.JSONDecodeError:
|
|
992
|
+
pass
|
|
993
|
+
|
|
994
|
+
# If JSON parsing fails, return the content as-is (might be plain text description)
|
|
995
|
+
return content
|
|
996
|
+
|
|
997
|
+
except requests.exceptions.RequestException as e:
|
|
998
|
+
raise AIAPIError(f"Perplexity API request failed: {str(e)}",
|
|
999
|
+
status_code=getattr(e.response, 'status_code', None),
|
|
1000
|
+
response=getattr(e.response, 'text', None))
|
|
1001
|
+
except (KeyError, IndexError) as e:
|
|
1002
|
+
raise AIAPIError(f"Failed to extract description from Perplexity response: {str(e)}")
|