undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1002 @@
1
+ """AI service provider implementations."""
2
+ import json
3
+ import os
4
+ import re
5
+ import time
6
+ from typing import Dict, Optional, Callable
7
+
8
+ import requests
9
+
10
+ from .base import AIService, AIConfigurationError, AIAPIError
11
+ from .schemas import FIELD_INFO_SCHEMA, DATASET_DESCRIPTION_SCHEMA
12
+
13
+
14
+ def retry_with_backoff(func: Callable, max_retries: int = 3, initial_delay: float = 1.0,
15
+ backoff_factor: float = 2.0, retry_statuses: tuple = (429, 500, 502, 503, 504)):
16
+ """Retry a function with exponential backoff for rate limiting and server errors.
17
+
18
+ Args:
19
+ func: Function to retry (should raise requests.exceptions.RequestException)
20
+ max_retries: Maximum number of retry attempts
21
+ initial_delay: Initial delay in seconds before first retry
22
+ backoff_factor: Factor to multiply delay by for each retry
23
+ retry_statuses: HTTP status codes that should trigger a retry
24
+
25
+ Returns:
26
+ Result of the function call
27
+
28
+ Raises:
29
+ AIAPIError: If all retries are exhausted
30
+ """
31
+ delay = initial_delay
32
+ last_exception = None
33
+
34
+ for attempt in range(max_retries + 1):
35
+ try:
36
+ return func()
37
+ except requests.exceptions.RequestException as e:
38
+ last_exception = e
39
+ status_code = getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None
40
+
41
+ # Only retry on specific status codes or if status_code is None (network error)
42
+ if status_code not in retry_statuses and status_code is not None:
43
+ raise
44
+
45
+ # Don't retry on last attempt
46
+ if attempt >= max_retries:
47
+ break
48
+
49
+ # Extract retry-after header if present (for 429 errors)
50
+ retry_after = None
51
+ if hasattr(e, 'response') and e.response is not None:
52
+ retry_after = e.response.headers.get('Retry-After')
53
+ if retry_after:
54
+ try:
55
+ delay = float(retry_after)
56
+ except ValueError:
57
+ pass
58
+
59
+ # Wait before retrying
60
+ time.sleep(delay)
61
+ delay *= backoff_factor
62
+
63
+ # All retries exhausted, raise the last exception
64
+ status_code = getattr(last_exception.response, 'status_code', None) if hasattr(last_exception, 'response') else None
65
+ error_msg = f"API request failed after {max_retries + 1} attempts: {str(last_exception)}"
66
+
67
+ if status_code == 429:
68
+ error_msg += "\nRate limit exceeded. Please wait a moment and try again, or check your API usage limits."
69
+ elif status_code in (500, 502, 503, 504):
70
+ error_msg += "\nServer error. The API may be temporarily unavailable. Please try again later."
71
+
72
+ raise AIAPIError(error_msg,
73
+ status_code=status_code,
74
+ response=getattr(last_exception.response, 'text', None) if hasattr(last_exception, 'response') else None)
75
+
76
+
77
+ class OpenAIProvider(AIService):
78
+ """OpenAI API provider."""
79
+
80
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
81
+ model: Optional[str] = None, timeout: int = 30):
82
+ """Initialize OpenAI provider.
83
+
84
+ Args:
85
+ api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
86
+ base_url: Base URL (defaults to https://api.openai.com/v1)
87
+ model: Model name (defaults to gpt-4o-mini)
88
+ timeout: Request timeout in seconds
89
+ """
90
+ api_key = api_key or os.getenv('OPENAI_API_KEY')
91
+ base_url = base_url or 'https://api.openai.com/v1'
92
+ model = model or 'gpt-4o-mini'
93
+ super().__init__(api_key, base_url, model, timeout)
94
+ self._validate_config()
95
+
96
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
97
+ """Get field descriptions using OpenAI API."""
98
+ fields_str = ', '.join(fields)
99
+ url = f"{self.base_url}/chat/completions"
100
+ headers = {
101
+ "Authorization": f"Bearer {self.api_key}",
102
+ "Content-Type": "application/json"
103
+ }
104
+
105
+ payload = {
106
+ "model": self.model,
107
+ "messages": [
108
+ {
109
+ "role": "system",
110
+ "content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON."
111
+ },
112
+ {
113
+ "role": "user",
114
+ "content": f"Please describe these data fields in {language}: {fields_str}. Provide a description for each field explaining what it represents. Return your response as a JSON object with a 'fields' array containing objects with 'name' and 'description' keys."
115
+ }
116
+ ],
117
+ "response_format": {
118
+ "type": "json_object"
119
+ },
120
+ "temperature": 0.3
121
+ }
122
+
123
+ def _make_request():
124
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
125
+ response.raise_for_status()
126
+ return response
127
+
128
+ try:
129
+ response = retry_with_backoff(_make_request)
130
+ data = response.json()
131
+
132
+ content = data["choices"][0]["message"]["content"]
133
+ result = json.loads(content)
134
+
135
+ # Validate and convert to expected format
136
+ if "fields" not in result:
137
+ raise AIAPIError("Invalid response format: missing 'fields' key")
138
+
139
+ field_dict = {}
140
+ for field_info in result["fields"]:
141
+ if "name" not in field_info or "description" not in field_info:
142
+ continue
143
+ field_dict[field_info["name"]] = field_info["description"]
144
+
145
+ # Ensure all requested fields are in the result
146
+ for field in fields:
147
+ if field not in field_dict:
148
+ field_dict[field] = f"Field: {field}"
149
+
150
+ return field_dict
151
+
152
+ except AIAPIError:
153
+ raise
154
+ except requests.exceptions.RequestException as e:
155
+ raise AIAPIError(f"OpenAI API request failed: {str(e)}",
156
+ status_code=getattr(e.response, 'status_code', None),
157
+ response=getattr(e.response, 'text', None))
158
+ except json.JSONDecodeError as e:
159
+ raise AIAPIError(f"Failed to parse OpenAI response: {str(e)}")
160
+
161
+ def get_description(self, data: str, language: str = 'English') -> str:
162
+ """Get dataset description using OpenAI API."""
163
+ # Truncate data if too large (OpenAI has token limits)
164
+ # Use a more conservative limit to account for prompt overhead
165
+ MAX_DATA_LENGTH = 3000
166
+ if len(data) > MAX_DATA_LENGTH:
167
+ data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
168
+
169
+ url = f"{self.base_url}/chat/completions"
170
+ headers = {
171
+ "Authorization": f"Bearer {self.api_key}",
172
+ "Content-Type": "application/json"
173
+ }
174
+
175
+ user_content = f"""I have the following CSV data sample:
176
+ {data}
177
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
178
+ Return your response as a JSON object with a "description" key."""
179
+
180
+ payload = {
181
+ "model": self.model,
182
+ "messages": [
183
+ {
184
+ "role": "system",
185
+ "content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON."
186
+ },
187
+ {
188
+ "role": "user",
189
+ "content": user_content
190
+ }
191
+ ],
192
+ "response_format": {
193
+ "type": "json_object"
194
+ },
195
+ "temperature": 0.3
196
+ }
197
+
198
+ def _make_request():
199
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
200
+ response.raise_for_status()
201
+ return response
202
+
203
+ try:
204
+ response = retry_with_backoff(_make_request)
205
+ response_data = response.json()
206
+
207
+ content = response_data["choices"][0]["message"]["content"]
208
+ result = json.loads(content)
209
+
210
+ if "description" in result:
211
+ return result["description"]
212
+ else:
213
+ # Fallback: return the content as-is if structure is different
214
+ return content
215
+
216
+ except AIAPIError:
217
+ raise
218
+ except requests.exceptions.RequestException as e:
219
+ error_msg = f"OpenAI API request failed: {str(e)}"
220
+ if hasattr(e, 'response') and e.response is not None:
221
+ try:
222
+ error_detail = e.response.json()
223
+ if 'error' in error_detail:
224
+ error_info = error_detail['error']
225
+ if 'message' in error_info:
226
+ error_msg += f"\nError details: {error_info['message']}"
227
+ if 'code' in error_info:
228
+ error_msg += f"\nError code: {error_info['code']}"
229
+ except (ValueError, KeyError):
230
+ # If we can't parse the error response, include the raw text
231
+ error_text = getattr(e.response, 'text', None)
232
+ if error_text:
233
+ error_msg += f"\nResponse: {error_text[:500]}"
234
+ raise AIAPIError(error_msg,
235
+ status_code=getattr(e.response, 'status_code', None),
236
+ response=getattr(e.response, 'text', None))
237
+ except json.JSONDecodeError:
238
+ # If JSON parsing fails, try to extract description from text
239
+ try:
240
+ content = response_data["choices"][0]["message"]["content"]
241
+ return content
242
+ except (KeyError, IndexError):
243
+ raise AIAPIError("Failed to extract description from OpenAI response")
244
+
245
+
246
+ class OpenRouterProvider(AIService):
247
+ """OpenRouter API provider (OpenAI-compatible)."""
248
+
249
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
250
+ model: Optional[str] = None, timeout: int = 30):
251
+ """Initialize OpenRouter provider.
252
+
253
+ Args:
254
+ api_key: OpenRouter API key (defaults to OPENROUTER_API_KEY env var)
255
+ base_url: Base URL (defaults to https://openrouter.ai/api/v1)
256
+ model: Model name (defaults to openai/gpt-4o-mini)
257
+ timeout: Request timeout in seconds
258
+ """
259
+ api_key = api_key or os.getenv('OPENROUTER_API_KEY')
260
+ base_url = base_url or 'https://openrouter.ai/api/v1'
261
+ model = model or 'openai/gpt-4o-mini'
262
+ super().__init__(api_key, base_url, model, timeout)
263
+ self._validate_config()
264
+
265
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
266
+ """Get field descriptions using OpenRouter API."""
267
+ fields_str = ', '.join(fields)
268
+ url = f"{self.base_url}/chat/completions"
269
+ headers = {
270
+ "Authorization": f"Bearer {self.api_key}",
271
+ "Content-Type": "application/json",
272
+ "HTTP-Referer": "https://github.com/datacoon/undatum",
273
+ "X-Title": "Undatum Data Analysis"
274
+ }
275
+
276
+ payload = {
277
+ "model": self.model,
278
+ "messages": [
279
+ {
280
+ "role": "system",
281
+ "content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON."
282
+ },
283
+ {
284
+ "role": "user",
285
+ "content": f"Please describe these data fields in {language}: {fields_str}. Provide a description for each field explaining what it represents. Return your response as a JSON object with a 'fields' array containing objects with 'name' and 'description' keys."
286
+ }
287
+ ],
288
+ "response_format": {
289
+ "type": "json_object"
290
+ },
291
+ "temperature": 0.3
292
+ }
293
+
294
+ def _make_request():
295
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
296
+ response.raise_for_status()
297
+ return response
298
+
299
+ try:
300
+ response = retry_with_backoff(_make_request)
301
+ data = response.json()
302
+
303
+ content = data["choices"][0]["message"]["content"]
304
+ result = json.loads(content)
305
+
306
+ field_dict = {}
307
+ if "fields" in result:
308
+ for field_info in result["fields"]:
309
+ if "name" in field_info and "description" in field_info:
310
+ field_dict[field_info["name"]] = field_info["description"]
311
+
312
+ # Ensure all requested fields are in the result
313
+ for field in fields:
314
+ if field not in field_dict:
315
+ field_dict[field] = f"Field: {field}"
316
+
317
+ return field_dict
318
+
319
+ except AIAPIError:
320
+ raise
321
+ except requests.exceptions.RequestException as e:
322
+ raise AIAPIError(f"OpenRouter API request failed: {str(e)}",
323
+ status_code=getattr(e.response, 'status_code', None),
324
+ response=getattr(e.response, 'text', None))
325
+ except json.JSONDecodeError as e:
326
+ raise AIAPIError(f"Failed to parse OpenRouter response: {str(e)}")
327
+
328
+ def get_description(self, data: str, language: str = 'English') -> str:
329
+ """Get dataset description using OpenRouter API."""
330
+ url = f"{self.base_url}/chat/completions"
331
+ headers = {
332
+ "Authorization": f"Bearer {self.api_key}",
333
+ "Content-Type": "application/json",
334
+ "HTTP-Referer": "https://github.com/datacoon/undatum",
335
+ "X-Title": "Undatum Data Analysis"
336
+ }
337
+
338
+ user_content = f"""I have the following CSV data sample:
339
+ {data}
340
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
341
+ Return your response as a JSON object with a "description" key."""
342
+
343
+ payload = {
344
+ "model": self.model,
345
+ "messages": [
346
+ {
347
+ "role": "system",
348
+ "content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON."
349
+ },
350
+ {
351
+ "role": "user",
352
+ "content": user_content
353
+ }
354
+ ],
355
+ "response_format": {
356
+ "type": "json_object"
357
+ },
358
+ "temperature": 0.3
359
+ }
360
+
361
+ # Truncate data if too large
362
+ MAX_DATA_LENGTH = 5000
363
+ if len(data) > MAX_DATA_LENGTH:
364
+ data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
365
+ payload["messages"][1]["content"] = f"""I have the following CSV data sample:
366
+ {data}
367
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples. Return JSON with a 'description' key."""
368
+
369
+ def _make_request():
370
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
371
+ response.raise_for_status()
372
+ return response
373
+
374
+ try:
375
+ response = retry_with_backoff(_make_request)
376
+ response_data = response.json()
377
+
378
+ content = response_data["choices"][0]["message"]["content"]
379
+ result = json.loads(content)
380
+
381
+ if "description" in result:
382
+ return result["description"]
383
+ else:
384
+ return content
385
+
386
+ except AIAPIError:
387
+ raise
388
+ except requests.exceptions.RequestException as e:
389
+ raise AIAPIError(f"OpenRouter API request failed: {str(e)}",
390
+ status_code=getattr(e.response, 'status_code', None),
391
+ response=getattr(e.response, 'text', None))
392
+ except json.JSONDecodeError:
393
+ try:
394
+ content = response_data["choices"][0]["message"]["content"]
395
+ return content
396
+ except (KeyError, IndexError):
397
+ raise AIAPIError("Failed to extract description from OpenRouter response")
398
+
399
+
400
+ class OllamaProvider(AIService):
401
+ """Ollama local API provider."""
402
+
403
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
404
+ model: Optional[str] = None, timeout: int = 30):
405
+ """Initialize Ollama provider.
406
+
407
+ Args:
408
+ api_key: Not used for Ollama (kept for interface compatibility)
409
+ base_url: Base URL (defaults to http://localhost:11434)
410
+ model: Model name (defaults to llama3.2)
411
+ timeout: Request timeout in seconds
412
+ """
413
+ base_url = base_url or os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
414
+ model = model or 'llama3.2'
415
+ super().__init__(api_key, base_url, model, timeout)
416
+ if not self.model:
417
+ raise AIConfigurationError("Model is required for OllamaProvider")
418
+
419
+ def _validate_config(self) -> None:
420
+ """Ollama doesn't require API key."""
421
+ if not self.model:
422
+ raise AIConfigurationError("Model is required for OllamaProvider")
423
+
424
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
425
+ """Get field descriptions using Ollama API."""
426
+ fields_str = ', '.join(fields)
427
+ url = f"{self.base_url}/api/chat"
428
+
429
+ payload = {
430
+ "model": self.model,
431
+ "messages": [
432
+ {
433
+ "role": "system",
434
+ "content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only."
435
+ },
436
+ {
437
+ "role": "user",
438
+ "content": f"""Please describe these data fields in {language}: {fields_str}.
439
+ Return a JSON object with a "fields" array. Each item should have "name" and "description" keys.
440
+ Example format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}"""
441
+ }
442
+ ],
443
+ "format": "json",
444
+ "stream": False, # Explicitly disable streaming
445
+ "options": {
446
+ "temperature": 0.3
447
+ }
448
+ }
449
+
450
+ try:
451
+ response = requests.post(url, json=payload, timeout=self.timeout)
452
+ response.raise_for_status()
453
+
454
+ # Handle potential streaming response or malformed JSON
455
+ # Ollama may return multiple JSON objects even with stream=False
456
+ response_text = response.text.strip()
457
+ data = None
458
+
459
+ try:
460
+ data = response.json()
461
+ except (ValueError, json.JSONDecodeError) as e:
462
+ # If response.json() fails, it might be a streaming response with multiple JSON objects
463
+ # Try to parse the last complete JSON object from the response text
464
+ if response_text:
465
+ # Split by newlines and try to parse each line as JSON
466
+ # Ollama streaming format has one JSON object per line
467
+ lines = response_text.strip().split('\n')
468
+ for line in reversed(lines): # Start from the last line
469
+ line = line.strip()
470
+ if not line:
471
+ continue
472
+ try:
473
+ # Try to parse as JSON
474
+ parsed = json.loads(line)
475
+ # Check if it's a valid Ollama response structure
476
+ if isinstance(parsed, dict) and ('message' in parsed or 'content' in parsed or 'response' in parsed):
477
+ data = parsed
478
+ break
479
+ except json.JSONDecodeError:
480
+ continue
481
+
482
+ # If we still don't have data, try to extract JSON object with regex
483
+ if data is None:
484
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
485
+ if json_match:
486
+ try:
487
+ data = json.loads(json_match.group(0))
488
+ except json.JSONDecodeError:
489
+ pass
490
+
491
+ if data is None:
492
+ raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Response: {response_text[:500]}")
493
+ else:
494
+ raise AIAPIError(f"Empty response from Ollama: {str(e)}")
495
+
496
+ content = data.get("message", {}).get("content", "")
497
+
498
+ # Try to extract JSON from content if it contains markdown code blocks
499
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
500
+ if json_match:
501
+ content = json_match.group(1)
502
+ else:
503
+ # Try to find JSON object in the text
504
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
505
+ if json_match:
506
+ content = json_match.group(0)
507
+
508
+ result = json.loads(content)
509
+
510
+ field_dict = {}
511
+ if "fields" in result:
512
+ for field_info in result["fields"]:
513
+ if "name" in field_info and "description" in field_info:
514
+ field_dict[field_info["name"]] = field_info["description"]
515
+
516
+ # Ensure all requested fields are in the result
517
+ for field in fields:
518
+ if field not in field_dict:
519
+ field_dict[field] = f"Field: {field}"
520
+
521
+ return field_dict
522
+
523
+ except requests.exceptions.RequestException as e:
524
+ raise AIAPIError(f"Ollama API request failed: {str(e)}",
525
+ status_code=getattr(e.response, 'status_code', None),
526
+ response=getattr(e.response, 'text', None))
527
+ except json.JSONDecodeError as e:
528
+ raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Content: {content[:200] if 'content' in locals() else 'N/A'}")
529
+
530
+ def get_description(self, data: str, language: str = 'English') -> str:
531
+ """Get dataset description using Ollama API."""
532
+ url = f"{self.base_url}/api/chat"
533
+
534
+ payload = {
535
+ "model": self.model,
536
+ "messages": [
537
+ {
538
+ "role": "system",
539
+ "content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only."
540
+ },
541
+ {
542
+ "role": "user",
543
+ "content": f"""I have the following CSV data sample:
544
+ {data}
545
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
546
+ Return JSON with format: {{"description": "..."}}"""
547
+ }
548
+ ],
549
+ "format": "json",
550
+ "stream": False, # Explicitly disable streaming
551
+ "options": {
552
+ "temperature": 0.3
553
+ }
554
+ }
555
+
556
+ try:
557
+ response = requests.post(url, json=payload, timeout=self.timeout)
558
+ response.raise_for_status()
559
+
560
+ # Handle potential streaming response or malformed JSON
561
+ # Ollama may return multiple JSON objects even with stream=False
562
+ response_text = response.text.strip()
563
+ data = None
564
+
565
+ try:
566
+ data = response.json()
567
+ except (ValueError, json.JSONDecodeError) as e:
568
+ # If response.json() fails, it might be a streaming response with multiple JSON objects
569
+ # Try to parse the last complete JSON object from the response text
570
+ if response_text:
571
+ # Split by newlines and try to parse each line as JSON
572
+ # Ollama streaming format has one JSON object per line
573
+ lines = response_text.strip().split('\n')
574
+ for line in reversed(lines): # Start from the last line
575
+ line = line.strip()
576
+ if not line:
577
+ continue
578
+ try:
579
+ # Try to parse as JSON
580
+ parsed = json.loads(line)
581
+ # Check if it's a valid Ollama response structure
582
+ if isinstance(parsed, dict) and ('message' in parsed or 'content' in parsed or 'response' in parsed):
583
+ data = parsed
584
+ break
585
+ except json.JSONDecodeError:
586
+ continue
587
+
588
+ # If we still don't have data, try to extract JSON object with regex
589
+ if data is None:
590
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
591
+ if json_match:
592
+ try:
593
+ data = json.loads(json_match.group(0))
594
+ except json.JSONDecodeError:
595
+ pass
596
+
597
+ if data is None:
598
+ raise AIAPIError(f"Failed to parse Ollama response: {str(e)}. Response: {response_text[:500]}")
599
+ else:
600
+ raise AIAPIError(f"Empty response from Ollama: {str(e)}")
601
+
602
+ content = data.get("message", {}).get("content", "")
603
+
604
+ # Try to extract JSON from content if it contains markdown code blocks
605
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
606
+ if json_match:
607
+ content = json_match.group(1)
608
+ else:
609
+ # Try to find JSON object in the text
610
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
611
+ if json_match:
612
+ content = json_match.group(0)
613
+
614
+ # Try to parse as JSON
615
+ try:
616
+ result = json.loads(content)
617
+ if "description" in result:
618
+ return result["description"]
619
+ else:
620
+ # If JSON is valid but doesn't have description, return content as-is
621
+ return content
622
+ except json.JSONDecodeError as json_err:
623
+ # If JSON parsing fails, try to return the raw content
624
+ # This handles cases where the model returns plain text instead of JSON
625
+ if content:
626
+ return content
627
+ raise AIAPIError(f"Failed to parse Ollama JSON response: {str(json_err)}. Content: {content[:200]}")
628
+
629
+ except requests.exceptions.RequestException as e:
630
+ raise AIAPIError(f"Ollama API request failed: {str(e)}",
631
+ status_code=getattr(e.response, 'status_code', None),
632
+ response=getattr(e.response, 'text', None))
633
+ except json.JSONDecodeError as e:
634
+ # This should not happen now, but keep as fallback
635
+ try:
636
+ content = data.get("message", {}).get("content", "")
637
+ if content:
638
+ return content
639
+ except (KeyError, IndexError, NameError):
640
+ pass
641
+ raise AIAPIError(f"Failed to extract description from Ollama response: {str(e)}")
642
+
643
+
644
+ class LMStudioProvider(AIService):
645
+ """LM Studio local API provider (OpenAI-compatible)."""
646
+
647
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
648
+ model: Optional[str] = None, timeout: int = 30):
649
+ """Initialize LM Studio provider.
650
+
651
+ Args:
652
+ api_key: Not used for LM Studio (kept for interface compatibility, can be "lm-studio")
653
+ base_url: Base URL (defaults to http://localhost:1234/v1)
654
+ model: Model name (REQUIRED - must match a model loaded in LM Studio)
655
+ timeout: Request timeout in seconds
656
+ """
657
+ base_url = base_url or os.getenv('LMSTUDIO_BASE_URL', 'http://localhost:1234/v1')
658
+ super().__init__(api_key, base_url, model, timeout)
659
+ self._validate_config()
660
+
661
+ def _validate_config(self) -> None:
662
+ """Validate LM Studio configuration."""
663
+ if not self.model:
664
+ raise AIConfigurationError(
665
+ "Model name is required for LM Studio. "
666
+ "Please specify a model name that matches a model loaded in LM Studio. "
667
+ "Example: --ai-model 'your-model-name'"
668
+ )
669
+
670
+ def _get_available_models(self) -> list[str]:
671
+ """Get list of available models from LM Studio."""
672
+ try:
673
+ models_url = f"{self.base_url}/models"
674
+ response = requests.get(models_url, timeout=self.timeout)
675
+ if response.status_code == 200:
676
+ data = response.json()
677
+ if isinstance(data, dict) and 'data' in data:
678
+ return [model.get('id', '') for model in data['data'] if 'id' in model]
679
+ except Exception:
680
+ pass
681
+ return []
682
+
683
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
684
+ """Get field descriptions using LM Studio API."""
685
+ fields_str = ', '.join(fields)
686
+ url = f"{self.base_url}/chat/completions"
687
+ headers = {
688
+ "Content-Type": "application/json"
689
+ }
690
+
691
+ payload = {
692
+ "model": self.model,
693
+ "messages": [
694
+ {
695
+ "role": "system",
696
+ "content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only, no markdown, no code blocks."
697
+ },
698
+ {
699
+ "role": "user",
700
+ "content": f"""Please describe these data fields in {language}: {fields_str}.
701
+ Return ONLY a JSON object with a "fields" array. Each item must have "name" and "description" keys.
702
+ Format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}
703
+ Return only the JSON, nothing else."""
704
+ }
705
+ ],
706
+ "temperature": 0.3
707
+ }
708
+
709
+ # Try with json_object format first (some models support it)
710
+ # If that fails, fall back to text parsing
711
+ try:
712
+ payload["response_format"] = {"type": "json_object"}
713
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
714
+ response.raise_for_status()
715
+ except requests.exceptions.RequestException:
716
+ # Remove response_format if not supported
717
+ payload.pop("response_format", None)
718
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
719
+ response.raise_for_status()
720
+
721
+ try:
722
+ data = response.json()
723
+ content = data["choices"][0]["message"]["content"].strip()
724
+
725
+ # Try to extract JSON from markdown code blocks if present
726
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
727
+ if json_match:
728
+ content = json_match.group(1)
729
+ else:
730
+ # Try to find JSON object in the text
731
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
732
+ if json_match:
733
+ content = json_match.group(0)
734
+
735
+ result = json.loads(content)
736
+
737
+ field_dict = {}
738
+ if "fields" in result:
739
+ for field_info in result["fields"]:
740
+ if "name" in field_info and "description" in field_info:
741
+ field_dict[field_info["name"]] = field_info["description"]
742
+
743
+ # Ensure all requested fields are in the result
744
+ for field in fields:
745
+ if field not in field_dict:
746
+ field_dict[field] = f"Field: {field}"
747
+
748
+ return field_dict
749
+
750
+ except requests.exceptions.RequestException as e:
751
+ available_models = self._get_available_models()
752
+ error_msg = f"LM Studio API request failed: {str(e)}"
753
+ if available_models:
754
+ error_msg += f"\nAvailable models: {', '.join(available_models)}"
755
+ elif e.response and e.response.status_code == 404:
756
+ error_msg += "\nMake sure LM Studio server is running and a model is loaded."
757
+ raise AIAPIError(error_msg,
758
+ status_code=getattr(e.response, 'status_code', None),
759
+ response=getattr(e.response, 'text', None))
760
+ except json.JSONDecodeError as e:
761
+ raise AIAPIError(f"Failed to parse LM Studio response: {str(e)}. Response content: {content[:200]}")
762
+
763
+ def get_description(self, data: str, language: str = 'English') -> str:
764
+ """Get dataset description using LM Studio API."""
765
+ # Truncate data if too large
766
+ MAX_DATA_LENGTH = 5000
767
+ if len(data) > MAX_DATA_LENGTH:
768
+ data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
769
+
770
+ url = f"{self.base_url}/chat/completions"
771
+ headers = {
772
+ "Content-Type": "application/json"
773
+ }
774
+
775
+ payload = {
776
+ "model": self.model,
777
+ "messages": [
778
+ {
779
+ "role": "system",
780
+ "content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only, no markdown, no code blocks."
781
+ },
782
+ {
783
+ "role": "user",
784
+ "content": f"""I have the following CSV data sample:
785
+ {data}
786
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
787
+ Return ONLY a JSON object with format: {{"description": "..."}}
788
+ Return only the JSON, nothing else."""
789
+ }
790
+ ],
791
+ "temperature": 0.3
792
+ }
793
+
794
+ # Try with json_object format first (some models support it)
795
+ # If that fails, fall back to text parsing
796
+ try:
797
+ payload["response_format"] = {"type": "json_object"}
798
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
799
+ response.raise_for_status()
800
+ except requests.exceptions.RequestException:
801
+ # Remove response_format if not supported
802
+ payload.pop("response_format", None)
803
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
804
+ response.raise_for_status()
805
+
806
+ try:
807
+ response_data = response.json()
808
+ content = response_data["choices"][0]["message"]["content"].strip()
809
+
810
+ # Try to extract JSON from markdown code blocks if present
811
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
812
+ if json_match:
813
+ content = json_match.group(1)
814
+ else:
815
+ # Try to find JSON object in the text
816
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
817
+ if json_match:
818
+ content = json_match.group(0)
819
+
820
+ # Try to parse as JSON first
821
+ try:
822
+ result = json.loads(content)
823
+ if "description" in result:
824
+ return result["description"]
825
+ except json.JSONDecodeError:
826
+ pass
827
+
828
+ # If JSON parsing fails, return the content as-is (might be plain text description)
829
+ return content
830
+
831
+ except requests.exceptions.RequestException as e:
832
+ available_models = self._get_available_models()
833
+ error_msg = f"LM Studio API request failed: {str(e)}"
834
+ if available_models:
835
+ error_msg += f"\nAvailable models: {', '.join(available_models)}"
836
+ elif e.response and e.response.status_code == 404:
837
+ error_msg += "\nMake sure LM Studio server is running and a model is loaded."
838
+ raise AIAPIError(error_msg,
839
+ status_code=getattr(e.response, 'status_code', None),
840
+ response=getattr(e.response, 'text', None))
841
+ except (KeyError, IndexError) as e:
842
+ raise AIAPIError(f"Failed to extract description from LM Studio response: {str(e)}")
843
+
844
+
845
+ class PerplexityProvider(AIService):
846
+ """Perplexity API provider with structured output."""
847
+
848
+ def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None,
849
+ model: Optional[str] = None, timeout: int = 30):
850
+ """Initialize Perplexity provider.
851
+
852
+ Args:
853
+ api_key: Perplexity API key (defaults to PERPLEXITY_API_KEY env var)
854
+ base_url: Base URL (defaults to https://api.perplexity.ai)
855
+ model: Model name (defaults to sonar)
856
+ timeout: Request timeout in seconds
857
+ """
858
+ # Always prioritize PERPLEXITY_API_KEY environment variable
859
+ perplexity_key = os.getenv('PERPLEXITY_API_KEY')
860
+ if perplexity_key:
861
+ # If PERPLEXITY_API_KEY is set, always use it (ignore passed api_key)
862
+ api_key = perplexity_key
863
+ # If PERPLEXITY_API_KEY is not set, use the passed api_key (which may be None)
864
+
865
+ base_url = base_url or 'https://api.perplexity.ai'
866
+ model = model or 'sonar'
867
+ super().__init__(api_key, base_url, model, timeout)
868
+ self._validate_config()
869
+
870
+ def get_fields_info(self, fields: list[str], language: str = 'English') -> Dict[str, str]:
871
+ """Get field descriptions using Perplexity API."""
872
+ fields_str = ', '.join(fields)
873
+ url = f"{self.base_url}/chat/completions"
874
+ headers = {
875
+ "Authorization": f"Bearer {self.api_key}",
876
+ "Content-Type": "application/json"
877
+ }
878
+
879
+ payload = {
880
+ "model": self.model,
881
+ "messages": [
882
+ {
883
+ "role": "system",
884
+ "content": f"You are a data documentation assistant. Provide clear, concise descriptions of data fields in {language}. Always respond with valid JSON only, no markdown, no code blocks."
885
+ },
886
+ {
887
+ "role": "user",
888
+ "content": f"""Please describe these data fields in {language}: {fields_str}.
889
+ Return ONLY a JSON object with a "fields" array. Each item must have "name" and "description" keys.
890
+ Format: {{"fields": [{{"name": "field1", "description": "..."}}, {{"name": "field2", "description": "..."}}]}}
891
+ Return only the JSON, nothing else."""
892
+ }
893
+ ],
894
+ "temperature": 0.3
895
+ }
896
+
897
+ try:
898
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
899
+ response.raise_for_status()
900
+ data = response.json()
901
+
902
+ content = data["choices"][0]["message"]["content"].strip()
903
+
904
+ # Try to extract JSON from markdown code blocks if present
905
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
906
+ if json_match:
907
+ content = json_match.group(1)
908
+ else:
909
+ # Try to find JSON object in the text
910
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
911
+ if json_match:
912
+ content = json_match.group(0)
913
+
914
+ result = json.loads(content)
915
+
916
+ field_dict = {}
917
+ if "fields" in result:
918
+ for field_info in result["fields"]:
919
+ if "name" in field_info and "description" in field_info:
920
+ field_dict[field_info["name"]] = field_info["description"]
921
+
922
+ # Ensure all requested fields are in the result
923
+ for field in fields:
924
+ if field not in field_dict:
925
+ field_dict[field] = f"Field: {field}"
926
+
927
+ return field_dict
928
+
929
+ except requests.exceptions.RequestException as e:
930
+ raise AIAPIError(f"Perplexity API request failed: {str(e)}",
931
+ status_code=getattr(e.response, 'status_code', None),
932
+ response=getattr(e.response, 'text', None))
933
+ except json.JSONDecodeError as e:
934
+ raise AIAPIError(f"Failed to parse Perplexity response: {str(e)}. Response content: {content[:200]}")
935
+
936
+ def get_description(self, data: str, language: str = 'English') -> str:
937
+ """Get dataset description using Perplexity API."""
938
+ # Truncate data if too large (Perplexity has token limits)
939
+ # Keep first ~5000 characters to ensure we stay within limits
940
+ MAX_DATA_LENGTH = 5000
941
+ if len(data) > MAX_DATA_LENGTH:
942
+ data = data[:MAX_DATA_LENGTH] + "\n... (truncated)"
943
+
944
+ url = f"{self.base_url}/chat/completions"
945
+ headers = {
946
+ "Authorization": f"Bearer {self.api_key}",
947
+ "Content-Type": "application/json"
948
+ }
949
+
950
+ payload = {
951
+ "model": self.model,
952
+ "messages": [
953
+ {
954
+ "role": "system",
955
+ "content": f"You are a data documentation assistant. Provide concise dataset descriptions in {language}. Always respond with valid JSON only, no markdown, no code blocks."
956
+ },
957
+ {
958
+ "role": "user",
959
+ "content": f"""I have the following CSV data sample:
960
+ {data}
961
+ Please provide a short description of this dataset in {language}. Consider this as a sample of a larger dataset. Don't generate code or data examples.
962
+ Return ONLY a JSON object with format: {{"description": "..."}}
963
+ Return only the JSON, nothing else."""
964
+ }
965
+ ],
966
+ "temperature": 0.3
967
+ }
968
+
969
+ try:
970
+ response = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
971
+ response.raise_for_status()
972
+ response_data = response.json()
973
+
974
+ content = response_data["choices"][0]["message"]["content"].strip()
975
+
976
+ # Try to extract JSON from markdown code blocks if present
977
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
978
+ if json_match:
979
+ content = json_match.group(1)
980
+ else:
981
+ # Try to find JSON object in the text
982
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
983
+ if json_match:
984
+ content = json_match.group(0)
985
+
986
+ # Try to parse as JSON first
987
+ try:
988
+ result = json.loads(content)
989
+ if "description" in result:
990
+ return result["description"]
991
+ except json.JSONDecodeError:
992
+ pass
993
+
994
+ # If JSON parsing fails, return the content as-is (might be plain text description)
995
+ return content
996
+
997
+ except requests.exceptions.RequestException as e:
998
+ raise AIAPIError(f"Perplexity API request failed: {str(e)}",
999
+ status_code=getattr(e.response, 'status_code', None),
1000
+ response=getattr(e.response, 'text', None))
1001
+ except (KeyError, IndexError) as e:
1002
+ raise AIAPIError(f"Failed to extract description from Perplexity response: {str(e)}")