dtSpark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. dtSpark/__init__.py +0 -0
  2. dtSpark/_description.txt +1 -0
  3. dtSpark/_full_name.txt +1 -0
  4. dtSpark/_licence.txt +21 -0
  5. dtSpark/_metadata.yaml +6 -0
  6. dtSpark/_name.txt +1 -0
  7. dtSpark/_version.txt +1 -0
  8. dtSpark/aws/__init__.py +7 -0
  9. dtSpark/aws/authentication.py +296 -0
  10. dtSpark/aws/bedrock.py +578 -0
  11. dtSpark/aws/costs.py +318 -0
  12. dtSpark/aws/pricing.py +580 -0
  13. dtSpark/cli_interface.py +2645 -0
  14. dtSpark/conversation_manager.py +3050 -0
  15. dtSpark/core/__init__.py +12 -0
  16. dtSpark/core/application.py +3355 -0
  17. dtSpark/core/context_compaction.py +735 -0
  18. dtSpark/daemon/__init__.py +104 -0
  19. dtSpark/daemon/__main__.py +10 -0
  20. dtSpark/daemon/action_monitor.py +213 -0
  21. dtSpark/daemon/daemon_app.py +730 -0
  22. dtSpark/daemon/daemon_manager.py +289 -0
  23. dtSpark/daemon/execution_coordinator.py +194 -0
  24. dtSpark/daemon/pid_file.py +169 -0
  25. dtSpark/database/__init__.py +482 -0
  26. dtSpark/database/autonomous_actions.py +1191 -0
  27. dtSpark/database/backends.py +329 -0
  28. dtSpark/database/connection.py +122 -0
  29. dtSpark/database/conversations.py +520 -0
  30. dtSpark/database/credential_prompt.py +218 -0
  31. dtSpark/database/files.py +205 -0
  32. dtSpark/database/mcp_ops.py +355 -0
  33. dtSpark/database/messages.py +161 -0
  34. dtSpark/database/schema.py +673 -0
  35. dtSpark/database/tool_permissions.py +186 -0
  36. dtSpark/database/usage.py +167 -0
  37. dtSpark/files/__init__.py +4 -0
  38. dtSpark/files/manager.py +322 -0
  39. dtSpark/launch.py +39 -0
  40. dtSpark/limits/__init__.py +10 -0
  41. dtSpark/limits/costs.py +296 -0
  42. dtSpark/limits/tokens.py +342 -0
  43. dtSpark/llm/__init__.py +17 -0
  44. dtSpark/llm/anthropic_direct.py +446 -0
  45. dtSpark/llm/base.py +146 -0
  46. dtSpark/llm/context_limits.py +438 -0
  47. dtSpark/llm/manager.py +177 -0
  48. dtSpark/llm/ollama.py +578 -0
  49. dtSpark/mcp_integration/__init__.py +5 -0
  50. dtSpark/mcp_integration/manager.py +653 -0
  51. dtSpark/mcp_integration/tool_selector.py +225 -0
  52. dtSpark/resources/config.yaml.template +631 -0
  53. dtSpark/safety/__init__.py +22 -0
  54. dtSpark/safety/llm_service.py +111 -0
  55. dtSpark/safety/patterns.py +229 -0
  56. dtSpark/safety/prompt_inspector.py +442 -0
  57. dtSpark/safety/violation_logger.py +346 -0
  58. dtSpark/scheduler/__init__.py +20 -0
  59. dtSpark/scheduler/creation_tools.py +599 -0
  60. dtSpark/scheduler/execution_queue.py +159 -0
  61. dtSpark/scheduler/executor.py +1152 -0
  62. dtSpark/scheduler/manager.py +395 -0
  63. dtSpark/tools/__init__.py +4 -0
  64. dtSpark/tools/builtin.py +833 -0
  65. dtSpark/web/__init__.py +20 -0
  66. dtSpark/web/auth.py +152 -0
  67. dtSpark/web/dependencies.py +37 -0
  68. dtSpark/web/endpoints/__init__.py +17 -0
  69. dtSpark/web/endpoints/autonomous_actions.py +1125 -0
  70. dtSpark/web/endpoints/chat.py +621 -0
  71. dtSpark/web/endpoints/conversations.py +353 -0
  72. dtSpark/web/endpoints/main_menu.py +547 -0
  73. dtSpark/web/endpoints/streaming.py +421 -0
  74. dtSpark/web/server.py +578 -0
  75. dtSpark/web/session.py +167 -0
  76. dtSpark/web/ssl_utils.py +195 -0
  77. dtSpark/web/static/css/dark-theme.css +427 -0
  78. dtSpark/web/static/js/actions.js +1101 -0
  79. dtSpark/web/static/js/chat.js +614 -0
  80. dtSpark/web/static/js/main.js +496 -0
  81. dtSpark/web/static/js/sse-client.js +242 -0
  82. dtSpark/web/templates/actions.html +408 -0
  83. dtSpark/web/templates/base.html +93 -0
  84. dtSpark/web/templates/chat.html +814 -0
  85. dtSpark/web/templates/conversations.html +350 -0
  86. dtSpark/web/templates/goodbye.html +81 -0
  87. dtSpark/web/templates/login.html +90 -0
  88. dtSpark/web/templates/main_menu.html +983 -0
  89. dtSpark/web/templates/new_conversation.html +191 -0
  90. dtSpark/web/web_interface.py +137 -0
  91. dtspark-1.0.4.dist-info/METADATA +187 -0
  92. dtspark-1.0.4.dist-info/RECORD +96 -0
  93. dtspark-1.0.4.dist-info/WHEEL +5 -0
  94. dtspark-1.0.4.dist-info/entry_points.txt +3 -0
  95. dtspark-1.0.4.dist-info/licenses/LICENSE +21 -0
  96. dtspark-1.0.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,186 @@
1
+ """
2
+ Tool permissions operations module.
3
+
4
+ This module handles:
5
+ - Checking tool permissions for conversations
6
+ - Setting tool permission states
7
+ - Managing first-time tool usage prompts
8
+ """
9
+
10
+ import sqlite3
11
+ import logging
12
+ from datetime import datetime
13
+ from typing import Optional, Dict, List
14
+
15
+
16
+ # Permission states
17
+ PERMISSION_ALLOWED = 'allowed' # Run this time and all future times
18
+ PERMISSION_DENIED = 'denied' # Never run this tool
19
+
20
+
21
+ def check_tool_permission(conn: sqlite3.Connection, conversation_id: int,
22
+ tool_name: str, user_guid: Optional[str] = None) -> Optional[str]:
23
+ """
24
+ Check the permission state for a tool in a conversation.
25
+ Returns None if no record exists (first-time usage, should prompt).
26
+
27
+ Args:
28
+ conn: Database connection
29
+ conversation_id: ID of the conversation
30
+ tool_name: Name of the tool
31
+ user_guid: User GUID for multi-user support
32
+
33
+ Returns:
34
+ Permission state ('allowed', 'denied') or None if no record exists
35
+ """
36
+ cursor = conn.cursor()
37
+ cursor.execute('''
38
+ SELECT permission_state
39
+ FROM conversation_tool_permissions
40
+ WHERE conversation_id = ? AND tool_name = ? AND user_guid = ?
41
+ ''', (conversation_id, tool_name, user_guid))
42
+
43
+ row = cursor.fetchone()
44
+ if row is None:
45
+ # No record exists - first time usage, should prompt user
46
+ return None
47
+ return row['permission_state']
48
+
49
+
50
+ def set_tool_permission(conn: sqlite3.Connection, conversation_id: int,
51
+ tool_name: str, permission_state: str,
52
+ user_guid: Optional[str] = None) -> bool:
53
+ """
54
+ Set the permission state for a tool in a conversation.
55
+
56
+ Args:
57
+ conn: Database connection
58
+ conversation_id: ID of the conversation
59
+ tool_name: Name of the tool
60
+ permission_state: Permission state ('allowed' or 'denied')
61
+ user_guid: User GUID for multi-user support
62
+
63
+ Returns:
64
+ True if successful, False otherwise
65
+ """
66
+ if permission_state not in [PERMISSION_ALLOWED, PERMISSION_DENIED]:
67
+ logging.error(f"Invalid permission state: {permission_state}")
68
+ return False
69
+
70
+ try:
71
+ cursor = conn.cursor()
72
+ now = datetime.now().isoformat()
73
+
74
+ cursor.execute('''
75
+ INSERT INTO conversation_tool_permissions
76
+ (conversation_id, tool_name, permission_state, granted_at, updated_at, user_guid)
77
+ VALUES (?, ?, ?, ?, ?, ?)
78
+ ON CONFLICT(conversation_id, tool_name) DO UPDATE SET
79
+ permission_state = excluded.permission_state,
80
+ updated_at = excluded.updated_at
81
+ ''', (conversation_id, tool_name, permission_state, now, now, user_guid))
82
+
83
+ conn.commit()
84
+ logging.info(f"Tool permission '{tool_name}' set to '{permission_state}' for conversation {conversation_id}")
85
+ return True
86
+
87
+ except Exception as e:
88
+ logging.error(f"Failed to set tool permission: {e}")
89
+ conn.rollback()
90
+ return False
91
+
92
+
93
+ def get_all_tool_permissions(conn: sqlite3.Connection, conversation_id: int,
94
+ user_guid: Optional[str] = None) -> List[Dict]:
95
+ """
96
+ Get all tool permissions for a conversation.
97
+
98
+ Args:
99
+ conn: Database connection
100
+ conversation_id: ID of the conversation
101
+ user_guid: User GUID for multi-user support
102
+
103
+ Returns:
104
+ List of dicts with 'tool_name', 'permission_state', 'granted_at', 'updated_at'
105
+ """
106
+ cursor = conn.cursor()
107
+ cursor.execute('''
108
+ SELECT tool_name, permission_state, granted_at, updated_at
109
+ FROM conversation_tool_permissions
110
+ WHERE conversation_id = ? AND user_guid = ?
111
+ ORDER BY updated_at DESC
112
+ ''', (conversation_id, user_guid))
113
+
114
+ permissions = []
115
+ for row in cursor.fetchall():
116
+ permissions.append({
117
+ 'tool_name': row['tool_name'],
118
+ 'permission_state': row['permission_state'],
119
+ 'granted_at': row['granted_at'],
120
+ 'updated_at': row['updated_at']
121
+ })
122
+
123
+ return permissions
124
+
125
+
126
+ def delete_tool_permission(conn: sqlite3.Connection, conversation_id: int,
127
+ tool_name: str, user_guid: Optional[str] = None) -> bool:
128
+ """
129
+ Delete a tool permission record (reset to first-time usage behavior).
130
+
131
+ Args:
132
+ conn: Database connection
133
+ conversation_id: ID of the conversation
134
+ tool_name: Name of the tool
135
+ user_guid: User GUID for multi-user support
136
+
137
+ Returns:
138
+ True if successful, False otherwise
139
+ """
140
+ try:
141
+ cursor = conn.cursor()
142
+ cursor.execute('''
143
+ DELETE FROM conversation_tool_permissions
144
+ WHERE conversation_id = ? AND tool_name = ? AND user_guid = ?
145
+ ''', (conversation_id, tool_name, user_guid))
146
+
147
+ conn.commit()
148
+ deleted_count = cursor.rowcount
149
+ if deleted_count > 0:
150
+ logging.info(f"Deleted tool permission for '{tool_name}' in conversation {conversation_id}")
151
+ return True
152
+
153
+ except Exception as e:
154
+ logging.error(f"Failed to delete tool permission: {e}")
155
+ conn.rollback()
156
+ return False
157
+
158
+
159
+ def is_tool_allowed(conn: sqlite3.Connection, conversation_id: int,
160
+ tool_name: str, user_guid: Optional[str] = None) -> Optional[bool]:
161
+ """
162
+ Check if a tool is allowed to run.
163
+ Returns None if permission should be requested from user (first-time usage).
164
+ Returns True if allowed, False if denied.
165
+
166
+ Args:
167
+ conn: Database connection
168
+ conversation_id: ID of the conversation
169
+ tool_name: Name of the tool
170
+ user_guid: User GUID for multi-user support
171
+
172
+ Returns:
173
+ True if allowed, False if denied, None if should prompt user
174
+ """
175
+ permission_state = check_tool_permission(conn, conversation_id, tool_name, user_guid)
176
+
177
+ if permission_state is None:
178
+ # No record exists - should prompt user
179
+ return None
180
+ elif permission_state == PERMISSION_ALLOWED:
181
+ return True
182
+ elif permission_state == PERMISSION_DENIED:
183
+ return False
184
+ else:
185
+ logging.warning(f"Unknown permission state: {permission_state}")
186
+ return None
@@ -0,0 +1,167 @@
1
+ """
2
+ Usage tracking module.
3
+
4
+ This module handles:
5
+ - Recording token and cost usage
6
+ - Retrieving usage within time windows
7
+ - Usage summary and reporting
8
+ - Cleanup of old usage data
9
+ """
10
+
11
+ import sqlite3
12
+ import logging
13
+ from datetime import datetime
14
+ from typing import List, Dict, Optional, Tuple
15
+
16
+
17
+ def record_usage(conn: sqlite3.Connection, conversation_id: int, model_id: str,
18
+ region: str, input_tokens: int, output_tokens: int, cost: float,
19
+ timestamp: datetime, user_guid: str = None):
20
+ """
21
+ Record usage for token management and billing.
22
+
23
+ Args:
24
+ conn: Database connection
25
+ conversation_id: ID of the conversation
26
+ model_id: Bedrock model ID
27
+ region: AWS region
28
+ input_tokens: Number of input tokens
29
+ output_tokens: Number of output tokens
30
+ cost: Cost in USD
31
+ timestamp: Timestamp of usage
32
+ user_guid: User GUID for multi-user support
33
+ """
34
+ cursor = conn.cursor()
35
+ cursor.execute('''
36
+ INSERT INTO usage_tracking
37
+ (conversation_id, model_id, region, input_tokens, output_tokens, cost, timestamp, user_guid)
38
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
39
+ ''', (conversation_id, model_id, region, input_tokens, output_tokens, cost, timestamp, user_guid))
40
+ conn.commit()
41
+
42
+
43
+ def get_usage_in_window(conn: sqlite3.Connection, window_start: datetime,
44
+ user_guid: str = None) -> float:
45
+ """
46
+ Get total cost for usage since window_start.
47
+
48
+ Args:
49
+ conn: Database connection
50
+ window_start: Start of rolling window
51
+ user_guid: User GUID for multi-user support
52
+
53
+ Returns:
54
+ Total cost in USD
55
+ """
56
+ cursor = conn.cursor()
57
+ cursor.execute('''
58
+ SELECT SUM(cost) as total_cost
59
+ FROM usage_tracking
60
+ WHERE timestamp >= ? AND user_guid = ?
61
+ ''', (window_start, user_guid))
62
+
63
+ result = cursor.fetchone()
64
+ return result['total_cost'] if result['total_cost'] is not None else 0.0
65
+
66
+
67
+ def get_oldest_usage_in_window(conn: sqlite3.Connection,
68
+ window_start: datetime, user_guid: str = None) -> Optional[datetime]:
69
+ """
70
+ Get timestamp of oldest usage in the rolling window.
71
+
72
+ Args:
73
+ conn: Database connection
74
+ window_start: Start of rolling window
75
+ user_guid: User GUID for multi-user support
76
+
77
+ Returns:
78
+ Timestamp of oldest usage or None
79
+ """
80
+ cursor = conn.cursor()
81
+ cursor.execute('''
82
+ SELECT MIN(timestamp) as oldest_timestamp
83
+ FROM usage_tracking
84
+ WHERE timestamp >= ? AND user_guid = ?
85
+ ''', (window_start, user_guid))
86
+
87
+ result = cursor.fetchone()
88
+ if result and result['oldest_timestamp']:
89
+ return datetime.fromisoformat(result['oldest_timestamp'])
90
+ return None
91
+
92
+
93
+ def get_token_usage_in_window(conn: sqlite3.Connection,
94
+ window_start: datetime, user_guid: str = None) -> Tuple[int, int]:
95
+ """
96
+ Get total token usage (input and output separately) since window_start.
97
+
98
+ Args:
99
+ conn: Database connection
100
+ window_start: Start of rolling window
101
+ user_guid: User GUID for multi-user support
102
+
103
+ Returns:
104
+ Tuple of (total_input_tokens, total_output_tokens)
105
+ """
106
+ cursor = conn.cursor()
107
+ cursor.execute('''
108
+ SELECT
109
+ SUM(input_tokens) as total_input,
110
+ SUM(output_tokens) as total_output
111
+ FROM usage_tracking
112
+ WHERE timestamp >= ? AND user_guid = ?
113
+ ''', (window_start, user_guid))
114
+
115
+ result = cursor.fetchone()
116
+ total_input = result['total_input'] if result['total_input'] is not None else 0
117
+ total_output = result['total_output'] if result['total_output'] is not None else 0
118
+ return int(total_input), int(total_output)
119
+
120
+
121
+ def get_usage_summary(conn: sqlite3.Connection, window_start: datetime,
122
+ user_guid: str = None) -> List[Dict]:
123
+ """
124
+ Get detailed usage summary for the rolling window.
125
+
126
+ Args:
127
+ conn: Database connection
128
+ window_start: Start of rolling window
129
+ user_guid: User GUID for multi-user support
130
+
131
+ Returns:
132
+ List of usage records with model, tokens, and costs
133
+ """
134
+ cursor = conn.cursor()
135
+ cursor.execute('''
136
+ SELECT model_id, region,
137
+ SUM(input_tokens) as total_input_tokens,
138
+ SUM(output_tokens) as total_output_tokens,
139
+ SUM(cost) as total_cost,
140
+ COUNT(*) as request_count
141
+ FROM usage_tracking
142
+ WHERE timestamp >= ? AND user_guid = ?
143
+ GROUP BY model_id, region
144
+ ORDER BY total_cost DESC
145
+ ''', (window_start, user_guid))
146
+
147
+ return [dict(row) for row in cursor.fetchall()]
148
+
149
+
150
+ def cleanup_old_usage(conn: sqlite3.Connection, cutoff_date: datetime,
151
+ user_guid: str = None):
152
+ """
153
+ Clean up usage records older than cutoff_date.
154
+
155
+ Args:
156
+ conn: Database connection
157
+ cutoff_date: Delete records older than this date
158
+ user_guid: User GUID for multi-user support (for safety filtering)
159
+ """
160
+ cursor = conn.cursor()
161
+ cursor.execute('''
162
+ DELETE FROM usage_tracking
163
+ WHERE timestamp < ? AND user_guid = ?
164
+ ''', (cutoff_date, user_guid))
165
+ deleted_count = cursor.rowcount
166
+ conn.commit()
167
+ logging.info(f"Cleaned up {deleted_count} old usage records for user {user_guid}")
@@ -0,0 +1,4 @@
1
+ """File management module."""
2
+ from .manager import FileManager
3
+
4
+ __all__ = ['FileManager']
@@ -0,0 +1,322 @@
1
+ """
2
+ File Manager module for processing various file types for conversation context.
3
+
4
+ This module provides functionality for:
5
+ - Extracting text from documents (PDF, DOCX, TXT, MD, CSV)
6
+ - Processing images as base64-encoded data
7
+ - Token counting for file content
8
+ """
9
+
10
+ import base64
11
+ import logging
12
+ import mimetypes
13
+ from pathlib import Path
14
+ from typing import Dict, Optional, Tuple
15
+
16
+
17
+ class FileManager:
18
+ """Manages file processing for conversation context."""
19
+
20
+ # Supported file extensions
21
+ SUPPORTED_TEXT_FILES = {'.txt', '.md', '.csv', '.json', '.yaml', '.yml', '.xml', '.log'}
22
+ SUPPORTED_CODE_FILES = {
23
+ '.py', '.js', '.ts', '.jsx', '.tsx', # Python, JavaScript, TypeScript
24
+ '.java', '.kt', '.scala', # JVM languages
25
+ '.c', '.cpp', '.cc', '.h', '.hpp', # C/C++
26
+ '.cs', # C#
27
+ '.go', '.rs', '.rb', # Go, Rust, Ruby
28
+ '.php', '.swift', '.m', # PHP, Swift, Objective-C
29
+ '.sh', '.bash', '.zsh', '.ps1', # Shell scripts
30
+ '.sql', '.r', # SQL, R
31
+ '.html', '.css', '.scss', '.sass', # Web
32
+ '.vue', '.svelte', # Web frameworks
33
+ '.toml', '.ini', '.cfg', '.conf', # Config files
34
+ '.dockerfile', '.tf', '.hcl', # Infrastructure
35
+ }
36
+ SUPPORTED_DOCUMENT_FILES = {'.pdf', '.docx', '.doc'}
37
+ SUPPORTED_IMAGE_FILES = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp'}
38
+
39
+ def __init__(self, bedrock_service=None):
40
+ """
41
+ Initialise the file manager.
42
+
43
+ Args:
44
+ bedrock_service: Optional BedrockService instance for token counting
45
+ """
46
+ self.bedrock_service = bedrock_service
47
+
48
+ @classmethod
49
+ def is_supported(cls, file_path: str) -> bool:
50
+ """
51
+ Check if a file type is supported.
52
+
53
+ Args:
54
+ file_path: Path to the file
55
+
56
+ Returns:
57
+ True if the file type is supported
58
+ """
59
+ ext = Path(file_path).suffix.lower()
60
+ return ext in (cls.SUPPORTED_TEXT_FILES | cls.SUPPORTED_CODE_FILES | cls.SUPPORTED_DOCUMENT_FILES | cls.SUPPORTED_IMAGE_FILES)
61
+
62
+ def process_file(self, file_path: str) -> Dict:
63
+ """
64
+ Process a file and extract its content.
65
+
66
+ Args:
67
+ file_path: Path to the file
68
+
69
+ Returns:
70
+ Dictionary containing:
71
+ - filename: Original filename
72
+ - file_type: File extension
73
+ - file_size: Size in bytes
74
+ - content_text: Extracted text content (for documents)
75
+ - content_base64: Base64 encoded content (for images)
76
+ - mime_type: MIME type (for images)
77
+ - token_count: Token count of extracted content
78
+ - error: Error message if processing failed
79
+ """
80
+ path = Path(file_path)
81
+
82
+ if not path.exists():
83
+ return {'error': f"File not found: {file_path}"}
84
+
85
+ if not self.is_supported(file_path):
86
+ return {'error': f"Unsupported file type: {path.suffix}"}
87
+
88
+ result = {
89
+ 'filename': path.name,
90
+ 'file_type': path.suffix.lower(),
91
+ 'file_size': path.stat().st_size,
92
+ 'content_text': None,
93
+ 'content_base64': None,
94
+ 'mime_type': None,
95
+ 'token_count': 0
96
+ }
97
+
98
+ ext = path.suffix.lower()
99
+
100
+ try:
101
+ # Process text and code files
102
+ if ext in (self.SUPPORTED_TEXT_FILES | self.SUPPORTED_CODE_FILES):
103
+ result['content_text'] = self._extract_text_file(file_path)
104
+ if result['content_text']:
105
+ result['token_count'] = self._count_tokens(result['content_text'])
106
+
107
+ # Process documents
108
+ elif ext in self.SUPPORTED_DOCUMENT_FILES:
109
+ if ext == '.pdf':
110
+ result['content_text'] = self._extract_pdf(file_path)
111
+ elif ext in {'.docx', '.doc'}:
112
+ result['content_text'] = self._extract_docx(file_path)
113
+
114
+ if result['content_text']:
115
+ result['token_count'] = self._count_tokens(result['content_text'])
116
+
117
+ # Process images
118
+ elif ext in self.SUPPORTED_IMAGE_FILES:
119
+ result['content_base64'], result['mime_type'] = self._encode_image(file_path)
120
+ # Images don't have text token count, but we could estimate based on size
121
+ # For now, we'll use a simple heuristic: ~1 token per 750 bytes
122
+ result['token_count'] = max(1, result['file_size'] // 750)
123
+
124
+ logging.info(f"Processed file: {path.name} ({result['file_type']}, {result['token_count']} tokens)")
125
+
126
+ except Exception as e:
127
+ logging.error(f"Error processing file {file_path}: {e}")
128
+ result['error'] = str(e)
129
+
130
+ return result
131
+
132
+ def _extract_text_file(self, file_path: str) -> str:
133
+ """
134
+ Extract text from a plain text file.
135
+
136
+ Args:
137
+ file_path: Path to the file
138
+
139
+ Returns:
140
+ Extracted text content
141
+ """
142
+ try:
143
+ # Try UTF-8 first
144
+ with open(file_path, 'r', encoding='utf-8') as f:
145
+ return f.read()
146
+ except UnicodeDecodeError:
147
+ # Fall back to latin-1 if UTF-8 fails
148
+ try:
149
+ with open(file_path, 'r', encoding='latin-1') as f:
150
+ return f.read()
151
+ except Exception as e:
152
+ logging.error(f"Failed to read text file {file_path}: {e}")
153
+ raise
154
+
155
+ def _extract_pdf(self, file_path: str) -> str:
156
+ """
157
+ Extract text from a PDF file.
158
+
159
+ Args:
160
+ file_path: Path to the PDF file
161
+
162
+ Returns:
163
+ Extracted text content
164
+ """
165
+ try:
166
+ import pypdf
167
+
168
+ text_parts = []
169
+ with open(file_path, 'rb') as f:
170
+ pdf_reader = pypdf.PdfReader(f)
171
+ for page_num, page in enumerate(pdf_reader.pages):
172
+ text = page.extract_text()
173
+ if text.strip():
174
+ text_parts.append(f"--- Page {page_num + 1} ---\n{text}")
175
+
176
+ return '\n\n'.join(text_parts)
177
+
178
+ except ImportError:
179
+ raise ImportError("pypdf library is required for PDF processing. Install with: pip install pypdf")
180
+ except Exception as e:
181
+ logging.error(f"Failed to extract PDF {file_path}: {e}")
182
+ raise
183
+
184
+ def _extract_docx(self, file_path: str) -> str:
185
+ """
186
+ Extract text from a DOCX file.
187
+
188
+ Args:
189
+ file_path: Path to the DOCX file
190
+
191
+ Returns:
192
+ Extracted text content
193
+ """
194
+ try:
195
+ import docx
196
+
197
+ doc = docx.Document(file_path)
198
+ text_parts = []
199
+
200
+ # Extract paragraphs
201
+ for para in doc.paragraphs:
202
+ if para.text.strip():
203
+ text_parts.append(para.text)
204
+
205
+ # Extract tables
206
+ for table in doc.tables:
207
+ for row in table.rows:
208
+ row_text = ' | '.join(cell.text.strip() for cell in row.cells)
209
+ if row_text.strip():
210
+ text_parts.append(row_text)
211
+
212
+ return '\n\n'.join(text_parts)
213
+
214
+ except ImportError:
215
+ raise ImportError("python-docx library is required for DOCX processing. Install with: pip install python-docx")
216
+ except Exception as e:
217
+ logging.error(f"Failed to extract DOCX {file_path}: {e}")
218
+ raise
219
+
220
+ def _encode_image(self, file_path: str) -> Tuple[str, str]:
221
+ """
222
+ Encode an image as base64.
223
+
224
+ Args:
225
+ file_path: Path to the image file
226
+
227
+ Returns:
228
+ Tuple of (base64_string, mime_type)
229
+ """
230
+ try:
231
+ # Read and encode image
232
+ with open(file_path, 'rb') as f:
233
+ image_data = f.read()
234
+
235
+ base64_string = base64.b64encode(image_data).decode('utf-8')
236
+
237
+ # Determine MIME type
238
+ mime_type, _ = mimetypes.guess_type(file_path)
239
+ if not mime_type:
240
+ # Default to common image types
241
+ ext = Path(file_path).suffix.lower()
242
+ mime_map = {
243
+ '.jpg': 'image/jpeg',
244
+ '.jpeg': 'image/jpeg',
245
+ '.png': 'image/png',
246
+ '.gif': 'image/gif',
247
+ '.bmp': 'image/bmp',
248
+ '.webp': 'image/webp'
249
+ }
250
+ mime_type = mime_map.get(ext, 'application/octet-stream')
251
+
252
+ return base64_string, mime_type
253
+
254
+ except Exception as e:
255
+ logging.error(f"Failed to encode image {file_path}: {e}")
256
+ raise
257
+
258
+ def _count_tokens(self, text: str) -> int:
259
+ """
260
+ Count tokens in text.
261
+
262
+ Args:
263
+ text: Text to count tokens for
264
+
265
+ Returns:
266
+ Token count
267
+ """
268
+ if self.bedrock_service:
269
+ return self.bedrock_service.count_tokens(text)
270
+ else:
271
+ # Rough estimate: ~4 characters per token
272
+ return max(1, len(text) // 4)
273
+
274
+ @classmethod
275
+ def get_supported_extensions(cls) -> str:
276
+ """
277
+ Get a formatted string of supported file extensions.
278
+
279
+ Returns:
280
+ Comma-separated list of supported extensions
281
+ """
282
+ all_extensions = sorted(
283
+ cls.SUPPORTED_TEXT_FILES | cls.SUPPORTED_DOCUMENT_FILES | cls.SUPPORTED_IMAGE_FILES
284
+ )
285
+ return ', '.join(all_extensions)
286
+
287
+ @classmethod
288
+ def scan_directory(cls, directory_path: str, recursive: bool = False) -> list:
289
+ """
290
+ Scan a directory for supported files.
291
+
292
+ Args:
293
+ directory_path: Path to the directory to scan
294
+ recursive: If True, scan subdirectories as well
295
+
296
+ Returns:
297
+ List of absolute file paths for supported files
298
+ """
299
+ from pathlib import Path
300
+
301
+ dir_path = Path(directory_path)
302
+
303
+ if not dir_path.exists():
304
+ raise FileNotFoundError(f"Directory not found: {directory_path}")
305
+
306
+ if not dir_path.is_dir():
307
+ raise NotADirectoryError(f"Not a directory: {directory_path}")
308
+
309
+ supported_files = []
310
+
311
+ if recursive:
312
+ # Recursively find all files
313
+ for file_path in dir_path.rglob('*'):
314
+ if file_path.is_file() and cls.is_supported(str(file_path)):
315
+ supported_files.append(str(file_path.absolute()))
316
+ else:
317
+ # Only scan immediate directory
318
+ for file_path in dir_path.iterdir():
319
+ if file_path.is_file() and cls.is_supported(str(file_path)):
320
+ supported_files.append(str(file_path.absolute()))
321
+
322
+ return sorted(supported_files)