dtSpark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtSpark/__init__.py +0 -0
- dtSpark/_description.txt +1 -0
- dtSpark/_full_name.txt +1 -0
- dtSpark/_licence.txt +21 -0
- dtSpark/_metadata.yaml +6 -0
- dtSpark/_name.txt +1 -0
- dtSpark/_version.txt +1 -0
- dtSpark/aws/__init__.py +7 -0
- dtSpark/aws/authentication.py +296 -0
- dtSpark/aws/bedrock.py +578 -0
- dtSpark/aws/costs.py +318 -0
- dtSpark/aws/pricing.py +580 -0
- dtSpark/cli_interface.py +2645 -0
- dtSpark/conversation_manager.py +3050 -0
- dtSpark/core/__init__.py +12 -0
- dtSpark/core/application.py +3355 -0
- dtSpark/core/context_compaction.py +735 -0
- dtSpark/daemon/__init__.py +104 -0
- dtSpark/daemon/__main__.py +10 -0
- dtSpark/daemon/action_monitor.py +213 -0
- dtSpark/daemon/daemon_app.py +730 -0
- dtSpark/daemon/daemon_manager.py +289 -0
- dtSpark/daemon/execution_coordinator.py +194 -0
- dtSpark/daemon/pid_file.py +169 -0
- dtSpark/database/__init__.py +482 -0
- dtSpark/database/autonomous_actions.py +1191 -0
- dtSpark/database/backends.py +329 -0
- dtSpark/database/connection.py +122 -0
- dtSpark/database/conversations.py +520 -0
- dtSpark/database/credential_prompt.py +218 -0
- dtSpark/database/files.py +205 -0
- dtSpark/database/mcp_ops.py +355 -0
- dtSpark/database/messages.py +161 -0
- dtSpark/database/schema.py +673 -0
- dtSpark/database/tool_permissions.py +186 -0
- dtSpark/database/usage.py +167 -0
- dtSpark/files/__init__.py +4 -0
- dtSpark/files/manager.py +322 -0
- dtSpark/launch.py +39 -0
- dtSpark/limits/__init__.py +10 -0
- dtSpark/limits/costs.py +296 -0
- dtSpark/limits/tokens.py +342 -0
- dtSpark/llm/__init__.py +17 -0
- dtSpark/llm/anthropic_direct.py +446 -0
- dtSpark/llm/base.py +146 -0
- dtSpark/llm/context_limits.py +438 -0
- dtSpark/llm/manager.py +177 -0
- dtSpark/llm/ollama.py +578 -0
- dtSpark/mcp_integration/__init__.py +5 -0
- dtSpark/mcp_integration/manager.py +653 -0
- dtSpark/mcp_integration/tool_selector.py +225 -0
- dtSpark/resources/config.yaml.template +631 -0
- dtSpark/safety/__init__.py +22 -0
- dtSpark/safety/llm_service.py +111 -0
- dtSpark/safety/patterns.py +229 -0
- dtSpark/safety/prompt_inspector.py +442 -0
- dtSpark/safety/violation_logger.py +346 -0
- dtSpark/scheduler/__init__.py +20 -0
- dtSpark/scheduler/creation_tools.py +599 -0
- dtSpark/scheduler/execution_queue.py +159 -0
- dtSpark/scheduler/executor.py +1152 -0
- dtSpark/scheduler/manager.py +395 -0
- dtSpark/tools/__init__.py +4 -0
- dtSpark/tools/builtin.py +833 -0
- dtSpark/web/__init__.py +20 -0
- dtSpark/web/auth.py +152 -0
- dtSpark/web/dependencies.py +37 -0
- dtSpark/web/endpoints/__init__.py +17 -0
- dtSpark/web/endpoints/autonomous_actions.py +1125 -0
- dtSpark/web/endpoints/chat.py +621 -0
- dtSpark/web/endpoints/conversations.py +353 -0
- dtSpark/web/endpoints/main_menu.py +547 -0
- dtSpark/web/endpoints/streaming.py +421 -0
- dtSpark/web/server.py +578 -0
- dtSpark/web/session.py +167 -0
- dtSpark/web/ssl_utils.py +195 -0
- dtSpark/web/static/css/dark-theme.css +427 -0
- dtSpark/web/static/js/actions.js +1101 -0
- dtSpark/web/static/js/chat.js +614 -0
- dtSpark/web/static/js/main.js +496 -0
- dtSpark/web/static/js/sse-client.js +242 -0
- dtSpark/web/templates/actions.html +408 -0
- dtSpark/web/templates/base.html +93 -0
- dtSpark/web/templates/chat.html +814 -0
- dtSpark/web/templates/conversations.html +350 -0
- dtSpark/web/templates/goodbye.html +81 -0
- dtSpark/web/templates/login.html +90 -0
- dtSpark/web/templates/main_menu.html +983 -0
- dtSpark/web/templates/new_conversation.html +191 -0
- dtSpark/web/web_interface.py +137 -0
- dtspark-1.0.4.dist-info/METADATA +187 -0
- dtspark-1.0.4.dist-info/RECORD +96 -0
- dtspark-1.0.4.dist-info/WHEEL +5 -0
- dtspark-1.0.4.dist-info/entry_points.txt +3 -0
- dtspark-1.0.4.dist-info/licenses/LICENSE +21 -0
- dtspark-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tool permissions operations module.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
- Checking tool permissions for conversations
|
|
6
|
+
- Setting tool permission states
|
|
7
|
+
- Managing first-time tool usage prompts
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sqlite3
|
|
11
|
+
import logging
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import Optional, Dict, List
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Permission states
|
|
17
|
+
PERMISSION_ALLOWED = 'allowed' # Run this time and all future times
|
|
18
|
+
PERMISSION_DENIED = 'denied' # Never run this tool
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def check_tool_permission(conn: sqlite3.Connection, conversation_id: int,
|
|
22
|
+
tool_name: str, user_guid: Optional[str] = None) -> Optional[str]:
|
|
23
|
+
"""
|
|
24
|
+
Check the permission state for a tool in a conversation.
|
|
25
|
+
Returns None if no record exists (first-time usage, should prompt).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
conn: Database connection
|
|
29
|
+
conversation_id: ID of the conversation
|
|
30
|
+
tool_name: Name of the tool
|
|
31
|
+
user_guid: User GUID for multi-user support
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Permission state ('allowed', 'denied') or None if no record exists
|
|
35
|
+
"""
|
|
36
|
+
cursor = conn.cursor()
|
|
37
|
+
cursor.execute('''
|
|
38
|
+
SELECT permission_state
|
|
39
|
+
FROM conversation_tool_permissions
|
|
40
|
+
WHERE conversation_id = ? AND tool_name = ? AND user_guid = ?
|
|
41
|
+
''', (conversation_id, tool_name, user_guid))
|
|
42
|
+
|
|
43
|
+
row = cursor.fetchone()
|
|
44
|
+
if row is None:
|
|
45
|
+
# No record exists - first time usage, should prompt user
|
|
46
|
+
return None
|
|
47
|
+
return row['permission_state']
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def set_tool_permission(conn: sqlite3.Connection, conversation_id: int,
|
|
51
|
+
tool_name: str, permission_state: str,
|
|
52
|
+
user_guid: Optional[str] = None) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Set the permission state for a tool in a conversation.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
conn: Database connection
|
|
58
|
+
conversation_id: ID of the conversation
|
|
59
|
+
tool_name: Name of the tool
|
|
60
|
+
permission_state: Permission state ('allowed' or 'denied')
|
|
61
|
+
user_guid: User GUID for multi-user support
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
True if successful, False otherwise
|
|
65
|
+
"""
|
|
66
|
+
if permission_state not in [PERMISSION_ALLOWED, PERMISSION_DENIED]:
|
|
67
|
+
logging.error(f"Invalid permission state: {permission_state}")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
cursor = conn.cursor()
|
|
72
|
+
now = datetime.now().isoformat()
|
|
73
|
+
|
|
74
|
+
cursor.execute('''
|
|
75
|
+
INSERT INTO conversation_tool_permissions
|
|
76
|
+
(conversation_id, tool_name, permission_state, granted_at, updated_at, user_guid)
|
|
77
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
78
|
+
ON CONFLICT(conversation_id, tool_name) DO UPDATE SET
|
|
79
|
+
permission_state = excluded.permission_state,
|
|
80
|
+
updated_at = excluded.updated_at
|
|
81
|
+
''', (conversation_id, tool_name, permission_state, now, now, user_guid))
|
|
82
|
+
|
|
83
|
+
conn.commit()
|
|
84
|
+
logging.info(f"Tool permission '{tool_name}' set to '{permission_state}' for conversation {conversation_id}")
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logging.error(f"Failed to set tool permission: {e}")
|
|
89
|
+
conn.rollback()
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_all_tool_permissions(conn: sqlite3.Connection, conversation_id: int,
|
|
94
|
+
user_guid: Optional[str] = None) -> List[Dict]:
|
|
95
|
+
"""
|
|
96
|
+
Get all tool permissions for a conversation.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
conn: Database connection
|
|
100
|
+
conversation_id: ID of the conversation
|
|
101
|
+
user_guid: User GUID for multi-user support
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of dicts with 'tool_name', 'permission_state', 'granted_at', 'updated_at'
|
|
105
|
+
"""
|
|
106
|
+
cursor = conn.cursor()
|
|
107
|
+
cursor.execute('''
|
|
108
|
+
SELECT tool_name, permission_state, granted_at, updated_at
|
|
109
|
+
FROM conversation_tool_permissions
|
|
110
|
+
WHERE conversation_id = ? AND user_guid = ?
|
|
111
|
+
ORDER BY updated_at DESC
|
|
112
|
+
''', (conversation_id, user_guid))
|
|
113
|
+
|
|
114
|
+
permissions = []
|
|
115
|
+
for row in cursor.fetchall():
|
|
116
|
+
permissions.append({
|
|
117
|
+
'tool_name': row['tool_name'],
|
|
118
|
+
'permission_state': row['permission_state'],
|
|
119
|
+
'granted_at': row['granted_at'],
|
|
120
|
+
'updated_at': row['updated_at']
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
return permissions
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def delete_tool_permission(conn: sqlite3.Connection, conversation_id: int,
|
|
127
|
+
tool_name: str, user_guid: Optional[str] = None) -> bool:
|
|
128
|
+
"""
|
|
129
|
+
Delete a tool permission record (reset to first-time usage behavior).
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
conn: Database connection
|
|
133
|
+
conversation_id: ID of the conversation
|
|
134
|
+
tool_name: Name of the tool
|
|
135
|
+
user_guid: User GUID for multi-user support
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
True if successful, False otherwise
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
cursor = conn.cursor()
|
|
142
|
+
cursor.execute('''
|
|
143
|
+
DELETE FROM conversation_tool_permissions
|
|
144
|
+
WHERE conversation_id = ? AND tool_name = ? AND user_guid = ?
|
|
145
|
+
''', (conversation_id, tool_name, user_guid))
|
|
146
|
+
|
|
147
|
+
conn.commit()
|
|
148
|
+
deleted_count = cursor.rowcount
|
|
149
|
+
if deleted_count > 0:
|
|
150
|
+
logging.info(f"Deleted tool permission for '{tool_name}' in conversation {conversation_id}")
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logging.error(f"Failed to delete tool permission: {e}")
|
|
155
|
+
conn.rollback()
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def is_tool_allowed(conn: sqlite3.Connection, conversation_id: int,
|
|
160
|
+
tool_name: str, user_guid: Optional[str] = None) -> Optional[bool]:
|
|
161
|
+
"""
|
|
162
|
+
Check if a tool is allowed to run.
|
|
163
|
+
Returns None if permission should be requested from user (first-time usage).
|
|
164
|
+
Returns True if allowed, False if denied.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
conn: Database connection
|
|
168
|
+
conversation_id: ID of the conversation
|
|
169
|
+
tool_name: Name of the tool
|
|
170
|
+
user_guid: User GUID for multi-user support
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
True if allowed, False if denied, None if should prompt user
|
|
174
|
+
"""
|
|
175
|
+
permission_state = check_tool_permission(conn, conversation_id, tool_name, user_guid)
|
|
176
|
+
|
|
177
|
+
if permission_state is None:
|
|
178
|
+
# No record exists - should prompt user
|
|
179
|
+
return None
|
|
180
|
+
elif permission_state == PERMISSION_ALLOWED:
|
|
181
|
+
return True
|
|
182
|
+
elif permission_state == PERMISSION_DENIED:
|
|
183
|
+
return False
|
|
184
|
+
else:
|
|
185
|
+
logging.warning(f"Unknown permission state: {permission_state}")
|
|
186
|
+
return None
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Usage tracking module.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
- Recording token and cost usage
|
|
6
|
+
- Retrieving usage within time windows
|
|
7
|
+
- Usage summary and reporting
|
|
8
|
+
- Cleanup of old usage data
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sqlite3
|
|
12
|
+
import logging
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import List, Dict, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def record_usage(conn: sqlite3.Connection, conversation_id: int, model_id: str,
|
|
18
|
+
region: str, input_tokens: int, output_tokens: int, cost: float,
|
|
19
|
+
timestamp: datetime, user_guid: str = None):
|
|
20
|
+
"""
|
|
21
|
+
Record usage for token management and billing.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
conn: Database connection
|
|
25
|
+
conversation_id: ID of the conversation
|
|
26
|
+
model_id: Bedrock model ID
|
|
27
|
+
region: AWS region
|
|
28
|
+
input_tokens: Number of input tokens
|
|
29
|
+
output_tokens: Number of output tokens
|
|
30
|
+
cost: Cost in USD
|
|
31
|
+
timestamp: Timestamp of usage
|
|
32
|
+
user_guid: User GUID for multi-user support
|
|
33
|
+
"""
|
|
34
|
+
cursor = conn.cursor()
|
|
35
|
+
cursor.execute('''
|
|
36
|
+
INSERT INTO usage_tracking
|
|
37
|
+
(conversation_id, model_id, region, input_tokens, output_tokens, cost, timestamp, user_guid)
|
|
38
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
39
|
+
''', (conversation_id, model_id, region, input_tokens, output_tokens, cost, timestamp, user_guid))
|
|
40
|
+
conn.commit()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_usage_in_window(conn: sqlite3.Connection, window_start: datetime,
|
|
44
|
+
user_guid: str = None) -> float:
|
|
45
|
+
"""
|
|
46
|
+
Get total cost for usage since window_start.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
conn: Database connection
|
|
50
|
+
window_start: Start of rolling window
|
|
51
|
+
user_guid: User GUID for multi-user support
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Total cost in USD
|
|
55
|
+
"""
|
|
56
|
+
cursor = conn.cursor()
|
|
57
|
+
cursor.execute('''
|
|
58
|
+
SELECT SUM(cost) as total_cost
|
|
59
|
+
FROM usage_tracking
|
|
60
|
+
WHERE timestamp >= ? AND user_guid = ?
|
|
61
|
+
''', (window_start, user_guid))
|
|
62
|
+
|
|
63
|
+
result = cursor.fetchone()
|
|
64
|
+
return result['total_cost'] if result['total_cost'] is not None else 0.0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_oldest_usage_in_window(conn: sqlite3.Connection,
|
|
68
|
+
window_start: datetime, user_guid: str = None) -> Optional[datetime]:
|
|
69
|
+
"""
|
|
70
|
+
Get timestamp of oldest usage in the rolling window.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
conn: Database connection
|
|
74
|
+
window_start: Start of rolling window
|
|
75
|
+
user_guid: User GUID for multi-user support
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Timestamp of oldest usage or None
|
|
79
|
+
"""
|
|
80
|
+
cursor = conn.cursor()
|
|
81
|
+
cursor.execute('''
|
|
82
|
+
SELECT MIN(timestamp) as oldest_timestamp
|
|
83
|
+
FROM usage_tracking
|
|
84
|
+
WHERE timestamp >= ? AND user_guid = ?
|
|
85
|
+
''', (window_start, user_guid))
|
|
86
|
+
|
|
87
|
+
result = cursor.fetchone()
|
|
88
|
+
if result and result['oldest_timestamp']:
|
|
89
|
+
return datetime.fromisoformat(result['oldest_timestamp'])
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_token_usage_in_window(conn: sqlite3.Connection,
|
|
94
|
+
window_start: datetime, user_guid: str = None) -> Tuple[int, int]:
|
|
95
|
+
"""
|
|
96
|
+
Get total token usage (input and output separately) since window_start.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
conn: Database connection
|
|
100
|
+
window_start: Start of rolling window
|
|
101
|
+
user_guid: User GUID for multi-user support
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Tuple of (total_input_tokens, total_output_tokens)
|
|
105
|
+
"""
|
|
106
|
+
cursor = conn.cursor()
|
|
107
|
+
cursor.execute('''
|
|
108
|
+
SELECT
|
|
109
|
+
SUM(input_tokens) as total_input,
|
|
110
|
+
SUM(output_tokens) as total_output
|
|
111
|
+
FROM usage_tracking
|
|
112
|
+
WHERE timestamp >= ? AND user_guid = ?
|
|
113
|
+
''', (window_start, user_guid))
|
|
114
|
+
|
|
115
|
+
result = cursor.fetchone()
|
|
116
|
+
total_input = result['total_input'] if result['total_input'] is not None else 0
|
|
117
|
+
total_output = result['total_output'] if result['total_output'] is not None else 0
|
|
118
|
+
return int(total_input), int(total_output)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_usage_summary(conn: sqlite3.Connection, window_start: datetime,
|
|
122
|
+
user_guid: str = None) -> List[Dict]:
|
|
123
|
+
"""
|
|
124
|
+
Get detailed usage summary for the rolling window.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
conn: Database connection
|
|
128
|
+
window_start: Start of rolling window
|
|
129
|
+
user_guid: User GUID for multi-user support
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of usage records with model, tokens, and costs
|
|
133
|
+
"""
|
|
134
|
+
cursor = conn.cursor()
|
|
135
|
+
cursor.execute('''
|
|
136
|
+
SELECT model_id, region,
|
|
137
|
+
SUM(input_tokens) as total_input_tokens,
|
|
138
|
+
SUM(output_tokens) as total_output_tokens,
|
|
139
|
+
SUM(cost) as total_cost,
|
|
140
|
+
COUNT(*) as request_count
|
|
141
|
+
FROM usage_tracking
|
|
142
|
+
WHERE timestamp >= ? AND user_guid = ?
|
|
143
|
+
GROUP BY model_id, region
|
|
144
|
+
ORDER BY total_cost DESC
|
|
145
|
+
''', (window_start, user_guid))
|
|
146
|
+
|
|
147
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def cleanup_old_usage(conn: sqlite3.Connection, cutoff_date: datetime,
|
|
151
|
+
user_guid: str = None):
|
|
152
|
+
"""
|
|
153
|
+
Clean up usage records older than cutoff_date.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
conn: Database connection
|
|
157
|
+
cutoff_date: Delete records older than this date
|
|
158
|
+
user_guid: User GUID for multi-user support (for safety filtering)
|
|
159
|
+
"""
|
|
160
|
+
cursor = conn.cursor()
|
|
161
|
+
cursor.execute('''
|
|
162
|
+
DELETE FROM usage_tracking
|
|
163
|
+
WHERE timestamp < ? AND user_guid = ?
|
|
164
|
+
''', (cutoff_date, user_guid))
|
|
165
|
+
deleted_count = cursor.rowcount
|
|
166
|
+
conn.commit()
|
|
167
|
+
logging.info(f"Cleaned up {deleted_count} old usage records for user {user_guid}")
|
dtSpark/files/manager.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File Manager module for processing various file types for conversation context.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for:
|
|
5
|
+
- Extracting text from documents (PDF, DOCX, TXT, MD, CSV)
|
|
6
|
+
- Processing images as base64-encoded data
|
|
7
|
+
- Token counting for file content
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import logging
|
|
12
|
+
import mimetypes
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FileManager:
|
|
18
|
+
"""Manages file processing for conversation context."""
|
|
19
|
+
|
|
20
|
+
# Supported file extensions
|
|
21
|
+
SUPPORTED_TEXT_FILES = {'.txt', '.md', '.csv', '.json', '.yaml', '.yml', '.xml', '.log'}
|
|
22
|
+
SUPPORTED_CODE_FILES = {
|
|
23
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', # Python, JavaScript, TypeScript
|
|
24
|
+
'.java', '.kt', '.scala', # JVM languages
|
|
25
|
+
'.c', '.cpp', '.cc', '.h', '.hpp', # C/C++
|
|
26
|
+
'.cs', # C#
|
|
27
|
+
'.go', '.rs', '.rb', # Go, Rust, Ruby
|
|
28
|
+
'.php', '.swift', '.m', # PHP, Swift, Objective-C
|
|
29
|
+
'.sh', '.bash', '.zsh', '.ps1', # Shell scripts
|
|
30
|
+
'.sql', '.r', # SQL, R
|
|
31
|
+
'.html', '.css', '.scss', '.sass', # Web
|
|
32
|
+
'.vue', '.svelte', # Web frameworks
|
|
33
|
+
'.toml', '.ini', '.cfg', '.conf', # Config files
|
|
34
|
+
'.dockerfile', '.tf', '.hcl', # Infrastructure
|
|
35
|
+
}
|
|
36
|
+
SUPPORTED_DOCUMENT_FILES = {'.pdf', '.docx', '.doc'}
|
|
37
|
+
SUPPORTED_IMAGE_FILES = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp'}
|
|
38
|
+
|
|
39
|
+
def __init__(self, bedrock_service=None):
|
|
40
|
+
"""
|
|
41
|
+
Initialise the file manager.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
bedrock_service: Optional BedrockService instance for token counting
|
|
45
|
+
"""
|
|
46
|
+
self.bedrock_service = bedrock_service
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def is_supported(cls, file_path: str) -> bool:
|
|
50
|
+
"""
|
|
51
|
+
Check if a file type is supported.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
file_path: Path to the file
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if the file type is supported
|
|
58
|
+
"""
|
|
59
|
+
ext = Path(file_path).suffix.lower()
|
|
60
|
+
return ext in (cls.SUPPORTED_TEXT_FILES | cls.SUPPORTED_CODE_FILES | cls.SUPPORTED_DOCUMENT_FILES | cls.SUPPORTED_IMAGE_FILES)
|
|
61
|
+
|
|
62
|
+
def process_file(self, file_path: str) -> Dict:
|
|
63
|
+
"""
|
|
64
|
+
Process a file and extract its content.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
file_path: Path to the file
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Dictionary containing:
|
|
71
|
+
- filename: Original filename
|
|
72
|
+
- file_type: File extension
|
|
73
|
+
- file_size: Size in bytes
|
|
74
|
+
- content_text: Extracted text content (for documents)
|
|
75
|
+
- content_base64: Base64 encoded content (for images)
|
|
76
|
+
- mime_type: MIME type (for images)
|
|
77
|
+
- token_count: Token count of extracted content
|
|
78
|
+
- error: Error message if processing failed
|
|
79
|
+
"""
|
|
80
|
+
path = Path(file_path)
|
|
81
|
+
|
|
82
|
+
if not path.exists():
|
|
83
|
+
return {'error': f"File not found: {file_path}"}
|
|
84
|
+
|
|
85
|
+
if not self.is_supported(file_path):
|
|
86
|
+
return {'error': f"Unsupported file type: {path.suffix}"}
|
|
87
|
+
|
|
88
|
+
result = {
|
|
89
|
+
'filename': path.name,
|
|
90
|
+
'file_type': path.suffix.lower(),
|
|
91
|
+
'file_size': path.stat().st_size,
|
|
92
|
+
'content_text': None,
|
|
93
|
+
'content_base64': None,
|
|
94
|
+
'mime_type': None,
|
|
95
|
+
'token_count': 0
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
ext = path.suffix.lower()
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Process text and code files
|
|
102
|
+
if ext in (self.SUPPORTED_TEXT_FILES | self.SUPPORTED_CODE_FILES):
|
|
103
|
+
result['content_text'] = self._extract_text_file(file_path)
|
|
104
|
+
if result['content_text']:
|
|
105
|
+
result['token_count'] = self._count_tokens(result['content_text'])
|
|
106
|
+
|
|
107
|
+
# Process documents
|
|
108
|
+
elif ext in self.SUPPORTED_DOCUMENT_FILES:
|
|
109
|
+
if ext == '.pdf':
|
|
110
|
+
result['content_text'] = self._extract_pdf(file_path)
|
|
111
|
+
elif ext in {'.docx', '.doc'}:
|
|
112
|
+
result['content_text'] = self._extract_docx(file_path)
|
|
113
|
+
|
|
114
|
+
if result['content_text']:
|
|
115
|
+
result['token_count'] = self._count_tokens(result['content_text'])
|
|
116
|
+
|
|
117
|
+
# Process images
|
|
118
|
+
elif ext in self.SUPPORTED_IMAGE_FILES:
|
|
119
|
+
result['content_base64'], result['mime_type'] = self._encode_image(file_path)
|
|
120
|
+
# Images don't have text token count, but we could estimate based on size
|
|
121
|
+
# For now, we'll use a simple heuristic: ~1 token per 750 bytes
|
|
122
|
+
result['token_count'] = max(1, result['file_size'] // 750)
|
|
123
|
+
|
|
124
|
+
logging.info(f"Processed file: {path.name} ({result['file_type']}, {result['token_count']} tokens)")
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logging.error(f"Error processing file {file_path}: {e}")
|
|
128
|
+
result['error'] = str(e)
|
|
129
|
+
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
def _extract_text_file(self, file_path: str) -> str:
|
|
133
|
+
"""
|
|
134
|
+
Extract text from a plain text file.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
file_path: Path to the file
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Extracted text content
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
# Try UTF-8 first
|
|
144
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
145
|
+
return f.read()
|
|
146
|
+
except UnicodeDecodeError:
|
|
147
|
+
# Fall back to latin-1 if UTF-8 fails
|
|
148
|
+
try:
|
|
149
|
+
with open(file_path, 'r', encoding='latin-1') as f:
|
|
150
|
+
return f.read()
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logging.error(f"Failed to read text file {file_path}: {e}")
|
|
153
|
+
raise
|
|
154
|
+
|
|
155
|
+
def _extract_pdf(self, file_path: str) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Extract text from a PDF file.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
file_path: Path to the PDF file
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Extracted text content
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
import pypdf
|
|
167
|
+
|
|
168
|
+
text_parts = []
|
|
169
|
+
with open(file_path, 'rb') as f:
|
|
170
|
+
pdf_reader = pypdf.PdfReader(f)
|
|
171
|
+
for page_num, page in enumerate(pdf_reader.pages):
|
|
172
|
+
text = page.extract_text()
|
|
173
|
+
if text.strip():
|
|
174
|
+
text_parts.append(f"--- Page {page_num + 1} ---\n{text}")
|
|
175
|
+
|
|
176
|
+
return '\n\n'.join(text_parts)
|
|
177
|
+
|
|
178
|
+
except ImportError:
|
|
179
|
+
raise ImportError("pypdf library is required for PDF processing. Install with: pip install pypdf")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logging.error(f"Failed to extract PDF {file_path}: {e}")
|
|
182
|
+
raise
|
|
183
|
+
|
|
184
|
+
def _extract_docx(self, file_path: str) -> str:
|
|
185
|
+
"""
|
|
186
|
+
Extract text from a DOCX file.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
file_path: Path to the DOCX file
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Extracted text content
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
import docx
|
|
196
|
+
|
|
197
|
+
doc = docx.Document(file_path)
|
|
198
|
+
text_parts = []
|
|
199
|
+
|
|
200
|
+
# Extract paragraphs
|
|
201
|
+
for para in doc.paragraphs:
|
|
202
|
+
if para.text.strip():
|
|
203
|
+
text_parts.append(para.text)
|
|
204
|
+
|
|
205
|
+
# Extract tables
|
|
206
|
+
for table in doc.tables:
|
|
207
|
+
for row in table.rows:
|
|
208
|
+
row_text = ' | '.join(cell.text.strip() for cell in row.cells)
|
|
209
|
+
if row_text.strip():
|
|
210
|
+
text_parts.append(row_text)
|
|
211
|
+
|
|
212
|
+
return '\n\n'.join(text_parts)
|
|
213
|
+
|
|
214
|
+
except ImportError:
|
|
215
|
+
raise ImportError("python-docx library is required for DOCX processing. Install with: pip install python-docx")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logging.error(f"Failed to extract DOCX {file_path}: {e}")
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
def _encode_image(self, file_path: str) -> Tuple[str, str]:
|
|
221
|
+
"""
|
|
222
|
+
Encode an image as base64.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
file_path: Path to the image file
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Tuple of (base64_string, mime_type)
|
|
229
|
+
"""
|
|
230
|
+
try:
|
|
231
|
+
# Read and encode image
|
|
232
|
+
with open(file_path, 'rb') as f:
|
|
233
|
+
image_data = f.read()
|
|
234
|
+
|
|
235
|
+
base64_string = base64.b64encode(image_data).decode('utf-8')
|
|
236
|
+
|
|
237
|
+
# Determine MIME type
|
|
238
|
+
mime_type, _ = mimetypes.guess_type(file_path)
|
|
239
|
+
if not mime_type:
|
|
240
|
+
# Default to common image types
|
|
241
|
+
ext = Path(file_path).suffix.lower()
|
|
242
|
+
mime_map = {
|
|
243
|
+
'.jpg': 'image/jpeg',
|
|
244
|
+
'.jpeg': 'image/jpeg',
|
|
245
|
+
'.png': 'image/png',
|
|
246
|
+
'.gif': 'image/gif',
|
|
247
|
+
'.bmp': 'image/bmp',
|
|
248
|
+
'.webp': 'image/webp'
|
|
249
|
+
}
|
|
250
|
+
mime_type = mime_map.get(ext, 'application/octet-stream')
|
|
251
|
+
|
|
252
|
+
return base64_string, mime_type
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logging.error(f"Failed to encode image {file_path}: {e}")
|
|
256
|
+
raise
|
|
257
|
+
|
|
258
|
+
def _count_tokens(self, text: str) -> int:
|
|
259
|
+
"""
|
|
260
|
+
Count tokens in text.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
text: Text to count tokens for
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Token count
|
|
267
|
+
"""
|
|
268
|
+
if self.bedrock_service:
|
|
269
|
+
return self.bedrock_service.count_tokens(text)
|
|
270
|
+
else:
|
|
271
|
+
# Rough estimate: ~4 characters per token
|
|
272
|
+
return max(1, len(text) // 4)
|
|
273
|
+
|
|
274
|
+
@classmethod
|
|
275
|
+
def get_supported_extensions(cls) -> str:
|
|
276
|
+
"""
|
|
277
|
+
Get a formatted string of supported file extensions.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Comma-separated list of supported extensions
|
|
281
|
+
"""
|
|
282
|
+
all_extensions = sorted(
|
|
283
|
+
cls.SUPPORTED_TEXT_FILES | cls.SUPPORTED_DOCUMENT_FILES | cls.SUPPORTED_IMAGE_FILES
|
|
284
|
+
)
|
|
285
|
+
return ', '.join(all_extensions)
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def scan_directory(cls, directory_path: str, recursive: bool = False) -> list:
|
|
289
|
+
"""
|
|
290
|
+
Scan a directory for supported files.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
directory_path: Path to the directory to scan
|
|
294
|
+
recursive: If True, scan subdirectories as well
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of absolute file paths for supported files
|
|
298
|
+
"""
|
|
299
|
+
from pathlib import Path
|
|
300
|
+
|
|
301
|
+
dir_path = Path(directory_path)
|
|
302
|
+
|
|
303
|
+
if not dir_path.exists():
|
|
304
|
+
raise FileNotFoundError(f"Directory not found: {directory_path}")
|
|
305
|
+
|
|
306
|
+
if not dir_path.is_dir():
|
|
307
|
+
raise NotADirectoryError(f"Not a directory: {directory_path}")
|
|
308
|
+
|
|
309
|
+
supported_files = []
|
|
310
|
+
|
|
311
|
+
if recursive:
|
|
312
|
+
# Recursively find all files
|
|
313
|
+
for file_path in dir_path.rglob('*'):
|
|
314
|
+
if file_path.is_file() and cls.is_supported(str(file_path)):
|
|
315
|
+
supported_files.append(str(file_path.absolute()))
|
|
316
|
+
else:
|
|
317
|
+
# Only scan immediate directory
|
|
318
|
+
for file_path in dir_path.iterdir():
|
|
319
|
+
if file_path.is_file() and cls.is_supported(str(file_path)):
|
|
320
|
+
supported_files.append(str(file_path.absolute()))
|
|
321
|
+
|
|
322
|
+
return sorted(supported_files)
|