PikoAi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Agents/Executor/__init__.py +0 -0
- Agents/Executor/executor.py +242 -0
- Agents/Executor/prompts.py +99 -0
- Agents/__init__.py +0 -0
- Env/__init__.py +5 -0
- Env/base_env.py +44 -0
- Env/base_executor.py +24 -0
- Env/env.py +5 -0
- Env/js_executor.py +63 -0
- Env/python_executor.py +96 -0
- Env/shell.py +55 -0
- Tools/__init__.py +0 -0
- Tools/file_task.py +240 -0
- Tools/system_details.py +76 -0
- Tools/tool_manager.py +51 -0
- Tools/userinp.py +34 -0
- Tools/web_loader.py +173 -0
- Tools/web_search.py +30 -0
- llm_interface/__init__.py +0 -0
- llm_interface/llm.py +148 -0
- pikoai-0.1.0.dist-info/METADATA +101 -0
- pikoai-0.1.0.dist-info/RECORD +26 -0
- pikoai-0.1.0.dist-info/WHEEL +5 -0
- pikoai-0.1.0.dist-info/entry_points.txt +2 -0
- pikoai-0.1.0.dist-info/licenses/LICENSE +21 -0
- pikoai-0.1.0.dist-info/top_level.txt +4 -0
Tools/file_task.py
ADDED
@@ -0,0 +1,240 @@
|
|
1
|
+
import os
|
2
|
+
from PyPDF2 import PdfReader
|
3
|
+
from PyPDF2 import errors as PyPDF2Errors
|
4
|
+
import docx
|
5
|
+
from docx.opc import exceptions as DocxOpcExceptions
|
6
|
+
|
7
|
+
def file_reader(**kwargs) -> dict:
|
8
|
+
"""Reads the content of a specified file and returns it.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
**kwargs: Keyword arguments with 'file_path' specifying the file to read.
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
Dictionary with 'success' (bool), 'output' (file content or error message).
|
15
|
+
"""
|
16
|
+
try:
|
17
|
+
# Validate input
|
18
|
+
if "file_path" not in kwargs:
|
19
|
+
return {"success": False, "output": "Error: 'file_path' is required."}
|
20
|
+
|
21
|
+
file_path = kwargs["file_path"]
|
22
|
+
|
23
|
+
# Enhanced Security Checks (Primary Change Area)
|
24
|
+
abs_file_path = os.path.abspath(file_path)
|
25
|
+
normalized_abs_path = abs_file_path.lower()
|
26
|
+
|
27
|
+
# Expanded and normalized forbidden directories
|
28
|
+
# Ensure trailing slashes for directory checks and all lowercase
|
29
|
+
forbidden_dirs = [
|
30
|
+
"/etc/", "/root/", "/sys/", "/proc/", "/dev/", "/boot/", "/sbin/", "/usr/sbin/",
|
31
|
+
"c:\\windows\\", "c:\\program files\\", "c:\\program files (x86)\\",
|
32
|
+
"c:\\users\\default\\", # Added default user for windows
|
33
|
+
"/system/", "/library/", "/private/", "/applications/", "/usr/bin/"
|
34
|
+
]
|
35
|
+
|
36
|
+
if any(normalized_abs_path.startswith(d) for d in forbidden_dirs):
|
37
|
+
return {"success": False, "output": f"Error: Access to system or restricted directory '{abs_file_path}' is not allowed."}
|
38
|
+
|
39
|
+
# Check for sensitive files/directories in user's home directory
|
40
|
+
try:
|
41
|
+
user_home = os.path.expanduser("~").lower()
|
42
|
+
# Define sensitive files and directories relative to user_home
|
43
|
+
sensitive_home_files = [
|
44
|
+
os.path.join(user_home, ".gitconfig").lower(),
|
45
|
+
os.path.join(user_home, ".bash_history").lower(),
|
46
|
+
os.path.join(user_home, ".zsh_history").lower(),
|
47
|
+
os.path.join(user_home, ".python_history").lower(), # Added from previous patterns
|
48
|
+
os.path.join(user_home, ".npmrc").lower(), # Added from previous patterns
|
49
|
+
os.path.join(user_home, ".yarnrc").lower(), # Added from previous patterns
|
50
|
+
os.path.join(user_home, ".gemrc").lower() # Added from previous patterns
|
51
|
+
# Add other specific sensitive *files* here
|
52
|
+
]
|
53
|
+
sensitive_home_dirs = [
|
54
|
+
os.path.join(user_home, ".ssh").lower(),
|
55
|
+
os.path.join(user_home, ".aws").lower(),
|
56
|
+
os.path.join(user_home, ".gcloud").lower(), # Changed from .config/gcloud as per request
|
57
|
+
os.path.join(user_home, ".gnupg").lower(), # Added from previous patterns
|
58
|
+
os.path.join(user_home, ".docker").lower(), # Added from previous patterns
|
59
|
+
os.path.join(user_home, ".kube").lower() # Added from previous patterns
|
60
|
+
# Add other specific sensitive *directories* here
|
61
|
+
]
|
62
|
+
|
63
|
+
if normalized_abs_path in sensitive_home_files:
|
64
|
+
return {"success": False, "output": f"Error: Access to sensitive user configuration file '{normalized_abs_path}' is restricted."}
|
65
|
+
|
66
|
+
if any(normalized_abs_path.startswith(d + os.sep) for d in sensitive_home_dirs): # Check if path starts with any sensitive dir + separator
|
67
|
+
return {"success": False, "output": f"Error: Access to files within sensitive user directory '{os.path.dirname(normalized_abs_path)}' is restricted."}
|
68
|
+
|
69
|
+
# Also, if the path *is* one of the sensitive_home_dirs itself (e.g. trying to read ~/.ssh as a file)
|
70
|
+
if normalized_abs_path in sensitive_home_dirs:
|
71
|
+
return {"success": False, "output": f"Error: Direct access to sensitive user directory '{normalized_abs_path}' is restricted."}
|
72
|
+
|
73
|
+
except Exception: # Broad exception catch
|
74
|
+
# In case of error determining home directory or paths (e.g., os.path.expanduser fails),
|
75
|
+
# proceed with caution. For now, we'll let it pass, but logging this would be advisable.
|
76
|
+
# This means sensitive home path checks might be bypassed if an error occurs here.
|
77
|
+
pass
|
78
|
+
|
79
|
+
# Determine file extension (moved after security checks)
|
80
|
+
_, file_extension = os.path.splitext(file_path)
|
81
|
+
file_extension = file_extension.lower()
|
82
|
+
|
83
|
+
# Check if file exists and is readable (after security checks)
|
84
|
+
if not os.path.isfile(file_path):
|
85
|
+
return {"success": False, "output": f"Error: File '{file_path}' does not exist."}
|
86
|
+
if not os.access(file_path, os.R_OK):
|
87
|
+
return {"success": False, "output": f"Error: No read permission for '{file_path}'."}
|
88
|
+
|
89
|
+
# Read file content
|
90
|
+
content = ""
|
91
|
+
if file_extension == ".pdf":
|
92
|
+
try:
|
93
|
+
with open(file_path, "rb") as f:
|
94
|
+
reader = PdfReader(f)
|
95
|
+
if reader.is_encrypted:
|
96
|
+
# Attempt to decrypt with an empty password, or handle if not possible.
|
97
|
+
# For now, we'll assume most encrypted files without passwords are not readable by default.
|
98
|
+
return {"success": False, "output": f"Error: PDF file '{file_path}' is encrypted and cannot be read without a password."}
|
99
|
+
for page in reader.pages:
|
100
|
+
content += page.extract_text() or ""
|
101
|
+
except PyPDF2Errors.FileNotDecryptedError:
|
102
|
+
return {"success": False, "output": f"Error: PDF file '{file_path}' is encrypted and cannot be read."}
|
103
|
+
except PyPDF2Errors.PdfReadError as pe:
|
104
|
+
return {"success": False, "output": f"Error: Could not read PDF file '{file_path}'. It may be corrupted, not a valid PDF, or an unsupported format. Details: {str(pe)}"}
|
105
|
+
except Exception as e: # General fallback for other PDF issues
|
106
|
+
return {"success": False, "output": f"Error processing PDF file '{file_path}': {str(e)}"}
|
107
|
+
elif file_extension == ".docx":
|
108
|
+
try:
|
109
|
+
doc = docx.Document(file_path)
|
110
|
+
for para in doc.paragraphs:
|
111
|
+
content += para.text + "\n"
|
112
|
+
except DocxOpcExceptions.PackageNotFoundError:
|
113
|
+
return {"success": False, "output": f"Error: File '{file_path}' is not a valid DOCX file, is corrupted, or is not a compatible OOXML package."}
|
114
|
+
except Exception as e: # General fallback for other DOCX issues
|
115
|
+
return {"success": False, "output": f"Error processing DOCX file '{file_path}': {str(e)}"}
|
116
|
+
else: # Fallback to existing plain text reading
|
117
|
+
# Ensure this part also has robust error handling, though it's simpler
|
118
|
+
try:
|
119
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
120
|
+
content = f.read()
|
121
|
+
except UnicodeDecodeError as ude:
|
122
|
+
return {"success": False, "output": f"Error: Could not decode file '{file_path}' using UTF-8. It might be a binary file or use a different text encoding. Details: {str(ude)}"}
|
123
|
+
except Exception as e: # General fallback for text files
|
124
|
+
return {"success": False, "output": f"Error reading text file '{file_path}': {str(e)}"}
|
125
|
+
|
126
|
+
return {"success": True, "output": content}
|
127
|
+
|
128
|
+
except FileNotFoundError: # Specific exception for file not found
|
129
|
+
return {"success": False, "output": f"Error: File '{file_path}' does not exist."} # Redundant if isfile check is perfect, but good practice
|
130
|
+
except PermissionError: # Specific exception for permission issues
|
131
|
+
return {"success": False, "output": f"Error: No read permission for '{file_path}'."} # Redundant if os.access check is perfect
|
132
|
+
except Exception as e:
|
133
|
+
return {"success": False, "output": f"An unexpected error occurred while trying to read '{file_path}': {str(e)}"}
|
134
|
+
|
135
|
+
def file_maker(**kwargs) -> dict:
|
136
|
+
"""Creates an empty file at the specified path.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
**kwargs: Keyword arguments with 'file_path' specifying the file to create.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Dictionary with 'success' (bool), 'output' (confirmation or error message).
|
143
|
+
"""
|
144
|
+
try:
|
145
|
+
# Validate input
|
146
|
+
if "file_path" not in kwargs:
|
147
|
+
return {"success": False, "output": "Error: 'file_path' is required."}
|
148
|
+
|
149
|
+
file_path = kwargs["file_path"]
|
150
|
+
|
151
|
+
# Security check: Prevent creation in sensitive directories
|
152
|
+
forbidden_dirs = ["/etc", "/root", "/sys", "/proc"]
|
153
|
+
if any(file_path.startswith(d) for d in forbidden_dirs):
|
154
|
+
return {"success": False, "output": "Error: Creation in system directories is restricted."}
|
155
|
+
|
156
|
+
# Check if file already exists
|
157
|
+
if os.path.exists(file_path):
|
158
|
+
return {"success": False, "output": f"Error: File '{file_path}' already exists."}
|
159
|
+
|
160
|
+
# Ensure directory exists
|
161
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
162
|
+
|
163
|
+
# Create empty file
|
164
|
+
with open(file_path, "w", encoding="utf-8"):
|
165
|
+
pass
|
166
|
+
|
167
|
+
return {"success": True, "output": f"File '{file_path}' created successfully."}
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
return {"success": False, "output": f"Error: {str(e)}"}
|
171
|
+
|
172
|
+
def file_writer(**kwargs) -> dict:
|
173
|
+
"""Writes or appends content to a specified file.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
**kwargs: Keyword arguments with 'file_path' (str), 'content' (str), and optional 'append' (bool).
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
Dictionary with 'success' (bool), 'output' (confirmation or error message).
|
180
|
+
"""
|
181
|
+
try:
|
182
|
+
# Validate input
|
183
|
+
if "file_path" not in kwargs or "content" not in kwargs:
|
184
|
+
return {"success": False, "output": "Error: 'file_path' and 'content' are required."}
|
185
|
+
|
186
|
+
file_path = kwargs["file_path"]
|
187
|
+
content = kwargs["content"]
|
188
|
+
append_mode = kwargs.get("append", False)
|
189
|
+
|
190
|
+
# Security check: Prevent writing to sensitive directories
|
191
|
+
forbidden_dirs = ["/etc", "/root", "/sys", "/proc"]
|
192
|
+
if any(file_path.startswith(d) for d in forbidden_dirs):
|
193
|
+
return {"success": False, "output": "Error: Writing to system directories is restricted."}
|
194
|
+
|
195
|
+
# Ensure directory exists
|
196
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
197
|
+
|
198
|
+
# Write or append to file
|
199
|
+
mode = "a" if append_mode else "w"
|
200
|
+
with open(file_path, mode, encoding="utf-8") as f:
|
201
|
+
f.write(content)
|
202
|
+
|
203
|
+
action = "appended to" if append_mode else "written to"
|
204
|
+
return {"success": True, "output": f"Content {action} '{file_path}' successfully."}
|
205
|
+
|
206
|
+
except Exception as e:
|
207
|
+
return {"success": False, "output": f"Error: {str(e)}"}
|
208
|
+
|
209
|
+
def directory_maker(**kwargs) -> dict:
|
210
|
+
"""Creates a directory at the specified path.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
**kwargs: Keyword arguments with 'dir_path' specifying the directory to create.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
Dictionary with 'success' (bool), 'output' (confirmation or error message).
|
217
|
+
"""
|
218
|
+
try:
|
219
|
+
# Validate input
|
220
|
+
if "dir_path" not in kwargs:
|
221
|
+
return {"success": False, "output": "Error: 'dir_path' is required."}
|
222
|
+
|
223
|
+
dir_path = kwargs["dir_path"]
|
224
|
+
|
225
|
+
# Security check: Prevent creation in sensitive directories
|
226
|
+
forbidden_dirs = ["/etc", "/root", "/sys", "/proc"]
|
227
|
+
if any(dir_path.startswith(d) for d in forbidden_dirs):
|
228
|
+
return {"success": False, "output": "Error: Creation in system directories is restricted."}
|
229
|
+
|
230
|
+
# Check if directory already exists
|
231
|
+
if os.path.exists(dir_path):
|
232
|
+
return {"success": False, "output": f"Error: Directory '{dir_path}' already exists."}
|
233
|
+
|
234
|
+
# Create directory
|
235
|
+
os.makedirs(dir_path)
|
236
|
+
|
237
|
+
return {"success": True, "output": f"Directory '{dir_path}' created successfully."}
|
238
|
+
|
239
|
+
except Exception as e:
|
240
|
+
return {"success": False, "output": f"Error: {str(e)}"}
|
Tools/system_details.py
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
import platform
|
2
|
+
import psutil
|
3
|
+
import datetime
|
4
|
+
import json
|
5
|
+
|
6
|
+
def get_os_details():
|
7
|
+
"""Get operating system details"""
|
8
|
+
return {
|
9
|
+
"system": platform.system(),
|
10
|
+
"release": platform.release(),
|
11
|
+
"version": platform.version(),
|
12
|
+
"machine": platform.machine(),
|
13
|
+
"processor": platform.processor()
|
14
|
+
}
|
15
|
+
|
16
|
+
def get_datetime():
|
17
|
+
"""Get current date and time"""
|
18
|
+
now = datetime.datetime.now()
|
19
|
+
return {
|
20
|
+
"date": now.strftime("%Y-%m-%d"),
|
21
|
+
"time": now.strftime("%H:%M:%S"),
|
22
|
+
"timezone": datetime.datetime.now().astimezone().tzname()
|
23
|
+
}
|
24
|
+
|
25
|
+
def get_memory_usage():
|
26
|
+
"""Get memory usage details"""
|
27
|
+
memory = psutil.virtual_memory()
|
28
|
+
return {
|
29
|
+
"total": f"{memory.total / (1024**3):.2f} GB",
|
30
|
+
"available": f"{memory.available / (1024**3):.2f} GB",
|
31
|
+
"used": f"{memory.used / (1024**3):.2f} GB",
|
32
|
+
"percent": f"{memory.percent}%"
|
33
|
+
}
|
34
|
+
|
35
|
+
def get_cpu_info():
|
36
|
+
"""Get CPU information"""
|
37
|
+
return {
|
38
|
+
"physical_cores": psutil.cpu_count(logical=False),
|
39
|
+
"total_cores": psutil.cpu_count(logical=True),
|
40
|
+
"cpu_freq": {
|
41
|
+
"current": f"{psutil.cpu_freq().current:.2f} MHz",
|
42
|
+
"min": f"{psutil.cpu_freq().min:.2f} MHz",
|
43
|
+
"max": f"{psutil.cpu_freq().max:.2f} MHz"
|
44
|
+
},
|
45
|
+
"cpu_usage": f"{psutil.cpu_percent()}%"
|
46
|
+
}
|
47
|
+
|
48
|
+
def system_details(detail_type="all"):
|
49
|
+
"""
|
50
|
+
Get system details based on the requested type.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
detail_type (str): Type of system detail to retrieve (os, datetime, memory, cpu, all)
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
dict: Requested system details
|
57
|
+
"""
|
58
|
+
detail_type = detail_type.lower()
|
59
|
+
|
60
|
+
if detail_type == "all":
|
61
|
+
return {
|
62
|
+
"os": get_os_details(),
|
63
|
+
"datetime": get_datetime(),
|
64
|
+
"memory": get_memory_usage(),
|
65
|
+
"cpu": get_cpu_info()
|
66
|
+
}
|
67
|
+
elif detail_type == "os":
|
68
|
+
return get_os_details()
|
69
|
+
elif detail_type == "datetime":
|
70
|
+
return get_datetime()
|
71
|
+
elif detail_type == "memory":
|
72
|
+
return get_memory_usage()
|
73
|
+
elif detail_type == "cpu":
|
74
|
+
return get_cpu_info()
|
75
|
+
else:
|
76
|
+
raise ValueError(f"Invalid detail type: {detail_type}. Must be one of: os, datetime, memory, cpu, all")
|
Tools/tool_manager.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
import sys
|
2
|
+
import os
|
3
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
4
|
+
from Src.Tools.web_loader import load_data
|
5
|
+
from Src.Tools.web_search import web_search
|
6
|
+
from Src.Tools.file_task import file_reader, file_maker, file_writer, directory_maker
|
7
|
+
from Src.Tools.system_details import get_os_details, get_datetime, get_memory_usage, get_cpu_info
|
8
|
+
from Src.Tools.userinp import get_user_input
|
9
|
+
|
10
|
+
#need to transform it into map of dictionary
|
11
|
+
#name : [function : xyz,description : blah bah]
|
12
|
+
|
13
|
+
tools_function_map = {
|
14
|
+
"web_loader": load_data,
|
15
|
+
"web_search": web_search,
|
16
|
+
"file_maker": file_maker,
|
17
|
+
"file_reader":file_reader,
|
18
|
+
"directory_maker":directory_maker,
|
19
|
+
"file_writer":file_writer,
|
20
|
+
"get_os_details": get_os_details,
|
21
|
+
"get_datetime": get_datetime,
|
22
|
+
"get_memory_usage": get_memory_usage,
|
23
|
+
"get_cpu_info": get_cpu_info,
|
24
|
+
"get_user_input": get_user_input
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
def call_tool(tool_name, tool_input):
|
30
|
+
"""
|
31
|
+
Calls the appropriate tool function with the given input.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
tool_name (str): Name of the tool to call
|
35
|
+
tool_input (dict): Input parameters for the tool
|
36
|
+
"""
|
37
|
+
if tool_name in tools_function_map:
|
38
|
+
# Pass the tool_input dictionary as kwargs to the tool function
|
39
|
+
return tools_function_map[tool_name](**tool_input)
|
40
|
+
else:
|
41
|
+
raise ValueError(f"Tool '{tool_name}' not found. Check the tools available in the tool directory")
|
42
|
+
|
43
|
+
|
44
|
+
# print(call_tool("web_loader","https://www.toastmasters.org"))
|
45
|
+
# print(call_tool("web_search","manus ai"))
|
46
|
+
# print(call_tool("web_loader",{"url":"https://www.toastmasters.org"}))
|
47
|
+
# print(call_tool("file_reader",{"file_path":"/Users/niharshettigar/Web Dev Projects/Jsprograms/Arrays.js"}))
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
Tools/userinp.py
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
def get_user_input(prompt="Enter input: ", input_type="text"):
|
2
|
+
"""
|
3
|
+
Get input from the user with optional type validation.
|
4
|
+
|
5
|
+
Args:
|
6
|
+
prompt (str): The message to display to the user
|
7
|
+
input_type (str): Type of input to validate (text, number, boolean)
|
8
|
+
|
9
|
+
Returns:
|
10
|
+
The user's input, converted to the appropriate type
|
11
|
+
"""
|
12
|
+
while True:
|
13
|
+
try:
|
14
|
+
user_input = input(prompt)
|
15
|
+
|
16
|
+
if input_type == "text":
|
17
|
+
return user_input
|
18
|
+
elif input_type == "number":
|
19
|
+
return float(user_input)
|
20
|
+
elif input_type == "boolean":
|
21
|
+
user_input = user_input.lower()
|
22
|
+
if user_input in ["true", "yes", "y", "1"]:
|
23
|
+
return True
|
24
|
+
elif user_input in ["false", "no", "n", "0"]:
|
25
|
+
return False
|
26
|
+
else:
|
27
|
+
print("Please enter 'yes' or 'no'")
|
28
|
+
continue
|
29
|
+
else:
|
30
|
+
raise ValueError(f"Invalid input type: {input_type}")
|
31
|
+
|
32
|
+
except ValueError as e:
|
33
|
+
print(f"Invalid input. Please try again. ({str(e)})")
|
34
|
+
continue
|
Tools/web_loader.py
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
import hashlib
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
import requests
|
5
|
+
import pdfplumber
|
6
|
+
from io import BytesIO
|
7
|
+
try:
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
except ImportError:
|
10
|
+
raise ImportError(
|
11
|
+
'Webpage requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
|
12
|
+
) from None
|
13
|
+
|
14
|
+
|
15
|
+
def clean_string(text):
|
16
|
+
"""
|
17
|
+
This function takes in a string and performs a series of text cleaning operations.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
text (str): The text to be cleaned. This is expected to be a string.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
cleaned_text (str): The cleaned text after all the cleaning operations
|
24
|
+
have been performed.
|
25
|
+
"""
|
26
|
+
# Replacement of newline characters:
|
27
|
+
text = text.replace("\n", " ")
|
28
|
+
|
29
|
+
# Stripping and reducing multiple spaces to single:
|
30
|
+
cleaned_text = re.sub(r"\s+", " ", text.strip())
|
31
|
+
|
32
|
+
# Removing backslashes:
|
33
|
+
cleaned_text = cleaned_text.replace("\\", "")
|
34
|
+
|
35
|
+
# Replacing hash characters:
|
36
|
+
cleaned_text = cleaned_text.replace("#", " ")
|
37
|
+
|
38
|
+
# Eliminating consecutive non-alphanumeric characters:
|
39
|
+
# This regex identifies consecutive non-alphanumeric characters (i.e., not
|
40
|
+
# a word character [a-zA-Z0-9_] and not a whitespace) in the string
|
41
|
+
# and replaces each group of such characters with a single occurrence of
|
42
|
+
# that character.
|
43
|
+
# For example, "!!! hello !!!" would become "! hello !".
|
44
|
+
cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
|
45
|
+
|
46
|
+
return cleaned_text
|
47
|
+
|
48
|
+
|
49
|
+
def get_clean_content(html, url) -> str:
|
50
|
+
"""
|
51
|
+
Clean and extract text from HTML content.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
html (bytes): The HTML content to be cleaned.
|
55
|
+
url (str): The URL of the webpage (for logging purposes).
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
str: The cleaned text content.
|
59
|
+
"""
|
60
|
+
soup = BeautifulSoup(html, "html.parser")
|
61
|
+
original_size = len(str(soup.get_text()))
|
62
|
+
|
63
|
+
tags_to_exclude = [
|
64
|
+
"nav",
|
65
|
+
"aside",
|
66
|
+
"form",
|
67
|
+
"header",
|
68
|
+
"noscript",
|
69
|
+
"svg",
|
70
|
+
"canvas",
|
71
|
+
"footer",
|
72
|
+
"script",
|
73
|
+
"style",
|
74
|
+
]
|
75
|
+
for tag in soup(tags_to_exclude):
|
76
|
+
tag.decompose()
|
77
|
+
|
78
|
+
ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
|
79
|
+
for id in ids_to_exclude:
|
80
|
+
tags = soup.find_all(id=id)
|
81
|
+
for tag in tags:
|
82
|
+
tag.decompose()
|
83
|
+
|
84
|
+
classes_to_exclude = [
|
85
|
+
"elementor-location-header",
|
86
|
+
"navbar-header",
|
87
|
+
"nav",
|
88
|
+
"header-sidebar-wrapper",
|
89
|
+
"blog-sidebar-wrapper",
|
90
|
+
"related-posts",
|
91
|
+
]
|
92
|
+
for class_name in classes_to_exclude:
|
93
|
+
tags = soup.find_all(class_=class_name)
|
94
|
+
for tag in tags:
|
95
|
+
tag.decompose()
|
96
|
+
|
97
|
+
content = soup.get_text()
|
98
|
+
content = clean_string(content)
|
99
|
+
|
100
|
+
cleaned_size = len(content)
|
101
|
+
if original_size != 0:
|
102
|
+
logging.info(
|
103
|
+
f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)"
|
104
|
+
)
|
105
|
+
|
106
|
+
return content
|
107
|
+
|
108
|
+
|
109
|
+
def load_data(**kwargs):
|
110
|
+
# Get the url from kwargs, raise error if not found
|
111
|
+
if 'url' not in kwargs:
|
112
|
+
raise ValueError("URL is required but not provided in kwargs")
|
113
|
+
|
114
|
+
url = kwargs['url']
|
115
|
+
|
116
|
+
# Alternative way using .get() with error handling
|
117
|
+
# url = kwargs.get('url')
|
118
|
+
# if url is None:
|
119
|
+
# raise ValueError("URL is required but not provided in kwargs")
|
120
|
+
|
121
|
+
session = requests.Session()
|
122
|
+
|
123
|
+
headers = {
|
124
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML like Gecko) Chrome/52.0.2743.116 Safari/537.36"
|
125
|
+
}
|
126
|
+
web_data = {}
|
127
|
+
content = ""
|
128
|
+
try:
|
129
|
+
response = session.get(url, headers=headers, timeout=30)
|
130
|
+
response.raise_for_status()
|
131
|
+
data = response.content
|
132
|
+
# Check content type
|
133
|
+
content_type = response.headers.get("Content-Type", "")
|
134
|
+
if "html" in content_type:
|
135
|
+
content = get_clean_content(data, url)
|
136
|
+
elif "pdf" in content_type:
|
137
|
+
# Open the PDF file using pdfplumber
|
138
|
+
with pdfplumber.open(BytesIO(response.content)) as pdf:
|
139
|
+
# Extract text from each page and combine it
|
140
|
+
content = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
|
141
|
+
|
142
|
+
meta_data = {"url": url}
|
143
|
+
doc_id = hashlib.sha256((content + url).encode()).hexdigest()
|
144
|
+
web_data = {
|
145
|
+
"doc_id": doc_id,
|
146
|
+
"data": [
|
147
|
+
{
|
148
|
+
"content": content,
|
149
|
+
"meta_data": meta_data,
|
150
|
+
}
|
151
|
+
],
|
152
|
+
}
|
153
|
+
except Exception as e:
|
154
|
+
logging.error(f"Error loading data from {url}: {e}")
|
155
|
+
web_data = {
|
156
|
+
"data": [
|
157
|
+
{
|
158
|
+
"content": "",
|
159
|
+
"meta_data": "",
|
160
|
+
}
|
161
|
+
],
|
162
|
+
}
|
163
|
+
return web_data
|
164
|
+
|
165
|
+
|
166
|
+
def close_session(session):
|
167
|
+
"""
|
168
|
+
Close the requests session.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
session (requests.Session): The session to close.
|
172
|
+
"""
|
173
|
+
session.close()
|
Tools/web_search.py
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
from duckduckgo_search import DDGS
|
2
|
+
|
3
|
+
def web_search(max_results: int = 10, **kwargs) -> str:
|
4
|
+
"""
|
5
|
+
Performs a DuckDuckGo web search based on your query (think a Google search) then returns the top search results.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
query (str): The search query to perform.
|
9
|
+
max_results (int, optional): Maximum number of results to return. Defaults to 10.
|
10
|
+
**kwargs: Additional keyword arguments to pass to DDGS.
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
str: Formatted string containing search results.
|
14
|
+
|
15
|
+
Raises:
|
16
|
+
ImportError: If the duckduckgo_search package is not installed.
|
17
|
+
Exception: If no results are found for the given query.
|
18
|
+
"""
|
19
|
+
try:
|
20
|
+
ddgs = DDGS()
|
21
|
+
except ImportError as e:
|
22
|
+
raise ImportError("You must install package `duckduckgo_search` to run this function: for instance run `pip install duckduckgo-search`."
|
23
|
+
) from e
|
24
|
+
query = kwargs['query']
|
25
|
+
results = ddgs.text(query, max_results=max_results)
|
26
|
+
if len(results) == 0:
|
27
|
+
raise Exception("No results found! Try a less restrictive/shorter query.")
|
28
|
+
|
29
|
+
postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
|
30
|
+
return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
|
File without changes
|