janito 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janito/__init__.py +1 -1
- janito/__main__.py +6 -204
- janito/callbacks.py +34 -132
- janito/cli/__init__.py +6 -0
- janito/cli/agent.py +400 -0
- janito/cli/app.py +94 -0
- janito/cli/commands.py +329 -0
- janito/cli/output.py +29 -0
- janito/cli/utils.py +22 -0
- janito/config.py +358 -121
- janito/data/instructions_template.txt +28 -0
- janito/token_report.py +154 -145
- janito/tools/__init__.py +38 -21
- janito/tools/bash/bash.py +84 -0
- janito/tools/bash/unix_persistent_bash.py +184 -0
- janito/tools/bash/win_persistent_bash.py +308 -0
- janito/tools/decorators.py +2 -13
- janito/tools/delete_file.py +27 -9
- janito/tools/fetch_webpage/__init__.py +34 -0
- janito/tools/fetch_webpage/chunking.py +76 -0
- janito/tools/fetch_webpage/core.py +155 -0
- janito/tools/fetch_webpage/extractors.py +276 -0
- janito/tools/fetch_webpage/news.py +137 -0
- janito/tools/fetch_webpage/utils.py +108 -0
- janito/tools/find_files.py +106 -44
- janito/tools/move_file.py +72 -0
- janito/tools/prompt_user.py +37 -6
- janito/tools/replace_file.py +31 -4
- janito/tools/rich_console.py +176 -0
- janito/tools/search_text.py +35 -22
- janito/tools/str_replace_editor/editor.py +7 -4
- janito/tools/str_replace_editor/handlers/__init__.py +16 -0
- janito/tools/str_replace_editor/handlers/create.py +60 -0
- janito/tools/str_replace_editor/handlers/insert.py +100 -0
- janito/tools/str_replace_editor/handlers/str_replace.py +94 -0
- janito/tools/str_replace_editor/handlers/undo.py +64 -0
- janito/tools/str_replace_editor/handlers/view.py +159 -0
- janito/tools/str_replace_editor/utils.py +0 -1
- janito/tools/usage_tracker.py +136 -0
- janito-0.13.0.dist-info/METADATA +300 -0
- janito-0.13.0.dist-info/RECORD +47 -0
- janito/chat_history.py +0 -117
- janito/data/instructions.txt +0 -4
- janito/tools/bash.py +0 -22
- janito/tools/str_replace_editor/handlers.py +0 -335
- janito-0.11.0.dist-info/METADATA +0 -86
- janito-0.11.0.dist-info/RECORD +0 -26
- {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/WHEEL +0 -0
- {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/entry_points.txt +0 -0
- {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,308 @@
|
|
1
|
+
import subprocess
|
2
|
+
import os
|
3
|
+
import platform
|
4
|
+
import time
|
5
|
+
import uuid
|
6
|
+
import sys
|
7
|
+
import io
|
8
|
+
import codecs
|
9
|
+
|
10
|
+
class PersistentBash:
|
11
|
+
"""
|
12
|
+
A wrapper class that maintains a persistent Bash session.
|
13
|
+
Allows sending commands and collecting output without restarting Bash.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, bash_path=None):
|
17
|
+
"""
|
18
|
+
Initialize a persistent Bash session.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
bash_path (str, optional): Path to the Bash executable. If None, tries to detect automatically.
|
22
|
+
This can be configured in Janito's config using the gitbash_path setting.
|
23
|
+
"""
|
24
|
+
self.process = None
|
25
|
+
self.bash_path = bash_path
|
26
|
+
|
27
|
+
# Configure UTF-8 support for Windows
|
28
|
+
if platform.system() == "Windows":
|
29
|
+
# Force UTF-8 mode in Python 3.7+
|
30
|
+
os.environ["PYTHONUTF8"] = "1"
|
31
|
+
|
32
|
+
# Set Python's standard IO encoding to UTF-8
|
33
|
+
if hasattr(sys.stdout, 'reconfigure'):
|
34
|
+
sys.stdout.reconfigure(encoding='utf-8')
|
35
|
+
if hasattr(sys.stderr, 'reconfigure'):
|
36
|
+
sys.stderr.reconfigure(encoding='utf-8')
|
37
|
+
if hasattr(sys.stdin, 'reconfigure'):
|
38
|
+
sys.stdin.reconfigure(encoding='utf-8')
|
39
|
+
|
40
|
+
# Ensure Windows console is in UTF-8 mode
|
41
|
+
try:
|
42
|
+
# Try to set console mode to UTF-8
|
43
|
+
os.system("chcp 65001 > nul")
|
44
|
+
|
45
|
+
# Redirect stdout through a UTF-8 writer
|
46
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True)
|
47
|
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True)
|
48
|
+
except Exception as e:
|
49
|
+
print(f"Warning: Failed to set up UTF-8 encoding: {str(e)}")
|
50
|
+
|
51
|
+
# If bash_path is not provided, try to detect it
|
52
|
+
if self.bash_path is None:
|
53
|
+
if platform.system() == "Windows":
|
54
|
+
# Common paths for Git Bash on Windows
|
55
|
+
possible_paths = [
|
56
|
+
r"C:\Program Files\Git\bin\bash.exe",
|
57
|
+
r"C:\Program Files (x86)\Git\bin\bash.exe",
|
58
|
+
]
|
59
|
+
for path in possible_paths:
|
60
|
+
if os.path.exists(path):
|
61
|
+
self.bash_path = path
|
62
|
+
break
|
63
|
+
if self.bash_path is None:
|
64
|
+
raise FileNotFoundError("Could not find Git Bash executable. Please specify the path manually.")
|
65
|
+
else:
|
66
|
+
# On Unix-like systems, bash is usually in the PATH
|
67
|
+
self.bash_path = "bash"
|
68
|
+
|
69
|
+
# Start the bash process
|
70
|
+
self.start_process()
|
71
|
+
|
72
|
+
def start_process(self):
|
73
|
+
"""Start the Bash process."""
|
74
|
+
# Create a subprocess with pipe for stdin, stdout, and stderr
|
75
|
+
bash_args = [self.bash_path]
|
76
|
+
|
77
|
+
# Set UTF-8 codepage for Windows
|
78
|
+
env = os.environ.copy()
|
79
|
+
if platform.system() == "Windows":
|
80
|
+
# Set codepage to UTF-8 (65001) - run this before starting the process
|
81
|
+
os.system("chcp 65001 > nul")
|
82
|
+
# Set environment variables for proper UTF-8 handling
|
83
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
84
|
+
env["PYTHONUTF8"] = "1"
|
85
|
+
# Add additional environment variables for Windows CMD
|
86
|
+
env["LANG"] = "en_US.UTF-8"
|
87
|
+
env["LC_ALL"] = "en_US.UTF-8"
|
88
|
+
|
89
|
+
# Create the process with binary pipes for better control over encoding
|
90
|
+
if platform.system() == "Windows":
|
91
|
+
# On Windows, we need special handling for UTF-8
|
92
|
+
self.process = subprocess.Popen(
|
93
|
+
bash_args,
|
94
|
+
stdin=subprocess.PIPE,
|
95
|
+
stdout=subprocess.PIPE,
|
96
|
+
stderr=subprocess.STDOUT, # Redirect stderr to stdout
|
97
|
+
bufsize=0, # Unbuffered
|
98
|
+
universal_newlines=False, # Use binary mode
|
99
|
+
env=env # Pass the modified environment
|
100
|
+
)
|
101
|
+
|
102
|
+
# Create UTF-8 wrappers for stdin/stdout
|
103
|
+
self.stdin = io.TextIOWrapper(self.process.stdin, encoding='utf-8', errors='replace', line_buffering=True)
|
104
|
+
self.stdout = io.TextIOWrapper(self.process.stdout, encoding='utf-8', errors='replace', line_buffering=True)
|
105
|
+
else:
|
106
|
+
# On Unix systems, the standard approach works fine
|
107
|
+
self.process = subprocess.Popen(
|
108
|
+
bash_args,
|
109
|
+
stdin=subprocess.PIPE,
|
110
|
+
stdout=subprocess.PIPE,
|
111
|
+
stderr=subprocess.STDOUT, # Redirect stderr to stdout
|
112
|
+
text=True, # Use text mode for input/output
|
113
|
+
bufsize=0, # Unbuffered
|
114
|
+
universal_newlines=True, # Universal newlines mode
|
115
|
+
env=env, # Pass the modified environment
|
116
|
+
encoding='utf-8', # Explicitly set encoding to UTF-8
|
117
|
+
errors='replace' # Replace invalid characters instead of failing
|
118
|
+
)
|
119
|
+
self.stdin = self.process.stdin
|
120
|
+
self.stdout = self.process.stdout
|
121
|
+
|
122
|
+
# Set up a more reliable environment
|
123
|
+
setup_commands = [
|
124
|
+
"export PS1='$ '", # Simple prompt to avoid parsing issues
|
125
|
+
"export TERM=dumb", # Disable color codes and other terminal features
|
126
|
+
"set +o history", # Disable history
|
127
|
+
"shopt -s expand_aliases", # Enable alias expansion
|
128
|
+
"export LANG=en_US.UTF-8", # Set UTF-8 locale
|
129
|
+
"export LC_ALL=en_US.UTF-8", # Set all locale categories to UTF-8
|
130
|
+
]
|
131
|
+
|
132
|
+
# Additional setup for Windows to handle UTF-8
|
133
|
+
if platform.system() == "Windows":
|
134
|
+
setup_commands.extend([
|
135
|
+
# Force Git Bash to use UTF-8
|
136
|
+
"export LESSCHARSET=utf-8",
|
137
|
+
# Ensure proper display of Unicode characters
|
138
|
+
"export PYTHONIOENCODING=utf-8"
|
139
|
+
])
|
140
|
+
|
141
|
+
# Send setup commands
|
142
|
+
for cmd in setup_commands:
|
143
|
+
self._send_command(cmd)
|
144
|
+
|
145
|
+
# Clear initial output with a marker
|
146
|
+
marker = f"INIT_COMPLETE_{uuid.uuid4().hex}"
|
147
|
+
self._send_command(f"echo {marker}")
|
148
|
+
|
149
|
+
while True:
|
150
|
+
line = self.stdout.readline().strip()
|
151
|
+
if marker in line:
|
152
|
+
break
|
153
|
+
|
154
|
+
def _send_command(self, command):
|
155
|
+
"""Send a command to the Bash process without reading the output."""
|
156
|
+
if self.process is None or self.process.poll() is not None:
|
157
|
+
self.start_process()
|
158
|
+
|
159
|
+
# Use our stdin wrapper instead of process.stdin directly
|
160
|
+
self.stdin.write(command + "\n")
|
161
|
+
self.stdin.flush()
|
162
|
+
|
163
|
+
def execute(self, command, timeout=None):
|
164
|
+
"""
|
165
|
+
Execute a command in the Bash session and return the output.
|
166
|
+
|
167
|
+
Args:
|
168
|
+
command (str): The command to execute.
|
169
|
+
timeout (int, optional): Timeout in seconds. If None, no timeout is applied.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
str: The command output.
|
173
|
+
"""
|
174
|
+
if self.process is None or self.process.poll() is not None:
|
175
|
+
# Process has terminated, restart it
|
176
|
+
self.start_process()
|
177
|
+
|
178
|
+
# Create a unique marker to identify the end of output
|
179
|
+
end_marker = f"END_OF_COMMAND_{uuid.uuid4().hex}"
|
180
|
+
|
181
|
+
# Construct the wrapped command with echo markers
|
182
|
+
# Only use timeout when explicitly requested
|
183
|
+
if timeout is not None and timeout > 0:
|
184
|
+
# Check if timeout command is available
|
185
|
+
is_timeout_available = False
|
186
|
+
try:
|
187
|
+
check_cmd = "command -v timeout > /dev/null 2>&1 && echo available || echo unavailable"
|
188
|
+
self._send_command(check_cmd)
|
189
|
+
for _ in range(10): # Read up to 10 lines to find the result
|
190
|
+
line = self.stdout.readline().strip()
|
191
|
+
if "available" in line:
|
192
|
+
is_timeout_available = True
|
193
|
+
break
|
194
|
+
elif "unavailable" in line:
|
195
|
+
is_timeout_available = False
|
196
|
+
break
|
197
|
+
except:
|
198
|
+
is_timeout_available = False
|
199
|
+
|
200
|
+
if is_timeout_available:
|
201
|
+
# For timeout to work with shell syntax, we need to use bash -c
|
202
|
+
wrapped_command = f"timeout {timeout}s bash -c \"{command.replace('"', '\\"')}\" 2>&1; echo '{end_marker}'"
|
203
|
+
else:
|
204
|
+
wrapped_command = f"{command} 2>&1; echo '{end_marker}'"
|
205
|
+
else:
|
206
|
+
wrapped_command = f"{command} 2>&1; echo '{end_marker}'"
|
207
|
+
|
208
|
+
# Send the command
|
209
|
+
self._send_command(wrapped_command)
|
210
|
+
|
211
|
+
# Import the console here to avoid circular imports
|
212
|
+
from janito.tools.rich_console import console
|
213
|
+
|
214
|
+
# Collect output until the end marker is found
|
215
|
+
output_lines = []
|
216
|
+
start_time = time.time()
|
217
|
+
max_wait = timeout if timeout is not None else 3600 # Default to 1 hour if no timeout
|
218
|
+
|
219
|
+
while time.time() - start_time < max_wait + 5: # Add buffer time
|
220
|
+
try:
|
221
|
+
line = self.stdout.readline().rstrip('\r\n')
|
222
|
+
if end_marker in line:
|
223
|
+
break
|
224
|
+
|
225
|
+
# Print the output to the console in real-time if not in trust mode
|
226
|
+
if line:
|
227
|
+
from janito.config import get_config
|
228
|
+
if not get_config().trust_mode:
|
229
|
+
console.print(line)
|
230
|
+
|
231
|
+
output_lines.append(line)
|
232
|
+
except UnicodeDecodeError as e:
|
233
|
+
# Handle potential UTF-8 decoding errors
|
234
|
+
error_msg = f"[Warning: Unicode decode error occurred: {str(e)}]"
|
235
|
+
console.print(error_msg, style="yellow")
|
236
|
+
output_lines.append(error_msg)
|
237
|
+
# Just continue with replacement character
|
238
|
+
continue
|
239
|
+
except Exception as e:
|
240
|
+
error_msg = f"[Error reading output: {str(e)}]"
|
241
|
+
console.print(error_msg, style="red")
|
242
|
+
output_lines.append(error_msg)
|
243
|
+
continue
|
244
|
+
|
245
|
+
# Check for timeout
|
246
|
+
if time.time() - start_time >= max_wait + 5:
|
247
|
+
timeout_msg = f"Error: Command timed out after {max_wait} seconds"
|
248
|
+
console.print(timeout_msg, style="red bold")
|
249
|
+
output_lines.append(timeout_msg)
|
250
|
+
|
251
|
+
# Try to reset the bash session after a timeout
|
252
|
+
self.close()
|
253
|
+
self.start_process()
|
254
|
+
|
255
|
+
return "\n".join(output_lines)
|
256
|
+
|
257
|
+
def windows_to_bash_path(self, windows_path):
|
258
|
+
"""
|
259
|
+
Convert a Windows path to a Git Bash compatible path.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
windows_path (str): A Windows path like 'C:\\folder\\file.txt'
|
263
|
+
|
264
|
+
Returns:
|
265
|
+
str: Git Bash compatible path like '/c/folder/file.txt'
|
266
|
+
"""
|
267
|
+
if not windows_path or not platform.system() == "Windows":
|
268
|
+
return windows_path
|
269
|
+
|
270
|
+
# Handle drive letter (e.g., C: -> /c)
|
271
|
+
if ":" in windows_path:
|
272
|
+
drive, path = windows_path.split(":", 1)
|
273
|
+
unix_path = f"/{drive.lower()}{path}"
|
274
|
+
else:
|
275
|
+
unix_path = windows_path
|
276
|
+
|
277
|
+
# Convert backslashes to forward slashes
|
278
|
+
unix_path = unix_path.replace("\\", "/")
|
279
|
+
|
280
|
+
# Remove any double slashes
|
281
|
+
while "//" in unix_path:
|
282
|
+
unix_path = unix_path.replace("//", "/")
|
283
|
+
|
284
|
+
# If the path contains spaces, we need to escape them or quote the entire path
|
285
|
+
if " " in unix_path:
|
286
|
+
unix_path = f'"{unix_path}"'
|
287
|
+
|
288
|
+
return unix_path
|
289
|
+
|
290
|
+
def close(self):
|
291
|
+
"""Close the Bash session."""
|
292
|
+
if self.process and self.process.poll() is None:
|
293
|
+
try:
|
294
|
+
self._send_command("exit")
|
295
|
+
self.process.wait(timeout=2)
|
296
|
+
except:
|
297
|
+
pass
|
298
|
+
finally:
|
299
|
+
try:
|
300
|
+
self.process.terminate()
|
301
|
+
except:
|
302
|
+
pass
|
303
|
+
|
304
|
+
self.process = None
|
305
|
+
|
306
|
+
def __del__(self):
|
307
|
+
"""Destructor to ensure the process is closed."""
|
308
|
+
self.close()
|
janito/tools/decorators.py
CHANGED
@@ -2,9 +2,8 @@
|
|
2
2
|
Decorators for janito tools.
|
3
3
|
"""
|
4
4
|
import functools
|
5
|
-
import inspect
|
6
5
|
import string
|
7
|
-
from typing import Any, Callable, Dict, Optional
|
6
|
+
from typing import Any, Callable, Dict, Optional
|
8
7
|
|
9
8
|
|
10
9
|
class ToolMetaFormatter(string.Formatter):
|
@@ -83,19 +82,9 @@ def format_tool_label(func: Callable, tool_input: Dict[str, Any]) -> Optional[st
|
|
83
82
|
# Get the label template
|
84
83
|
label_template = func._tool_meta['label']
|
85
84
|
|
86
|
-
# Special handling for str_replace_editor which uses **kwargs
|
87
|
-
if func.__name__ == 'str_replace_editor':
|
88
|
-
# Extract command and file_path from tool_input if they exist
|
89
|
-
command = tool_input.get('command', 'unknown')
|
90
|
-
file_path = tool_input.get('file_path', '')
|
91
|
-
|
92
|
-
# Simple string replacement for the common case
|
93
|
-
if '{command}' in label_template and '{file_path}' in label_template:
|
94
|
-
return label_template.replace('{command}', command).replace('{file_path}', file_path)
|
95
|
-
|
96
85
|
# Format the label with the parameters
|
97
86
|
try:
|
98
87
|
formatter = ToolMetaFormatter()
|
99
88
|
return formatter.format(label_template, **tool_input)
|
100
|
-
except Exception
|
89
|
+
except Exception:
|
101
90
|
return f"{func.__name__}"
|
janito/tools/delete_file.py
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
"""
|
2
2
|
Tool for deleting files through the claudine agent.
|
3
3
|
"""
|
4
|
-
import os
|
5
4
|
from pathlib import Path
|
6
|
-
from typing import
|
7
|
-
from janito.config import get_config
|
5
|
+
from typing import Tuple
|
8
6
|
from janito.tools.str_replace_editor.utils import normalize_path
|
9
|
-
from janito.tools.
|
7
|
+
from janito.tools.rich_console import print_info, print_success, print_error
|
8
|
+
from janito.tools.usage_tracker import track_usage, get_tracker
|
10
9
|
|
11
10
|
|
12
|
-
@
|
11
|
+
@track_usage('files_deleted')
|
13
12
|
def delete_file(
|
14
13
|
file_path: str,
|
15
14
|
) -> Tuple[str, bool]:
|
@@ -22,6 +21,7 @@ def delete_file(
|
|
22
21
|
Returns:
|
23
22
|
A tuple containing (message, is_error)
|
24
23
|
"""
|
24
|
+
print_info(f"Deleting file {file_path}", "Delete Operation")
|
25
25
|
# Store the original path for display purposes
|
26
26
|
original_path = file_path
|
27
27
|
|
@@ -33,15 +33,33 @@ def delete_file(
|
|
33
33
|
|
34
34
|
# Check if the file exists
|
35
35
|
if not path_obj.exists():
|
36
|
-
|
36
|
+
error_msg = f"File {original_path} does not exist."
|
37
|
+
print_error(error_msg, "Error")
|
38
|
+
return (error_msg, True)
|
37
39
|
|
38
40
|
# Check if it's a directory
|
39
41
|
if path_obj.is_dir():
|
40
|
-
|
42
|
+
error_msg = f"{original_path} is a directory, not a file. Use delete_directory for directories."
|
43
|
+
print_error(error_msg, "Error")
|
44
|
+
return (error_msg, True)
|
41
45
|
|
42
46
|
# Delete the file
|
43
47
|
try:
|
48
|
+
# Count the number of lines in the file before deleting
|
49
|
+
try:
|
50
|
+
with open(path_obj, 'r', encoding='utf-8') as f:
|
51
|
+
line_count = len(f.readlines())
|
52
|
+
# Track negative line delta for deleted file
|
53
|
+
get_tracker().increment('lines_delta', -line_count)
|
54
|
+
except Exception:
|
55
|
+
# If we can't read the file, we can't count lines
|
56
|
+
pass
|
57
|
+
|
44
58
|
path_obj.unlink()
|
45
|
-
|
59
|
+
success_msg = f"Successfully deleted file {original_path}"
|
60
|
+
print_success("", "Success")
|
61
|
+
return (success_msg, False)
|
46
62
|
except Exception as e:
|
47
|
-
|
63
|
+
error_msg = f"Error deleting file {original_path}: {str(e)}"
|
64
|
+
print_error(error_msg, "Error")
|
65
|
+
return (error_msg, True)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
"""
|
2
|
+
Webpage Content Extractor Package
|
3
|
+
|
4
|
+
A comprehensive tool for extracting clean, relevant content from web pages
|
5
|
+
for processing with LLMs. Features include:
|
6
|
+
- General content extraction with multiple methods
|
7
|
+
- Specialized handling for news aggregator sites
|
8
|
+
- Targeted extraction based on specific search strings
|
9
|
+
- Chunking for large content
|
10
|
+
- Structured content extraction
|
11
|
+
|
12
|
+
Dependencies:
|
13
|
+
- requests
|
14
|
+
- beautifulsoup4
|
15
|
+
- trafilatura
|
16
|
+
- newspaper3k
|
17
|
+
|
18
|
+
Author: Claude (Anthropic)
|
19
|
+
"""
|
20
|
+
|
21
|
+
from janito.tools.fetch_webpage.core import fetch_webpage, fetch_and_extract
|
22
|
+
from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
|
23
|
+
from janito.tools.fetch_webpage.extractors import extract_clean_text, extract_targeted_content, extract_structured_content
|
24
|
+
from janito.tools.fetch_webpage.chunking import chunk_large_content
|
25
|
+
|
26
|
+
__all__ = [
|
27
|
+
'fetch_webpage',
|
28
|
+
'fetch_and_extract',
|
29
|
+
'fetch_and_extract_news_aggregator',
|
30
|
+
'extract_clean_text',
|
31
|
+
'extract_targeted_content',
|
32
|
+
'extract_structured_content',
|
33
|
+
'chunk_large_content'
|
34
|
+
]
|
@@ -0,0 +1,76 @@
|
|
1
|
+
"""
|
2
|
+
Functions for chunking large content into manageable pieces.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
from janito.tools.rich_console import print_info, print_success
|
7
|
+
|
8
|
+
|
9
|
+
def chunk_large_content(text: str, chunk_size: int = 4000, overlap: int = 500) -> List[str]:
|
10
|
+
"""
|
11
|
+
Split very large text content into manageable chunks suitable for LLM processing.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
text: The text to chunk
|
15
|
+
chunk_size: Target size for each chunk in characters
|
16
|
+
overlap: Number of characters to overlap between chunks
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
List of text chunks
|
20
|
+
"""
|
21
|
+
if not text or len(text) <= chunk_size:
|
22
|
+
return [text] if text else []
|
23
|
+
|
24
|
+
print_info(f"Chunking {len(text)} characters of text into ~{chunk_size} character chunks", "Content Chunking")
|
25
|
+
|
26
|
+
# Try to split on paragraph breaks first
|
27
|
+
paragraphs = text.split('\n\n')
|
28
|
+
chunks = []
|
29
|
+
current_chunk = ""
|
30
|
+
|
31
|
+
for para in paragraphs:
|
32
|
+
# If adding this paragraph would exceed chunk size
|
33
|
+
if len(current_chunk) + len(para) + 2 > chunk_size:
|
34
|
+
# If current chunk is not empty, add it to chunks
|
35
|
+
if current_chunk:
|
36
|
+
chunks.append(current_chunk)
|
37
|
+
# Start new chunk with overlap from previous chunk
|
38
|
+
if overlap > 0 and len(current_chunk) > overlap:
|
39
|
+
current_chunk = current_chunk[-overlap:] + "\n\n" + para
|
40
|
+
else:
|
41
|
+
current_chunk = para
|
42
|
+
else:
|
43
|
+
# If paragraph itself is bigger than chunk size, split it
|
44
|
+
if len(para) > chunk_size:
|
45
|
+
words = para.split()
|
46
|
+
temp_chunk = ""
|
47
|
+
for word in words:
|
48
|
+
if len(temp_chunk) + len(word) + 1 > chunk_size:
|
49
|
+
chunks.append(temp_chunk)
|
50
|
+
# Start new chunk with overlap
|
51
|
+
if overlap > 0 and len(temp_chunk) > overlap:
|
52
|
+
temp_chunk = temp_chunk[-overlap:] + " " + word
|
53
|
+
else:
|
54
|
+
temp_chunk = word
|
55
|
+
else:
|
56
|
+
if temp_chunk:
|
57
|
+
temp_chunk += " " + word
|
58
|
+
else:
|
59
|
+
temp_chunk = word
|
60
|
+
if temp_chunk:
|
61
|
+
current_chunk = temp_chunk
|
62
|
+
else:
|
63
|
+
chunks.append(para)
|
64
|
+
else:
|
65
|
+
# Add paragraph to current chunk
|
66
|
+
if current_chunk:
|
67
|
+
current_chunk += "\n\n" + para
|
68
|
+
else:
|
69
|
+
current_chunk = para
|
70
|
+
|
71
|
+
# Don't forget the last chunk
|
72
|
+
if current_chunk:
|
73
|
+
chunks.append(current_chunk)
|
74
|
+
|
75
|
+
print_success(f"Text chunked into {len(chunks)} segments", "Content Chunking")
|
76
|
+
return chunks
|
@@ -0,0 +1,155 @@
|
|
1
|
+
"""
|
2
|
+
Core functionality for fetching web pages and extracting content.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import requests
|
6
|
+
from typing import Tuple, List, Optional
|
7
|
+
from urllib.parse import urlparse
|
8
|
+
from janito.tools.rich_console import print_info, print_success, print_error, print_warning
|
9
|
+
from janito.tools.usage_tracker import track_usage
|
10
|
+
|
11
|
+
from janito.tools.fetch_webpage.extractors import extract_clean_text
|
12
|
+
# Import moved to fetch_and_extract function to avoid circular imports
|
13
|
+
from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
|
14
|
+
|
15
|
+
|
16
|
+
@track_usage('web_requests')
|
17
|
+
def fetch_webpage(url: str, headers: dict = None, timeout: int = 30, max_size: int = 5000000,
|
18
|
+
target_strings: List[str] = None) -> Tuple[str, bool]:
|
19
|
+
"""
|
20
|
+
Fetch the content of a web page from a given URL.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
url: The URL of the web page to fetch
|
24
|
+
headers: Optional HTTP headers to include in the request (default: None)
|
25
|
+
timeout: Request timeout in seconds (default: 30)
|
26
|
+
max_size: Maximum size in bytes to download (default: 5MB)
|
27
|
+
target_strings: Optional list of strings to target specific content sections
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
A tuple containing (message, is_error)
|
31
|
+
"""
|
32
|
+
print_info(f"Fetching content from URL: {url}", "Web Fetch")
|
33
|
+
|
34
|
+
try:
|
35
|
+
# Set default headers if none provided
|
36
|
+
if headers is None:
|
37
|
+
headers = {
|
38
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
39
|
+
}
|
40
|
+
|
41
|
+
# Make the HTTP request with streaming enabled
|
42
|
+
response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
43
|
+
|
44
|
+
# Raise an exception for HTTP errors
|
45
|
+
response.raise_for_status()
|
46
|
+
|
47
|
+
# Check content length before downloading fully
|
48
|
+
content_length = response.headers.get('Content-Length')
|
49
|
+
if content_length and int(content_length) > max_size:
|
50
|
+
warning_msg = f"Web Fetch: Content size ({int(content_length)/1000000:.1f}MB) exceeds max size ({max_size/1000000:.1f}MB). Aborting download."
|
51
|
+
print_warning(warning_msg)
|
52
|
+
return warning_msg, True
|
53
|
+
|
54
|
+
# Download content with size limit
|
55
|
+
content_bytes = b''
|
56
|
+
for chunk in response.iter_content(chunk_size=1024 * 1024): # 1MB chunks
|
57
|
+
content_bytes += chunk
|
58
|
+
if len(content_bytes) > max_size:
|
59
|
+
warning_msg = f"Web Fetch: Download exceeded max size ({max_size/1000000:.1f}MB). Truncating."
|
60
|
+
print_warning(warning_msg)
|
61
|
+
break
|
62
|
+
|
63
|
+
# Get the content
|
64
|
+
content = content_bytes.decode('utf-8', errors='replace')
|
65
|
+
|
66
|
+
# If target strings are provided, extract only the relevant sections
|
67
|
+
if target_strings and len(target_strings) > 0:
|
68
|
+
print_info(f"Targeting specific content using {len(target_strings)} search strings", "Web Fetch")
|
69
|
+
from janito.tools.fetch_webpage.extractors import extract_targeted_content
|
70
|
+
targeted_content = extract_targeted_content(content, target_strings)
|
71
|
+
|
72
|
+
if targeted_content:
|
73
|
+
print_success(f"Successfully targeted specific content based on search strings", "Web Fetch")
|
74
|
+
# Create a summary with first 300 chars of targeted content
|
75
|
+
content_preview = targeted_content[:300] + "..." if len(targeted_content) > 300 else targeted_content
|
76
|
+
summary = f"Successfully fetched targeted content from {url}\n\nContent preview:\n{content_preview}"
|
77
|
+
print_success(f"Successfully fetched targeted content from {url} ({len(targeted_content)} bytes)", "Web Fetch")
|
78
|
+
return targeted_content, False
|
79
|
+
else:
|
80
|
+
print_warning(f"Web Fetch: Could not find content matching the target strings. Returning full content.")
|
81
|
+
|
82
|
+
# Create a summary message with first 300 chars of content
|
83
|
+
content_preview = content[:300] + "..." if len(content) > 300 else content
|
84
|
+
|
85
|
+
print_success(f"({len(content)} bytes)", "Web Fetch")
|
86
|
+
|
87
|
+
# Return the full content
|
88
|
+
return content, False
|
89
|
+
|
90
|
+
except requests.exceptions.RequestException as e:
|
91
|
+
error_msg = f"Error fetching web page: {str(e)}"
|
92
|
+
print_error(error_msg, "Web Fetch Error")
|
93
|
+
return error_msg, True
|
94
|
+
|
95
|
+
|
96
|
+
@track_usage('web_content')
|
97
|
+
def fetch_and_extract(url: str, extract_method: str = 'trafilatura',
|
98
|
+
max_length: int = 10000,
|
99
|
+
target_strings: List[str] = None) -> Tuple[str, bool]:
|
100
|
+
"""
|
101
|
+
Fetch a webpage and extract its main content in a format suitable for LLM processing.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
url: The URL to fetch
|
105
|
+
extract_method: Content extraction method ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
|
106
|
+
max_length: Maximum length of text to return
|
107
|
+
target_strings: Optional list of strings to target specific content sections
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
A tuple containing (extracted_content, is_error)
|
111
|
+
"""
|
112
|
+
# Check if this is a news aggregator site that needs special handling
|
113
|
+
domain = urlparse(url).netloc
|
114
|
+
for site_domain in SITE_SPECIFIC_STRATEGIES.keys():
|
115
|
+
if site_domain in domain:
|
116
|
+
print_info(f"Detected news aggregator site: {domain}. Using specialized extraction.", "Content Extraction")
|
117
|
+
# Import here to avoid circular imports
|
118
|
+
from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
|
119
|
+
return fetch_and_extract_news_aggregator(url)
|
120
|
+
|
121
|
+
# If target strings are provided, pass them directly to fetch_webpage for efficiency
|
122
|
+
if target_strings and len(target_strings) > 0:
|
123
|
+
html_content, is_error = fetch_webpage(url, target_strings=target_strings)
|
124
|
+
else:
|
125
|
+
html_content, is_error = fetch_webpage(url)
|
126
|
+
|
127
|
+
if is_error:
|
128
|
+
return html_content, True
|
129
|
+
|
130
|
+
extracted_text = extract_clean_text(html_content, method=extract_method, url=url)
|
131
|
+
|
132
|
+
if not extracted_text or len(extracted_text) < 100:
|
133
|
+
return f"Could not extract meaningful content from {url}", True
|
134
|
+
|
135
|
+
# If target strings were provided but not already handled by fetch_webpage
|
136
|
+
if target_strings and len(target_strings) > 0 and not any(target in extracted_text for target in target_strings if len(target) > 3):
|
137
|
+
from janito.tools.fetch_webpage.extractors import extract_targeted_content
|
138
|
+
targeted_content = extract_targeted_content(html_content, target_strings)
|
139
|
+
if targeted_content:
|
140
|
+
print_success(f"Successfully extracted targeted content based on {len(target_strings)} search strings",
|
141
|
+
"Targeted Extraction")
|
142
|
+
extracted_text = targeted_content
|
143
|
+
|
144
|
+
# Truncate if needed
|
145
|
+
if len(extracted_text) > max_length:
|
146
|
+
print_info(f"Truncating content from {len(extracted_text)} to {max_length} characters", "Content Extraction")
|
147
|
+
extracted_text = extracted_text[:max_length] + "..."
|
148
|
+
|
149
|
+
# Check if the content is still too large for an LLM (rough estimate)
|
150
|
+
estimated_tokens = len(extracted_text.split())
|
151
|
+
if estimated_tokens > 10000: # Conservative estimate for token limits
|
152
|
+
print_warning(f"Content Extraction: Extracted content still very large (~{estimated_tokens} words). Consider using chunk_large_content()")
|
153
|
+
|
154
|
+
print_success(f"Successfully extracted {len(extracted_text)} characters of content", "Content Extraction")
|
155
|
+
return extracted_text, False
|