janito 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. janito/__init__.py +1 -1
  2. janito/__main__.py +6 -204
  3. janito/callbacks.py +34 -132
  4. janito/cli/__init__.py +6 -0
  5. janito/cli/agent.py +400 -0
  6. janito/cli/app.py +94 -0
  7. janito/cli/commands.py +329 -0
  8. janito/cli/output.py +29 -0
  9. janito/cli/utils.py +22 -0
  10. janito/config.py +358 -121
  11. janito/data/instructions_template.txt +28 -0
  12. janito/token_report.py +154 -145
  13. janito/tools/__init__.py +38 -21
  14. janito/tools/bash/bash.py +84 -0
  15. janito/tools/bash/unix_persistent_bash.py +184 -0
  16. janito/tools/bash/win_persistent_bash.py +308 -0
  17. janito/tools/decorators.py +2 -13
  18. janito/tools/delete_file.py +27 -9
  19. janito/tools/fetch_webpage/__init__.py +34 -0
  20. janito/tools/fetch_webpage/chunking.py +76 -0
  21. janito/tools/fetch_webpage/core.py +155 -0
  22. janito/tools/fetch_webpage/extractors.py +276 -0
  23. janito/tools/fetch_webpage/news.py +137 -0
  24. janito/tools/fetch_webpage/utils.py +108 -0
  25. janito/tools/find_files.py +106 -44
  26. janito/tools/move_file.py +72 -0
  27. janito/tools/prompt_user.py +37 -6
  28. janito/tools/replace_file.py +31 -4
  29. janito/tools/rich_console.py +176 -0
  30. janito/tools/search_text.py +35 -22
  31. janito/tools/str_replace_editor/editor.py +7 -4
  32. janito/tools/str_replace_editor/handlers/__init__.py +16 -0
  33. janito/tools/str_replace_editor/handlers/create.py +60 -0
  34. janito/tools/str_replace_editor/handlers/insert.py +100 -0
  35. janito/tools/str_replace_editor/handlers/str_replace.py +94 -0
  36. janito/tools/str_replace_editor/handlers/undo.py +64 -0
  37. janito/tools/str_replace_editor/handlers/view.py +159 -0
  38. janito/tools/str_replace_editor/utils.py +0 -1
  39. janito/tools/usage_tracker.py +136 -0
  40. janito-0.13.0.dist-info/METADATA +300 -0
  41. janito-0.13.0.dist-info/RECORD +47 -0
  42. janito/chat_history.py +0 -117
  43. janito/data/instructions.txt +0 -4
  44. janito/tools/bash.py +0 -22
  45. janito/tools/str_replace_editor/handlers.py +0 -335
  46. janito-0.11.0.dist-info/METADATA +0 -86
  47. janito-0.11.0.dist-info/RECORD +0 -26
  48. {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/WHEEL +0 -0
  49. {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/entry_points.txt +0 -0
  50. {janito-0.11.0.dist-info → janito-0.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,308 @@
1
+ import subprocess
2
+ import os
3
+ import platform
4
+ import time
5
+ import uuid
6
+ import sys
7
+ import io
8
+ import codecs
9
+
10
+ class PersistentBash:
11
+ """
12
+ A wrapper class that maintains a persistent Bash session.
13
+ Allows sending commands and collecting output without restarting Bash.
14
+ """
15
+
16
+ def __init__(self, bash_path=None):
17
+ """
18
+ Initialize a persistent Bash session.
19
+
20
+ Args:
21
+ bash_path (str, optional): Path to the Bash executable. If None, tries to detect automatically.
22
+ This can be configured in Janito's config using the gitbash_path setting.
23
+ """
24
+ self.process = None
25
+ self.bash_path = bash_path
26
+
27
+ # Configure UTF-8 support for Windows
28
+ if platform.system() == "Windows":
29
+ # Force UTF-8 mode in Python 3.7+
30
+ os.environ["PYTHONUTF8"] = "1"
31
+
32
+ # Set Python's standard IO encoding to UTF-8
33
+ if hasattr(sys.stdout, 'reconfigure'):
34
+ sys.stdout.reconfigure(encoding='utf-8')
35
+ if hasattr(sys.stderr, 'reconfigure'):
36
+ sys.stderr.reconfigure(encoding='utf-8')
37
+ if hasattr(sys.stdin, 'reconfigure'):
38
+ sys.stdin.reconfigure(encoding='utf-8')
39
+
40
+ # Ensure Windows console is in UTF-8 mode
41
+ try:
42
+ # Try to set console mode to UTF-8
43
+ os.system("chcp 65001 > nul")
44
+
45
+ # Redirect stdout through a UTF-8 writer
46
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True)
47
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True)
48
+ except Exception as e:
49
+ print(f"Warning: Failed to set up UTF-8 encoding: {str(e)}")
50
+
51
+ # If bash_path is not provided, try to detect it
52
+ if self.bash_path is None:
53
+ if platform.system() == "Windows":
54
+ # Common paths for Git Bash on Windows
55
+ possible_paths = [
56
+ r"C:\Program Files\Git\bin\bash.exe",
57
+ r"C:\Program Files (x86)\Git\bin\bash.exe",
58
+ ]
59
+ for path in possible_paths:
60
+ if os.path.exists(path):
61
+ self.bash_path = path
62
+ break
63
+ if self.bash_path is None:
64
+ raise FileNotFoundError("Could not find Git Bash executable. Please specify the path manually.")
65
+ else:
66
+ # On Unix-like systems, bash is usually in the PATH
67
+ self.bash_path = "bash"
68
+
69
+ # Start the bash process
70
+ self.start_process()
71
+
72
+ def start_process(self):
73
+ """Start the Bash process."""
74
+ # Create a subprocess with pipe for stdin, stdout, and stderr
75
+ bash_args = [self.bash_path]
76
+
77
+ # Set UTF-8 codepage for Windows
78
+ env = os.environ.copy()
79
+ if platform.system() == "Windows":
80
+ # Set codepage to UTF-8 (65001) - run this before starting the process
81
+ os.system("chcp 65001 > nul")
82
+ # Set environment variables for proper UTF-8 handling
83
+ env["PYTHONIOENCODING"] = "utf-8"
84
+ env["PYTHONUTF8"] = "1"
85
+ # Add additional environment variables for Windows CMD
86
+ env["LANG"] = "en_US.UTF-8"
87
+ env["LC_ALL"] = "en_US.UTF-8"
88
+
89
+ # Create the process with binary pipes for better control over encoding
90
+ if platform.system() == "Windows":
91
+ # On Windows, we need special handling for UTF-8
92
+ self.process = subprocess.Popen(
93
+ bash_args,
94
+ stdin=subprocess.PIPE,
95
+ stdout=subprocess.PIPE,
96
+ stderr=subprocess.STDOUT, # Redirect stderr to stdout
97
+ bufsize=0, # Unbuffered
98
+ universal_newlines=False, # Use binary mode
99
+ env=env # Pass the modified environment
100
+ )
101
+
102
+ # Create UTF-8 wrappers for stdin/stdout
103
+ self.stdin = io.TextIOWrapper(self.process.stdin, encoding='utf-8', errors='replace', line_buffering=True)
104
+ self.stdout = io.TextIOWrapper(self.process.stdout, encoding='utf-8', errors='replace', line_buffering=True)
105
+ else:
106
+ # On Unix systems, the standard approach works fine
107
+ self.process = subprocess.Popen(
108
+ bash_args,
109
+ stdin=subprocess.PIPE,
110
+ stdout=subprocess.PIPE,
111
+ stderr=subprocess.STDOUT, # Redirect stderr to stdout
112
+ text=True, # Use text mode for input/output
113
+ bufsize=0, # Unbuffered
114
+ universal_newlines=True, # Universal newlines mode
115
+ env=env, # Pass the modified environment
116
+ encoding='utf-8', # Explicitly set encoding to UTF-8
117
+ errors='replace' # Replace invalid characters instead of failing
118
+ )
119
+ self.stdin = self.process.stdin
120
+ self.stdout = self.process.stdout
121
+
122
+ # Set up a more reliable environment
123
+ setup_commands = [
124
+ "export PS1='$ '", # Simple prompt to avoid parsing issues
125
+ "export TERM=dumb", # Disable color codes and other terminal features
126
+ "set +o history", # Disable history
127
+ "shopt -s expand_aliases", # Enable alias expansion
128
+ "export LANG=en_US.UTF-8", # Set UTF-8 locale
129
+ "export LC_ALL=en_US.UTF-8", # Set all locale categories to UTF-8
130
+ ]
131
+
132
+ # Additional setup for Windows to handle UTF-8
133
+ if platform.system() == "Windows":
134
+ setup_commands.extend([
135
+ # Force Git Bash to use UTF-8
136
+ "export LESSCHARSET=utf-8",
137
+ # Ensure proper display of Unicode characters
138
+ "export PYTHONIOENCODING=utf-8"
139
+ ])
140
+
141
+ # Send setup commands
142
+ for cmd in setup_commands:
143
+ self._send_command(cmd)
144
+
145
+ # Clear initial output with a marker
146
+ marker = f"INIT_COMPLETE_{uuid.uuid4().hex}"
147
+ self._send_command(f"echo {marker}")
148
+
149
+ while True:
150
+ line = self.stdout.readline().strip()
151
+ if marker in line:
152
+ break
153
+
154
+ def _send_command(self, command):
155
+ """Send a command to the Bash process without reading the output."""
156
+ if self.process is None or self.process.poll() is not None:
157
+ self.start_process()
158
+
159
+ # Use our stdin wrapper instead of process.stdin directly
160
+ self.stdin.write(command + "\n")
161
+ self.stdin.flush()
162
+
163
+ def execute(self, command, timeout=None):
164
+ """
165
+ Execute a command in the Bash session and return the output.
166
+
167
+ Args:
168
+ command (str): The command to execute.
169
+ timeout (int, optional): Timeout in seconds. If None, no timeout is applied.
170
+
171
+ Returns:
172
+ str: The command output.
173
+ """
174
+ if self.process is None or self.process.poll() is not None:
175
+ # Process has terminated, restart it
176
+ self.start_process()
177
+
178
+ # Create a unique marker to identify the end of output
179
+ end_marker = f"END_OF_COMMAND_{uuid.uuid4().hex}"
180
+
181
+ # Construct the wrapped command with echo markers
182
+ # Only use timeout when explicitly requested
183
+ if timeout is not None and timeout > 0:
184
+ # Check if timeout command is available
185
+ is_timeout_available = False
186
+ try:
187
+ check_cmd = "command -v timeout > /dev/null 2>&1 && echo available || echo unavailable"
188
+ self._send_command(check_cmd)
189
+ for _ in range(10): # Read up to 10 lines to find the result
190
+ line = self.stdout.readline().strip()
191
+ if "available" in line:
192
+ is_timeout_available = True
193
+ break
194
+ elif "unavailable" in line:
195
+ is_timeout_available = False
196
+ break
197
+ except:
198
+ is_timeout_available = False
199
+
200
+ if is_timeout_available:
201
+ # For timeout to work with shell syntax, we need to use bash -c
202
+ wrapped_command = f"timeout {timeout}s bash -c \"{command.replace('"', '\\"')}\" 2>&1; echo '{end_marker}'"
203
+ else:
204
+ wrapped_command = f"{command} 2>&1; echo '{end_marker}'"
205
+ else:
206
+ wrapped_command = f"{command} 2>&1; echo '{end_marker}'"
207
+
208
+ # Send the command
209
+ self._send_command(wrapped_command)
210
+
211
+ # Import the console here to avoid circular imports
212
+ from janito.tools.rich_console import console
213
+
214
+ # Collect output until the end marker is found
215
+ output_lines = []
216
+ start_time = time.time()
217
+ max_wait = timeout if timeout is not None else 3600 # Default to 1 hour if no timeout
218
+
219
+ while time.time() - start_time < max_wait + 5: # Add buffer time
220
+ try:
221
+ line = self.stdout.readline().rstrip('\r\n')
222
+ if end_marker in line:
223
+ break
224
+
225
+ # Print the output to the console in real-time if not in trust mode
226
+ if line:
227
+ from janito.config import get_config
228
+ if not get_config().trust_mode:
229
+ console.print(line)
230
+
231
+ output_lines.append(line)
232
+ except UnicodeDecodeError as e:
233
+ # Handle potential UTF-8 decoding errors
234
+ error_msg = f"[Warning: Unicode decode error occurred: {str(e)}]"
235
+ console.print(error_msg, style="yellow")
236
+ output_lines.append(error_msg)
237
+ # Just continue with replacement character
238
+ continue
239
+ except Exception as e:
240
+ error_msg = f"[Error reading output: {str(e)}]"
241
+ console.print(error_msg, style="red")
242
+ output_lines.append(error_msg)
243
+ continue
244
+
245
+ # Check for timeout
246
+ if time.time() - start_time >= max_wait + 5:
247
+ timeout_msg = f"Error: Command timed out after {max_wait} seconds"
248
+ console.print(timeout_msg, style="red bold")
249
+ output_lines.append(timeout_msg)
250
+
251
+ # Try to reset the bash session after a timeout
252
+ self.close()
253
+ self.start_process()
254
+
255
+ return "\n".join(output_lines)
256
+
257
+ def windows_to_bash_path(self, windows_path):
258
+ """
259
+ Convert a Windows path to a Git Bash compatible path.
260
+
261
+ Args:
262
+ windows_path (str): A Windows path like 'C:\\folder\\file.txt'
263
+
264
+ Returns:
265
+ str: Git Bash compatible path like '/c/folder/file.txt'
266
+ """
267
+ if not windows_path or not platform.system() == "Windows":
268
+ return windows_path
269
+
270
+ # Handle drive letter (e.g., C: -> /c)
271
+ if ":" in windows_path:
272
+ drive, path = windows_path.split(":", 1)
273
+ unix_path = f"/{drive.lower()}{path}"
274
+ else:
275
+ unix_path = windows_path
276
+
277
+ # Convert backslashes to forward slashes
278
+ unix_path = unix_path.replace("\\", "/")
279
+
280
+ # Remove any double slashes
281
+ while "//" in unix_path:
282
+ unix_path = unix_path.replace("//", "/")
283
+
284
+ # If the path contains spaces, we need to escape them or quote the entire path
285
+ if " " in unix_path:
286
+ unix_path = f'"{unix_path}"'
287
+
288
+ return unix_path
289
+
290
+ def close(self):
291
+ """Close the Bash session."""
292
+ if self.process and self.process.poll() is None:
293
+ try:
294
+ self._send_command("exit")
295
+ self.process.wait(timeout=2)
296
+ except:
297
+ pass
298
+ finally:
299
+ try:
300
+ self.process.terminate()
301
+ except:
302
+ pass
303
+
304
+ self.process = None
305
+
306
+ def __del__(self):
307
+ """Destructor to ensure the process is closed."""
308
+ self.close()
@@ -2,9 +2,8 @@
2
2
  Decorators for janito tools.
3
3
  """
4
4
  import functools
5
- import inspect
6
5
  import string
7
- from typing import Any, Callable, Dict, Optional, Tuple
6
+ from typing import Any, Callable, Dict, Optional
8
7
 
9
8
 
10
9
  class ToolMetaFormatter(string.Formatter):
@@ -83,19 +82,9 @@ def format_tool_label(func: Callable, tool_input: Dict[str, Any]) -> Optional[st
83
82
  # Get the label template
84
83
  label_template = func._tool_meta['label']
85
84
 
86
- # Special handling for str_replace_editor which uses **kwargs
87
- if func.__name__ == 'str_replace_editor':
88
- # Extract command and file_path from tool_input if they exist
89
- command = tool_input.get('command', 'unknown')
90
- file_path = tool_input.get('file_path', '')
91
-
92
- # Simple string replacement for the common case
93
- if '{command}' in label_template and '{file_path}' in label_template:
94
- return label_template.replace('{command}', command).replace('{file_path}', file_path)
95
-
96
85
  # Format the label with the parameters
97
86
  try:
98
87
  formatter = ToolMetaFormatter()
99
88
  return formatter.format(label_template, **tool_input)
100
- except Exception as e:
89
+ except Exception:
101
90
  return f"{func.__name__}"
@@ -1,15 +1,14 @@
1
1
  """
2
2
  Tool for deleting files through the claudine agent.
3
3
  """
4
- import os
5
4
  from pathlib import Path
6
- from typing import Dict, Any, Tuple
7
- from janito.config import get_config
5
+ from typing import Tuple
8
6
  from janito.tools.str_replace_editor.utils import normalize_path
9
- from janito.tools.decorators import tool_meta
7
+ from janito.tools.rich_console import print_info, print_success, print_error
8
+ from janito.tools.usage_tracker import track_usage, get_tracker
10
9
 
11
10
 
12
- @tool_meta(label="Deleting file {file_path}")
11
+ @track_usage('files_deleted')
13
12
  def delete_file(
14
13
  file_path: str,
15
14
  ) -> Tuple[str, bool]:
@@ -22,6 +21,7 @@ def delete_file(
22
21
  Returns:
23
22
  A tuple containing (message, is_error)
24
23
  """
24
+ print_info(f"Deleting file {file_path}", "Delete Operation")
25
25
  # Store the original path for display purposes
26
26
  original_path = file_path
27
27
 
@@ -33,15 +33,33 @@ def delete_file(
33
33
 
34
34
  # Check if the file exists
35
35
  if not path_obj.exists():
36
- return (f"File {original_path} does not exist.", True)
36
+ error_msg = f"File {original_path} does not exist."
37
+ print_error(error_msg, "Error")
38
+ return (error_msg, True)
37
39
 
38
40
  # Check if it's a directory
39
41
  if path_obj.is_dir():
40
- return (f"{original_path} is a directory, not a file. Use delete_directory for directories.", True)
42
+ error_msg = f"{original_path} is a directory, not a file. Use delete_directory for directories."
43
+ print_error(error_msg, "Error")
44
+ return (error_msg, True)
41
45
 
42
46
  # Delete the file
43
47
  try:
48
+ # Count the number of lines in the file before deleting
49
+ try:
50
+ with open(path_obj, 'r', encoding='utf-8') as f:
51
+ line_count = len(f.readlines())
52
+ # Track negative line delta for deleted file
53
+ get_tracker().increment('lines_delta', -line_count)
54
+ except Exception:
55
+ # If we can't read the file, we can't count lines
56
+ pass
57
+
44
58
  path_obj.unlink()
45
- return (f"Successfully deleted file {original_path}", False)
59
+ success_msg = f"Successfully deleted file {original_path}"
60
+ print_success("", "Success")
61
+ return (success_msg, False)
46
62
  except Exception as e:
47
- return (f"Error deleting file {original_path}: {str(e)}", True)
63
+ error_msg = f"Error deleting file {original_path}: {str(e)}"
64
+ print_error(error_msg, "Error")
65
+ return (error_msg, True)
@@ -0,0 +1,34 @@
1
+ """
2
+ Webpage Content Extractor Package
3
+
4
+ A comprehensive tool for extracting clean, relevant content from web pages
5
+ for processing with LLMs. Features include:
6
+ - General content extraction with multiple methods
7
+ - Specialized handling for news aggregator sites
8
+ - Targeted extraction based on specific search strings
9
+ - Chunking for large content
10
+ - Structured content extraction
11
+
12
+ Dependencies:
13
+ - requests
14
+ - beautifulsoup4
15
+ - trafilatura
16
+ - newspaper3k
17
+
18
+ Author: Claude (Anthropic)
19
+ """
20
+
21
+ from janito.tools.fetch_webpage.core import fetch_webpage, fetch_and_extract
22
+ from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
23
+ from janito.tools.fetch_webpage.extractors import extract_clean_text, extract_targeted_content, extract_structured_content
24
+ from janito.tools.fetch_webpage.chunking import chunk_large_content
25
+
26
+ __all__ = [
27
+ 'fetch_webpage',
28
+ 'fetch_and_extract',
29
+ 'fetch_and_extract_news_aggregator',
30
+ 'extract_clean_text',
31
+ 'extract_targeted_content',
32
+ 'extract_structured_content',
33
+ 'chunk_large_content'
34
+ ]
@@ -0,0 +1,76 @@
1
+ """
2
+ Functions for chunking large content into manageable pieces.
3
+ """
4
+
5
+ from typing import List
6
+ from janito.tools.rich_console import print_info, print_success
7
+
8
+
9
+ def chunk_large_content(text: str, chunk_size: int = 4000, overlap: int = 500) -> List[str]:
10
+ """
11
+ Split very large text content into manageable chunks suitable for LLM processing.
12
+
13
+ Args:
14
+ text: The text to chunk
15
+ chunk_size: Target size for each chunk in characters
16
+ overlap: Number of characters to overlap between chunks
17
+
18
+ Returns:
19
+ List of text chunks
20
+ """
21
+ if not text or len(text) <= chunk_size:
22
+ return [text] if text else []
23
+
24
+ print_info(f"Chunking {len(text)} characters of text into ~{chunk_size} character chunks", "Content Chunking")
25
+
26
+ # Try to split on paragraph breaks first
27
+ paragraphs = text.split('\n\n')
28
+ chunks = []
29
+ current_chunk = ""
30
+
31
+ for para in paragraphs:
32
+ # If adding this paragraph would exceed chunk size
33
+ if len(current_chunk) + len(para) + 2 > chunk_size:
34
+ # If current chunk is not empty, add it to chunks
35
+ if current_chunk:
36
+ chunks.append(current_chunk)
37
+ # Start new chunk with overlap from previous chunk
38
+ if overlap > 0 and len(current_chunk) > overlap:
39
+ current_chunk = current_chunk[-overlap:] + "\n\n" + para
40
+ else:
41
+ current_chunk = para
42
+ else:
43
+ # If paragraph itself is bigger than chunk size, split it
44
+ if len(para) > chunk_size:
45
+ words = para.split()
46
+ temp_chunk = ""
47
+ for word in words:
48
+ if len(temp_chunk) + len(word) + 1 > chunk_size:
49
+ chunks.append(temp_chunk)
50
+ # Start new chunk with overlap
51
+ if overlap > 0 and len(temp_chunk) > overlap:
52
+ temp_chunk = temp_chunk[-overlap:] + " " + word
53
+ else:
54
+ temp_chunk = word
55
+ else:
56
+ if temp_chunk:
57
+ temp_chunk += " " + word
58
+ else:
59
+ temp_chunk = word
60
+ if temp_chunk:
61
+ current_chunk = temp_chunk
62
+ else:
63
+ chunks.append(para)
64
+ else:
65
+ # Add paragraph to current chunk
66
+ if current_chunk:
67
+ current_chunk += "\n\n" + para
68
+ else:
69
+ current_chunk = para
70
+
71
+ # Don't forget the last chunk
72
+ if current_chunk:
73
+ chunks.append(current_chunk)
74
+
75
+ print_success(f"Text chunked into {len(chunks)} segments", "Content Chunking")
76
+ return chunks
@@ -0,0 +1,155 @@
1
+ """
2
+ Core functionality for fetching web pages and extracting content.
3
+ """
4
+
5
+ import requests
6
+ from typing import Tuple, List, Optional
7
+ from urllib.parse import urlparse
8
+ from janito.tools.rich_console import print_info, print_success, print_error, print_warning
9
+ from janito.tools.usage_tracker import track_usage
10
+
11
+ from janito.tools.fetch_webpage.extractors import extract_clean_text
12
+ # Import moved to fetch_and_extract function to avoid circular imports
13
+ from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
14
+
15
+
16
+ @track_usage('web_requests')
17
+ def fetch_webpage(url: str, headers: dict = None, timeout: int = 30, max_size: int = 5000000,
18
+ target_strings: List[str] = None) -> Tuple[str, bool]:
19
+ """
20
+ Fetch the content of a web page from a given URL.
21
+
22
+ Args:
23
+ url: The URL of the web page to fetch
24
+ headers: Optional HTTP headers to include in the request (default: None)
25
+ timeout: Request timeout in seconds (default: 30)
26
+ max_size: Maximum size in bytes to download (default: 5MB)
27
+ target_strings: Optional list of strings to target specific content sections
28
+
29
+ Returns:
30
+ A tuple containing (message, is_error)
31
+ """
32
+ print_info(f"Fetching content from URL: {url}", "Web Fetch")
33
+
34
+ try:
35
+ # Set default headers if none provided
36
+ if headers is None:
37
+ headers = {
38
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
39
+ }
40
+
41
+ # Make the HTTP request with streaming enabled
42
+ response = requests.get(url, headers=headers, timeout=timeout, stream=True)
43
+
44
+ # Raise an exception for HTTP errors
45
+ response.raise_for_status()
46
+
47
+ # Check content length before downloading fully
48
+ content_length = response.headers.get('Content-Length')
49
+ if content_length and int(content_length) > max_size:
50
+ warning_msg = f"Web Fetch: Content size ({int(content_length)/1000000:.1f}MB) exceeds max size ({max_size/1000000:.1f}MB). Aborting download."
51
+ print_warning(warning_msg)
52
+ return warning_msg, True
53
+
54
+ # Download content with size limit
55
+ content_bytes = b''
56
+ for chunk in response.iter_content(chunk_size=1024 * 1024): # 1MB chunks
57
+ content_bytes += chunk
58
+ if len(content_bytes) > max_size:
59
+ warning_msg = f"Web Fetch: Download exceeded max size ({max_size/1000000:.1f}MB). Truncating."
60
+ print_warning(warning_msg)
61
+ break
62
+
63
+ # Get the content
64
+ content = content_bytes.decode('utf-8', errors='replace')
65
+
66
+ # If target strings are provided, extract only the relevant sections
67
+ if target_strings and len(target_strings) > 0:
68
+ print_info(f"Targeting specific content using {len(target_strings)} search strings", "Web Fetch")
69
+ from janito.tools.fetch_webpage.extractors import extract_targeted_content
70
+ targeted_content = extract_targeted_content(content, target_strings)
71
+
72
+ if targeted_content:
73
+ print_success(f"Successfully targeted specific content based on search strings", "Web Fetch")
74
+ # Create a summary with first 300 chars of targeted content
75
+ content_preview = targeted_content[:300] + "..." if len(targeted_content) > 300 else targeted_content
76
+ summary = f"Successfully fetched targeted content from {url}\n\nContent preview:\n{content_preview}"
77
+ print_success(f"Successfully fetched targeted content from {url} ({len(targeted_content)} bytes)", "Web Fetch")
78
+ return targeted_content, False
79
+ else:
80
+ print_warning(f"Web Fetch: Could not find content matching the target strings. Returning full content.")
81
+
82
+ # Create a summary message with first 300 chars of content
83
+ content_preview = content[:300] + "..." if len(content) > 300 else content
84
+
85
+ print_success(f"({len(content)} bytes)", "Web Fetch")
86
+
87
+ # Return the full content
88
+ return content, False
89
+
90
+ except requests.exceptions.RequestException as e:
91
+ error_msg = f"Error fetching web page: {str(e)}"
92
+ print_error(error_msg, "Web Fetch Error")
93
+ return error_msg, True
94
+
95
+
96
+ @track_usage('web_content')
97
+ def fetch_and_extract(url: str, extract_method: str = 'trafilatura',
98
+ max_length: int = 10000,
99
+ target_strings: List[str] = None) -> Tuple[str, bool]:
100
+ """
101
+ Fetch a webpage and extract its main content in a format suitable for LLM processing.
102
+
103
+ Args:
104
+ url: The URL to fetch
105
+ extract_method: Content extraction method ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
106
+ max_length: Maximum length of text to return
107
+ target_strings: Optional list of strings to target specific content sections
108
+
109
+ Returns:
110
+ A tuple containing (extracted_content, is_error)
111
+ """
112
+ # Check if this is a news aggregator site that needs special handling
113
+ domain = urlparse(url).netloc
114
+ for site_domain in SITE_SPECIFIC_STRATEGIES.keys():
115
+ if site_domain in domain:
116
+ print_info(f"Detected news aggregator site: {domain}. Using specialized extraction.", "Content Extraction")
117
+ # Import here to avoid circular imports
118
+ from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
119
+ return fetch_and_extract_news_aggregator(url)
120
+
121
+ # If target strings are provided, pass them directly to fetch_webpage for efficiency
122
+ if target_strings and len(target_strings) > 0:
123
+ html_content, is_error = fetch_webpage(url, target_strings=target_strings)
124
+ else:
125
+ html_content, is_error = fetch_webpage(url)
126
+
127
+ if is_error:
128
+ return html_content, True
129
+
130
+ extracted_text = extract_clean_text(html_content, method=extract_method, url=url)
131
+
132
+ if not extracted_text or len(extracted_text) < 100:
133
+ return f"Could not extract meaningful content from {url}", True
134
+
135
+ # If target strings were provided but not already handled by fetch_webpage
136
+ if target_strings and len(target_strings) > 0 and not any(target in extracted_text for target in target_strings if len(target) > 3):
137
+ from janito.tools.fetch_webpage.extractors import extract_targeted_content
138
+ targeted_content = extract_targeted_content(html_content, target_strings)
139
+ if targeted_content:
140
+ print_success(f"Successfully extracted targeted content based on {len(target_strings)} search strings",
141
+ "Targeted Extraction")
142
+ extracted_text = targeted_content
143
+
144
+ # Truncate if needed
145
+ if len(extracted_text) > max_length:
146
+ print_info(f"Truncating content from {len(extracted_text)} to {max_length} characters", "Content Extraction")
147
+ extracted_text = extracted_text[:max_length] + "..."
148
+
149
+ # Check if the content is still too large for an LLM (rough estimate)
150
+ estimated_tokens = len(extracted_text.split())
151
+ if estimated_tokens > 10000: # Conservative estimate for token limits
152
+ print_warning(f"Content Extraction: Extracted content still very large (~{estimated_tokens} words). Consider using chunk_large_content()")
153
+
154
+ print_success(f"Successfully extracted {len(extracted_text)} characters of content", "Content Extraction")
155
+ return extracted_text, False