janito 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janito/__init__.py +1 -1
- janito/__main__.py +3 -147
- janito/callbacks.py +13 -109
- janito/cli/__init__.py +6 -0
- janito/cli/agent.py +287 -0
- janito/cli/app.py +86 -0
- janito/cli/commands.py +329 -0
- janito/cli/output.py +29 -0
- janito/cli/utils.py +22 -0
- janito/config.py +338 -63
- janito/data/instructions_template.txt +27 -0
- janito/token_report.py +124 -43
- janito/tools/__init__.py +29 -1
- janito/tools/bash/bash.py +82 -0
- janito/tools/bash/unix_persistent_bash.py +182 -0
- janito/tools/bash/win_persistent_bash.py +306 -0
- janito/tools/decorators.py +90 -84
- janito/tools/delete_file.py +65 -44
- janito/tools/fetch_webpage/__init__.py +34 -0
- janito/tools/fetch_webpage/chunking.py +76 -0
- janito/tools/fetch_webpage/core.py +155 -0
- janito/tools/fetch_webpage/extractors.py +276 -0
- janito/tools/fetch_webpage/news.py +137 -0
- janito/tools/fetch_webpage/utils.py +108 -0
- janito/tools/find_files.py +108 -42
- janito/tools/move_file.py +72 -0
- janito/tools/prompt_user.py +57 -0
- janito/tools/replace_file.py +63 -0
- janito/tools/rich_console.py +139 -0
- janito/tools/search_text.py +33 -21
- janito/tools/str_replace_editor/editor.py +55 -43
- janito/tools/str_replace_editor/handlers/__init__.py +16 -0
- janito/tools/str_replace_editor/handlers/create.py +60 -0
- janito/tools/str_replace_editor/handlers/insert.py +100 -0
- janito/tools/str_replace_editor/handlers/str_replace.py +92 -0
- janito/tools/str_replace_editor/handlers/undo.py +64 -0
- janito/tools/str_replace_editor/handlers/view.py +153 -0
- janito/tools/str_replace_editor/utils.py +7 -62
- janito/tools/usage_tracker.py +136 -0
- janito-0.12.0.dist-info/METADATA +203 -0
- janito-0.12.0.dist-info/RECORD +47 -0
- janito/cli.py +0 -202
- janito/data/instructions.txt +0 -4
- janito/tools/str_replace_editor/handlers.py +0 -338
- janito-0.10.1.dist-info/METADATA +0 -86
- janito-0.10.1.dist-info/RECORD +0 -23
- {janito-0.10.1.dist-info → janito-0.12.0.dist-info}/WHEEL +0 -0
- {janito-0.10.1.dist-info → janito-0.12.0.dist-info}/entry_points.txt +0 -0
- {janito-0.10.1.dist-info → janito-0.12.0.dist-info}/licenses/LICENSE +0 -0
janito/tools/decorators.py
CHANGED
@@ -1,84 +1,90 @@
|
|
1
|
-
"""
|
2
|
-
Decorators for janito tools.
|
3
|
-
"""
|
4
|
-
import functools
|
5
|
-
import
|
6
|
-
import
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
1
|
+
"""
|
2
|
+
Decorators for janito tools.
|
3
|
+
"""
|
4
|
+
import functools
|
5
|
+
import string
|
6
|
+
from typing import Any, Callable, Dict, Optional
|
7
|
+
|
8
|
+
|
9
|
+
class ToolMetaFormatter(string.Formatter):
|
10
|
+
"""Custom string formatter that handles conditional expressions in format strings."""
|
11
|
+
|
12
|
+
def get_value(self, key, args, kwargs):
|
13
|
+
"""Override to handle conditional expressions."""
|
14
|
+
if key in kwargs:
|
15
|
+
return kwargs[key]
|
16
|
+
|
17
|
+
# Try to evaluate the key as a Python expression
|
18
|
+
try:
|
19
|
+
# Create a safe local namespace with only the parameters
|
20
|
+
return eval(key, {"__builtins__": {}}, kwargs)
|
21
|
+
except Exception:
|
22
|
+
return f"[{key}]"
|
23
|
+
|
24
|
+
|
25
|
+
def tool_meta(label: str):
|
26
|
+
"""
|
27
|
+
Decorator to add metadata to a tool function.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
label: A format string that can reference function parameters.
|
31
|
+
Example: "Finding files {pattern}, on {root_dir}"
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Decorated function with metadata attached
|
35
|
+
"""
|
36
|
+
def decorator(func: Callable):
|
37
|
+
@functools.wraps(func)
|
38
|
+
def wrapper(*args, **kwargs):
|
39
|
+
return func(*args, **kwargs)
|
40
|
+
|
41
|
+
# Attach metadata to the function
|
42
|
+
wrapper._tool_meta = {
|
43
|
+
'label': label
|
44
|
+
}
|
45
|
+
|
46
|
+
return wrapper
|
47
|
+
|
48
|
+
return decorator
|
49
|
+
|
50
|
+
|
51
|
+
def tool(func: Callable):
|
52
|
+
"""
|
53
|
+
Basic decorator for tool functions.
|
54
|
+
|
55
|
+
This decorator marks a function as a tool and can be used for
|
56
|
+
simpler tools that don't need additional metadata.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
Decorated function
|
60
|
+
"""
|
61
|
+
@functools.wraps(func)
|
62
|
+
def wrapper(*args, **kwargs):
|
63
|
+
return func(*args, **kwargs)
|
64
|
+
|
65
|
+
return wrapper
|
66
|
+
|
67
|
+
|
68
|
+
def format_tool_label(func: Callable, tool_input: Dict[str, Any]) -> Optional[str]:
|
69
|
+
"""
|
70
|
+
Format the tool label using the function's parameters.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
func: The tool function
|
74
|
+
tool_input: Input parameters for the tool
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
Formatted label string or None if no label is defined
|
78
|
+
"""
|
79
|
+
if not hasattr(func, '_tool_meta') or 'label' not in func._tool_meta:
|
80
|
+
return None
|
81
|
+
|
82
|
+
# Get the label template
|
83
|
+
label_template = func._tool_meta['label']
|
84
|
+
|
85
|
+
# Format the label with the parameters
|
86
|
+
try:
|
87
|
+
formatter = ToolMetaFormatter()
|
88
|
+
return formatter.format(label_template, **tool_input)
|
89
|
+
except Exception:
|
90
|
+
return f"{func.__name__}"
|
janito/tools/delete_file.py
CHANGED
@@ -1,44 +1,65 @@
|
|
1
|
-
"""
|
2
|
-
Tool for deleting files through the claudine agent.
|
3
|
-
"""
|
4
|
-
import
|
5
|
-
from
|
6
|
-
from
|
7
|
-
from janito.
|
8
|
-
from janito.tools.
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
"""
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
#
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
return (
|
1
|
+
"""
|
2
|
+
Tool for deleting files through the claudine agent.
|
3
|
+
"""
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Tuple
|
6
|
+
from janito.tools.str_replace_editor.utils import normalize_path
|
7
|
+
from janito.tools.rich_console import print_info, print_success, print_error
|
8
|
+
from janito.tools.usage_tracker import track_usage, get_tracker
|
9
|
+
|
10
|
+
|
11
|
+
@track_usage('files_deleted')
|
12
|
+
def delete_file(
|
13
|
+
file_path: str,
|
14
|
+
) -> Tuple[str, bool]:
|
15
|
+
"""
|
16
|
+
Delete an existing file.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
file_path: Path to the file to delete, relative to the workspace directory
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
A tuple containing (message, is_error)
|
23
|
+
"""
|
24
|
+
print_info(f"Deleting file {file_path}", "Delete Operation")
|
25
|
+
# Store the original path for display purposes
|
26
|
+
original_path = file_path
|
27
|
+
|
28
|
+
# Normalize the file path (converts to absolute path)
|
29
|
+
path = normalize_path(file_path)
|
30
|
+
|
31
|
+
# Convert to Path object for better path handling
|
32
|
+
path_obj = Path(path)
|
33
|
+
|
34
|
+
# Check if the file exists
|
35
|
+
if not path_obj.exists():
|
36
|
+
error_msg = f"File {original_path} does not exist."
|
37
|
+
print_error(error_msg, "Error")
|
38
|
+
return (error_msg, True)
|
39
|
+
|
40
|
+
# Check if it's a directory
|
41
|
+
if path_obj.is_dir():
|
42
|
+
error_msg = f"{original_path} is a directory, not a file. Use delete_directory for directories."
|
43
|
+
print_error(error_msg, "Error")
|
44
|
+
return (error_msg, True)
|
45
|
+
|
46
|
+
# Delete the file
|
47
|
+
try:
|
48
|
+
# Count the number of lines in the file before deleting
|
49
|
+
try:
|
50
|
+
with open(path_obj, 'r', encoding='utf-8') as f:
|
51
|
+
line_count = len(f.readlines())
|
52
|
+
# Track negative line delta for deleted file
|
53
|
+
get_tracker().increment('lines_delta', -line_count)
|
54
|
+
except Exception:
|
55
|
+
# If we can't read the file, we can't count lines
|
56
|
+
pass
|
57
|
+
|
58
|
+
path_obj.unlink()
|
59
|
+
success_msg = f"Successfully deleted file {original_path}"
|
60
|
+
print_success("", "Success")
|
61
|
+
return (success_msg, False)
|
62
|
+
except Exception as e:
|
63
|
+
error_msg = f"Error deleting file {original_path}: {str(e)}"
|
64
|
+
print_error(error_msg, "Error")
|
65
|
+
return (error_msg, True)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
"""
|
2
|
+
Webpage Content Extractor Package
|
3
|
+
|
4
|
+
A comprehensive tool for extracting clean, relevant content from web pages
|
5
|
+
for processing with LLMs. Features include:
|
6
|
+
- General content extraction with multiple methods
|
7
|
+
- Specialized handling for news aggregator sites
|
8
|
+
- Targeted extraction based on specific search strings
|
9
|
+
- Chunking for large content
|
10
|
+
- Structured content extraction
|
11
|
+
|
12
|
+
Dependencies:
|
13
|
+
- requests
|
14
|
+
- beautifulsoup4
|
15
|
+
- trafilatura
|
16
|
+
- newspaper3k
|
17
|
+
|
18
|
+
Author: Claude (Anthropic)
|
19
|
+
"""
|
20
|
+
|
21
|
+
from janito.tools.fetch_webpage.core import fetch_webpage, fetch_and_extract
|
22
|
+
from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
|
23
|
+
from janito.tools.fetch_webpage.extractors import extract_clean_text, extract_targeted_content, extract_structured_content
|
24
|
+
from janito.tools.fetch_webpage.chunking import chunk_large_content
|
25
|
+
|
26
|
+
__all__ = [
|
27
|
+
'fetch_webpage',
|
28
|
+
'fetch_and_extract',
|
29
|
+
'fetch_and_extract_news_aggregator',
|
30
|
+
'extract_clean_text',
|
31
|
+
'extract_targeted_content',
|
32
|
+
'extract_structured_content',
|
33
|
+
'chunk_large_content'
|
34
|
+
]
|
@@ -0,0 +1,76 @@
|
|
1
|
+
"""
|
2
|
+
Functions for chunking large content into manageable pieces.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
from janito.tools.rich_console import print_info, print_success
|
7
|
+
|
8
|
+
|
9
|
+
def chunk_large_content(text: str, chunk_size: int = 4000, overlap: int = 500) -> List[str]:
|
10
|
+
"""
|
11
|
+
Split very large text content into manageable chunks suitable for LLM processing.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
text: The text to chunk
|
15
|
+
chunk_size: Target size for each chunk in characters
|
16
|
+
overlap: Number of characters to overlap between chunks
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
List of text chunks
|
20
|
+
"""
|
21
|
+
if not text or len(text) <= chunk_size:
|
22
|
+
return [text] if text else []
|
23
|
+
|
24
|
+
print_info(f"Chunking {len(text)} characters of text into ~{chunk_size} character chunks", "Content Chunking")
|
25
|
+
|
26
|
+
# Try to split on paragraph breaks first
|
27
|
+
paragraphs = text.split('\n\n')
|
28
|
+
chunks = []
|
29
|
+
current_chunk = ""
|
30
|
+
|
31
|
+
for para in paragraphs:
|
32
|
+
# If adding this paragraph would exceed chunk size
|
33
|
+
if len(current_chunk) + len(para) + 2 > chunk_size:
|
34
|
+
# If current chunk is not empty, add it to chunks
|
35
|
+
if current_chunk:
|
36
|
+
chunks.append(current_chunk)
|
37
|
+
# Start new chunk with overlap from previous chunk
|
38
|
+
if overlap > 0 and len(current_chunk) > overlap:
|
39
|
+
current_chunk = current_chunk[-overlap:] + "\n\n" + para
|
40
|
+
else:
|
41
|
+
current_chunk = para
|
42
|
+
else:
|
43
|
+
# If paragraph itself is bigger than chunk size, split it
|
44
|
+
if len(para) > chunk_size:
|
45
|
+
words = para.split()
|
46
|
+
temp_chunk = ""
|
47
|
+
for word in words:
|
48
|
+
if len(temp_chunk) + len(word) + 1 > chunk_size:
|
49
|
+
chunks.append(temp_chunk)
|
50
|
+
# Start new chunk with overlap
|
51
|
+
if overlap > 0 and len(temp_chunk) > overlap:
|
52
|
+
temp_chunk = temp_chunk[-overlap:] + " " + word
|
53
|
+
else:
|
54
|
+
temp_chunk = word
|
55
|
+
else:
|
56
|
+
if temp_chunk:
|
57
|
+
temp_chunk += " " + word
|
58
|
+
else:
|
59
|
+
temp_chunk = word
|
60
|
+
if temp_chunk:
|
61
|
+
current_chunk = temp_chunk
|
62
|
+
else:
|
63
|
+
chunks.append(para)
|
64
|
+
else:
|
65
|
+
# Add paragraph to current chunk
|
66
|
+
if current_chunk:
|
67
|
+
current_chunk += "\n\n" + para
|
68
|
+
else:
|
69
|
+
current_chunk = para
|
70
|
+
|
71
|
+
# Don't forget the last chunk
|
72
|
+
if current_chunk:
|
73
|
+
chunks.append(current_chunk)
|
74
|
+
|
75
|
+
print_success(f"Text chunked into {len(chunks)} segments", "Content Chunking")
|
76
|
+
return chunks
|
@@ -0,0 +1,155 @@
|
|
1
|
+
"""
|
2
|
+
Core functionality for fetching web pages and extracting content.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import requests
|
6
|
+
from typing import Tuple, List, Optional
|
7
|
+
from urllib.parse import urlparse
|
8
|
+
from janito.tools.rich_console import print_info, print_success, print_error, print_warning
|
9
|
+
from janito.tools.usage_tracker import track_usage
|
10
|
+
|
11
|
+
from janito.tools.fetch_webpage.extractors import extract_clean_text
|
12
|
+
# Import moved to fetch_and_extract function to avoid circular imports
|
13
|
+
from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
|
14
|
+
|
15
|
+
|
16
|
+
@track_usage('web_requests')
|
17
|
+
def fetch_webpage(url: str, headers: dict = None, timeout: int = 30, max_size: int = 5000000,
|
18
|
+
target_strings: List[str] = None) -> Tuple[str, bool]:
|
19
|
+
"""
|
20
|
+
Fetch the content of a web page from a given URL.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
url: The URL of the web page to fetch
|
24
|
+
headers: Optional HTTP headers to include in the request (default: None)
|
25
|
+
timeout: Request timeout in seconds (default: 30)
|
26
|
+
max_size: Maximum size in bytes to download (default: 5MB)
|
27
|
+
target_strings: Optional list of strings to target specific content sections
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
A tuple containing (message, is_error)
|
31
|
+
"""
|
32
|
+
print_info(f"Fetching content from URL: {url}", "Web Fetch")
|
33
|
+
|
34
|
+
try:
|
35
|
+
# Set default headers if none provided
|
36
|
+
if headers is None:
|
37
|
+
headers = {
|
38
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
39
|
+
}
|
40
|
+
|
41
|
+
# Make the HTTP request with streaming enabled
|
42
|
+
response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
43
|
+
|
44
|
+
# Raise an exception for HTTP errors
|
45
|
+
response.raise_for_status()
|
46
|
+
|
47
|
+
# Check content length before downloading fully
|
48
|
+
content_length = response.headers.get('Content-Length')
|
49
|
+
if content_length and int(content_length) > max_size:
|
50
|
+
warning_msg = f"Web Fetch: Content size ({int(content_length)/1000000:.1f}MB) exceeds max size ({max_size/1000000:.1f}MB). Aborting download."
|
51
|
+
print_warning(warning_msg)
|
52
|
+
return warning_msg, True
|
53
|
+
|
54
|
+
# Download content with size limit
|
55
|
+
content_bytes = b''
|
56
|
+
for chunk in response.iter_content(chunk_size=1024 * 1024): # 1MB chunks
|
57
|
+
content_bytes += chunk
|
58
|
+
if len(content_bytes) > max_size:
|
59
|
+
warning_msg = f"Web Fetch: Download exceeded max size ({max_size/1000000:.1f}MB). Truncating."
|
60
|
+
print_warning(warning_msg)
|
61
|
+
break
|
62
|
+
|
63
|
+
# Get the content
|
64
|
+
content = content_bytes.decode('utf-8', errors='replace')
|
65
|
+
|
66
|
+
# If target strings are provided, extract only the relevant sections
|
67
|
+
if target_strings and len(target_strings) > 0:
|
68
|
+
print_info(f"Targeting specific content using {len(target_strings)} search strings", "Web Fetch")
|
69
|
+
from janito.tools.fetch_webpage.extractors import extract_targeted_content
|
70
|
+
targeted_content = extract_targeted_content(content, target_strings)
|
71
|
+
|
72
|
+
if targeted_content:
|
73
|
+
print_success(f"Successfully targeted specific content based on search strings", "Web Fetch")
|
74
|
+
# Create a summary with first 300 chars of targeted content
|
75
|
+
content_preview = targeted_content[:300] + "..." if len(targeted_content) > 300 else targeted_content
|
76
|
+
summary = f"Successfully fetched targeted content from {url}\n\nContent preview:\n{content_preview}"
|
77
|
+
print_success(f"Successfully fetched targeted content from {url} ({len(targeted_content)} bytes)", "Web Fetch")
|
78
|
+
return targeted_content, False
|
79
|
+
else:
|
80
|
+
print_warning(f"Web Fetch: Could not find content matching the target strings. Returning full content.")
|
81
|
+
|
82
|
+
# Create a summary message with first 300 chars of content
|
83
|
+
content_preview = content[:300] + "..." if len(content) > 300 else content
|
84
|
+
|
85
|
+
print_success(f"({len(content)} bytes)", "Web Fetch")
|
86
|
+
|
87
|
+
# Return the full content
|
88
|
+
return content, False
|
89
|
+
|
90
|
+
except requests.exceptions.RequestException as e:
|
91
|
+
error_msg = f"Error fetching web page: {str(e)}"
|
92
|
+
print_error(error_msg, "Web Fetch Error")
|
93
|
+
return error_msg, True
|
94
|
+
|
95
|
+
|
96
|
+
@track_usage('web_content')
|
97
|
+
def fetch_and_extract(url: str, extract_method: str = 'trafilatura',
|
98
|
+
max_length: int = 10000,
|
99
|
+
target_strings: List[str] = None) -> Tuple[str, bool]:
|
100
|
+
"""
|
101
|
+
Fetch a webpage and extract its main content in a format suitable for LLM processing.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
url: The URL to fetch
|
105
|
+
extract_method: Content extraction method ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
|
106
|
+
max_length: Maximum length of text to return
|
107
|
+
target_strings: Optional list of strings to target specific content sections
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
A tuple containing (extracted_content, is_error)
|
111
|
+
"""
|
112
|
+
# Check if this is a news aggregator site that needs special handling
|
113
|
+
domain = urlparse(url).netloc
|
114
|
+
for site_domain in SITE_SPECIFIC_STRATEGIES.keys():
|
115
|
+
if site_domain in domain:
|
116
|
+
print_info(f"Detected news aggregator site: {domain}. Using specialized extraction.", "Content Extraction")
|
117
|
+
# Import here to avoid circular imports
|
118
|
+
from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
|
119
|
+
return fetch_and_extract_news_aggregator(url)
|
120
|
+
|
121
|
+
# If target strings are provided, pass them directly to fetch_webpage for efficiency
|
122
|
+
if target_strings and len(target_strings) > 0:
|
123
|
+
html_content, is_error = fetch_webpage(url, target_strings=target_strings)
|
124
|
+
else:
|
125
|
+
html_content, is_error = fetch_webpage(url)
|
126
|
+
|
127
|
+
if is_error:
|
128
|
+
return html_content, True
|
129
|
+
|
130
|
+
extracted_text = extract_clean_text(html_content, method=extract_method, url=url)
|
131
|
+
|
132
|
+
if not extracted_text or len(extracted_text) < 100:
|
133
|
+
return f"Could not extract meaningful content from {url}", True
|
134
|
+
|
135
|
+
# If target strings were provided but not already handled by fetch_webpage
|
136
|
+
if target_strings and len(target_strings) > 0 and not any(target in extracted_text for target in target_strings if len(target) > 3):
|
137
|
+
from janito.tools.fetch_webpage.extractors import extract_targeted_content
|
138
|
+
targeted_content = extract_targeted_content(html_content, target_strings)
|
139
|
+
if targeted_content:
|
140
|
+
print_success(f"Successfully extracted targeted content based on {len(target_strings)} search strings",
|
141
|
+
"Targeted Extraction")
|
142
|
+
extracted_text = targeted_content
|
143
|
+
|
144
|
+
# Truncate if needed
|
145
|
+
if len(extracted_text) > max_length:
|
146
|
+
print_info(f"Truncating content from {len(extracted_text)} to {max_length} characters", "Content Extraction")
|
147
|
+
extracted_text = extracted_text[:max_length] + "..."
|
148
|
+
|
149
|
+
# Check if the content is still too large for an LLM (rough estimate)
|
150
|
+
estimated_tokens = len(extracted_text.split())
|
151
|
+
if estimated_tokens > 10000: # Conservative estimate for token limits
|
152
|
+
print_warning(f"Content Extraction: Extracted content still very large (~{estimated_tokens} words). Consider using chunk_large_content()")
|
153
|
+
|
154
|
+
print_success(f"Successfully extracted {len(extracted_text)} characters of content", "Content Extraction")
|
155
|
+
return extracted_text, False
|