devduck 1.1.0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of devduck might be problematic. Click here for more details.
- devduck/__init__.py +99 -15
- devduck/_version.py +2 -2
- devduck/tools/__init__.py +41 -49
- devduck/tools/fetch_github_tool.py +201 -0
- devduck/tools/scraper.py +935 -0
- devduck/tools/speech_to_speech.py +109 -9
- devduck/tools/system_prompt.py +276 -153
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/METADATA +2 -1
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/RECORD +13 -11
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/WHEEL +0 -0
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/entry_points.txt +0 -0
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/licenses/LICENSE +0 -0
- {devduck-1.1.0.dist-info → devduck-1.1.3.dist-info}/top_level.txt +0 -0
devduck/tools/scraper.py
ADDED
|
@@ -0,0 +1,935 @@
|
|
|
1
|
+
"""BeautifulSoup4 tool for comprehensive HTML/XML parsing and web scraping.
|
|
2
|
+
Provides full access to BeautifulSoup4's capabilities including parsing,
|
|
3
|
+
searching, navigating, and modifying HTML/XML documents.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from bs4 import BeautifulSoup, Tag
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.progress import Progress
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
from rich.tree import Tree
|
|
16
|
+
from strands import tool
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def clean_text(text: Any) -> str:
|
|
22
|
+
"""Clean and normalize text content."""
|
|
23
|
+
if not text:
|
|
24
|
+
return ""
|
|
25
|
+
if isinstance(text, Tag):
|
|
26
|
+
text = text.get_text()
|
|
27
|
+
if not isinstance(text, str):
|
|
28
|
+
text = str(text)
|
|
29
|
+
# Remove extra whitespace and normalize
|
|
30
|
+
return " ".join(text.split())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_tag_text(tag: Tag) -> str:
|
|
34
|
+
"""Get clean text from a tag."""
|
|
35
|
+
return clean_text(tag.string if tag else "")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_element_data(element: Tag) -> dict:
|
|
39
|
+
"""Extract relevant data from a BS4 element."""
|
|
40
|
+
if not element or not isinstance(element, Tag):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
data = {
|
|
44
|
+
"tag": element.name,
|
|
45
|
+
"text": clean_text(element.get_text()),
|
|
46
|
+
"html": str(element),
|
|
47
|
+
}
|
|
48
|
+
if element.attrs:
|
|
49
|
+
data["attributes"] = dict(element.attrs)
|
|
50
|
+
return data
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def create_element_tree(element: Tag, tree: Tree) -> None:
|
|
54
|
+
"""Create a tree visualization of HTML structure."""
|
|
55
|
+
if not isinstance(element, Tag):
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
# Create node label with tag name and classes
|
|
59
|
+
classes = element.get("class", [])
|
|
60
|
+
class_str = f" .{'.'.join(classes)}" if classes else ""
|
|
61
|
+
id_str = f" #{element['id']}" if element.get("id") else ""
|
|
62
|
+
label = f"{element.name}{class_str}{id_str}"
|
|
63
|
+
|
|
64
|
+
# Add text preview if it exists
|
|
65
|
+
text = clean_text(element.string)
|
|
66
|
+
if text:
|
|
67
|
+
label += f": {text[:30]}..." if len(text) > 30 else f": {text}"
|
|
68
|
+
|
|
69
|
+
# Create branch
|
|
70
|
+
branch = tree.add(label)
|
|
71
|
+
|
|
72
|
+
# Recursively add children
|
|
73
|
+
for child in element.children:
|
|
74
|
+
if isinstance(child, Tag):
|
|
75
|
+
create_element_tree(child, branch)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@tool
|
|
79
|
+
def scraper(
|
|
80
|
+
action: str,
|
|
81
|
+
content: str | None = None,
|
|
82
|
+
url: str | None = None,
|
|
83
|
+
parser: str = "html.parser",
|
|
84
|
+
find_params: dict[str, Any] | None = None,
|
|
85
|
+
navigation: dict[str, Any] | None = None,
|
|
86
|
+
modifications: list[dict[str, Any]] | None = None,
|
|
87
|
+
) -> dict[str, Any]:
|
|
88
|
+
"""Advanced HTML/XML parsing and web scraping tool using BeautifulSoup4.
|
|
89
|
+
|
|
90
|
+
This tool provides comprehensive HTML/XML parsing and web scraping capabilities
|
|
91
|
+
using BeautifulSoup4. It supports various actions like parsing, finding elements,
|
|
92
|
+
extracting text, modifying content, and navigating document structures.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
action: The BeautifulSoup action to perform. One of:
|
|
96
|
+
- "parse": Parse HTML/XML content
|
|
97
|
+
- "find": Find elements using various methods
|
|
98
|
+
- "extract_text": Extract text from elements
|
|
99
|
+
- "extract_attrs": Extract attributes from elements
|
|
100
|
+
- "modify": Modify HTML content
|
|
101
|
+
- "navigate": Navigate through document tree
|
|
102
|
+
- "scrape_url": Scrape content from URL
|
|
103
|
+
content: HTML/XML content to parse (for parse/modify actions)
|
|
104
|
+
url: URL to scrape (for scrape_url action)
|
|
105
|
+
parser: Parser to use (default: html.parser). Options: html.parser, lxml, xml, html5lib
|
|
106
|
+
find_params: Parameters for find/find_all operations
|
|
107
|
+
navigation: Navigation parameters
|
|
108
|
+
modifications: List of modifications to apply. Each modification requires a CSS selector as target.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dict containing status and response content:
|
|
112
|
+
{
|
|
113
|
+
"status": "success|error",
|
|
114
|
+
"content": [{"text": "Response message"}]
|
|
115
|
+
}
|
|
116
|
+
"""
|
|
117
|
+
# Set default values for optional parameters
|
|
118
|
+
if find_params is None:
|
|
119
|
+
find_params = {}
|
|
120
|
+
if navigation is None:
|
|
121
|
+
navigation = {}
|
|
122
|
+
if modifications is None:
|
|
123
|
+
modifications = []
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
if action == "scrape_url":
|
|
127
|
+
if not url:
|
|
128
|
+
raise ValueError("URL is required for scrape_url action")
|
|
129
|
+
headers = {
|
|
130
|
+
"User-Agent": (
|
|
131
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
132
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
133
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
134
|
+
),
|
|
135
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
136
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
137
|
+
"DNT": "1",
|
|
138
|
+
"Connection": "keep-alive",
|
|
139
|
+
"Upgrade-Insecure-Requests": "1",
|
|
140
|
+
}
|
|
141
|
+
console.print(Panel.fit(f"[bold blue]Scraping URL: {url}", style="blue"))
|
|
142
|
+
|
|
143
|
+
with Progress() as progress:
|
|
144
|
+
# Initialize progress tasks
|
|
145
|
+
fetch_task = progress.add_task("[green]Fetching URL...", total=100)
|
|
146
|
+
parse_task = progress.add_task(
|
|
147
|
+
"[cyan]Parsing content...", total=100, visible=False
|
|
148
|
+
)
|
|
149
|
+
analyze_task = progress.add_task(
|
|
150
|
+
"[yellow]Analyzing elements...", total=100, visible=False
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
# Fetch URL
|
|
155
|
+
response = requests.get(
|
|
156
|
+
url, headers=headers, timeout=10, verify=True
|
|
157
|
+
)
|
|
158
|
+
response.raise_for_status()
|
|
159
|
+
content = response.text
|
|
160
|
+
progress.update(fetch_task, completed=100)
|
|
161
|
+
|
|
162
|
+
# Parse content
|
|
163
|
+
progress.update(parse_task, visible=True)
|
|
164
|
+
soup = BeautifulSoup(content, parser)
|
|
165
|
+
progress.update(parse_task, advance=50)
|
|
166
|
+
|
|
167
|
+
# Create document overview
|
|
168
|
+
title = get_tag_text(soup.find("title"))
|
|
169
|
+
console.print(f"\n[bold cyan]Document Title:[/] {title}")
|
|
170
|
+
|
|
171
|
+
# Extract meta information
|
|
172
|
+
meta_tags = {
|
|
173
|
+
tag.get("name", tag.get("property", "")): tag.get("content", "")
|
|
174
|
+
for tag in soup.find_all("meta")
|
|
175
|
+
if tag.get("name") or tag.get("property")
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if meta_tags:
|
|
179
|
+
meta_table = Table(
|
|
180
|
+
show_header=True, header_style="bold magenta"
|
|
181
|
+
)
|
|
182
|
+
meta_table.add_column("Meta Property")
|
|
183
|
+
meta_table.add_column("Content")
|
|
184
|
+
for prop, content in meta_tags.items():
|
|
185
|
+
if content: # Only show non-empty meta tags
|
|
186
|
+
meta_table.add_row(
|
|
187
|
+
prop,
|
|
188
|
+
(
|
|
189
|
+
content[:100] + "..."
|
|
190
|
+
if len(content) > 100
|
|
191
|
+
else content
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
console.print("\n[bold cyan]Meta Information:[/]")
|
|
195
|
+
console.print(meta_table)
|
|
196
|
+
|
|
197
|
+
progress.update(parse_task, completed=100)
|
|
198
|
+
|
|
199
|
+
# Analyze elements
|
|
200
|
+
progress.update(analyze_task, visible=True)
|
|
201
|
+
|
|
202
|
+
# Extract and display links
|
|
203
|
+
links = [
|
|
204
|
+
{"href": link.get("href"), "text": clean_text(link.text)}
|
|
205
|
+
for link in soup.find_all("a", href=True)
|
|
206
|
+
]
|
|
207
|
+
progress.update(analyze_task, advance=25)
|
|
208
|
+
|
|
209
|
+
if links:
|
|
210
|
+
link_table = Table(
|
|
211
|
+
show_header=True, header_style="bold magenta"
|
|
212
|
+
)
|
|
213
|
+
link_table.add_column("Link Text")
|
|
214
|
+
link_table.add_column("URL")
|
|
215
|
+
for link in links[:10]: # Show first 10 links
|
|
216
|
+
link_table.add_row(
|
|
217
|
+
(
|
|
218
|
+
link["text"][:50] + "..."
|
|
219
|
+
if len(link["text"]) > 50
|
|
220
|
+
else link["text"]
|
|
221
|
+
),
|
|
222
|
+
link["href"],
|
|
223
|
+
)
|
|
224
|
+
console.print("\n[bold cyan]Links Found:[/] (showing first 10)")
|
|
225
|
+
console.print(link_table)
|
|
226
|
+
|
|
227
|
+
# Extract and display headings
|
|
228
|
+
headings = [
|
|
229
|
+
{"level": int(h.name[1]), "text": clean_text(h.text)}
|
|
230
|
+
for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
231
|
+
]
|
|
232
|
+
progress.update(analyze_task, advance=25)
|
|
233
|
+
|
|
234
|
+
if headings:
|
|
235
|
+
heading_tree = Tree("[bold cyan]📑 Document Structure[/]")
|
|
236
|
+
current_level = 0
|
|
237
|
+
current_node = heading_tree
|
|
238
|
+
for h in headings:
|
|
239
|
+
level = h["level"]
|
|
240
|
+
text = h["text"]
|
|
241
|
+
if level > current_level:
|
|
242
|
+
current_node = current_node.add(
|
|
243
|
+
f"[bold]H{level}:[/] {text}"
|
|
244
|
+
)
|
|
245
|
+
else:
|
|
246
|
+
current_node = heading_tree.add(
|
|
247
|
+
f"[bold]H{level}:[/] {text}"
|
|
248
|
+
)
|
|
249
|
+
current_level = level
|
|
250
|
+
console.print("\n")
|
|
251
|
+
console.print(heading_tree)
|
|
252
|
+
|
|
253
|
+
# Extract and display images
|
|
254
|
+
images = [
|
|
255
|
+
{"src": img.get("src"), "alt": img.get("alt", "")}
|
|
256
|
+
for img in soup.find_all("img", src=True)
|
|
257
|
+
]
|
|
258
|
+
progress.update(analyze_task, advance=25)
|
|
259
|
+
|
|
260
|
+
if images:
|
|
261
|
+
image_table = Table(
|
|
262
|
+
show_header=True, header_style="bold magenta"
|
|
263
|
+
)
|
|
264
|
+
image_table.add_column("Image Source")
|
|
265
|
+
image_table.add_column("Alt Text")
|
|
266
|
+
for img in images:
|
|
267
|
+
image_table.add_row(
|
|
268
|
+
img["src"], img["alt"] or "[grey](no alt text)"
|
|
269
|
+
)
|
|
270
|
+
console.print("\n[bold cyan]Images Found:[/]")
|
|
271
|
+
console.print(image_table)
|
|
272
|
+
|
|
273
|
+
# Show text preview
|
|
274
|
+
text_content = clean_text(soup.get_text(separator=" "))
|
|
275
|
+
text_preview = (
|
|
276
|
+
text_content[:200] + "..."
|
|
277
|
+
if len(text_content) > 200
|
|
278
|
+
else text_content
|
|
279
|
+
)
|
|
280
|
+
console.print("\n[bold cyan]Text Preview:[/]")
|
|
281
|
+
console.print(Panel(text_preview, style="green"))
|
|
282
|
+
|
|
283
|
+
progress.update(analyze_task, completed=100)
|
|
284
|
+
|
|
285
|
+
# Create final result
|
|
286
|
+
result = {
|
|
287
|
+
"url": url,
|
|
288
|
+
"status_code": response.status_code,
|
|
289
|
+
"content_type": response.headers.get("content-type", ""),
|
|
290
|
+
"title": title,
|
|
291
|
+
"text": text_content,
|
|
292
|
+
"meta": meta_tags,
|
|
293
|
+
"links": links,
|
|
294
|
+
"headings": headings,
|
|
295
|
+
"images": images,
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
# Show statistics
|
|
299
|
+
stats_table = Table(show_header=True, header_style="bold magenta")
|
|
300
|
+
stats_table.add_column("Element Type")
|
|
301
|
+
stats_table.add_column("Count")
|
|
302
|
+
|
|
303
|
+
stats_table.add_row("Links", str(len(links)))
|
|
304
|
+
stats_table.add_row("Headings", str(len(headings)))
|
|
305
|
+
stats_table.add_row("Images", str(len(images)))
|
|
306
|
+
stats_table.add_row("Meta Tags", str(len(meta_tags)))
|
|
307
|
+
|
|
308
|
+
console.print("\n[bold cyan]Page Statistics:[/]")
|
|
309
|
+
console.print(stats_table)
|
|
310
|
+
|
|
311
|
+
console.print("\n[bold green]✓ Scraping complete![/]")
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
"status": "success",
|
|
315
|
+
"content": [
|
|
316
|
+
{"text": f"Successfully scraped {url}"},
|
|
317
|
+
{"text": f"Results: {json.dumps(result, indent=2)}"},
|
|
318
|
+
],
|
|
319
|
+
}
|
|
320
|
+
except requests.RequestException as e:
|
|
321
|
+
raise ValueError(f"Failed to fetch URL: {str(e)}")
|
|
322
|
+
else:
|
|
323
|
+
if not content and action != "scrape_url":
|
|
324
|
+
raise ValueError("Content is required for this action")
|
|
325
|
+
soup = BeautifulSoup(content, parser)
|
|
326
|
+
result = None
|
|
327
|
+
|
|
328
|
+
if action == "parse":
|
|
329
|
+
console.print(Panel.fit("[bold blue]Parsing HTML Document", style="blue"))
|
|
330
|
+
|
|
331
|
+
with Progress() as progress:
|
|
332
|
+
parse_task = progress.add_task(
|
|
333
|
+
"[green]Analyzing document structure...", total=100
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Parse title
|
|
337
|
+
progress.update(parse_task, advance=10)
|
|
338
|
+
title_tag = soup.find("title")
|
|
339
|
+
title = get_tag_text(title_tag)
|
|
340
|
+
console.print(f"\n[bold cyan]Document Title:[/] {title}")
|
|
341
|
+
|
|
342
|
+
# Create document tree
|
|
343
|
+
progress.update(parse_task, advance=20)
|
|
344
|
+
console.print("\n[bold cyan]Document Structure:[/]")
|
|
345
|
+
doc_tree = Tree("🌐 HTML Document")
|
|
346
|
+
create_element_tree(soup.find("html"), doc_tree)
|
|
347
|
+
console.print(doc_tree)
|
|
348
|
+
|
|
349
|
+
# Parse links
|
|
350
|
+
progress.update(parse_task, advance=20)
|
|
351
|
+
links = [
|
|
352
|
+
{"href": link.get("href"), "text": clean_text(link.text)}
|
|
353
|
+
for link in soup.find_all("a", href=True)
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
if links:
|
|
357
|
+
link_table = Table(show_header=True, header_style="bold magenta")
|
|
358
|
+
link_table.add_column("Link Text")
|
|
359
|
+
link_table.add_column("URL")
|
|
360
|
+
for link in links:
|
|
361
|
+
link_table.add_row(
|
|
362
|
+
(
|
|
363
|
+
link["text"][:50] + "..."
|
|
364
|
+
if len(link["text"]) > 50
|
|
365
|
+
else link["text"]
|
|
366
|
+
),
|
|
367
|
+
link["href"],
|
|
368
|
+
)
|
|
369
|
+
console.print("\n[bold cyan]Links Found:[/]")
|
|
370
|
+
console.print(link_table)
|
|
371
|
+
|
|
372
|
+
# Parse headings
|
|
373
|
+
progress.update(parse_task, advance=20)
|
|
374
|
+
headings = [
|
|
375
|
+
{"level": int(h.name[1]), "text": clean_text(h.text)}
|
|
376
|
+
for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
if headings:
|
|
380
|
+
heading_table = Table(show_header=True, header_style="bold magenta")
|
|
381
|
+
heading_table.add_column("Level")
|
|
382
|
+
heading_table.add_column("Heading Text")
|
|
383
|
+
for h in headings:
|
|
384
|
+
heading_table.add_row(f"H{h['level']}", h["text"])
|
|
385
|
+
console.print("\n[bold cyan]Document Headings:[/]")
|
|
386
|
+
console.print(heading_table)
|
|
387
|
+
|
|
388
|
+
# Parse lists
|
|
389
|
+
progress.update(parse_task, advance=20)
|
|
390
|
+
lists = [
|
|
391
|
+
{
|
|
392
|
+
"type": ul.name,
|
|
393
|
+
"items": [clean_text(li.text) for li in ul.find_all("li")],
|
|
394
|
+
}
|
|
395
|
+
for ul in soup.find_all(["ul", "ol"])
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
if lists:
|
|
399
|
+
console.print("\n[bold cyan]Lists Found:[/]")
|
|
400
|
+
for lst in lists:
|
|
401
|
+
list_panel = Panel.fit(
|
|
402
|
+
"\n".join([f"• {item}" for item in lst["items"]]),
|
|
403
|
+
title=f"{lst['type'].upper()} List",
|
|
404
|
+
style="green",
|
|
405
|
+
)
|
|
406
|
+
console.print(list_panel)
|
|
407
|
+
|
|
408
|
+
# Complete progress
|
|
409
|
+
progress.update(parse_task, advance=10)
|
|
410
|
+
|
|
411
|
+
result = {
|
|
412
|
+
"title": title,
|
|
413
|
+
"text": clean_text(soup.get_text(separator=" ")),
|
|
414
|
+
"links": links,
|
|
415
|
+
"structure": {
|
|
416
|
+
"headings": headings,
|
|
417
|
+
"paragraphs": [
|
|
418
|
+
clean_text(p.text) for p in soup.find_all("p") if p.text.strip()
|
|
419
|
+
],
|
|
420
|
+
"lists": lists,
|
|
421
|
+
},
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
console.print("\n[bold green]✓ Parsing complete![/]")
|
|
425
|
+
|
|
426
|
+
elif action == "find":
|
|
427
|
+
console.print(Panel.fit("[bold blue]Finding Elements", style="blue"))
|
|
428
|
+
|
|
429
|
+
# find_params is now a direct parameter (no need to extract)
|
|
430
|
+
|
|
431
|
+
# Show search parameters
|
|
432
|
+
search_table = Table(show_header=True, header_style="bold magenta")
|
|
433
|
+
search_table.add_column("Parameter")
|
|
434
|
+
search_table.add_column("Value")
|
|
435
|
+
|
|
436
|
+
if "selector" in find_params:
|
|
437
|
+
search_table.add_row("CSS Selector", find_params["selector"])
|
|
438
|
+
else:
|
|
439
|
+
for param, value in find_params.items():
|
|
440
|
+
if value is not None:
|
|
441
|
+
search_table.add_row(param, str(value))
|
|
442
|
+
|
|
443
|
+
console.print("\n[bold cyan]Search Parameters:[/]")
|
|
444
|
+
console.print(search_table)
|
|
445
|
+
|
|
446
|
+
with Progress() as progress:
|
|
447
|
+
find_task = progress.add_task(
|
|
448
|
+
"[green]Searching for elements...", total=100
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Find elements
|
|
452
|
+
progress.update(find_task, advance=40)
|
|
453
|
+
if "selector" in find_params:
|
|
454
|
+
elements = soup.select(find_params["selector"])
|
|
455
|
+
else:
|
|
456
|
+
name = find_params.get("name")
|
|
457
|
+
attrs = find_params.get("attrs", {})
|
|
458
|
+
recursive = find_params.get("recursive", True)
|
|
459
|
+
string = find_params.get("string")
|
|
460
|
+
limit = find_params.get("limit")
|
|
461
|
+
|
|
462
|
+
elements = soup.find_all(
|
|
463
|
+
name=name,
|
|
464
|
+
attrs=attrs,
|
|
465
|
+
recursive=recursive,
|
|
466
|
+
string=string,
|
|
467
|
+
limit=limit,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Process results
|
|
471
|
+
progress.update(find_task, advance=40)
|
|
472
|
+
result = [
|
|
473
|
+
extract_element_data(el) for el in elements if isinstance(el, Tag)
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
# Show results
|
|
477
|
+
if result:
|
|
478
|
+
console.print(f"\n[bold green]✓ Found {len(result)} element(s)[/]")
|
|
479
|
+
|
|
480
|
+
# Create results table
|
|
481
|
+
results_table = Table(show_header=True, header_style="bold magenta")
|
|
482
|
+
results_table.add_column("Tag")
|
|
483
|
+
results_table.add_column("Attributes")
|
|
484
|
+
results_table.add_column("Text Preview")
|
|
485
|
+
|
|
486
|
+
for item in result:
|
|
487
|
+
tag = item["tag"]
|
|
488
|
+
attrs = json.dumps(item.get("attributes", {}), indent=2)
|
|
489
|
+
text = (
|
|
490
|
+
item.get("text", "")[:50] + "..."
|
|
491
|
+
if item.get("text", "")
|
|
492
|
+
else ""
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
results_table.add_row(f"[bold]{tag}[/]", attrs, text)
|
|
496
|
+
|
|
497
|
+
console.print("\n[bold cyan]Found Elements:[/]")
|
|
498
|
+
console.print(results_table)
|
|
499
|
+
else:
|
|
500
|
+
console.print(
|
|
501
|
+
"\n[bold yellow]! No elements found matching the criteria[/]"
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Complete progress
|
|
505
|
+
progress.update(find_task, advance=20)
|
|
506
|
+
|
|
507
|
+
console.print("\n[bold green]✓ Search complete![/]")
|
|
508
|
+
|
|
509
|
+
elif action == "extract_text":
|
|
510
|
+
console.print(Panel.fit("[bold blue]Extracting Text Content", style="blue"))
|
|
511
|
+
|
|
512
|
+
with Progress() as progress:
|
|
513
|
+
extract_task = progress.add_task(
|
|
514
|
+
"[green]Processing text content...", total=100
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Extract full text
|
|
518
|
+
progress.update(extract_task, advance=25)
|
|
519
|
+
full_text = clean_text(soup.get_text(separator=" "))
|
|
520
|
+
console.print("\n[bold cyan]Full Text Preview:[/]")
|
|
521
|
+
console.print(
|
|
522
|
+
Panel(
|
|
523
|
+
full_text[:200] + "..." if len(full_text) > 200 else full_text,
|
|
524
|
+
style="green",
|
|
525
|
+
)
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# Extract stripped strings
|
|
529
|
+
progress.update(extract_task, advance=25)
|
|
530
|
+
stripped_strings = [clean_text(s) for s in soup.stripped_strings]
|
|
531
|
+
|
|
532
|
+
# Extract structured text
|
|
533
|
+
progress.update(extract_task, advance=25)
|
|
534
|
+
headings = [
|
|
535
|
+
clean_text(h.text)
|
|
536
|
+
for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
537
|
+
]
|
|
538
|
+
paragraphs = [clean_text(p.text) for p in soup.find_all("p")]
|
|
539
|
+
lists = [clean_text(li.text) for li in soup.find_all("li")]
|
|
540
|
+
|
|
541
|
+
# Show structured content
|
|
542
|
+
if headings:
|
|
543
|
+
heading_tree = Tree("[bold cyan]📑 Headings[/]")
|
|
544
|
+
for h in headings:
|
|
545
|
+
heading_tree.add(h)
|
|
546
|
+
console.print("\n")
|
|
547
|
+
console.print(heading_tree)
|
|
548
|
+
|
|
549
|
+
if paragraphs:
|
|
550
|
+
console.print("\n[bold cyan]📝 Paragraphs:[/]")
|
|
551
|
+
for i, p in enumerate(paragraphs, 1):
|
|
552
|
+
preview = p[:100] + "..." if len(p) > 100 else p
|
|
553
|
+
console.print(f"{i}. {preview}")
|
|
554
|
+
|
|
555
|
+
if lists:
|
|
556
|
+
list_tree = Tree("[bold cyan]📋 List Items[/]")
|
|
557
|
+
for item in lists:
|
|
558
|
+
list_tree.add(item)
|
|
559
|
+
console.print("\n")
|
|
560
|
+
console.print(list_tree)
|
|
561
|
+
|
|
562
|
+
# Create result
|
|
563
|
+
result = {
|
|
564
|
+
"full_text": full_text,
|
|
565
|
+
"stripped_strings": stripped_strings,
|
|
566
|
+
"structured_text": {
|
|
567
|
+
"headings": headings,
|
|
568
|
+
"paragraphs": paragraphs,
|
|
569
|
+
"lists": lists,
|
|
570
|
+
},
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
# Complete progress
|
|
574
|
+
progress.update(extract_task, advance=25)
|
|
575
|
+
|
|
576
|
+
# Show statistics
|
|
577
|
+
stats_table = Table(show_header=True, header_style="bold magenta")
|
|
578
|
+
stats_table.add_column("Content Type")
|
|
579
|
+
stats_table.add_column("Count")
|
|
580
|
+
stats_table.add_column("Total Length")
|
|
581
|
+
|
|
582
|
+
stats_table.add_row(
|
|
583
|
+
"Headings", str(len(headings)), str(sum(len(h) for h in headings))
|
|
584
|
+
)
|
|
585
|
+
stats_table.add_row(
|
|
586
|
+
"Paragraphs", str(len(paragraphs)), str(sum(len(p) for p in paragraphs))
|
|
587
|
+
)
|
|
588
|
+
stats_table.add_row(
|
|
589
|
+
"List Items", str(len(lists)), str(sum(len(li) for li in lists))
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
console.print("\n[bold cyan]Content Statistics:[/]")
|
|
593
|
+
console.print(stats_table)
|
|
594
|
+
|
|
595
|
+
console.print("\n[bold green]✓ Text extraction complete![/]")
|
|
596
|
+
|
|
597
|
+
elif action == "extract_attrs":
|
|
598
|
+
console.print(
|
|
599
|
+
Panel.fit("[bold blue]Extracting Element Attributes", style="blue")
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
with Progress() as progress:
|
|
603
|
+
attr_task = progress.add_task("[green]Analyzing elements...", total=100)
|
|
604
|
+
|
|
605
|
+
# Find all elements with attributes
|
|
606
|
+
progress.update(attr_task, advance=30)
|
|
607
|
+
elements = soup.find_all(True)
|
|
608
|
+
elements_with_attrs = [el for el in elements if el.attrs]
|
|
609
|
+
|
|
610
|
+
# Process attributes
|
|
611
|
+
progress.update(attr_task, advance=40)
|
|
612
|
+
result = []
|
|
613
|
+
attr_stats = {}
|
|
614
|
+
|
|
615
|
+
for el in elements_with_attrs:
|
|
616
|
+
attr_data = {
|
|
617
|
+
"tag": el.name,
|
|
618
|
+
"attributes": dict(el.attrs),
|
|
619
|
+
"text": clean_text(el.text) if el.text.strip() else None,
|
|
620
|
+
}
|
|
621
|
+
result.append(attr_data)
|
|
622
|
+
|
|
623
|
+
# Collect statistics
|
|
624
|
+
for attr in el.attrs:
|
|
625
|
+
if attr not in attr_stats:
|
|
626
|
+
attr_stats[attr] = {"count": 0, "tags": set()}
|
|
627
|
+
attr_stats[attr]["count"] += 1
|
|
628
|
+
attr_stats[attr]["tags"].add(el.name)
|
|
629
|
+
|
|
630
|
+
# Show results
|
|
631
|
+
if result:
|
|
632
|
+
console.print(
|
|
633
|
+
f"\n[bold green]✓ Found {len(result)} elements with attributes[/]"
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
# Create attribute summary table
|
|
637
|
+
attr_table = Table(show_header=True, header_style="bold magenta")
|
|
638
|
+
attr_table.add_column("Attribute")
|
|
639
|
+
attr_table.add_column("Count")
|
|
640
|
+
attr_table.add_column("Tags Using It")
|
|
641
|
+
|
|
642
|
+
for attr, stats in sorted(
|
|
643
|
+
attr_stats.items(), key=lambda x: x[1]["count"], reverse=True
|
|
644
|
+
):
|
|
645
|
+
attr_table.add_row(
|
|
646
|
+
f"[bold]{attr}[/]",
|
|
647
|
+
str(stats["count"]),
|
|
648
|
+
", ".join(sorted(stats["tags"])),
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
console.print("\n[bold cyan]Attribute Usage Summary:[/]")
|
|
652
|
+
console.print(attr_table)
|
|
653
|
+
|
|
654
|
+
# Show detailed results tree
|
|
655
|
+
results_tree = Tree("[bold cyan]🏷️ Elements with Attributes[/]")
|
|
656
|
+
for item in result:
|
|
657
|
+
tag_node = results_tree.add(f"[bold]{item['tag']}[/]")
|
|
658
|
+
for attr, value in item["attributes"].items():
|
|
659
|
+
tag_node.add(f"[green]{attr}[/]: {value}")
|
|
660
|
+
if item["text"]:
|
|
661
|
+
preview = (
|
|
662
|
+
item["text"][:50] + "..."
|
|
663
|
+
if len(item["text"]) > 50
|
|
664
|
+
else item["text"]
|
|
665
|
+
)
|
|
666
|
+
tag_node.add(f"[yellow]Text: {preview}[/]")
|
|
667
|
+
|
|
668
|
+
console.print("\n[bold cyan]Detailed Element Analysis:[/]")
|
|
669
|
+
console.print(results_tree)
|
|
670
|
+
else:
|
|
671
|
+
console.print(
|
|
672
|
+
"\n[bold yellow]! No elements with attributes found[/]"
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# Complete progress
|
|
676
|
+
progress.update(attr_task, advance=30)
|
|
677
|
+
|
|
678
|
+
console.print("\n[bold green]✓ Attribute extraction complete![/]")
|
|
679
|
+
|
|
680
|
+
elif action == "modify":
|
|
681
|
+
# modifications is now a direct parameter
|
|
682
|
+
if not modifications:
|
|
683
|
+
raise ValueError("No modifications specified")
|
|
684
|
+
|
|
685
|
+
changes_made = []
|
|
686
|
+
|
|
687
|
+
# Create a progress bar for modifications
|
|
688
|
+
with Progress() as progress:
|
|
689
|
+
modify_task = progress.add_task(
|
|
690
|
+
"[green]Processing modifications...", total=len(modifications)
|
|
691
|
+
)
|
|
692
|
+
console.print(Panel.fit("Starting HTML modifications", style="blue"))
|
|
693
|
+
|
|
694
|
+
for mod in modifications:
|
|
695
|
+
if not mod.get("target"):
|
|
696
|
+
raise ValueError("Target selector is required for modification")
|
|
697
|
+
|
|
698
|
+
try:
|
|
699
|
+
elements = soup.select(mod["target"])
|
|
700
|
+
if not elements:
|
|
701
|
+
changes_made.append(
|
|
702
|
+
f"Warning: No elements found for selector '{mod['target']}'"
|
|
703
|
+
)
|
|
704
|
+
continue
|
|
705
|
+
|
|
706
|
+
action_type = mod["action"]
|
|
707
|
+
content_required = action_type in ["insert", "append", "replace"]
|
|
708
|
+
|
|
709
|
+
if content_required and not mod.get("content"):
|
|
710
|
+
raise ValueError(
|
|
711
|
+
f"Content is required for {action_type} action"
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
for element in elements:
|
|
715
|
+
try:
|
|
716
|
+
if action_type == "insert":
|
|
717
|
+
new_tag = BeautifulSoup(mod["content"], parser).find()
|
|
718
|
+
if new_tag:
|
|
719
|
+
element.insert_before(new_tag)
|
|
720
|
+
changes_made.append(
|
|
721
|
+
f"Inserted content before {mod['target']}"
|
|
722
|
+
)
|
|
723
|
+
else:
|
|
724
|
+
changes_made.append(
|
|
725
|
+
f"Warning: Invalid content for insertion at {mod['target']}"
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
elif action_type == "append":
|
|
729
|
+
new_content = BeautifulSoup(mod["content"], parser)
|
|
730
|
+
element.append(new_content)
|
|
731
|
+
changes_made.append(
|
|
732
|
+
f"Appended content to {mod['target']}"
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
elif action_type == "replace":
|
|
736
|
+
new_content = BeautifulSoup(
|
|
737
|
+
mod["content"], parser
|
|
738
|
+
).find()
|
|
739
|
+
if new_content:
|
|
740
|
+
element.replace_with(new_content)
|
|
741
|
+
changes_made.append(f"Replaced {mod['target']}")
|
|
742
|
+
else:
|
|
743
|
+
changes_made.append(
|
|
744
|
+
f"Warning: Invalid content for replacement at {mod['target']}"
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
elif action_type == "clear":
|
|
748
|
+
element.clear()
|
|
749
|
+
changes_made.append(
|
|
750
|
+
f"Cleared contents of {mod['target']}"
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
elif action_type == "unwrap":
|
|
754
|
+
element.unwrap()
|
|
755
|
+
changes_made.append(f"Unwrapped {mod['target']}")
|
|
756
|
+
|
|
757
|
+
progress.update(modify_task, advance=1)
|
|
758
|
+
console.print(
|
|
759
|
+
f"✓ Completed: {action_type} on {mod['target']}",
|
|
760
|
+
style="green",
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
except Exception as elem_error:
|
|
764
|
+
changes_made.append(
|
|
765
|
+
f"Error modifying element {mod['target']}: {str(elem_error)}"
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
except Exception as selector_error:
|
|
769
|
+
changes_made.append(
|
|
770
|
+
f"Invalid selector '{mod['target']}': {str(selector_error)}"
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
# Create a summary table
|
|
774
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
775
|
+
table.add_column("Action")
|
|
776
|
+
table.add_column("Target")
|
|
777
|
+
table.add_column("Status")
|
|
778
|
+
|
|
779
|
+
for change in changes_made:
|
|
780
|
+
if ":" in change:
|
|
781
|
+
action, detail = change.split(":", 1)
|
|
782
|
+
table.add_row(
|
|
783
|
+
action.strip(),
|
|
784
|
+
detail.strip(),
|
|
785
|
+
"✓" if "Error" not in change else "✗",
|
|
786
|
+
style="green" if "Error" not in change else "red",
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
console.print("\nModification Summary:")
|
|
790
|
+
console.print(table)
|
|
791
|
+
|
|
792
|
+
result = {
|
|
793
|
+
"modified_html": str(soup),
|
|
794
|
+
"changes": changes_made,
|
|
795
|
+
"status": "complete" if changes_made else "no_changes",
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
elif action == "navigate":
|
|
799
|
+
console.print(
|
|
800
|
+
Panel.fit("[bold blue]Navigating Document Structure", style="blue")
|
|
801
|
+
)
|
|
802
|
+
# navigation is now a direct parameter
|
|
803
|
+
direction = navigation.get("direction", "children")
|
|
804
|
+
|
|
805
|
+
with Progress() as progress:
|
|
806
|
+
nav_task = progress.add_task(
|
|
807
|
+
"[green]Analyzing document structure...", total=100
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Initialize navigation
|
|
811
|
+
progress.update(nav_task, advance=20)
|
|
812
|
+
start_element = soup.find() if direction != "parent" else soup
|
|
813
|
+
|
|
814
|
+
# Create direction info panel
|
|
815
|
+
direction_info = Panel.fit(
|
|
816
|
+
(
|
|
817
|
+
f"[bold]Direction:[/] {direction}\n"
|
|
818
|
+
f"[bold]Starting from:[/] {start_element.name if start_element else 'root'}"
|
|
819
|
+
),
|
|
820
|
+
title="Navigation Parameters",
|
|
821
|
+
style="cyan",
|
|
822
|
+
)
|
|
823
|
+
console.print("\n")
|
|
824
|
+
console.print(direction_info)
|
|
825
|
+
|
|
826
|
+
# Navigate and collect elements
|
|
827
|
+
progress.update(nav_task, advance=30)
|
|
828
|
+
if direction == "parent":
|
|
829
|
+
elements = [start_element.parent] if start_element.parent else []
|
|
830
|
+
elif direction == "children":
|
|
831
|
+
elements = list(
|
|
832
|
+
filter(lambda x: isinstance(x, Tag), start_element.children)
|
|
833
|
+
)
|
|
834
|
+
elif direction == "siblings":
|
|
835
|
+
elements = list(
|
|
836
|
+
filter(
|
|
837
|
+
lambda x: isinstance(x, Tag),
|
|
838
|
+
list(start_element.next_siblings)
|
|
839
|
+
+ list(start_element.previous_siblings),
|
|
840
|
+
)
|
|
841
|
+
)
|
|
842
|
+
elif direction == "descendants":
|
|
843
|
+
elements = list(
|
|
844
|
+
filter(lambda x: isinstance(x, Tag), start_element.descendants)
|
|
845
|
+
)
|
|
846
|
+
elif direction == "ancestors":
|
|
847
|
+
elements = list(
|
|
848
|
+
filter(lambda x: isinstance(x, Tag), start_element.parents)
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# Process elements
|
|
852
|
+
progress.update(nav_task, advance=30)
|
|
853
|
+
result = [extract_element_data(el) for el in elements if el]
|
|
854
|
+
|
|
855
|
+
# Create visual representation
|
|
856
|
+
if result:
|
|
857
|
+
# Create element tree
|
|
858
|
+
nav_tree = Tree(
|
|
859
|
+
f"[bold cyan]🌐 {direction.title()} of {start_element.name}[/]"
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
for idx, element in enumerate(elements, 1):
|
|
863
|
+
if isinstance(element, Tag):
|
|
864
|
+
# Create node label
|
|
865
|
+
classes = element.get("class", [])
|
|
866
|
+
class_str = f" .{'.'.join(classes)}" if classes else ""
|
|
867
|
+
id_str = f" #{element['id']}" if element.get("id") else ""
|
|
868
|
+
|
|
869
|
+
# Add text preview if available
|
|
870
|
+
text = clean_text(element.string)
|
|
871
|
+
text_preview = (
|
|
872
|
+
f": {text[:30]}..."
|
|
873
|
+
if text and len(text) > 30
|
|
874
|
+
else f": {text}" if text else ""
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
node_label = (
|
|
878
|
+
f"{element.name}{class_str}{id_str}{text_preview}"
|
|
879
|
+
)
|
|
880
|
+
element_node = nav_tree.add(f"[bold]{idx}.[/] {node_label}")
|
|
881
|
+
|
|
882
|
+
# Add attribute information
|
|
883
|
+
if element.attrs:
|
|
884
|
+
attrs_node = element_node.add("[yellow]Attributes[/]")
|
|
885
|
+
for attr, value in element.attrs.items():
|
|
886
|
+
attrs_node.add(f"[green]{attr}[/]: {value}")
|
|
887
|
+
|
|
888
|
+
console.print("\n[bold cyan]Navigation Results:[/]")
|
|
889
|
+
console.print(nav_tree)
|
|
890
|
+
|
|
891
|
+
# Create statistics table
|
|
892
|
+
stats_table = Table(show_header=True, header_style="bold magenta")
|
|
893
|
+
stats_table.add_column("Statistic")
|
|
894
|
+
stats_table.add_column("Value")
|
|
895
|
+
|
|
896
|
+
tag_counts = {}
|
|
897
|
+
total_attrs = 0
|
|
898
|
+
total_text_length = 0
|
|
899
|
+
|
|
900
|
+
for el in elements:
|
|
901
|
+
if isinstance(el, Tag):
|
|
902
|
+
tag_counts[el.name] = tag_counts.get(el.name, 0) + 1
|
|
903
|
+
total_attrs += len(el.attrs)
|
|
904
|
+
total_text_length += len(clean_text(el.text))
|
|
905
|
+
|
|
906
|
+
stats_table.add_row("Total Elements", str(len(elements)))
|
|
907
|
+
stats_table.add_row("Unique Tags", str(len(tag_counts)))
|
|
908
|
+
stats_table.add_row("Total Attributes", str(total_attrs))
|
|
909
|
+
stats_table.add_row("Total Text Length", str(total_text_length))
|
|
910
|
+
|
|
911
|
+
console.print("\n[bold cyan]Element Statistics:[/]")
|
|
912
|
+
console.print(stats_table)
|
|
913
|
+
else:
|
|
914
|
+
console.print(
|
|
915
|
+
"\n[bold yellow]! No elements found in specified direction[/]"
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
# Complete progress
|
|
919
|
+
progress.update(nav_task, advance=20)
|
|
920
|
+
|
|
921
|
+
console.print("\n[bold green]✓ Navigation complete![/]")
|
|
922
|
+
|
|
923
|
+
return {
|
|
924
|
+
"status": "success",
|
|
925
|
+
"content": [
|
|
926
|
+
{"text": f"Action '{action}' completed successfully"},
|
|
927
|
+
{"text": f"Results: {json.dumps(result, indent=2)}"},
|
|
928
|
+
],
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
except Exception as e:
|
|
932
|
+
return {
|
|
933
|
+
"status": "error",
|
|
934
|
+
"content": [{"text": f"Error: {str(e)}"}],
|
|
935
|
+
}
|