agentle 0.9.25__py3-none-any.whl → 0.9.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentle/agents/apis/api.py +25 -7
- agentle/agents/apis/endpoint.py +14 -8
- agentle/web/extractor.py +277 -164
- {agentle-0.9.25.dist-info → agentle-0.9.26.dist-info}/METADATA +1 -1
- {agentle-0.9.25.dist-info → agentle-0.9.26.dist-info}/RECORD +7 -7
- {agentle-0.9.25.dist-info → agentle-0.9.26.dist-info}/WHEEL +0 -0
- {agentle-0.9.25.dist-info → agentle-0.9.26.dist-info}/licenses/LICENSE +0 -0
agentle/agents/apis/api.py
CHANGED
|
@@ -14,6 +14,7 @@ Provides advanced features for managing collections of related endpoints with:
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import re
|
|
17
18
|
from collections.abc import (
|
|
18
19
|
Coroutine,
|
|
19
20
|
Mapping,
|
|
@@ -513,13 +514,30 @@ class API(BaseModel):
|
|
|
513
514
|
continue
|
|
514
515
|
|
|
515
516
|
# Create endpoint
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
517
|
+
# Generate a valid function name from the path
|
|
518
|
+
if operation_id:
|
|
519
|
+
endpoint_name = operation_id
|
|
520
|
+
else:
|
|
521
|
+
# Clean the path to create a valid function name
|
|
522
|
+
# Remove leading/trailing slashes and replace special chars
|
|
523
|
+
clean_path = (
|
|
524
|
+
path.strip("/")
|
|
525
|
+
.replace("/", "_")
|
|
526
|
+
.replace("{", "")
|
|
527
|
+
.replace("}", "")
|
|
528
|
+
.replace("-", "_")
|
|
529
|
+
)
|
|
530
|
+
# Remove any consecutive underscores
|
|
531
|
+
clean_path = re.sub(r"_+", "_", clean_path)
|
|
532
|
+
# Ensure it doesn't start with a number
|
|
533
|
+
if clean_path and clean_path[0].isdigit():
|
|
534
|
+
clean_path = f"n{clean_path}"
|
|
535
|
+
# If empty after cleaning, use a generic name
|
|
536
|
+
if not clean_path:
|
|
537
|
+
clean_path = "root"
|
|
538
|
+
endpoint_name = f"{method.lower()}_{clean_path}"
|
|
539
|
+
|
|
540
|
+
endpoint_name = cast(str, endpoint_name)
|
|
523
541
|
|
|
524
542
|
endpoint_description: str = cast(
|
|
525
543
|
str,
|
agentle/agents/apis/endpoint.py
CHANGED
|
@@ -359,7 +359,7 @@ class Endpoint(BaseModel):
|
|
|
359
359
|
await self._auth_handler.refresh_if_needed()
|
|
360
360
|
await self._auth_handler.apply_auth(None, url, headers, query_params) # type: ignore
|
|
361
361
|
|
|
362
|
-
# Prepare connector
|
|
362
|
+
# Prepare connector kwargs (will be used to create fresh connector for each attempt)
|
|
363
363
|
connector_kwargs: dict[str, Any] = {
|
|
364
364
|
"limit": 10,
|
|
365
365
|
"limit_per_host": 5,
|
|
@@ -369,8 +369,6 @@ class Endpoint(BaseModel):
|
|
|
369
369
|
if not self.request_config.verify_ssl:
|
|
370
370
|
connector_kwargs["ssl"] = False
|
|
371
371
|
|
|
372
|
-
connector = aiohttp.TCPConnector(**connector_kwargs)
|
|
373
|
-
|
|
374
372
|
# Prepare timeout
|
|
375
373
|
timeout = aiohttp.ClientTimeout(
|
|
376
374
|
total=self.request_config.timeout,
|
|
@@ -381,9 +379,11 @@ class Endpoint(BaseModel):
|
|
|
381
379
|
# Define the request function for circuit breaker
|
|
382
380
|
async def make_single_request() -> Any:
|
|
383
381
|
"""Make a single request attempt."""
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
382
|
+
# Create a fresh connector for each request attempt to avoid "Session is closed" errors on retries
|
|
383
|
+
connector = aiohttp.TCPConnector(**connector_kwargs)
|
|
384
|
+
session = None
|
|
385
|
+
try:
|
|
386
|
+
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
|
|
387
387
|
# Prepare request kwargs
|
|
388
388
|
request_kwargs: dict[str, Any] = {
|
|
389
389
|
"headers": headers,
|
|
@@ -486,6 +486,12 @@ class Endpoint(BaseModel):
|
|
|
486
486
|
await self._response_cache.set(url, kwargs, result)
|
|
487
487
|
|
|
488
488
|
return result
|
|
489
|
+
finally:
|
|
490
|
+
# Always close the session to prevent "Session is closed" errors on retries
|
|
491
|
+
if session is not None:
|
|
492
|
+
await session.close()
|
|
493
|
+
# Give the connector time to close properly
|
|
494
|
+
await asyncio.sleep(0.01)
|
|
489
495
|
|
|
490
496
|
# Execute with retries
|
|
491
497
|
last_exception = None
|
|
@@ -569,11 +575,11 @@ class Endpoint(BaseModel):
|
|
|
569
575
|
|
|
570
576
|
if hasattr(param, "enum") and param.enum:
|
|
571
577
|
param_info["enum"] = list(param.enum)
|
|
572
|
-
|
|
578
|
+
|
|
573
579
|
# Add constraints for number/primitive types
|
|
574
580
|
if hasattr(param, "parameter_schema") and param.parameter_schema:
|
|
575
581
|
from agentle.agents.apis.primitive_schema import PrimitiveSchema
|
|
576
|
-
|
|
582
|
+
|
|
577
583
|
schema = param.parameter_schema
|
|
578
584
|
# Only PrimitiveSchema has minimum, maximum, format
|
|
579
585
|
if isinstance(schema, PrimitiveSchema):
|
agentle/web/extractor.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
1
4
|
from collections.abc import Sequence
|
|
2
5
|
from textwrap import dedent
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
3
7
|
|
|
4
8
|
from html_to_markdown import convert
|
|
5
|
-
from playwright.async_api import Geolocation, ViewportSize
|
|
6
9
|
from rsb.coroutines.run_sync import run_sync
|
|
7
10
|
from rsb.models import Field
|
|
8
11
|
from rsb.models.base_model import BaseModel
|
|
@@ -18,6 +21,10 @@ from agentle.web.actions.action import Action
|
|
|
18
21
|
from agentle.web.extraction_preferences import ExtractionPreferences
|
|
19
22
|
from agentle.web.extraction_result import ExtractionResult
|
|
20
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from playwright.async_api import Browser, Geolocation, ViewportSize
|
|
26
|
+
|
|
27
|
+
|
|
21
28
|
_INSTRUCTIONS = Prompt.from_text(
|
|
22
29
|
dedent("""\
|
|
23
30
|
<character>
|
|
@@ -64,31 +71,28 @@ class Extractor(BaseModel):
|
|
|
64
71
|
|
|
65
72
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
66
73
|
|
|
67
|
-
def
|
|
74
|
+
def extract_markdown(
|
|
68
75
|
self,
|
|
76
|
+
browser: Browser,
|
|
69
77
|
urls: Sequence[str],
|
|
70
|
-
output: type[T],
|
|
71
|
-
prompt: str | None = None,
|
|
72
78
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
73
79
|
ignore_invalid_urls: bool = True,
|
|
74
|
-
) ->
|
|
80
|
+
) -> tuple[str, str]:
|
|
75
81
|
return run_sync(
|
|
76
|
-
self.
|
|
77
|
-
|
|
78
|
-
|
|
82
|
+
self.extract_markdown_async,
|
|
83
|
+
browser=browser,
|
|
84
|
+
urls=urls,
|
|
85
|
+
extraction_preferences=extraction_preferences,
|
|
86
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
79
87
|
)
|
|
80
88
|
|
|
81
|
-
|
|
82
|
-
async def extract_async[T: BaseModel](
|
|
89
|
+
async def extract_markdown_async(
|
|
83
90
|
self,
|
|
91
|
+
browser: Browser,
|
|
84
92
|
urls: Sequence[str],
|
|
85
|
-
output: type[T],
|
|
86
|
-
prompt: str | None = None,
|
|
87
93
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
88
94
|
ignore_invalid_urls: bool = True,
|
|
89
|
-
) ->
|
|
90
|
-
from playwright import async_api
|
|
91
|
-
|
|
95
|
+
) -> tuple[str, str]:
|
|
92
96
|
_preferences = extraction_preferences or ExtractionPreferences()
|
|
93
97
|
_actions: Sequence[Action] = _preferences.actions or []
|
|
94
98
|
|
|
@@ -98,171 +102,272 @@ class Extractor(BaseModel):
|
|
|
98
102
|
# This is a placeholder for proxy configuration
|
|
99
103
|
pass
|
|
100
104
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
geolocation
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
geolocation = Geolocation(
|
|
119
|
-
latitude=getattr(_preferences.location, "latitude", 0),
|
|
120
|
-
longitude=getattr(_preferences.location, "longitude", 0),
|
|
121
|
-
)
|
|
122
|
-
permissions = ["geolocation"]
|
|
123
|
-
|
|
124
|
-
context = await browser.new_context(
|
|
125
|
-
viewport=viewport,
|
|
126
|
-
user_agent=user_agent,
|
|
127
|
-
is_mobile=is_mobile,
|
|
128
|
-
extra_http_headers=_preferences.headers,
|
|
129
|
-
ignore_https_errors=_preferences.skip_tls_verification,
|
|
130
|
-
geolocation=geolocation,
|
|
131
|
-
permissions=permissions,
|
|
105
|
+
# Build context options properly based on preferences
|
|
106
|
+
if _preferences.mobile:
|
|
107
|
+
viewport: ViewportSize | None = ViewportSize(width=375, height=667)
|
|
108
|
+
user_agent = "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15"
|
|
109
|
+
is_mobile = True
|
|
110
|
+
else:
|
|
111
|
+
viewport = None
|
|
112
|
+
user_agent = None
|
|
113
|
+
is_mobile = None
|
|
114
|
+
|
|
115
|
+
# Handle geolocation
|
|
116
|
+
geolocation: Geolocation | None = None
|
|
117
|
+
permissions = None
|
|
118
|
+
if _preferences.location:
|
|
119
|
+
geolocation = Geolocation(
|
|
120
|
+
latitude=getattr(_preferences.location, "latitude", 0),
|
|
121
|
+
longitude=getattr(_preferences.location, "longitude", 0),
|
|
132
122
|
)
|
|
123
|
+
permissions = ["geolocation"]
|
|
124
|
+
|
|
125
|
+
context = await browser.new_context(
|
|
126
|
+
viewport=viewport,
|
|
127
|
+
user_agent=user_agent,
|
|
128
|
+
is_mobile=is_mobile,
|
|
129
|
+
extra_http_headers=_preferences.headers,
|
|
130
|
+
ignore_https_errors=_preferences.skip_tls_verification,
|
|
131
|
+
geolocation=geolocation,
|
|
132
|
+
permissions=permissions,
|
|
133
|
+
)
|
|
133
134
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
)
|
|
151
|
-
else route.continue_(),
|
|
135
|
+
# Block ads if specified
|
|
136
|
+
if _preferences.block_ads:
|
|
137
|
+
await context.route(
|
|
138
|
+
"**/*",
|
|
139
|
+
lambda route: route.abort()
|
|
140
|
+
if route.request.resource_type in ["image", "media", "font"]
|
|
141
|
+
and any(
|
|
142
|
+
ad_domain in route.request.url
|
|
143
|
+
for ad_domain in [
|
|
144
|
+
"doubleclick.net",
|
|
145
|
+
"googlesyndication.com",
|
|
146
|
+
"adservice.google.com",
|
|
147
|
+
"ads",
|
|
148
|
+
"analytics",
|
|
149
|
+
"tracking",
|
|
150
|
+
]
|
|
152
151
|
)
|
|
152
|
+
else route.continue_(),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
page = await context.new_page()
|
|
156
|
+
|
|
157
|
+
for url in urls:
|
|
158
|
+
# Set timeout if specified
|
|
159
|
+
timeout = _preferences.timeout_ms if _preferences.timeout_ms else 30000
|
|
153
160
|
|
|
154
|
-
|
|
161
|
+
try:
|
|
162
|
+
await page.goto(url, timeout=timeout)
|
|
155
163
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
164
|
+
# Wait for specified time if configured
|
|
165
|
+
if _preferences.wait_for_ms:
|
|
166
|
+
await page.wait_for_timeout(_preferences.wait_for_ms)
|
|
159
167
|
|
|
160
|
-
|
|
161
|
-
|
|
168
|
+
# Execute actions
|
|
169
|
+
for action in _actions:
|
|
170
|
+
await action.execute(page)
|
|
162
171
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
172
|
+
except Exception as e:
|
|
173
|
+
if ignore_invalid_urls:
|
|
174
|
+
print(f"Warning: Failed to load {url}: {e}")
|
|
175
|
+
continue
|
|
176
|
+
else:
|
|
177
|
+
raise
|
|
166
178
|
|
|
167
|
-
|
|
168
|
-
for action in _actions:
|
|
169
|
-
await action.execute(page)
|
|
179
|
+
html = await page.content()
|
|
170
180
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
181
|
+
# Process HTML based on preferences - consolidate all BeautifulSoup operations
|
|
182
|
+
if (
|
|
183
|
+
_preferences.remove_base_64_images
|
|
184
|
+
or _preferences.include_tags
|
|
185
|
+
or _preferences.exclude_tags
|
|
186
|
+
or _preferences.only_main_content
|
|
187
|
+
):
|
|
188
|
+
from bs4 import BeautifulSoup
|
|
177
189
|
|
|
178
|
-
|
|
190
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
179
191
|
|
|
180
|
-
#
|
|
192
|
+
# Remove base64 images first
|
|
181
193
|
if _preferences.remove_base_64_images:
|
|
182
194
|
import re
|
|
183
195
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
196
|
+
# Debug: Check what we have before processing
|
|
197
|
+
all_imgs = soup.find_all("img")
|
|
198
|
+
print(f"DEBUG: Found {len(all_imgs)} img tags total")
|
|
199
|
+
base64_count = 0
|
|
200
|
+
for img in all_imgs:
|
|
201
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
202
|
+
if isinstance(src, str) and "data:image/" in src:
|
|
203
|
+
base64_count += 1
|
|
204
|
+
print(f"DEBUG: Found base64 img: {src[:100]}...")
|
|
205
|
+
print(f"DEBUG: {base64_count} images have base64 data")
|
|
206
|
+
|
|
207
|
+
# First, remove any anchor tags that contain img children with base64
|
|
208
|
+
# (must be done before removing img tags themselves)
|
|
209
|
+
removed_anchors = 0
|
|
210
|
+
for a_tag in soup.find_all("a"):
|
|
211
|
+
imgs = a_tag.find_all("img") # type: ignore[union-attr]
|
|
212
|
+
for img in imgs:
|
|
213
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
214
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
215
|
+
# Remove the entire anchor tag if it contains base64 image
|
|
216
|
+
a_tag.decompose()
|
|
217
|
+
removed_anchors += 1
|
|
218
|
+
break
|
|
219
|
+
print(
|
|
220
|
+
f"DEBUG: Removed {removed_anchors} anchor tags with base64 images"
|
|
189
221
|
)
|
|
190
222
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
# Prepare and send prompt
|
|
227
|
-
_prompt = _PROMPT.compile(
|
|
228
|
-
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
if isinstance(self.llm, GenerationProvider):
|
|
232
|
-
response = await self.llm.generate_by_prompt_async(
|
|
233
|
-
prompt=_prompt,
|
|
234
|
-
model=self.model,
|
|
235
|
-
developer_prompt=_INSTRUCTIONS,
|
|
236
|
-
response_schema=output,
|
|
223
|
+
# Remove standalone img tags with base64 src
|
|
224
|
+
removed_imgs = 0
|
|
225
|
+
for img in soup.find_all("img"):
|
|
226
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
227
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
228
|
+
img.decompose()
|
|
229
|
+
removed_imgs += 1
|
|
230
|
+
print(f"DEBUG: Removed {removed_imgs} standalone img tags")
|
|
231
|
+
|
|
232
|
+
# Remove any element with base64 in href (like anchor tags with image data)
|
|
233
|
+
for elem in soup.find_all(attrs={"href": True}):
|
|
234
|
+
href = elem.attrs.get("href") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
235
|
+
if isinstance(href, str) and href.startswith("data:image/"):
|
|
236
|
+
elem.decompose()
|
|
237
|
+
|
|
238
|
+
# Remove any element with base64 in style attribute
|
|
239
|
+
for elem in soup.find_all(attrs={"style": True}):
|
|
240
|
+
style = elem.attrs.get("style") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
241
|
+
if isinstance(style, str) and "data:image/" in style:
|
|
242
|
+
elem.decompose()
|
|
243
|
+
|
|
244
|
+
# Remove SVG tags (they often contain base64 or are converted to base64 by markdown)
|
|
245
|
+
for svg in soup.find_all("svg"):
|
|
246
|
+
svg.decompose()
|
|
247
|
+
|
|
248
|
+
# Remove any anchor tags that contain SVG children
|
|
249
|
+
for a_tag in soup.find_all("a"):
|
|
250
|
+
if a_tag.find("svg"): # type: ignore[union-attr]
|
|
251
|
+
a_tag.decompose()
|
|
252
|
+
|
|
253
|
+
# Final check: see if any base64 remains in the HTML string
|
|
254
|
+
html_str = str(soup)
|
|
255
|
+
remaining = len(re.findall(r'data:image/[^"\')\s]+', html_str))
|
|
256
|
+
print(
|
|
257
|
+
f"DEBUG: After processing, {remaining} base64 data URIs remain in HTML"
|
|
237
258
|
)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
259
|
+
|
|
260
|
+
# Extract main content if requested
|
|
261
|
+
if _preferences.only_main_content:
|
|
262
|
+
main_content = (
|
|
263
|
+
soup.find("main")
|
|
264
|
+
or soup.find("article")
|
|
265
|
+
or soup.find("div", {"id": "content"})
|
|
266
|
+
or soup.find("div", {"class": "content"})
|
|
245
267
|
)
|
|
268
|
+
if main_content:
|
|
269
|
+
soup = main_content # type: ignore[assignment]
|
|
246
270
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
271
|
+
# Exclude specific tags
|
|
272
|
+
if _preferences.exclude_tags:
|
|
273
|
+
for tag in _preferences.exclude_tags:
|
|
274
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
275
|
+
element.decompose()
|
|
276
|
+
|
|
277
|
+
# Include only specific tags
|
|
278
|
+
if _preferences.include_tags:
|
|
279
|
+
new_soup = BeautifulSoup("", "html.parser")
|
|
280
|
+
for tag in _preferences.include_tags:
|
|
281
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
282
|
+
new_soup.append(element) # type: ignore[arg-type]
|
|
283
|
+
soup = new_soup
|
|
284
|
+
|
|
285
|
+
html = str(soup)
|
|
252
286
|
|
|
253
|
-
|
|
287
|
+
# Convert to markdown
|
|
288
|
+
markdown = convert(html)
|
|
289
|
+
return html, markdown
|
|
254
290
|
|
|
255
|
-
|
|
291
|
+
def extract[T: BaseModel](
|
|
292
|
+
self,
|
|
293
|
+
browser: Browser,
|
|
294
|
+
urls: Sequence[str],
|
|
295
|
+
output: type[T],
|
|
296
|
+
prompt: str | None = None,
|
|
297
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
298
|
+
ignore_invalid_urls: bool = True,
|
|
299
|
+
) -> ExtractionResult[T]:
|
|
300
|
+
return run_sync(
|
|
301
|
+
self.extract_async(
|
|
302
|
+
browser=browser,
|
|
256
303
|
urls=urls,
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
extraction_preferences=
|
|
260
|
-
|
|
304
|
+
output=output,
|
|
305
|
+
prompt=prompt,
|
|
306
|
+
extraction_preferences=extraction_preferences,
|
|
307
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
261
308
|
)
|
|
309
|
+
)
|
|
262
310
|
|
|
311
|
+
@needs("playwright")
|
|
312
|
+
async def extract_async[T: BaseModel](
|
|
313
|
+
self,
|
|
314
|
+
browser: Browser,
|
|
315
|
+
urls: Sequence[str],
|
|
316
|
+
output: type[T],
|
|
317
|
+
prompt: str | None = None,
|
|
318
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
319
|
+
ignore_invalid_urls: bool = True,
|
|
320
|
+
) -> ExtractionResult[T]:
|
|
321
|
+
_preferences = extraction_preferences or ExtractionPreferences()
|
|
263
322
|
|
|
264
|
-
|
|
323
|
+
html, markdown = await self.extract_markdown_async(
|
|
324
|
+
browser=browser,
|
|
325
|
+
urls=urls,
|
|
326
|
+
extraction_preferences=_preferences,
|
|
327
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Prepare and send prompt
|
|
331
|
+
_prompt = _PROMPT.compile(
|
|
332
|
+
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if isinstance(self.llm, GenerationProvider):
|
|
336
|
+
response = await self.llm.generate_by_prompt_async(
|
|
337
|
+
prompt=_prompt,
|
|
338
|
+
model=self.model,
|
|
339
|
+
developer_prompt=_INSTRUCTIONS,
|
|
340
|
+
response_schema=output,
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
response = await self.llm.respond_async(
|
|
344
|
+
input=_prompt,
|
|
345
|
+
model=self.model,
|
|
346
|
+
instructions=_INSTRUCTIONS,
|
|
347
|
+
reasoning=self.reasoning,
|
|
348
|
+
text_format=output,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
output_parsed = (
|
|
352
|
+
response.parsed
|
|
353
|
+
if isinstance(response, Generation)
|
|
354
|
+
else response.output_parsed
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
await browser.close()
|
|
358
|
+
|
|
359
|
+
return ExtractionResult[T](
|
|
360
|
+
urls=urls,
|
|
361
|
+
html=html,
|
|
362
|
+
markdown=markdown,
|
|
363
|
+
extraction_preferences=_preferences,
|
|
364
|
+
output_parsed=output_parsed,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
async def test() -> None:
|
|
265
369
|
from dotenv import load_dotenv
|
|
370
|
+
from playwright import async_api
|
|
266
371
|
|
|
267
372
|
load_dotenv()
|
|
268
373
|
|
|
@@ -272,8 +377,8 @@ if __name__ == "__main__":
|
|
|
272
377
|
possiveis_redirecionamentos: list[str]
|
|
273
378
|
|
|
274
379
|
extractor = Extractor(
|
|
275
|
-
llm=Responder.
|
|
276
|
-
model="
|
|
380
|
+
llm=Responder.openrouter(),
|
|
381
|
+
model="google/gemini-2.5-flash",
|
|
277
382
|
)
|
|
278
383
|
|
|
279
384
|
# Example with custom extraction preferences
|
|
@@ -285,12 +390,20 @@ if __name__ == "__main__":
|
|
|
285
390
|
timeout_ms=15000,
|
|
286
391
|
)
|
|
287
392
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
393
|
+
async with async_api.async_playwright() as p:
|
|
394
|
+
browser = await p.chromium.launch(headless=True)
|
|
395
|
+
|
|
396
|
+
result = await extractor.extract_async(
|
|
397
|
+
browser=browser,
|
|
398
|
+
urls=[site_uniube],
|
|
399
|
+
output=PossiveisRedirecionamentos,
|
|
400
|
+
prompt="Extract the possible redirects from the page.",
|
|
401
|
+
extraction_preferences=preferences,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
for link in result.output_parsed.possiveis_redirecionamentos:
|
|
405
|
+
print(f"Link: {link}")
|
|
294
406
|
|
|
295
|
-
|
|
296
|
-
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__":
|
|
409
|
+
asyncio.run(test())
|
|
@@ -63,7 +63,7 @@ agentle/agents/a2a/tasks/managment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
63
63
|
agentle/agents/a2a/tasks/managment/in_memory.py,sha256=_G5VuXqEPBMtE6XJg1d7WmqFr1qzd0-99FoqM_qMwAE,23841
|
|
64
64
|
agentle/agents/a2a/tasks/managment/task_manager.py,sha256=rBCuzu4DqIs55xDnwXY0w5Rs9ybv6OJpgpugAQLhtoU,3112
|
|
65
65
|
agentle/agents/apis/__init__.py,sha256=PX7oAe0hRGvyLB295DrBF1VBsqgp5ZmGI4BCZvLUozo,2811
|
|
66
|
-
agentle/agents/apis/api.py,sha256=
|
|
66
|
+
agentle/agents/apis/api.py,sha256=PyYcZJJR0f8c7VceQMLzUfnvj3belNbCJJNmVdXfdXo,26610
|
|
67
67
|
agentle/agents/apis/api_key_authentication.py,sha256=MtMA4qkCjM3ou42a1fDgKI4u3NQkB_Zr-gEA5_oysZ0,1311
|
|
68
68
|
agentle/agents/apis/api_key_location.py,sha256=0pj_8rTkd0pkUJ2eP_Kur3AvT3JD8JpFIxQsWDzeg_c,188
|
|
69
69
|
agentle/agents/apis/api_metrics.py,sha256=SyvJdvEMKp7rGij-tDZ6vxjwc26MIFrKrcckMc4Q1Zg,380
|
|
@@ -78,7 +78,7 @@ agentle/agents/apis/cache_strategy.py,sha256=uoAvmUm1EE8426anuLR4PoygLgO4SPN1Qm_
|
|
|
78
78
|
agentle/agents/apis/circuit_breaker.py,sha256=9yopLPoZ6WMlPMWPoacayORoGN1CZq2EGWB0gCu3GOY,2371
|
|
79
79
|
agentle/agents/apis/circuit_breaker_error.py,sha256=I5XyCWwFCXTDzhvS7CpMZwBRxMyCk96g7MfBLBqAvhg,127
|
|
80
80
|
agentle/agents/apis/circuit_breaker_state.py,sha256=6IwcWWKNmE0cnpogcfsnhpy_EVIk7crQ7WiDN9YkkbE,277
|
|
81
|
-
agentle/agents/apis/endpoint.py,sha256=
|
|
81
|
+
agentle/agents/apis/endpoint.py,sha256=4skIMje2oEbc6ONb-Ww66wh8VGP5tj1JZACE3p7U62E,22600
|
|
82
82
|
agentle/agents/apis/endpoint_parameter.py,sha256=A_SVje6AyNeeJNDxWL__uGc4ZNZ2se6GvawS--kUYEU,20230
|
|
83
83
|
agentle/agents/apis/endpoints_to_tools.py,sha256=5KzxRLjfUYx7MGKHe65NMPHyTKOcd8tdK0uYzfGnO5g,999
|
|
84
84
|
agentle/agents/apis/file_upload.py,sha256=PzJ1197EKLCdBHrLDztgifM3WWFQZ8K2CKkTeeYpJas,587
|
|
@@ -1003,7 +1003,7 @@ agentle/voice_cloning/voice_cloner.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
1003
1003
|
agentle/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
1004
1004
|
agentle/web/extraction_preferences.py,sha256=Xb4X6ZgnbDuu4Pp7cI0sdPcv6LaR1Q94FPTNEoHVTGg,985
|
|
1005
1005
|
agentle/web/extraction_result.py,sha256=IsbRdT_wA9RVYGToCiz17XRoWMTtiFzxky96Zwqa4ZY,318
|
|
1006
|
-
agentle/web/extractor.py,sha256=
|
|
1006
|
+
agentle/web/extractor.py,sha256=ISNYVoofry47HtB0oDhmb2Eof15ZhTL-qdyes1mYSbQ,15585
|
|
1007
1007
|
agentle/web/location.py,sha256=RZgqb2rW7wUdcbw3PnmDtfr4FkTSSovW0j70ZOvoRiw,64
|
|
1008
1008
|
agentle/web/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
1009
1009
|
agentle/web/actions/action.py,sha256=krxW5vXaqB1_JfnPpuo5cVJyANrlElu9P0B0TrF_aZs,723
|
|
@@ -1017,7 +1017,7 @@ agentle/web/actions/scroll.py,sha256=WqVVAORNDK3BL1oASZBPmXJYeSVkPgAOmWA8ibYO82I
|
|
|
1017
1017
|
agentle/web/actions/viewport.py,sha256=KCwm88Pri19Qc6GLHC69HsRxmdJz1gEEAODfggC_fHo,287
|
|
1018
1018
|
agentle/web/actions/wait.py,sha256=IKEywjf-KC4ni9Gkkv4wgc7bY-hk7HwD4F-OFWlyf2w,571
|
|
1019
1019
|
agentle/web/actions/write_text.py,sha256=9mxfHcpKs_L7BsDnJvOYHQwG8M0GWe61SRJAsKk3xQ8,748
|
|
1020
|
-
agentle-0.9.
|
|
1021
|
-
agentle-0.9.
|
|
1022
|
-
agentle-0.9.
|
|
1023
|
-
agentle-0.9.
|
|
1020
|
+
agentle-0.9.26.dist-info/METADATA,sha256=ZCnSN_aDQlrUPevRNIgjpDU64lRUcusv_5bx3ZcDFP0,86849
|
|
1021
|
+
agentle-0.9.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
1022
|
+
agentle-0.9.26.dist-info/licenses/LICENSE,sha256=T90S9vqRS6qP-voULxAcvwEs558wRRo6dHuZrjgcOUI,1085
|
|
1023
|
+
agentle-0.9.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|