agentle 0.9.25__py3-none-any.whl → 0.9.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentle/agents/apis/api.py +25 -7
- agentle/agents/apis/endpoint.py +14 -8
- agentle/agents/whatsapp/models/whatsapp_response_base.py +31 -0
- agentle/agents/whatsapp/whatsapp_bot.py +53 -12
- agentle/web/extractor.py +277 -164
- {agentle-0.9.25.dist-info → agentle-0.9.27.dist-info}/METADATA +1 -1
- {agentle-0.9.25.dist-info → agentle-0.9.27.dist-info}/RECORD +9 -8
- {agentle-0.9.25.dist-info → agentle-0.9.27.dist-info}/WHEEL +0 -0
- {agentle-0.9.25.dist-info → agentle-0.9.27.dist-info}/licenses/LICENSE +0 -0
agentle/agents/apis/api.py
CHANGED
|
@@ -14,6 +14,7 @@ Provides advanced features for managing collections of related endpoints with:
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import re
|
|
17
18
|
from collections.abc import (
|
|
18
19
|
Coroutine,
|
|
19
20
|
Mapping,
|
|
@@ -513,13 +514,30 @@ class API(BaseModel):
|
|
|
513
514
|
continue
|
|
514
515
|
|
|
515
516
|
# Create endpoint
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
517
|
+
# Generate a valid function name from the path
|
|
518
|
+
if operation_id:
|
|
519
|
+
endpoint_name = operation_id
|
|
520
|
+
else:
|
|
521
|
+
# Clean the path to create a valid function name
|
|
522
|
+
# Remove leading/trailing slashes and replace special chars
|
|
523
|
+
clean_path = (
|
|
524
|
+
path.strip("/")
|
|
525
|
+
.replace("/", "_")
|
|
526
|
+
.replace("{", "")
|
|
527
|
+
.replace("}", "")
|
|
528
|
+
.replace("-", "_")
|
|
529
|
+
)
|
|
530
|
+
# Remove any consecutive underscores
|
|
531
|
+
clean_path = re.sub(r"_+", "_", clean_path)
|
|
532
|
+
# Ensure it doesn't start with a number
|
|
533
|
+
if clean_path and clean_path[0].isdigit():
|
|
534
|
+
clean_path = f"n{clean_path}"
|
|
535
|
+
# If empty after cleaning, use a generic name
|
|
536
|
+
if not clean_path:
|
|
537
|
+
clean_path = "root"
|
|
538
|
+
endpoint_name = f"{method.lower()}_{clean_path}"
|
|
539
|
+
|
|
540
|
+
endpoint_name = cast(str, endpoint_name)
|
|
523
541
|
|
|
524
542
|
endpoint_description: str = cast(
|
|
525
543
|
str,
|
agentle/agents/apis/endpoint.py
CHANGED
|
@@ -359,7 +359,7 @@ class Endpoint(BaseModel):
|
|
|
359
359
|
await self._auth_handler.refresh_if_needed()
|
|
360
360
|
await self._auth_handler.apply_auth(None, url, headers, query_params) # type: ignore
|
|
361
361
|
|
|
362
|
-
# Prepare connector
|
|
362
|
+
# Prepare connector kwargs (will be used to create fresh connector for each attempt)
|
|
363
363
|
connector_kwargs: dict[str, Any] = {
|
|
364
364
|
"limit": 10,
|
|
365
365
|
"limit_per_host": 5,
|
|
@@ -369,8 +369,6 @@ class Endpoint(BaseModel):
|
|
|
369
369
|
if not self.request_config.verify_ssl:
|
|
370
370
|
connector_kwargs["ssl"] = False
|
|
371
371
|
|
|
372
|
-
connector = aiohttp.TCPConnector(**connector_kwargs)
|
|
373
|
-
|
|
374
372
|
# Prepare timeout
|
|
375
373
|
timeout = aiohttp.ClientTimeout(
|
|
376
374
|
total=self.request_config.timeout,
|
|
@@ -381,9 +379,11 @@ class Endpoint(BaseModel):
|
|
|
381
379
|
# Define the request function for circuit breaker
|
|
382
380
|
async def make_single_request() -> Any:
|
|
383
381
|
"""Make a single request attempt."""
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
382
|
+
# Create a fresh connector for each request attempt to avoid "Session is closed" errors on retries
|
|
383
|
+
connector = aiohttp.TCPConnector(**connector_kwargs)
|
|
384
|
+
session = None
|
|
385
|
+
try:
|
|
386
|
+
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
|
|
387
387
|
# Prepare request kwargs
|
|
388
388
|
request_kwargs: dict[str, Any] = {
|
|
389
389
|
"headers": headers,
|
|
@@ -486,6 +486,12 @@ class Endpoint(BaseModel):
|
|
|
486
486
|
await self._response_cache.set(url, kwargs, result)
|
|
487
487
|
|
|
488
488
|
return result
|
|
489
|
+
finally:
|
|
490
|
+
# Always close the session to prevent "Session is closed" errors on retries
|
|
491
|
+
if session is not None:
|
|
492
|
+
await session.close()
|
|
493
|
+
# Give the connector time to close properly
|
|
494
|
+
await asyncio.sleep(0.01)
|
|
489
495
|
|
|
490
496
|
# Execute with retries
|
|
491
497
|
last_exception = None
|
|
@@ -569,11 +575,11 @@ class Endpoint(BaseModel):
|
|
|
569
575
|
|
|
570
576
|
if hasattr(param, "enum") and param.enum:
|
|
571
577
|
param_info["enum"] = list(param.enum)
|
|
572
|
-
|
|
578
|
+
|
|
573
579
|
# Add constraints for number/primitive types
|
|
574
580
|
if hasattr(param, "parameter_schema") and param.parameter_schema:
|
|
575
581
|
from agentle.agents.apis.primitive_schema import PrimitiveSchema
|
|
576
|
-
|
|
582
|
+
|
|
577
583
|
schema = param.parameter_schema
|
|
578
584
|
# Only PrimitiveSchema has minimum, maximum, format
|
|
579
585
|
if isinstance(schema, PrimitiveSchema):
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Em: agentle/agents/whatsapp/models/whatsapp_response_base.py
|
|
2
|
+
|
|
3
|
+
from rsb.models.base_model import BaseModel
|
|
4
|
+
from rsb.models.field import Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class WhatsAppResponseBase(BaseModel):
|
|
8
|
+
"""
|
|
9
|
+
Base class for WhatsApp bot structured responses.
|
|
10
|
+
|
|
11
|
+
This class ensures that all structured outputs from the WhatsApp bot
|
|
12
|
+
contain a 'response' field with the text to be sent to the user.
|
|
13
|
+
|
|
14
|
+
Developers can extend this class to add additional structured data
|
|
15
|
+
that they want to extract from the conversation.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
```python
|
|
19
|
+
class CustomerServiceResponse(WhatsAppResponseBase):
|
|
20
|
+
response: str # Inherited - text to send to user
|
|
21
|
+
sentiment: Literal["happy", "neutral", "frustrated", "angry"]
|
|
22
|
+
urgency: int = Field(ge=1, le=5, description="Urgency level 1-5")
|
|
23
|
+
requires_human: bool = False
|
|
24
|
+
suggested_actions: list[str] = Field(default_factory=list)
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
response: str = Field(
|
|
29
|
+
...,
|
|
30
|
+
description="The text response that will be sent to the WhatsApp user. This field is required.",
|
|
31
|
+
)
|
|
@@ -33,6 +33,7 @@ from agentle.agents.whatsapp.models.whatsapp_document_message import (
|
|
|
33
33
|
from agentle.agents.whatsapp.models.whatsapp_image_message import WhatsAppImageMessage
|
|
34
34
|
from agentle.agents.whatsapp.models.whatsapp_media_message import WhatsAppMediaMessage
|
|
35
35
|
from agentle.agents.whatsapp.models.whatsapp_message import WhatsAppMessage
|
|
36
|
+
from agentle.agents.whatsapp.models.whatsapp_response_base import WhatsAppResponseBase
|
|
36
37
|
from agentle.agents.whatsapp.models.whatsapp_session import WhatsAppSession
|
|
37
38
|
from agentle.agents.whatsapp.models.whatsapp_text_message import WhatsAppTextMessage
|
|
38
39
|
from agentle.agents.whatsapp.models.whatsapp_video_message import WhatsAppVideoMessage
|
|
@@ -128,14 +129,42 @@ class CallbackWithContext:
|
|
|
128
129
|
context: dict[str, Any] = field(default_factory=dict)
|
|
129
130
|
|
|
130
131
|
|
|
131
|
-
class WhatsAppBot(BaseModel):
|
|
132
|
+
class WhatsAppBot[T_Schema: WhatsAppResponseBase = WhatsAppResponseBase](BaseModel):
|
|
132
133
|
"""
|
|
133
134
|
WhatsApp bot that wraps an Agentle agent with enhanced message batching and spam protection.
|
|
134
135
|
|
|
135
|
-
Now
|
|
136
|
+
Now supports structured outputs through generic type parameter T_Schema.
|
|
137
|
+
The schema must extend WhatsAppResponseBase to ensure a 'response' field is always present.
|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
```python
|
|
141
|
+
# Basic usage (no structured output)
|
|
142
|
+
agent = Agent(...)
|
|
143
|
+
bot = WhatsAppBot(agent=agent, provider=provider)
|
|
144
|
+
|
|
145
|
+
# With structured output
|
|
146
|
+
class MyResponse(WhatsAppResponseBase):
|
|
147
|
+
sentiment: Literal["happy", "sad", "neutral"]
|
|
148
|
+
urgency_level: int
|
|
149
|
+
|
|
150
|
+
agent = Agent[MyResponse](
|
|
151
|
+
response_schema=MyResponse,
|
|
152
|
+
instructions="Extract sentiment and urgency from the conversation..."
|
|
153
|
+
)
|
|
154
|
+
bot = WhatsAppBot[MyResponse](agent=agent, provider=provider)
|
|
155
|
+
|
|
156
|
+
# Access structured data in callbacks
|
|
157
|
+
async def my_callback(phone, chat_id, response, context):
|
|
158
|
+
if response and response.parsed:
|
|
159
|
+
print(f"Sentiment: {response.parsed.sentiment}")
|
|
160
|
+
print(f"Urgency: {response.parsed.urgency_level}")
|
|
161
|
+
# response.parsed.response is automatically sent to WhatsApp
|
|
162
|
+
|
|
163
|
+
bot.add_response_callback(my_callback)
|
|
164
|
+
```
|
|
136
165
|
"""
|
|
137
166
|
|
|
138
|
-
agent: Agent[
|
|
167
|
+
agent: Agent[T_Schema]
|
|
139
168
|
provider: WhatsAppProvider
|
|
140
169
|
tts_provider: TtsProvider | None = Field(default=None)
|
|
141
170
|
file_storage_manager: FileStorageManager | None = Field(default=None)
|
|
@@ -1302,7 +1331,7 @@ class WhatsAppBot(BaseModel):
|
|
|
1302
1331
|
|
|
1303
1332
|
async def _process_message_batch(
|
|
1304
1333
|
self, phone_number: PhoneNumber, session: WhatsAppSession, processing_token: str
|
|
1305
|
-
) -> GeneratedAssistantMessage[
|
|
1334
|
+
) -> GeneratedAssistantMessage[T_Schema] | None:
|
|
1306
1335
|
"""Process a batch of messages for a user with enhanced timeout protection.
|
|
1307
1336
|
|
|
1308
1337
|
This method processes multiple messages that were received in quick succession
|
|
@@ -1504,7 +1533,7 @@ class WhatsAppBot(BaseModel):
|
|
|
1504
1533
|
message: WhatsAppMessage,
|
|
1505
1534
|
session: WhatsAppSession,
|
|
1506
1535
|
chat_id: ChatId | None = None,
|
|
1507
|
-
) -> GeneratedAssistantMessage[
|
|
1536
|
+
) -> GeneratedAssistantMessage[T_Schema]:
|
|
1508
1537
|
"""Process a single message immediately with quote message support."""
|
|
1509
1538
|
logger.info(
|
|
1510
1539
|
"[SINGLE_MESSAGE] ═══════════ SINGLE MESSAGE PROCESSING START ═══════════"
|
|
@@ -2207,7 +2236,7 @@ class WhatsAppBot(BaseModel):
|
|
|
2207
2236
|
async def _send_response(
|
|
2208
2237
|
self,
|
|
2209
2238
|
to: PhoneNumber,
|
|
2210
|
-
response: GeneratedAssistantMessage[
|
|
2239
|
+
response: GeneratedAssistantMessage[T_Schema] | str,
|
|
2211
2240
|
reply_to: str | None = None,
|
|
2212
2241
|
) -> None:
|
|
2213
2242
|
"""Send response message(s) to user with enhanced error handling and retry logic.
|
|
@@ -2255,12 +2284,24 @@ class WhatsAppBot(BaseModel):
|
|
|
2255
2284
|
... reply_to="msg_123"
|
|
2256
2285
|
... )
|
|
2257
2286
|
"""
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
if
|
|
2262
|
-
|
|
2263
|
-
|
|
2287
|
+
response_text = ""
|
|
2288
|
+
|
|
2289
|
+
if isinstance(response, GeneratedAssistantMessage):
|
|
2290
|
+
# Check if we have structured output (parsed)
|
|
2291
|
+
if response.parsed:
|
|
2292
|
+
# Use the 'response' field from structured output
|
|
2293
|
+
response_text = response.parsed.response
|
|
2294
|
+
logger.debug(
|
|
2295
|
+
"[SEND_RESPONSE] Using structured output 'response' field "
|
|
2296
|
+
+ f"(schema: {type(response.parsed).__name__})"
|
|
2297
|
+
)
|
|
2298
|
+
else:
|
|
2299
|
+
# Fallback to text field
|
|
2300
|
+
response_text = response.text
|
|
2301
|
+
logger.debug("[SEND_RESPONSE] Using standard text response")
|
|
2302
|
+
else:
|
|
2303
|
+
# Direct string
|
|
2304
|
+
response_text = response
|
|
2264
2305
|
|
|
2265
2306
|
# Apply WhatsApp-specific markdown formatting
|
|
2266
2307
|
response_text = self._format_whatsapp_markdown(response_text)
|
agentle/web/extractor.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
1
4
|
from collections.abc import Sequence
|
|
2
5
|
from textwrap import dedent
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
3
7
|
|
|
4
8
|
from html_to_markdown import convert
|
|
5
|
-
from playwright.async_api import Geolocation, ViewportSize
|
|
6
9
|
from rsb.coroutines.run_sync import run_sync
|
|
7
10
|
from rsb.models import Field
|
|
8
11
|
from rsb.models.base_model import BaseModel
|
|
@@ -18,6 +21,10 @@ from agentle.web.actions.action import Action
|
|
|
18
21
|
from agentle.web.extraction_preferences import ExtractionPreferences
|
|
19
22
|
from agentle.web.extraction_result import ExtractionResult
|
|
20
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from playwright.async_api import Browser, Geolocation, ViewportSize
|
|
26
|
+
|
|
27
|
+
|
|
21
28
|
_INSTRUCTIONS = Prompt.from_text(
|
|
22
29
|
dedent("""\
|
|
23
30
|
<character>
|
|
@@ -64,31 +71,28 @@ class Extractor(BaseModel):
|
|
|
64
71
|
|
|
65
72
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
66
73
|
|
|
67
|
-
def
|
|
74
|
+
def extract_markdown(
|
|
68
75
|
self,
|
|
76
|
+
browser: Browser,
|
|
69
77
|
urls: Sequence[str],
|
|
70
|
-
output: type[T],
|
|
71
|
-
prompt: str | None = None,
|
|
72
78
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
73
79
|
ignore_invalid_urls: bool = True,
|
|
74
|
-
) ->
|
|
80
|
+
) -> tuple[str, str]:
|
|
75
81
|
return run_sync(
|
|
76
|
-
self.
|
|
77
|
-
|
|
78
|
-
|
|
82
|
+
self.extract_markdown_async,
|
|
83
|
+
browser=browser,
|
|
84
|
+
urls=urls,
|
|
85
|
+
extraction_preferences=extraction_preferences,
|
|
86
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
79
87
|
)
|
|
80
88
|
|
|
81
|
-
|
|
82
|
-
async def extract_async[T: BaseModel](
|
|
89
|
+
async def extract_markdown_async(
|
|
83
90
|
self,
|
|
91
|
+
browser: Browser,
|
|
84
92
|
urls: Sequence[str],
|
|
85
|
-
output: type[T],
|
|
86
|
-
prompt: str | None = None,
|
|
87
93
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
88
94
|
ignore_invalid_urls: bool = True,
|
|
89
|
-
) ->
|
|
90
|
-
from playwright import async_api
|
|
91
|
-
|
|
95
|
+
) -> tuple[str, str]:
|
|
92
96
|
_preferences = extraction_preferences or ExtractionPreferences()
|
|
93
97
|
_actions: Sequence[Action] = _preferences.actions or []
|
|
94
98
|
|
|
@@ -98,171 +102,272 @@ class Extractor(BaseModel):
|
|
|
98
102
|
# This is a placeholder for proxy configuration
|
|
99
103
|
pass
|
|
100
104
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
geolocation
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
geolocation = Geolocation(
|
|
119
|
-
latitude=getattr(_preferences.location, "latitude", 0),
|
|
120
|
-
longitude=getattr(_preferences.location, "longitude", 0),
|
|
121
|
-
)
|
|
122
|
-
permissions = ["geolocation"]
|
|
123
|
-
|
|
124
|
-
context = await browser.new_context(
|
|
125
|
-
viewport=viewport,
|
|
126
|
-
user_agent=user_agent,
|
|
127
|
-
is_mobile=is_mobile,
|
|
128
|
-
extra_http_headers=_preferences.headers,
|
|
129
|
-
ignore_https_errors=_preferences.skip_tls_verification,
|
|
130
|
-
geolocation=geolocation,
|
|
131
|
-
permissions=permissions,
|
|
105
|
+
# Build context options properly based on preferences
|
|
106
|
+
if _preferences.mobile:
|
|
107
|
+
viewport: ViewportSize | None = ViewportSize(width=375, height=667)
|
|
108
|
+
user_agent = "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15"
|
|
109
|
+
is_mobile = True
|
|
110
|
+
else:
|
|
111
|
+
viewport = None
|
|
112
|
+
user_agent = None
|
|
113
|
+
is_mobile = None
|
|
114
|
+
|
|
115
|
+
# Handle geolocation
|
|
116
|
+
geolocation: Geolocation | None = None
|
|
117
|
+
permissions = None
|
|
118
|
+
if _preferences.location:
|
|
119
|
+
geolocation = Geolocation(
|
|
120
|
+
latitude=getattr(_preferences.location, "latitude", 0),
|
|
121
|
+
longitude=getattr(_preferences.location, "longitude", 0),
|
|
132
122
|
)
|
|
123
|
+
permissions = ["geolocation"]
|
|
124
|
+
|
|
125
|
+
context = await browser.new_context(
|
|
126
|
+
viewport=viewport,
|
|
127
|
+
user_agent=user_agent,
|
|
128
|
+
is_mobile=is_mobile,
|
|
129
|
+
extra_http_headers=_preferences.headers,
|
|
130
|
+
ignore_https_errors=_preferences.skip_tls_verification,
|
|
131
|
+
geolocation=geolocation,
|
|
132
|
+
permissions=permissions,
|
|
133
|
+
)
|
|
133
134
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
)
|
|
151
|
-
else route.continue_(),
|
|
135
|
+
# Block ads if specified
|
|
136
|
+
if _preferences.block_ads:
|
|
137
|
+
await context.route(
|
|
138
|
+
"**/*",
|
|
139
|
+
lambda route: route.abort()
|
|
140
|
+
if route.request.resource_type in ["image", "media", "font"]
|
|
141
|
+
and any(
|
|
142
|
+
ad_domain in route.request.url
|
|
143
|
+
for ad_domain in [
|
|
144
|
+
"doubleclick.net",
|
|
145
|
+
"googlesyndication.com",
|
|
146
|
+
"adservice.google.com",
|
|
147
|
+
"ads",
|
|
148
|
+
"analytics",
|
|
149
|
+
"tracking",
|
|
150
|
+
]
|
|
152
151
|
)
|
|
152
|
+
else route.continue_(),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
page = await context.new_page()
|
|
156
|
+
|
|
157
|
+
for url in urls:
|
|
158
|
+
# Set timeout if specified
|
|
159
|
+
timeout = _preferences.timeout_ms if _preferences.timeout_ms else 30000
|
|
153
160
|
|
|
154
|
-
|
|
161
|
+
try:
|
|
162
|
+
await page.goto(url, timeout=timeout)
|
|
155
163
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
164
|
+
# Wait for specified time if configured
|
|
165
|
+
if _preferences.wait_for_ms:
|
|
166
|
+
await page.wait_for_timeout(_preferences.wait_for_ms)
|
|
159
167
|
|
|
160
|
-
|
|
161
|
-
|
|
168
|
+
# Execute actions
|
|
169
|
+
for action in _actions:
|
|
170
|
+
await action.execute(page)
|
|
162
171
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
172
|
+
except Exception as e:
|
|
173
|
+
if ignore_invalid_urls:
|
|
174
|
+
print(f"Warning: Failed to load {url}: {e}")
|
|
175
|
+
continue
|
|
176
|
+
else:
|
|
177
|
+
raise
|
|
166
178
|
|
|
167
|
-
|
|
168
|
-
for action in _actions:
|
|
169
|
-
await action.execute(page)
|
|
179
|
+
html = await page.content()
|
|
170
180
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
181
|
+
# Process HTML based on preferences - consolidate all BeautifulSoup operations
|
|
182
|
+
if (
|
|
183
|
+
_preferences.remove_base_64_images
|
|
184
|
+
or _preferences.include_tags
|
|
185
|
+
or _preferences.exclude_tags
|
|
186
|
+
or _preferences.only_main_content
|
|
187
|
+
):
|
|
188
|
+
from bs4 import BeautifulSoup
|
|
177
189
|
|
|
178
|
-
|
|
190
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
179
191
|
|
|
180
|
-
#
|
|
192
|
+
# Remove base64 images first
|
|
181
193
|
if _preferences.remove_base_64_images:
|
|
182
194
|
import re
|
|
183
195
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
196
|
+
# Debug: Check what we have before processing
|
|
197
|
+
all_imgs = soup.find_all("img")
|
|
198
|
+
print(f"DEBUG: Found {len(all_imgs)} img tags total")
|
|
199
|
+
base64_count = 0
|
|
200
|
+
for img in all_imgs:
|
|
201
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
202
|
+
if isinstance(src, str) and "data:image/" in src:
|
|
203
|
+
base64_count += 1
|
|
204
|
+
print(f"DEBUG: Found base64 img: {src[:100]}...")
|
|
205
|
+
print(f"DEBUG: {base64_count} images have base64 data")
|
|
206
|
+
|
|
207
|
+
# First, remove any anchor tags that contain img children with base64
|
|
208
|
+
# (must be done before removing img tags themselves)
|
|
209
|
+
removed_anchors = 0
|
|
210
|
+
for a_tag in soup.find_all("a"):
|
|
211
|
+
imgs = a_tag.find_all("img") # type: ignore[union-attr]
|
|
212
|
+
for img in imgs:
|
|
213
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
214
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
215
|
+
# Remove the entire anchor tag if it contains base64 image
|
|
216
|
+
a_tag.decompose()
|
|
217
|
+
removed_anchors += 1
|
|
218
|
+
break
|
|
219
|
+
print(
|
|
220
|
+
f"DEBUG: Removed {removed_anchors} anchor tags with base64 images"
|
|
189
221
|
)
|
|
190
222
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
# Prepare and send prompt
|
|
227
|
-
_prompt = _PROMPT.compile(
|
|
228
|
-
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
if isinstance(self.llm, GenerationProvider):
|
|
232
|
-
response = await self.llm.generate_by_prompt_async(
|
|
233
|
-
prompt=_prompt,
|
|
234
|
-
model=self.model,
|
|
235
|
-
developer_prompt=_INSTRUCTIONS,
|
|
236
|
-
response_schema=output,
|
|
223
|
+
# Remove standalone img tags with base64 src
|
|
224
|
+
removed_imgs = 0
|
|
225
|
+
for img in soup.find_all("img"):
|
|
226
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
227
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
228
|
+
img.decompose()
|
|
229
|
+
removed_imgs += 1
|
|
230
|
+
print(f"DEBUG: Removed {removed_imgs} standalone img tags")
|
|
231
|
+
|
|
232
|
+
# Remove any element with base64 in href (like anchor tags with image data)
|
|
233
|
+
for elem in soup.find_all(attrs={"href": True}):
|
|
234
|
+
href = elem.attrs.get("href") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
235
|
+
if isinstance(href, str) and href.startswith("data:image/"):
|
|
236
|
+
elem.decompose()
|
|
237
|
+
|
|
238
|
+
# Remove any element with base64 in style attribute
|
|
239
|
+
for elem in soup.find_all(attrs={"style": True}):
|
|
240
|
+
style = elem.attrs.get("style") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
241
|
+
if isinstance(style, str) and "data:image/" in style:
|
|
242
|
+
elem.decompose()
|
|
243
|
+
|
|
244
|
+
# Remove SVG tags (they often contain base64 or are converted to base64 by markdown)
|
|
245
|
+
for svg in soup.find_all("svg"):
|
|
246
|
+
svg.decompose()
|
|
247
|
+
|
|
248
|
+
# Remove any anchor tags that contain SVG children
|
|
249
|
+
for a_tag in soup.find_all("a"):
|
|
250
|
+
if a_tag.find("svg"): # type: ignore[union-attr]
|
|
251
|
+
a_tag.decompose()
|
|
252
|
+
|
|
253
|
+
# Final check: see if any base64 remains in the HTML string
|
|
254
|
+
html_str = str(soup)
|
|
255
|
+
remaining = len(re.findall(r'data:image/[^"\')\s]+', html_str))
|
|
256
|
+
print(
|
|
257
|
+
f"DEBUG: After processing, {remaining} base64 data URIs remain in HTML"
|
|
237
258
|
)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
259
|
+
|
|
260
|
+
# Extract main content if requested
|
|
261
|
+
if _preferences.only_main_content:
|
|
262
|
+
main_content = (
|
|
263
|
+
soup.find("main")
|
|
264
|
+
or soup.find("article")
|
|
265
|
+
or soup.find("div", {"id": "content"})
|
|
266
|
+
or soup.find("div", {"class": "content"})
|
|
245
267
|
)
|
|
268
|
+
if main_content:
|
|
269
|
+
soup = main_content # type: ignore[assignment]
|
|
246
270
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
271
|
+
# Exclude specific tags
|
|
272
|
+
if _preferences.exclude_tags:
|
|
273
|
+
for tag in _preferences.exclude_tags:
|
|
274
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
275
|
+
element.decompose()
|
|
276
|
+
|
|
277
|
+
# Include only specific tags
|
|
278
|
+
if _preferences.include_tags:
|
|
279
|
+
new_soup = BeautifulSoup("", "html.parser")
|
|
280
|
+
for tag in _preferences.include_tags:
|
|
281
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
282
|
+
new_soup.append(element) # type: ignore[arg-type]
|
|
283
|
+
soup = new_soup
|
|
284
|
+
|
|
285
|
+
html = str(soup)
|
|
252
286
|
|
|
253
|
-
|
|
287
|
+
# Convert to markdown
|
|
288
|
+
markdown = convert(html)
|
|
289
|
+
return html, markdown
|
|
254
290
|
|
|
255
|
-
|
|
291
|
+
def extract[T: BaseModel](
|
|
292
|
+
self,
|
|
293
|
+
browser: Browser,
|
|
294
|
+
urls: Sequence[str],
|
|
295
|
+
output: type[T],
|
|
296
|
+
prompt: str | None = None,
|
|
297
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
298
|
+
ignore_invalid_urls: bool = True,
|
|
299
|
+
) -> ExtractionResult[T]:
|
|
300
|
+
return run_sync(
|
|
301
|
+
self.extract_async(
|
|
302
|
+
browser=browser,
|
|
256
303
|
urls=urls,
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
extraction_preferences=
|
|
260
|
-
|
|
304
|
+
output=output,
|
|
305
|
+
prompt=prompt,
|
|
306
|
+
extraction_preferences=extraction_preferences,
|
|
307
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
261
308
|
)
|
|
309
|
+
)
|
|
262
310
|
|
|
311
|
+
@needs("playwright")
|
|
312
|
+
async def extract_async[T: BaseModel](
|
|
313
|
+
self,
|
|
314
|
+
browser: Browser,
|
|
315
|
+
urls: Sequence[str],
|
|
316
|
+
output: type[T],
|
|
317
|
+
prompt: str | None = None,
|
|
318
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
319
|
+
ignore_invalid_urls: bool = True,
|
|
320
|
+
) -> ExtractionResult[T]:
|
|
321
|
+
_preferences = extraction_preferences or ExtractionPreferences()
|
|
263
322
|
|
|
264
|
-
|
|
323
|
+
html, markdown = await self.extract_markdown_async(
|
|
324
|
+
browser=browser,
|
|
325
|
+
urls=urls,
|
|
326
|
+
extraction_preferences=_preferences,
|
|
327
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Prepare and send prompt
|
|
331
|
+
_prompt = _PROMPT.compile(
|
|
332
|
+
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if isinstance(self.llm, GenerationProvider):
|
|
336
|
+
response = await self.llm.generate_by_prompt_async(
|
|
337
|
+
prompt=_prompt,
|
|
338
|
+
model=self.model,
|
|
339
|
+
developer_prompt=_INSTRUCTIONS,
|
|
340
|
+
response_schema=output,
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
response = await self.llm.respond_async(
|
|
344
|
+
input=_prompt,
|
|
345
|
+
model=self.model,
|
|
346
|
+
instructions=_INSTRUCTIONS,
|
|
347
|
+
reasoning=self.reasoning,
|
|
348
|
+
text_format=output,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
output_parsed = (
|
|
352
|
+
response.parsed
|
|
353
|
+
if isinstance(response, Generation)
|
|
354
|
+
else response.output_parsed
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
await browser.close()
|
|
358
|
+
|
|
359
|
+
return ExtractionResult[T](
|
|
360
|
+
urls=urls,
|
|
361
|
+
html=html,
|
|
362
|
+
markdown=markdown,
|
|
363
|
+
extraction_preferences=_preferences,
|
|
364
|
+
output_parsed=output_parsed,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
async def test() -> None:
|
|
265
369
|
from dotenv import load_dotenv
|
|
370
|
+
from playwright import async_api
|
|
266
371
|
|
|
267
372
|
load_dotenv()
|
|
268
373
|
|
|
@@ -272,8 +377,8 @@ if __name__ == "__main__":
|
|
|
272
377
|
possiveis_redirecionamentos: list[str]
|
|
273
378
|
|
|
274
379
|
extractor = Extractor(
|
|
275
|
-
llm=Responder.
|
|
276
|
-
model="
|
|
380
|
+
llm=Responder.openrouter(),
|
|
381
|
+
model="google/gemini-2.5-flash",
|
|
277
382
|
)
|
|
278
383
|
|
|
279
384
|
# Example with custom extraction preferences
|
|
@@ -285,12 +390,20 @@ if __name__ == "__main__":
|
|
|
285
390
|
timeout_ms=15000,
|
|
286
391
|
)
|
|
287
392
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
393
|
+
async with async_api.async_playwright() as p:
|
|
394
|
+
browser = await p.chromium.launch(headless=True)
|
|
395
|
+
|
|
396
|
+
result = await extractor.extract_async(
|
|
397
|
+
browser=browser,
|
|
398
|
+
urls=[site_uniube],
|
|
399
|
+
output=PossiveisRedirecionamentos,
|
|
400
|
+
prompt="Extract the possible redirects from the page.",
|
|
401
|
+
extraction_preferences=preferences,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
for link in result.output_parsed.possiveis_redirecionamentos:
|
|
405
|
+
print(f"Link: {link}")
|
|
294
406
|
|
|
295
|
-
|
|
296
|
-
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__":
|
|
409
|
+
asyncio.run(test())
|
|
@@ -63,7 +63,7 @@ agentle/agents/a2a/tasks/managment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
63
63
|
agentle/agents/a2a/tasks/managment/in_memory.py,sha256=_G5VuXqEPBMtE6XJg1d7WmqFr1qzd0-99FoqM_qMwAE,23841
|
|
64
64
|
agentle/agents/a2a/tasks/managment/task_manager.py,sha256=rBCuzu4DqIs55xDnwXY0w5Rs9ybv6OJpgpugAQLhtoU,3112
|
|
65
65
|
agentle/agents/apis/__init__.py,sha256=PX7oAe0hRGvyLB295DrBF1VBsqgp5ZmGI4BCZvLUozo,2811
|
|
66
|
-
agentle/agents/apis/api.py,sha256=
|
|
66
|
+
agentle/agents/apis/api.py,sha256=PyYcZJJR0f8c7VceQMLzUfnvj3belNbCJJNmVdXfdXo,26610
|
|
67
67
|
agentle/agents/apis/api_key_authentication.py,sha256=MtMA4qkCjM3ou42a1fDgKI4u3NQkB_Zr-gEA5_oysZ0,1311
|
|
68
68
|
agentle/agents/apis/api_key_location.py,sha256=0pj_8rTkd0pkUJ2eP_Kur3AvT3JD8JpFIxQsWDzeg_c,188
|
|
69
69
|
agentle/agents/apis/api_metrics.py,sha256=SyvJdvEMKp7rGij-tDZ6vxjwc26MIFrKrcckMc4Q1Zg,380
|
|
@@ -78,7 +78,7 @@ agentle/agents/apis/cache_strategy.py,sha256=uoAvmUm1EE8426anuLR4PoygLgO4SPN1Qm_
|
|
|
78
78
|
agentle/agents/apis/circuit_breaker.py,sha256=9yopLPoZ6WMlPMWPoacayORoGN1CZq2EGWB0gCu3GOY,2371
|
|
79
79
|
agentle/agents/apis/circuit_breaker_error.py,sha256=I5XyCWwFCXTDzhvS7CpMZwBRxMyCk96g7MfBLBqAvhg,127
|
|
80
80
|
agentle/agents/apis/circuit_breaker_state.py,sha256=6IwcWWKNmE0cnpogcfsnhpy_EVIk7crQ7WiDN9YkkbE,277
|
|
81
|
-
agentle/agents/apis/endpoint.py,sha256=
|
|
81
|
+
agentle/agents/apis/endpoint.py,sha256=4skIMje2oEbc6ONb-Ww66wh8VGP5tj1JZACE3p7U62E,22600
|
|
82
82
|
agentle/agents/apis/endpoint_parameter.py,sha256=A_SVje6AyNeeJNDxWL__uGc4ZNZ2se6GvawS--kUYEU,20230
|
|
83
83
|
agentle/agents/apis/endpoints_to_tools.py,sha256=5KzxRLjfUYx7MGKHe65NMPHyTKOcd8tdK0uYzfGnO5g,999
|
|
84
84
|
agentle/agents/apis/file_upload.py,sha256=PzJ1197EKLCdBHrLDztgifM3WWFQZ8K2CKkTeeYpJas,587
|
|
@@ -137,7 +137,7 @@ agentle/agents/ui/__init__.py,sha256=IjHRV0k2DNwvFrEHebmsXiBvmITE8nQUnsR07h9tVkU
|
|
|
137
137
|
agentle/agents/ui/streamlit.py,sha256=9afICL0cxtG1o2pWh6vH39-NdKiVfADKiXo405F2aB0,42829
|
|
138
138
|
agentle/agents/whatsapp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
139
|
agentle/agents/whatsapp/human_delay_calculator.py,sha256=BGCDeoNTPsMn4d_QYmG0BWGCG8SiUJC6Fk295ulAsAk,18268
|
|
140
|
-
agentle/agents/whatsapp/whatsapp_bot.py,sha256=
|
|
140
|
+
agentle/agents/whatsapp/whatsapp_bot.py,sha256=7B1ZCQYMJ6qgRINFzSQVSshDr-4L1F5H3MHqJYI7w3g,161962
|
|
141
141
|
agentle/agents/whatsapp/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
142
142
|
agentle/agents/whatsapp/models/audio_message.py,sha256=kUqG1HdNW6DCYD-CqscJ6WHlAyv9ufmTSKMdjio9XWk,2705
|
|
143
143
|
agentle/agents/whatsapp/models/context_info.py,sha256=sk80KuNE36S6VRnLh7n6UXmzZCXIB4E4lNxnRyVizg8,563
|
|
@@ -161,6 +161,7 @@ agentle/agents/whatsapp/models/whatsapp_media_message.py,sha256=-FY15vezGILzGMiU
|
|
|
161
161
|
agentle/agents/whatsapp/models/whatsapp_message.py,sha256=QtGAJKOF1ykZycsNDld25gk-JUeg3uV7hNXx0ZXO0Rg,1217
|
|
162
162
|
agentle/agents/whatsapp/models/whatsapp_message_status.py,sha256=jDWShdvSve5EhkgtDkh1jZmpRVNoXCokv4M6at1eSIU,214
|
|
163
163
|
agentle/agents/whatsapp/models/whatsapp_message_type.py,sha256=GctIGOC1Bc_D_L0ehEmEwgxePFx0ioTEUoBlZEdxdG8,279
|
|
164
|
+
agentle/agents/whatsapp/models/whatsapp_response_base.py,sha256=IIDONx9Ipt593tAZvoc8dPDUISeNH-WOpRP1x_-Q6Gk,1145
|
|
164
165
|
agentle/agents/whatsapp/models/whatsapp_session.py,sha256=9G1HC-A2G9jTdpwYy3w9bnYkOGK2vvA7kdYAf32oWMU,15640
|
|
165
166
|
agentle/agents/whatsapp/models/whatsapp_text_message.py,sha256=GpSwFrPC4qpQlVCWKKgYjQJKNv0qvwgYfuoD3ttLzdQ,441
|
|
166
167
|
agentle/agents/whatsapp/models/whatsapp_video_message.py,sha256=-d-4hnkkxyLVNoje3a1pOEAvzWqoCLFcBn70wUpnyXY,346
|
|
@@ -1003,7 +1004,7 @@ agentle/voice_cloning/voice_cloner.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
1003
1004
|
agentle/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
1004
1005
|
agentle/web/extraction_preferences.py,sha256=Xb4X6ZgnbDuu4Pp7cI0sdPcv6LaR1Q94FPTNEoHVTGg,985
|
|
1005
1006
|
agentle/web/extraction_result.py,sha256=IsbRdT_wA9RVYGToCiz17XRoWMTtiFzxky96Zwqa4ZY,318
|
|
1006
|
-
agentle/web/extractor.py,sha256=
|
|
1007
|
+
agentle/web/extractor.py,sha256=ISNYVoofry47HtB0oDhmb2Eof15ZhTL-qdyes1mYSbQ,15585
|
|
1007
1008
|
agentle/web/location.py,sha256=RZgqb2rW7wUdcbw3PnmDtfr4FkTSSovW0j70ZOvoRiw,64
|
|
1008
1009
|
agentle/web/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
1009
1010
|
agentle/web/actions/action.py,sha256=krxW5vXaqB1_JfnPpuo5cVJyANrlElu9P0B0TrF_aZs,723
|
|
@@ -1017,7 +1018,7 @@ agentle/web/actions/scroll.py,sha256=WqVVAORNDK3BL1oASZBPmXJYeSVkPgAOmWA8ibYO82I
|
|
|
1017
1018
|
agentle/web/actions/viewport.py,sha256=KCwm88Pri19Qc6GLHC69HsRxmdJz1gEEAODfggC_fHo,287
|
|
1018
1019
|
agentle/web/actions/wait.py,sha256=IKEywjf-KC4ni9Gkkv4wgc7bY-hk7HwD4F-OFWlyf2w,571
|
|
1019
1020
|
agentle/web/actions/write_text.py,sha256=9mxfHcpKs_L7BsDnJvOYHQwG8M0GWe61SRJAsKk3xQ8,748
|
|
1020
|
-
agentle-0.9.
|
|
1021
|
-
agentle-0.9.
|
|
1022
|
-
agentle-0.9.
|
|
1023
|
-
agentle-0.9.
|
|
1021
|
+
agentle-0.9.27.dist-info/METADATA,sha256=6AlFskgDL86WD5HmWjtRGLwjygJgG2rkMsTbmFUQsJ0,86849
|
|
1022
|
+
agentle-0.9.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
1023
|
+
agentle-0.9.27.dist-info/licenses/LICENSE,sha256=T90S9vqRS6qP-voULxAcvwEs558wRRo6dHuZrjgcOUI,1085
|
|
1024
|
+
agentle-0.9.27.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|