agentle 0.9.4__py3-none-any.whl → 0.9.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentle/agents/agent.py +175 -10
- agentle/agents/agent_run_output.py +8 -1
- agentle/agents/apis/__init__.py +79 -6
- agentle/agents/apis/api.py +342 -73
- agentle/agents/apis/api_key_authentication.py +43 -0
- agentle/agents/apis/api_key_location.py +11 -0
- agentle/agents/apis/api_metrics.py +16 -0
- agentle/agents/apis/auth_type.py +17 -0
- agentle/agents/apis/authentication.py +32 -0
- agentle/agents/apis/authentication_base.py +42 -0
- agentle/agents/apis/authentication_config.py +117 -0
- agentle/agents/apis/basic_authentication.py +34 -0
- agentle/agents/apis/bearer_authentication.py +52 -0
- agentle/agents/apis/cache_strategy.py +12 -0
- agentle/agents/apis/circuit_breaker.py +69 -0
- agentle/agents/apis/circuit_breaker_error.py +7 -0
- agentle/agents/apis/circuit_breaker_state.py +11 -0
- agentle/agents/apis/endpoint.py +413 -254
- agentle/agents/apis/file_upload.py +23 -0
- agentle/agents/apis/hmac_authentication.py +56 -0
- agentle/agents/apis/no_authentication.py +27 -0
- agentle/agents/apis/oauth2_authentication.py +111 -0
- agentle/agents/apis/oauth2_grant_type.py +12 -0
- agentle/agents/apis/object_schema.py +86 -1
- agentle/agents/apis/params/__init__.py +10 -1
- agentle/agents/apis/params/boolean_param.py +44 -0
- agentle/agents/apis/params/number_param.py +56 -0
- agentle/agents/apis/rate_limit_error.py +7 -0
- agentle/agents/apis/rate_limiter.py +57 -0
- agentle/agents/apis/request_config.py +126 -4
- agentle/agents/apis/request_hook.py +16 -0
- agentle/agents/apis/response_cache.py +49 -0
- agentle/agents/apis/retry_strategy.py +12 -0
- agentle/agents/whatsapp/human_delay_calculator.py +462 -0
- agentle/agents/whatsapp/models/audio_message.py +6 -4
- agentle/agents/whatsapp/models/key.py +2 -2
- agentle/agents/whatsapp/models/whatsapp_bot_config.py +375 -21
- agentle/agents/whatsapp/models/whatsapp_response_base.py +31 -0
- agentle/agents/whatsapp/models/whatsapp_webhook_payload.py +5 -1
- agentle/agents/whatsapp/providers/base/whatsapp_provider.py +51 -0
- agentle/agents/whatsapp/providers/evolution/evolution_api_provider.py +237 -10
- agentle/agents/whatsapp/providers/meta/meta_whatsapp_provider.py +126 -0
- agentle/agents/whatsapp/v2/batch_processor_manager.py +4 -0
- agentle/agents/whatsapp/v2/bot_config.py +188 -0
- agentle/agents/whatsapp/v2/message_limit.py +9 -0
- agentle/agents/whatsapp/v2/payload.py +0 -0
- agentle/agents/whatsapp/v2/whatsapp_bot.py +13 -0
- agentle/agents/whatsapp/v2/whatsapp_cloud_api_provider.py +0 -0
- agentle/agents/whatsapp/v2/whatsapp_provider.py +0 -0
- agentle/agents/whatsapp/whatsapp_bot.py +827 -45
- agentle/generations/providers/google/adapters/generate_generate_content_response_to_generation_adapter.py +13 -10
- agentle/generations/providers/google/google_generation_provider.py +35 -5
- agentle/generations/providers/openrouter/_adapters/openrouter_message_to_generated_assistant_message_adapter.py +35 -1
- agentle/mcp/servers/stdio_mcp_server.py +23 -4
- agentle/parsing/parsers/docx.py +8 -0
- agentle/parsing/parsers/file_parser.py +4 -0
- agentle/parsing/parsers/pdf.py +7 -1
- agentle/storage/__init__.py +11 -0
- agentle/storage/file_storage_manager.py +44 -0
- agentle/storage/local_file_storage_manager.py +122 -0
- agentle/storage/s3_file_storage_manager.py +124 -0
- agentle/tts/audio_format.py +6 -0
- agentle/tts/elevenlabs_tts_provider.py +108 -0
- agentle/tts/output_format_type.py +26 -0
- agentle/tts/speech_config.py +14 -0
- agentle/tts/speech_result.py +15 -0
- agentle/tts/tts_provider.py +16 -0
- agentle/tts/voice_settings.py +30 -0
- agentle/utils/parse_streaming_json.py +39 -13
- agentle/voice_cloning/__init__.py +0 -0
- agentle/voice_cloning/voice_cloner.py +0 -0
- agentle/web/extractor.py +282 -148
- {agentle-0.9.4.dist-info → agentle-0.9.28.dist-info}/METADATA +1 -1
- {agentle-0.9.4.dist-info → agentle-0.9.28.dist-info}/RECORD +78 -39
- agentle/tts/real_time/definitions/audio_data.py +0 -20
- agentle/tts/real_time/definitions/speech_config.py +0 -27
- agentle/tts/real_time/definitions/speech_result.py +0 -14
- agentle/tts/real_time/definitions/tts_stream_chunk.py +0 -15
- agentle/tts/real_time/definitions/voice_gender.py +0 -9
- agentle/tts/real_time/definitions/voice_info.py +0 -18
- agentle/tts/real_time/real_time_speech_to_text_provider.py +0 -66
- /agentle/{tts/real_time → agents/whatsapp/v2}/__init__.py +0 -0
- /agentle/{tts/real_time/definitions/__init__.py → agents/whatsapp/v2/in_memory_batch_processor_manager.py} +0 -0
- {agentle-0.9.4.dist-info → agentle-0.9.28.dist-info}/WHEEL +0 -0
- {agentle-0.9.4.dist-info → agentle-0.9.28.dist-info}/licenses/LICENSE +0 -0
agentle/web/extractor.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
2
4
|
from collections.abc import Sequence
|
|
3
5
|
from textwrap import dedent
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
4
7
|
|
|
5
8
|
from html_to_markdown import convert
|
|
6
|
-
from
|
|
9
|
+
from rsb.coroutines.run_sync import run_sync
|
|
7
10
|
from rsb.models import Field
|
|
8
11
|
from rsb.models.base_model import BaseModel
|
|
12
|
+
from rsb.models.config_dict import ConfigDict
|
|
9
13
|
|
|
14
|
+
from agentle.generations.models.generation.generation import Generation
|
|
15
|
+
from agentle.generations.providers.base.generation_provider import GenerationProvider
|
|
10
16
|
from agentle.prompts.models.prompt import Prompt
|
|
11
17
|
from agentle.responses.definitions.reasoning import Reasoning
|
|
12
18
|
from agentle.responses.responder import Responder
|
|
@@ -15,6 +21,10 @@ from agentle.web.actions.action import Action
|
|
|
15
21
|
from agentle.web.extraction_preferences import ExtractionPreferences
|
|
16
22
|
from agentle.web.extraction_result import ExtractionResult
|
|
17
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from playwright.async_api import Browser, Geolocation, ViewportSize
|
|
26
|
+
|
|
27
|
+
|
|
18
28
|
_INSTRUCTIONS = Prompt.from_text(
|
|
19
29
|
dedent("""\
|
|
20
30
|
<character>
|
|
@@ -52,36 +62,37 @@ _PROMPT = Prompt.from_text(
|
|
|
52
62
|
|
|
53
63
|
# HTML -> MD -> LLM (Structured Output)
|
|
54
64
|
class Extractor(BaseModel):
|
|
55
|
-
llm: Responder = Field(
|
|
65
|
+
llm: Responder | GenerationProvider = Field(
|
|
66
|
+
..., description="The responder to use for the extractor."
|
|
67
|
+
)
|
|
56
68
|
reasoning: Reasoning | None = Field(default=None)
|
|
57
69
|
model: str | None = Field(default=None)
|
|
58
70
|
max_output_tokens: int | None = Field(default=None)
|
|
59
71
|
|
|
60
|
-
|
|
72
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
73
|
+
|
|
74
|
+
def extract_markdown(
|
|
61
75
|
self,
|
|
76
|
+
browser: Browser,
|
|
62
77
|
urls: Sequence[str],
|
|
63
|
-
output: type[T],
|
|
64
|
-
prompt: str | None = None,
|
|
65
78
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
66
79
|
ignore_invalid_urls: bool = True,
|
|
67
|
-
) ->
|
|
80
|
+
) -> tuple[str, str]:
|
|
68
81
|
return run_sync(
|
|
69
|
-
self.
|
|
70
|
-
|
|
71
|
-
|
|
82
|
+
self.extract_markdown_async,
|
|
83
|
+
browser=browser,
|
|
84
|
+
urls=urls,
|
|
85
|
+
extraction_preferences=extraction_preferences,
|
|
86
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
72
87
|
)
|
|
73
88
|
|
|
74
|
-
|
|
75
|
-
async def extract_async[T: BaseModel](
|
|
89
|
+
async def extract_markdown_async(
|
|
76
90
|
self,
|
|
91
|
+
browser: Browser,
|
|
77
92
|
urls: Sequence[str],
|
|
78
|
-
output: type[T],
|
|
79
|
-
prompt: str | None = None,
|
|
80
93
|
extraction_preferences: ExtractionPreferences | None = None,
|
|
81
94
|
ignore_invalid_urls: bool = True,
|
|
82
|
-
) ->
|
|
83
|
-
from playwright import async_api
|
|
84
|
-
|
|
95
|
+
) -> tuple[str, str]:
|
|
85
96
|
_preferences = extraction_preferences or ExtractionPreferences()
|
|
86
97
|
_actions: Sequence[Action] = _preferences.actions or []
|
|
87
98
|
|
|
@@ -91,136 +102,244 @@ class Extractor(BaseModel):
|
|
|
91
102
|
# This is a placeholder for proxy configuration
|
|
92
103
|
pass
|
|
93
104
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
geolocation
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
geolocation = Geolocation(
|
|
112
|
-
latitude=getattr(_preferences.location, "latitude", 0),
|
|
113
|
-
longitude=getattr(_preferences.location, "longitude", 0),
|
|
114
|
-
)
|
|
115
|
-
permissions = ["geolocation"]
|
|
116
|
-
|
|
117
|
-
context = await browser.new_context(
|
|
118
|
-
viewport=viewport,
|
|
119
|
-
user_agent=user_agent,
|
|
120
|
-
is_mobile=is_mobile,
|
|
121
|
-
extra_http_headers=_preferences.headers,
|
|
122
|
-
ignore_https_errors=_preferences.skip_tls_verification,
|
|
123
|
-
geolocation=geolocation,
|
|
124
|
-
permissions=permissions,
|
|
105
|
+
# Build context options properly based on preferences
|
|
106
|
+
if _preferences.mobile:
|
|
107
|
+
viewport: ViewportSize | None = ViewportSize(width=375, height=667)
|
|
108
|
+
user_agent = "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15"
|
|
109
|
+
is_mobile = True
|
|
110
|
+
else:
|
|
111
|
+
viewport = None
|
|
112
|
+
user_agent = None
|
|
113
|
+
is_mobile = None
|
|
114
|
+
|
|
115
|
+
# Handle geolocation
|
|
116
|
+
geolocation: Geolocation | None = None
|
|
117
|
+
permissions = None
|
|
118
|
+
if _preferences.location:
|
|
119
|
+
geolocation = Geolocation(
|
|
120
|
+
latitude=getattr(_preferences.location, "latitude", 0),
|
|
121
|
+
longitude=getattr(_preferences.location, "longitude", 0),
|
|
125
122
|
)
|
|
123
|
+
permissions = ["geolocation"]
|
|
124
|
+
|
|
125
|
+
context = await browser.new_context(
|
|
126
|
+
viewport=viewport,
|
|
127
|
+
user_agent=user_agent,
|
|
128
|
+
is_mobile=is_mobile,
|
|
129
|
+
extra_http_headers=_preferences.headers,
|
|
130
|
+
ignore_https_errors=_preferences.skip_tls_verification,
|
|
131
|
+
geolocation=geolocation,
|
|
132
|
+
permissions=permissions,
|
|
133
|
+
)
|
|
126
134
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
)
|
|
144
|
-
else route.continue_(),
|
|
135
|
+
# Block ads if specified
|
|
136
|
+
if _preferences.block_ads:
|
|
137
|
+
await context.route(
|
|
138
|
+
"**/*",
|
|
139
|
+
lambda route: route.abort()
|
|
140
|
+
if route.request.resource_type in ["image", "media", "font"]
|
|
141
|
+
and any(
|
|
142
|
+
ad_domain in route.request.url
|
|
143
|
+
for ad_domain in [
|
|
144
|
+
"doubleclick.net",
|
|
145
|
+
"googlesyndication.com",
|
|
146
|
+
"adservice.google.com",
|
|
147
|
+
"ads",
|
|
148
|
+
"analytics",
|
|
149
|
+
"tracking",
|
|
150
|
+
]
|
|
145
151
|
)
|
|
152
|
+
else route.continue_(),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
page = await context.new_page()
|
|
156
|
+
|
|
157
|
+
for url in urls:
|
|
158
|
+
# Set timeout if specified
|
|
159
|
+
timeout = _preferences.timeout_ms if _preferences.timeout_ms else 30000
|
|
146
160
|
|
|
147
|
-
|
|
161
|
+
try:
|
|
162
|
+
await page.goto(url, timeout=timeout)
|
|
148
163
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
164
|
+
# Wait for specified time if configured
|
|
165
|
+
if _preferences.wait_for_ms:
|
|
166
|
+
await page.wait_for_timeout(_preferences.wait_for_ms)
|
|
152
167
|
|
|
153
|
-
|
|
154
|
-
|
|
168
|
+
# Execute actions
|
|
169
|
+
for action in _actions:
|
|
170
|
+
await action.execute(page)
|
|
155
171
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
172
|
+
except Exception as e:
|
|
173
|
+
if ignore_invalid_urls:
|
|
174
|
+
print(f"Warning: Failed to load {url}: {e}")
|
|
175
|
+
continue
|
|
176
|
+
else:
|
|
177
|
+
raise
|
|
159
178
|
|
|
160
|
-
|
|
161
|
-
for action in _actions:
|
|
162
|
-
await action.execute(page)
|
|
179
|
+
html = await page.content()
|
|
163
180
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
181
|
+
# Process HTML based on preferences - consolidate all BeautifulSoup operations
|
|
182
|
+
if (
|
|
183
|
+
_preferences.remove_base_64_images
|
|
184
|
+
or _preferences.include_tags
|
|
185
|
+
or _preferences.exclude_tags
|
|
186
|
+
or _preferences.only_main_content
|
|
187
|
+
):
|
|
188
|
+
from bs4 import BeautifulSoup
|
|
170
189
|
|
|
171
|
-
|
|
190
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
172
191
|
|
|
173
|
-
#
|
|
192
|
+
# Remove base64 images first
|
|
174
193
|
if _preferences.remove_base_64_images:
|
|
175
194
|
import re
|
|
176
195
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
196
|
+
# Debug: Check what we have before processing
|
|
197
|
+
all_imgs = soup.find_all("img")
|
|
198
|
+
print(f"DEBUG: Found {len(all_imgs)} img tags total")
|
|
199
|
+
base64_count = 0
|
|
200
|
+
for img in all_imgs:
|
|
201
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
202
|
+
if isinstance(src, str) and "data:image/" in src:
|
|
203
|
+
base64_count += 1
|
|
204
|
+
print(f"DEBUG: Found base64 img: {src[:100]}...")
|
|
205
|
+
print(f"DEBUG: {base64_count} images have base64 data")
|
|
206
|
+
|
|
207
|
+
# First, remove any anchor tags that contain img children with base64
|
|
208
|
+
# (must be done before removing img tags themselves)
|
|
209
|
+
removed_anchors = 0
|
|
210
|
+
for a_tag in soup.find_all("a"):
|
|
211
|
+
imgs = a_tag.find_all("img") # type: ignore[union-attr]
|
|
212
|
+
for img in imgs:
|
|
213
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
214
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
215
|
+
# Remove the entire anchor tag if it contains base64 image
|
|
216
|
+
a_tag.decompose()
|
|
217
|
+
removed_anchors += 1
|
|
218
|
+
break
|
|
219
|
+
print(
|
|
220
|
+
f"DEBUG: Removed {removed_anchors} anchor tags with base64 images"
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Remove standalone img tags with base64 src
|
|
224
|
+
removed_imgs = 0
|
|
225
|
+
for img in soup.find_all("img"):
|
|
226
|
+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
|
|
227
|
+
if isinstance(src, str) and src.startswith("data:image/"):
|
|
228
|
+
img.decompose()
|
|
229
|
+
removed_imgs += 1
|
|
230
|
+
print(f"DEBUG: Removed {removed_imgs} standalone img tags")
|
|
231
|
+
|
|
232
|
+
# Remove any element with base64 in href (like anchor tags with image data)
|
|
233
|
+
for elem in soup.find_all(attrs={"href": True}):
|
|
234
|
+
href = elem.attrs.get("href") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
235
|
+
if isinstance(href, str) and href.startswith("data:image/"):
|
|
236
|
+
elem.decompose()
|
|
237
|
+
|
|
238
|
+
# Remove any element with base64 in style attribute
|
|
239
|
+
for elem in soup.find_all(attrs={"style": True}):
|
|
240
|
+
style = elem.attrs.get("style") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
|
|
241
|
+
if isinstance(style, str) and "data:image/" in style:
|
|
242
|
+
elem.decompose()
|
|
243
|
+
|
|
244
|
+
# Remove SVG tags (they often contain base64 or are converted to base64 by markdown)
|
|
245
|
+
for svg in soup.find_all("svg"):
|
|
246
|
+
svg.decompose()
|
|
247
|
+
|
|
248
|
+
# Remove any anchor tags that contain SVG children
|
|
249
|
+
for a_tag in soup.find_all("a"):
|
|
250
|
+
if a_tag.find("svg"): # type: ignore[union-attr]
|
|
251
|
+
a_tag.decompose()
|
|
252
|
+
|
|
253
|
+
# Final check: see if any base64 remains in the HTML string
|
|
254
|
+
html_str = str(soup)
|
|
255
|
+
remaining = len(re.findall(r'data:image/[^"\')\s]+', html_str))
|
|
256
|
+
print(
|
|
257
|
+
f"DEBUG: After processing, {remaining} base64 data URIs remain in HTML"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Extract main content if requested
|
|
261
|
+
if _preferences.only_main_content:
|
|
262
|
+
main_content = (
|
|
263
|
+
soup.find("main")
|
|
264
|
+
or soup.find("article")
|
|
265
|
+
or soup.find("div", {"id": "content"})
|
|
266
|
+
or soup.find("div", {"class": "content"})
|
|
182
267
|
)
|
|
268
|
+
if main_content:
|
|
269
|
+
soup = main_content # type: ignore[assignment]
|
|
183
270
|
|
|
184
|
-
#
|
|
185
|
-
if _preferences.
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
271
|
+
# Exclude specific tags
|
|
272
|
+
if _preferences.exclude_tags:
|
|
273
|
+
for tag in _preferences.exclude_tags:
|
|
274
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
275
|
+
element.decompose()
|
|
276
|
+
|
|
277
|
+
# Include only specific tags
|
|
278
|
+
if _preferences.include_tags:
|
|
279
|
+
new_soup = BeautifulSoup("", "html.parser")
|
|
280
|
+
for tag in _preferences.include_tags:
|
|
281
|
+
for element in soup.find_all(tag): # type: ignore[union-attr]
|
|
282
|
+
new_soup.append(element) # type: ignore[arg-type]
|
|
283
|
+
soup = new_soup
|
|
284
|
+
|
|
285
|
+
html = str(soup)
|
|
286
|
+
|
|
287
|
+
# Convert to markdown
|
|
288
|
+
markdown = convert(html)
|
|
289
|
+
return html, markdown
|
|
290
|
+
|
|
291
|
+
def extract[T: BaseModel](
|
|
292
|
+
self,
|
|
293
|
+
browser: Browser,
|
|
294
|
+
urls: Sequence[str],
|
|
295
|
+
output: type[T],
|
|
296
|
+
prompt: str | None = None,
|
|
297
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
298
|
+
ignore_invalid_urls: bool = True,
|
|
299
|
+
) -> ExtractionResult[T]:
|
|
300
|
+
return run_sync(
|
|
301
|
+
self.extract_async(
|
|
302
|
+
browser=browser,
|
|
303
|
+
urls=urls,
|
|
304
|
+
output=output,
|
|
305
|
+
prompt=prompt,
|
|
306
|
+
extraction_preferences=extraction_preferences,
|
|
307
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
222
308
|
)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
@needs("playwright")
|
|
312
|
+
async def extract_async[T: BaseModel](
|
|
313
|
+
self,
|
|
314
|
+
browser: Browser,
|
|
315
|
+
urls: Sequence[str],
|
|
316
|
+
output: type[T],
|
|
317
|
+
prompt: str | None = None,
|
|
318
|
+
extraction_preferences: ExtractionPreferences | None = None,
|
|
319
|
+
ignore_invalid_urls: bool = True,
|
|
320
|
+
) -> ExtractionResult[T]:
|
|
321
|
+
_preferences = extraction_preferences or ExtractionPreferences()
|
|
223
322
|
|
|
323
|
+
html, markdown = await self.extract_markdown_async(
|
|
324
|
+
browser=browser,
|
|
325
|
+
urls=urls,
|
|
326
|
+
extraction_preferences=_preferences,
|
|
327
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Prepare and send prompt
|
|
331
|
+
_prompt = _PROMPT.compile(
|
|
332
|
+
user_instructions=prompt or "Not provided.", markdown=markdown
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if isinstance(self.llm, GenerationProvider):
|
|
336
|
+
response = await self.llm.generate_by_prompt_async(
|
|
337
|
+
prompt=_prompt,
|
|
338
|
+
model=self.model,
|
|
339
|
+
developer_prompt=_INSTRUCTIONS,
|
|
340
|
+
response_schema=output,
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
224
343
|
response = await self.llm.respond_async(
|
|
225
344
|
input=_prompt,
|
|
226
345
|
model=self.model,
|
|
@@ -229,19 +348,26 @@ class Extractor(BaseModel):
|
|
|
229
348
|
text_format=output,
|
|
230
349
|
)
|
|
231
350
|
|
|
232
|
-
|
|
351
|
+
output_parsed = (
|
|
352
|
+
response.parsed
|
|
353
|
+
if isinstance(response, Generation)
|
|
354
|
+
else response.output_parsed
|
|
355
|
+
)
|
|
233
356
|
|
|
234
|
-
|
|
235
|
-
urls=urls,
|
|
236
|
-
html=html,
|
|
237
|
-
markdown=markdown,
|
|
238
|
-
extraction_preferences=_preferences,
|
|
239
|
-
output_parsed=response.output_parsed,
|
|
240
|
-
)
|
|
357
|
+
await browser.close()
|
|
241
358
|
|
|
359
|
+
return ExtractionResult[T](
|
|
360
|
+
urls=urls,
|
|
361
|
+
html=html,
|
|
362
|
+
markdown=markdown,
|
|
363
|
+
extraction_preferences=_preferences,
|
|
364
|
+
output_parsed=output_parsed,
|
|
365
|
+
)
|
|
242
366
|
|
|
243
|
-
|
|
367
|
+
|
|
368
|
+
async def test() -> None:
|
|
244
369
|
from dotenv import load_dotenv
|
|
370
|
+
from playwright import async_api
|
|
245
371
|
|
|
246
372
|
load_dotenv()
|
|
247
373
|
|
|
@@ -251,8 +377,8 @@ if __name__ == "__main__":
|
|
|
251
377
|
possiveis_redirecionamentos: list[str]
|
|
252
378
|
|
|
253
379
|
extractor = Extractor(
|
|
254
|
-
llm=Responder.
|
|
255
|
-
model="
|
|
380
|
+
llm=Responder.openrouter(),
|
|
381
|
+
model="google/gemini-2.5-flash",
|
|
256
382
|
)
|
|
257
383
|
|
|
258
384
|
# Example with custom extraction preferences
|
|
@@ -264,12 +390,20 @@ if __name__ == "__main__":
|
|
|
264
390
|
timeout_ms=15000,
|
|
265
391
|
)
|
|
266
392
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
output=PossiveisRedirecionamentos,
|
|
270
|
-
prompt="Extract the possible redirects from the page.",
|
|
271
|
-
extraction_preferences=preferences,
|
|
272
|
-
)
|
|
393
|
+
async with async_api.async_playwright() as p:
|
|
394
|
+
browser = await p.chromium.launch(headless=True)
|
|
273
395
|
|
|
274
|
-
|
|
275
|
-
|
|
396
|
+
result = await extractor.extract_async(
|
|
397
|
+
browser=browser,
|
|
398
|
+
urls=[site_uniube],
|
|
399
|
+
output=PossiveisRedirecionamentos,
|
|
400
|
+
prompt="Extract the possible redirects from the page.",
|
|
401
|
+
extraction_preferences=preferences,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
for link in result.output_parsed.possiveis_redirecionamentos:
|
|
405
|
+
print(f"Link: {link}")
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__":
|
|
409
|
+
asyncio.run(test())
|