symbolicai 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +21 -71
- symai/backend/base.py +0 -26
- symai/backend/engines/drawing/engine_gemini_image.py +101 -0
- symai/backend/engines/embedding/engine_openai.py +11 -8
- symai/backend/engines/neurosymbolic/__init__.py +8 -0
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +14 -1
- symai/backend/engines/neurosymbolic/engine_openrouter.py +294 -0
- symai/backend/engines/scrape/engine_requests.py +39 -10
- symai/backend/engines/search/__init__.py +13 -0
- symai/backend/engines/search/engine_firecrawl.py +333 -0
- symai/backend/engines/search/engine_parallel.py +5 -5
- symai/backend/mixin/__init__.py +4 -0
- symai/backend/mixin/openrouter.py +2 -0
- symai/components.py +212 -16
- symai/extended/interfaces/firecrawl.py +30 -0
- symai/extended/interfaces/nanobanana.py +23 -0
- symai/extended/interfaces/parallel.py +5 -5
- symai/functional.py +3 -4
- symai/interfaces.py +2 -0
- symai/ops/primitives.py +0 -18
- symai/shellsv.py +2 -7
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/METADATA +3 -9
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/RECORD +27 -47
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/WHEEL +1 -1
- symai/backend/driver/webclient.py +0 -217
- symai/backend/engines/crawler/engine_selenium.py +0 -94
- symai/backend/engines/drawing/engine_dall_e.py +0 -131
- symai/backend/engines/embedding/engine_plugin_embeddings.py +0 -12
- symai/backend/engines/experiments/engine_bard_wrapper.py +0 -131
- symai/backend/engines/experiments/engine_gptfinetuner.py +0 -32
- symai/backend/engines/experiments/engine_llamacpp_completion.py +0 -142
- symai/backend/engines/neurosymbolic/engine_openai_gptX_completion.py +0 -277
- symai/collect/__init__.py +0 -8
- symai/collect/dynamic.py +0 -117
- symai/collect/pipeline.py +0 -156
- symai/collect/stats.py +0 -434
- symai/extended/crawler.py +0 -21
- symai/extended/interfaces/selenium.py +0 -18
- symai/extended/interfaces/vectordb.py +0 -21
- symai/extended/personas/__init__.py +0 -3
- symai/extended/personas/builder.py +0 -105
- symai/extended/personas/dialogue.py +0 -126
- symai/extended/personas/persona.py +0 -154
- symai/extended/personas/research/__init__.py +0 -1
- symai/extended/personas/research/yann_lecun.py +0 -62
- symai/extended/personas/sales/__init__.py +0 -1
- symai/extended/personas/sales/erik_james.py +0 -62
- symai/extended/personas/student/__init__.py +0 -1
- symai/extended/personas/student/max_tenner.py +0 -51
- symai/extended/strategies/__init__.py +0 -1
- symai/extended/strategies/cot.py +0 -40
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.4.0.dist-info → symbolicai-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
|
|
5
|
+
import openai
|
|
6
|
+
|
|
7
|
+
from ....components import SelfPrompt
|
|
8
|
+
from ....core_ext import retry
|
|
9
|
+
from ....utils import UserMessage
|
|
10
|
+
from ...base import Engine
|
|
11
|
+
from ...settings import SYMAI_CONFIG
|
|
12
|
+
|
|
13
|
+
logging.getLogger("openai").setLevel(logging.ERROR)
|
|
14
|
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
|
15
|
+
logging.getLogger("urllib").setLevel(logging.ERROR)
|
|
16
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
17
|
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_NON_VERBOSE_OUTPUT = (
|
|
21
|
+
"<META_INSTRUCTION/>\n"
|
|
22
|
+
"You do not output anything else, like verbose preambles or post explanation, such as "
|
|
23
|
+
'"Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. '
|
|
24
|
+
"Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use "
|
|
25
|
+
"indentation, etc. Never add meta instructions information to your output!\n\n"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OpenRouterEngine(Engine):
|
|
30
|
+
def __init__(self, api_key: str | None = None, model: str | None = None):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.config = deepcopy(SYMAI_CONFIG)
|
|
33
|
+
# In case we use EngineRepository.register to inject the api_key and model => dynamically change the engine at runtime
|
|
34
|
+
if api_key is not None and model is not None:
|
|
35
|
+
self.config["NEUROSYMBOLIC_ENGINE_API_KEY"] = api_key
|
|
36
|
+
self.config["NEUROSYMBOLIC_ENGINE_MODEL"] = model
|
|
37
|
+
if self.id() != "neurosymbolic":
|
|
38
|
+
return # do not initialize if not neurosymbolic; avoids conflict with llama.cpp check in EngineRepository.register_from_package
|
|
39
|
+
openai.api_key = self.config["NEUROSYMBOLIC_ENGINE_API_KEY"]
|
|
40
|
+
self.model = self.config["NEUROSYMBOLIC_ENGINE_MODEL"]
|
|
41
|
+
self.seed = None
|
|
42
|
+
self.name = self.__class__.__name__
|
|
43
|
+
self._last_prompt_tokens = None
|
|
44
|
+
self._last_messages = None
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
self.client = openai.OpenAI(
|
|
48
|
+
api_key=openai.api_key, base_url="https://openrouter.ai/api/v1"
|
|
49
|
+
)
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
UserMessage(
|
|
52
|
+
f"Failed to initialize OpenRouter client. Please check your OpenAI library version. Caused by: {exc}",
|
|
53
|
+
raise_with=ValueError,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def id(self) -> str:
|
|
57
|
+
model_name = self.config.get("NEUROSYMBOLIC_ENGINE_MODEL")
|
|
58
|
+
if model_name and model_name.startswith("openrouter"):
|
|
59
|
+
return "neurosymbolic"
|
|
60
|
+
return super().id()
|
|
61
|
+
|
|
62
|
+
def command(self, *args, **kwargs):
|
|
63
|
+
super().command(*args, **kwargs)
|
|
64
|
+
if "NEUROSYMBOLIC_ENGINE_API_KEY" in kwargs:
|
|
65
|
+
openai.api_key = kwargs["NEUROSYMBOLIC_ENGINE_API_KEY"]
|
|
66
|
+
if "NEUROSYMBOLIC_ENGINE_MODEL" in kwargs:
|
|
67
|
+
self.model = kwargs["NEUROSYMBOLIC_ENGINE_MODEL"]
|
|
68
|
+
if "seed" in kwargs:
|
|
69
|
+
self.seed = kwargs["seed"]
|
|
70
|
+
|
|
71
|
+
def compute_required_tokens(self, messages):
|
|
72
|
+
if self._last_prompt_tokens is not None and self._last_messages == messages:
|
|
73
|
+
return self._last_prompt_tokens
|
|
74
|
+
UserMessage(
|
|
75
|
+
"Token counting not implemented for this engine.", raise_with=NotImplementedError
|
|
76
|
+
)
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
def compute_remaining_tokens(self, _prompts: list) -> int:
|
|
80
|
+
UserMessage(
|
|
81
|
+
"Token counting not implemented for this engine.", raise_with=NotImplementedError
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _handle_prefix(self, model_name: str) -> str:
|
|
85
|
+
"""Handle prefix for model name."""
|
|
86
|
+
return model_name.replace("openrouter:", "")
|
|
87
|
+
|
|
88
|
+
def _extract_thinking_content(self, output: list[str]) -> tuple[str | None, list[str]]:
|
|
89
|
+
"""Extract thinking content from textual output using <think>...</think> tags if present."""
|
|
90
|
+
if not output or not output[0]:
|
|
91
|
+
return None, output
|
|
92
|
+
|
|
93
|
+
content = output[0]
|
|
94
|
+
start = content.find("<think>")
|
|
95
|
+
if start == -1:
|
|
96
|
+
return None, output
|
|
97
|
+
|
|
98
|
+
end = content.find("</think>", start + 7)
|
|
99
|
+
if end == -1:
|
|
100
|
+
return None, output
|
|
101
|
+
|
|
102
|
+
thinking_content = content[start + 7 : end].strip() or None
|
|
103
|
+
cleaned_content = (content[:start] + content[end + 8 :]).strip()
|
|
104
|
+
cleaned_output = [cleaned_content, *output[1:]]
|
|
105
|
+
|
|
106
|
+
return thinking_content, cleaned_output
|
|
107
|
+
|
|
108
|
+
# cumulative wait time is < 30s
|
|
109
|
+
@retry(tries=8, delay=0.5, backoff=1.5, max_delay=5, jitter=(0, 0.5))
|
|
110
|
+
def forward(self, argument):
|
|
111
|
+
kwargs = argument.kwargs
|
|
112
|
+
messages = argument.prop.prepared_input
|
|
113
|
+
payload = self._prepare_request_payload(messages, argument)
|
|
114
|
+
except_remedy = kwargs.get("except_remedy")
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
res = self.client.chat.completions.create(**payload)
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
if openai.api_key is None or openai.api_key == "":
|
|
120
|
+
msg = (
|
|
121
|
+
"OpenRouter API key is not set. Please set it in the config file or "
|
|
122
|
+
"pass it as an argument to the command method."
|
|
123
|
+
)
|
|
124
|
+
UserMessage(msg)
|
|
125
|
+
if (
|
|
126
|
+
self.config["NEUROSYMBOLIC_ENGINE_API_KEY"] is None
|
|
127
|
+
or self.config["NEUROSYMBOLIC_ENGINE_API_KEY"] == ""
|
|
128
|
+
):
|
|
129
|
+
UserMessage(msg, raise_with=ValueError)
|
|
130
|
+
openai.api_key = self.config["NEUROSYMBOLIC_ENGINE_API_KEY"]
|
|
131
|
+
|
|
132
|
+
callback = self.client.chat.completions.create
|
|
133
|
+
kwargs["model"] = (
|
|
134
|
+
self._handle_prefix(kwargs["model"])
|
|
135
|
+
if "model" in kwargs
|
|
136
|
+
else self._handle_prefix(self.model)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if except_remedy is not None:
|
|
140
|
+
res = except_remedy(self, exc, callback, argument)
|
|
141
|
+
else:
|
|
142
|
+
UserMessage(f"Error during generation. Caused by: {exc}", raise_with=ValueError)
|
|
143
|
+
|
|
144
|
+
prompt_tokens = getattr(res.usage, "prompt_tokens", None)
|
|
145
|
+
if prompt_tokens is None:
|
|
146
|
+
prompt_tokens = getattr(res.usage, "input_tokens", None)
|
|
147
|
+
self._last_prompt_tokens = prompt_tokens
|
|
148
|
+
self._last_messages = messages
|
|
149
|
+
|
|
150
|
+
metadata = {"raw_output": res}
|
|
151
|
+
if payload.get("tools"):
|
|
152
|
+
metadata = self._process_function_calls(res, metadata)
|
|
153
|
+
|
|
154
|
+
output = [r.message.content for r in res.choices]
|
|
155
|
+
thinking, output = self._extract_thinking_content(output)
|
|
156
|
+
if thinking:
|
|
157
|
+
metadata["thinking"] = thinking
|
|
158
|
+
|
|
159
|
+
return output, metadata
|
|
160
|
+
|
|
161
|
+
def _prepare_raw_input(self, argument):
|
|
162
|
+
if not argument.prop.processed_input:
|
|
163
|
+
UserMessage(
|
|
164
|
+
"Need to provide a prompt instruction to the engine if raw_input is enabled.",
|
|
165
|
+
raise_with=ValueError,
|
|
166
|
+
)
|
|
167
|
+
value = argument.prop.processed_input
|
|
168
|
+
if not isinstance(value, list):
|
|
169
|
+
if not isinstance(value, dict):
|
|
170
|
+
value = {"role": "user", "content": str(value)}
|
|
171
|
+
value = [value]
|
|
172
|
+
return value
|
|
173
|
+
|
|
174
|
+
def prepare(self, argument):
|
|
175
|
+
if argument.prop.raw_input:
|
|
176
|
+
argument.prop.prepared_input = self._prepare_raw_input(argument)
|
|
177
|
+
return
|
|
178
|
+
self._validate_response_format(argument)
|
|
179
|
+
|
|
180
|
+
system = self._build_system_message(argument)
|
|
181
|
+
user_content = self._build_user_content(argument)
|
|
182
|
+
user_prompt = {"role": "user", "content": user_content}
|
|
183
|
+
system, user_prompt = self._apply_self_prompt_if_needed(argument, system, user_prompt)
|
|
184
|
+
|
|
185
|
+
argument.prop.prepared_input = [
|
|
186
|
+
{"role": "system", "content": system},
|
|
187
|
+
user_prompt,
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
def _validate_response_format(self, argument) -> None:
|
|
191
|
+
if argument.prop.response_format:
|
|
192
|
+
response_format = argument.prop.response_format
|
|
193
|
+
assert response_format.get("type") is not None, (
|
|
194
|
+
'Expected format `{ "type": "json_object" }`! We are using the OpenAI compatible '
|
|
195
|
+
"API for OpenRouter."
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _build_system_message(self, argument) -> str:
|
|
199
|
+
system: str = ""
|
|
200
|
+
if argument.prop.suppress_verbose_output:
|
|
201
|
+
system += _NON_VERBOSE_OUTPUT
|
|
202
|
+
if system:
|
|
203
|
+
system = f"{system}\n"
|
|
204
|
+
|
|
205
|
+
ref = argument.prop.instance
|
|
206
|
+
static_ctxt, dyn_ctxt = ref.global_context
|
|
207
|
+
if len(static_ctxt) > 0:
|
|
208
|
+
system += f"<STATIC CONTEXT/>\n{static_ctxt}\n\n"
|
|
209
|
+
|
|
210
|
+
if len(dyn_ctxt) > 0:
|
|
211
|
+
system += f"<DYNAMIC CONTEXT/>\n{dyn_ctxt}\n\n"
|
|
212
|
+
|
|
213
|
+
if argument.prop.payload:
|
|
214
|
+
system += f"<ADDITIONAL CONTEXT/>\n{argument.prop.payload!s}\n\n"
|
|
215
|
+
|
|
216
|
+
examples = argument.prop.examples
|
|
217
|
+
if examples and len(examples) > 0:
|
|
218
|
+
system += f"<EXAMPLES/>\n{examples!s}\n\n"
|
|
219
|
+
|
|
220
|
+
if argument.prop.prompt is not None and len(argument.prop.prompt) > 0:
|
|
221
|
+
val = str(argument.prop.prompt)
|
|
222
|
+
system += f"<INSTRUCTION/>\n{val}\n\n"
|
|
223
|
+
|
|
224
|
+
if argument.prop.template_suffix:
|
|
225
|
+
system += (
|
|
226
|
+
" You will only generate content for the placeholder "
|
|
227
|
+
f"`{argument.prop.template_suffix!s}` following the instructions and the provided context "
|
|
228
|
+
"information.\n\n"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return system
|
|
232
|
+
|
|
233
|
+
def _build_user_content(self, argument) -> str:
|
|
234
|
+
return str(argument.prop.processed_input)
|
|
235
|
+
|
|
236
|
+
def _apply_self_prompt_if_needed(self, argument, system, user_prompt):
|
|
237
|
+
if argument.prop.instance._kwargs.get("self_prompt", False) or argument.prop.self_prompt:
|
|
238
|
+
self_prompter = SelfPrompt()
|
|
239
|
+
res = self_prompter({"user": user_prompt["content"], "system": system})
|
|
240
|
+
if res is None:
|
|
241
|
+
UserMessage("Self-prompting failed!", raise_with=ValueError)
|
|
242
|
+
return res["system"], {"role": "user", "content": res["user"]}
|
|
243
|
+
return system, user_prompt
|
|
244
|
+
|
|
245
|
+
def _process_function_calls(self, res, metadata):
|
|
246
|
+
hit = False
|
|
247
|
+
if (
|
|
248
|
+
hasattr(res, "choices")
|
|
249
|
+
and res.choices
|
|
250
|
+
and hasattr(res.choices[0], "message")
|
|
251
|
+
and res.choices[0].message
|
|
252
|
+
and hasattr(res.choices[0].message, "tool_calls")
|
|
253
|
+
and res.choices[0].message.tool_calls
|
|
254
|
+
):
|
|
255
|
+
for tool_call in res.choices[0].message.tool_calls:
|
|
256
|
+
if hasattr(tool_call, "function") and tool_call.function:
|
|
257
|
+
if hit:
|
|
258
|
+
UserMessage(
|
|
259
|
+
"Multiple function calls detected in the response but only the first one will be processed."
|
|
260
|
+
)
|
|
261
|
+
break
|
|
262
|
+
try:
|
|
263
|
+
args_dict = json.loads(tool_call.function.arguments)
|
|
264
|
+
except json.JSONDecodeError:
|
|
265
|
+
args_dict = {}
|
|
266
|
+
metadata["function_call"] = {
|
|
267
|
+
"name": tool_call.function.name,
|
|
268
|
+
"arguments": args_dict,
|
|
269
|
+
}
|
|
270
|
+
hit = True
|
|
271
|
+
return metadata
|
|
272
|
+
|
|
273
|
+
# TODO: requires updates for reasoning
|
|
274
|
+
def _prepare_request_payload(self, messages, argument):
|
|
275
|
+
kwargs = argument.kwargs
|
|
276
|
+
max_tokens = kwargs.get("max_tokens")
|
|
277
|
+
if max_tokens is None:
|
|
278
|
+
max_tokens = kwargs.get("max_completion_tokens")
|
|
279
|
+
return {
|
|
280
|
+
"messages": messages,
|
|
281
|
+
"model": self._handle_prefix(kwargs.get("model", self.model)),
|
|
282
|
+
"seed": kwargs.get("seed", self.seed),
|
|
283
|
+
"max_tokens": max_tokens,
|
|
284
|
+
"stop": kwargs.get("stop"),
|
|
285
|
+
"temperature": kwargs.get("temperature", 1),
|
|
286
|
+
"frequency_penalty": kwargs.get("frequency_penalty", 0),
|
|
287
|
+
"presence_penalty": kwargs.get("presence_penalty", 0),
|
|
288
|
+
"top_p": kwargs.get("top_p", 1),
|
|
289
|
+
"n": kwargs.get("n", 1),
|
|
290
|
+
"tools": kwargs.get("tools"),
|
|
291
|
+
"tool_choice": kwargs.get("tool_choice"),
|
|
292
|
+
"response_format": kwargs.get("response_format"),
|
|
293
|
+
"stream": kwargs.get("stream", False),
|
|
294
|
+
}
|
|
@@ -9,6 +9,7 @@ service disruption.
|
|
|
9
9
|
|
|
10
10
|
import io
|
|
11
11
|
import logging
|
|
12
|
+
import random
|
|
12
13
|
import re
|
|
13
14
|
from typing import Any, ClassVar
|
|
14
15
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
@@ -17,7 +18,9 @@ import requests
|
|
|
17
18
|
import trafilatura
|
|
18
19
|
from bs4 import BeautifulSoup
|
|
19
20
|
from pdfminer.high_level import extract_text
|
|
21
|
+
from requests.adapters import HTTPAdapter
|
|
20
22
|
from requests.structures import CaseInsensitiveDict
|
|
23
|
+
from urllib3.util.retry import Retry
|
|
21
24
|
|
|
22
25
|
from ....symbol import Result
|
|
23
26
|
from ....utils import UserMessage
|
|
@@ -80,24 +83,49 @@ class RequestsEngine(Engine):
|
|
|
80
83
|
"none": "None",
|
|
81
84
|
}
|
|
82
85
|
|
|
83
|
-
|
|
86
|
+
USER_AGENT_POOL: ClassVar[list[str]] = [
|
|
87
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
88
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
89
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
90
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
91
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
92
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
93
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
94
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def __init__(self, timeout=15, verify_ssl=True, user_agent=None, retries=3, backoff_factor=0.5, retry_status_codes=(500, 502, 503, 504)):
|
|
84
98
|
"""
|
|
85
99
|
Args:
|
|
86
100
|
timeout: Seconds to wait for network operations before aborting.
|
|
87
101
|
verify_ssl: Toggle for TLS certificate verification.
|
|
88
|
-
user_agent: Optional override for
|
|
102
|
+
user_agent: Optional override for user agent rotation.
|
|
103
|
+
retries: Number of retries for failed requests (default: 3).
|
|
104
|
+
backoff_factor: Multiplier for exponential backoff (default: 0.5).
|
|
105
|
+
retry_status_codes: HTTP status codes to retry on (default: 500, 502, 503, 504).
|
|
89
106
|
"""
|
|
90
107
|
super().__init__()
|
|
91
108
|
self.timeout = timeout
|
|
92
109
|
self.verify_ssl = verify_ssl
|
|
93
110
|
self.name = self.__class__.__name__
|
|
94
|
-
|
|
95
|
-
headers = dict(self.DEFAULT_HEADERS)
|
|
96
|
-
if user_agent:
|
|
97
|
-
headers["User-Agent"] = user_agent
|
|
111
|
+
self._user_agent_override = user_agent
|
|
98
112
|
|
|
99
113
|
self.session = requests.Session()
|
|
100
|
-
self.session.headers.update(
|
|
114
|
+
self.session.headers.update({k: v for k, v in self.DEFAULT_HEADERS.items() if k != "User-Agent"})
|
|
115
|
+
|
|
116
|
+
retry_strategy = Retry(
|
|
117
|
+
total=retries,
|
|
118
|
+
backoff_factor=backoff_factor,
|
|
119
|
+
status_forcelist=retry_status_codes,
|
|
120
|
+
allowed_methods=["GET", "HEAD"],
|
|
121
|
+
)
|
|
122
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
123
|
+
self.session.mount("http://", adapter)
|
|
124
|
+
self.session.mount("https://", adapter)
|
|
125
|
+
|
|
126
|
+
def _get_user_agent(self) -> str:
|
|
127
|
+
"""Return user agent: override if set, otherwise random from pool."""
|
|
128
|
+
return self._user_agent_override or random.choice(self.USER_AGENT_POOL)
|
|
101
129
|
|
|
102
130
|
def _maybe_set_bypass_cookies(self, url: str):
|
|
103
131
|
netloc = urlparse(url).hostname
|
|
@@ -232,7 +260,7 @@ class RequestsEngine(Engine):
|
|
|
232
260
|
# Avoid loops
|
|
233
261
|
if target == resp.url:
|
|
234
262
|
return resp
|
|
235
|
-
return self.session.get(target, timeout=timeout, allow_redirects=True)
|
|
263
|
+
return self.session.get(target, timeout=timeout, allow_redirects=True, headers={"User-Agent": self._get_user_agent()})
|
|
236
264
|
|
|
237
265
|
def _fetch_with_playwright(
|
|
238
266
|
self,
|
|
@@ -259,7 +287,7 @@ class RequestsEngine(Engine):
|
|
|
259
287
|
|
|
260
288
|
timeout_seconds = timeout if timeout is not None else self.timeout
|
|
261
289
|
timeout_ms = max(int(timeout_seconds * 1000), 0)
|
|
262
|
-
user_agent = self.
|
|
290
|
+
user_agent = self._get_user_agent()
|
|
263
291
|
|
|
264
292
|
parsed = urlparse(url)
|
|
265
293
|
hostname = parsed.hostname or ""
|
|
@@ -348,7 +376,8 @@ class RequestsEngine(Engine):
|
|
|
348
376
|
)
|
|
349
377
|
else:
|
|
350
378
|
resp = self.session.get(
|
|
351
|
-
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl
|
|
379
|
+
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl,
|
|
380
|
+
headers={"User-Agent": self._get_user_agent()}
|
|
352
381
|
)
|
|
353
382
|
resp.raise_for_status()
|
|
354
383
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .engine_firecrawl import FirecrawlEngine
|
|
2
|
+
from .engine_parallel import ParallelEngine
|
|
3
|
+
|
|
4
|
+
SEARCH_ENGINE_MAPPING = {
|
|
5
|
+
"firecrawl": FirecrawlEngine,
|
|
6
|
+
"parallel": ParallelEngine,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"SEARCH_ENGINE_MAPPING",
|
|
11
|
+
"FirecrawlEngine",
|
|
12
|
+
"ParallelEngine",
|
|
13
|
+
]
|