unique-search-proxy 2026.26.0.dev9__tar.gz → 2026.26.0.dev10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/PKG-INFO +2 -2
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/pyproject.toml +2 -2
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/crawl.py +46 -4
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/service.py +43 -14
- unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/crawlers/pinned_egress.py +22 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/provider_response.py +10 -0
- unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/url_safety/__init__.py +13 -0
- unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/url_safety/gate.py +76 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/README.md +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/health.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/agent_search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/configuration.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/openapi_examples.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/app.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/client.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/runner.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/serialization.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/service_base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/structured_output.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/client.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/gemini.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/client/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/client/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/errors.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/html_markdown.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/html.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/pdf.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/plain_text.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/registry.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/settings.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/user_agent.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/polling.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/jina/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/jina/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/tavily/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/tavily/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/providers.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/registry.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/query_params.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/descriptor.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/query_params.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/service_base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/error_handlers.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/metrics.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/setup.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/common.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/crawl.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/types.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/client.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/monitoring.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/bing_agent.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/brave.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/firecrawl.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/google.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/jina.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/perplexity.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/tavily.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/vertexai_agent.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/utils/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/utils/url.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: unique-search-proxy
|
|
3
|
-
Version: 2026.26.0.
|
|
3
|
+
Version: 2026.26.0.dev10
|
|
4
4
|
Summary: Web Search Proxy implementation
|
|
5
5
|
Author: ThePhilAz
|
|
6
6
|
Author-email: ThePhilAz <rami.azouz@philico.com>
|
|
@@ -19,7 +19,7 @@ Requires-Dist: certifi>=2025.11.12,<2027
|
|
|
19
19
|
Requires-Dist: google-genai>=1.73.0,<2
|
|
20
20
|
Requires-Dist: google-auth>=2.43.0,<3
|
|
21
21
|
Requires-Dist: unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0
|
|
22
|
-
Requires-Dist: unique-search-proxy-core>=2026.26.0.
|
|
22
|
+
Requires-Dist: unique-search-proxy-core>=2026.26.0.dev6,<2026.26.0rc0
|
|
23
23
|
Requires-Python: >=3.12
|
|
24
24
|
Description-Content-Type: text/markdown
|
|
25
25
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "unique-search-proxy"
|
|
3
|
-
version = "2026.26.0.
|
|
3
|
+
version = "2026.26.0.dev10"
|
|
4
4
|
description = "Web Search Proxy implementation"
|
|
5
5
|
authors = [{ name = "ThePhilAz", email = "rami.azouz@philico.com" }]
|
|
6
6
|
readme = "README.md"
|
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
|
21
21
|
"google-genai>=1.73.0,<2",
|
|
22
22
|
"google-auth>=2.43.0,<3",
|
|
23
23
|
"unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0",
|
|
24
|
-
"unique-search-proxy-core>=2026.26.0.
|
|
24
|
+
"unique-search-proxy-core>=2026.26.0.dev6,<2026.26.0rc0",
|
|
25
25
|
]
|
|
26
26
|
|
|
27
27
|
[dependency-groups]
|
|
@@ -18,6 +18,13 @@ from unique_search_proxy_client.web.api.v1.openapi_examples import (
|
|
|
18
18
|
)
|
|
19
19
|
from unique_search_proxy_client.web.core.client import get_http_client_pool
|
|
20
20
|
from unique_search_proxy_client.web.core.crawlers.factory import get_crawler_service
|
|
21
|
+
from unique_search_proxy_client.web.core.crawlers.pinned_egress import (
|
|
22
|
+
PinnedEgressCrawler,
|
|
23
|
+
)
|
|
24
|
+
from unique_search_proxy_client.web.core.url_safety.gate import (
|
|
25
|
+
apply_url_safety_gate,
|
|
26
|
+
merge_crawl_results,
|
|
27
|
+
)
|
|
21
28
|
from unique_search_proxy_client.web.monitoring.metrics import (
|
|
22
29
|
record_crawl_error,
|
|
23
30
|
record_crawl_success,
|
|
@@ -49,10 +56,38 @@ async def crawl(
|
|
|
49
56
|
started = time.perf_counter()
|
|
50
57
|
|
|
51
58
|
try:
|
|
52
|
-
pool = get_http_client_pool(request.app)
|
|
53
|
-
crawler = get_crawler_service(crawler_id, http_client=pool.client)
|
|
54
59
|
async with asyncio.timeout(timeout):
|
|
55
|
-
|
|
60
|
+
gate = await apply_url_safety_gate(body.urls)
|
|
61
|
+
if not gate.allowed_targets:
|
|
62
|
+
record_crawl_success(
|
|
63
|
+
crawler_id,
|
|
64
|
+
len(body.urls),
|
|
65
|
+
time.perf_counter() - started,
|
|
66
|
+
)
|
|
67
|
+
return CrawlResponse(
|
|
68
|
+
crawler=crawler_id,
|
|
69
|
+
results=merge_crawl_results(
|
|
70
|
+
body.urls,
|
|
71
|
+
blocked_by_index=gate.blocked_by_index,
|
|
72
|
+
crawler_results=[],
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
crawl_body = body.model_copy(
|
|
77
|
+
update={
|
|
78
|
+
"urls": [target.display_url for target in gate.allowed_targets],
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
pool = get_http_client_pool(request.app)
|
|
83
|
+
crawler = get_crawler_service(crawler_id, http_client=pool.client)
|
|
84
|
+
if isinstance(crawler, PinnedEgressCrawler):
|
|
85
|
+
crawler_results = await crawler.crawl_pinned(
|
|
86
|
+
crawl_body,
|
|
87
|
+
gate.allowed_targets,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
crawler_results = await crawler.crawl(crawl_body)
|
|
56
91
|
except TimeoutError as exc:
|
|
57
92
|
record_crawl_error(
|
|
58
93
|
crawler_id,
|
|
@@ -76,4 +111,11 @@ async def crawl(
|
|
|
76
111
|
raise
|
|
77
112
|
|
|
78
113
|
record_crawl_success(crawler_id, len(body.urls), time.perf_counter() - started)
|
|
79
|
-
return CrawlResponse(
|
|
114
|
+
return CrawlResponse(
|
|
115
|
+
crawler=crawler_id,
|
|
116
|
+
results=merge_crawl_results(
|
|
117
|
+
body.urls,
|
|
118
|
+
blocked_by_index=gate.blocked_by_index,
|
|
119
|
+
crawler_results=crawler_results,
|
|
120
|
+
),
|
|
121
|
+
)
|
|
@@ -14,6 +14,11 @@ from unique_search_proxy_core.schema import (
|
|
|
14
14
|
CrawlUrlResult,
|
|
15
15
|
ProxyErrorCode,
|
|
16
16
|
)
|
|
17
|
+
from unique_search_proxy_core.url_safety import (
|
|
18
|
+
ResolvedCrawlTarget,
|
|
19
|
+
bypass_crawl_target,
|
|
20
|
+
pinned_httpx_get_args,
|
|
21
|
+
)
|
|
17
22
|
|
|
18
23
|
from unique_search_proxy_client.web.core.crawlers.basic.processing import (
|
|
19
24
|
ContentProcessingError,
|
|
@@ -27,6 +32,7 @@ from unique_search_proxy_client.web.core.provider_response import (
|
|
|
27
32
|
crawl_upstream_error,
|
|
28
33
|
transport_error_raw,
|
|
29
34
|
)
|
|
35
|
+
from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
|
|
30
36
|
|
|
31
37
|
_LOGGER = logging.getLogger(__name__)
|
|
32
38
|
|
|
@@ -43,11 +49,30 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
43
49
|
|
|
44
50
|
crawler_id = CrawlerType.BASIC.value
|
|
45
51
|
|
|
46
|
-
async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore
|
|
52
|
+
async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore[override]
|
|
53
|
+
bypass_targets = [
|
|
54
|
+
AllowedCrawlTarget(
|
|
55
|
+
display_url=url.strip(),
|
|
56
|
+
resolved=bypass_crawl_target(url),
|
|
57
|
+
)
|
|
58
|
+
for url in request.urls
|
|
59
|
+
]
|
|
60
|
+
return await self.crawl_pinned(request, bypass_targets)
|
|
61
|
+
|
|
62
|
+
async def crawl_pinned(
|
|
63
|
+
self,
|
|
64
|
+
request: BasicCrawlRequest, # type: ignore[valid-type]
|
|
65
|
+
allowed_targets: list[AllowedCrawlTarget],
|
|
66
|
+
) -> list[CrawlUrlResult]:
|
|
47
67
|
client = self._http_client
|
|
48
68
|
if client is None:
|
|
49
69
|
raise RuntimeError("HTTP client is required for Basic crawler")
|
|
50
70
|
|
|
71
|
+
display_urls = list(request.urls)
|
|
72
|
+
if len(allowed_targets) != len(display_urls):
|
|
73
|
+
msg = "allowed_targets length must match request.urls length"
|
|
74
|
+
raise ValueError(msg)
|
|
75
|
+
|
|
51
76
|
timeout = request.timeout
|
|
52
77
|
semaphore = asyncio.Semaphore(request.max_concurrent_requests)
|
|
53
78
|
return list(
|
|
@@ -55,12 +80,13 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
55
80
|
*[
|
|
56
81
|
self._crawl_one(
|
|
57
82
|
client,
|
|
58
|
-
|
|
83
|
+
allowed_target.display_url,
|
|
84
|
+
resolved_target=allowed_target.resolved,
|
|
59
85
|
timeout=timeout,
|
|
60
86
|
semaphore=semaphore,
|
|
61
87
|
content_type_handlers=request.content_types.to_handlers(),
|
|
62
88
|
)
|
|
63
|
-
for
|
|
89
|
+
for allowed_target in allowed_targets
|
|
64
90
|
],
|
|
65
91
|
),
|
|
66
92
|
)
|
|
@@ -68,35 +94,38 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
68
94
|
async def _crawl_one(
|
|
69
95
|
self,
|
|
70
96
|
client: AsyncClient,
|
|
71
|
-
|
|
97
|
+
display_url: str,
|
|
72
98
|
*,
|
|
99
|
+
resolved_target: ResolvedCrawlTarget,
|
|
73
100
|
timeout: int,
|
|
74
101
|
semaphore: asyncio.Semaphore,
|
|
75
102
|
content_type_handlers: dict[str, ContentTypeHandlerPolicy],
|
|
76
103
|
) -> CrawlUrlResult:
|
|
77
|
-
request_url =
|
|
104
|
+
request_url, pin_headers, extensions = pinned_httpx_get_args(resolved_target)
|
|
78
105
|
async with semaphore:
|
|
79
|
-
headers = {"User-Agent": random_user_agent()}
|
|
106
|
+
headers = {"User-Agent": random_user_agent(), **pin_headers}
|
|
107
|
+
|
|
80
108
|
try:
|
|
81
109
|
response = await client.get(
|
|
82
110
|
request_url,
|
|
83
111
|
headers=headers,
|
|
112
|
+
extensions=extensions or None,
|
|
84
113
|
timeout=Timeout(timeout),
|
|
85
114
|
follow_redirects=True,
|
|
86
115
|
)
|
|
87
116
|
except httpx.TimeoutException as exc:
|
|
88
|
-
_LOGGER.warning("Basic crawl timed out for %s: %s",
|
|
117
|
+
_LOGGER.warning("Basic crawl timed out for %s: %s", display_url, exc)
|
|
89
118
|
return crawl_upstream_error(
|
|
90
|
-
|
|
119
|
+
display_url,
|
|
91
120
|
f"Crawl timed out after {timeout}s",
|
|
92
121
|
content_type=None,
|
|
93
122
|
code=ProxyErrorCode.UPSTREAM_TIMEOUT.value,
|
|
94
123
|
raw=transport_error_raw(exc),
|
|
95
124
|
)
|
|
96
125
|
except httpx.HTTPError as exc:
|
|
97
|
-
_LOGGER.warning("Basic crawl failed for %s: %s",
|
|
126
|
+
_LOGGER.warning("Basic crawl failed for %s: %s", display_url, exc)
|
|
98
127
|
return crawl_upstream_error(
|
|
99
|
-
|
|
128
|
+
display_url,
|
|
100
129
|
str(exc),
|
|
101
130
|
content_type=None,
|
|
102
131
|
raw=transport_error_raw(exc),
|
|
@@ -107,11 +136,11 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
107
136
|
if response.is_error:
|
|
108
137
|
_LOGGER.warning(
|
|
109
138
|
"Basic crawl HTTP error for %s: %s",
|
|
110
|
-
|
|
139
|
+
display_url,
|
|
111
140
|
response.status_code,
|
|
112
141
|
)
|
|
113
142
|
return crawl_upstream_error(
|
|
114
|
-
|
|
143
|
+
display_url,
|
|
115
144
|
f"HTTP {response.status_code} while fetching URL",
|
|
116
145
|
content_type=content_type,
|
|
117
146
|
raw=raw_body,
|
|
@@ -120,7 +149,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
120
149
|
content = await self._maybe_process_content(
|
|
121
150
|
raw_body,
|
|
122
151
|
content_type,
|
|
123
|
-
request_url=
|
|
152
|
+
request_url=display_url,
|
|
124
153
|
timeout=timeout,
|
|
125
154
|
content_type_handlers=content_type_handlers,
|
|
126
155
|
)
|
|
@@ -128,7 +157,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
128
157
|
return content
|
|
129
158
|
|
|
130
159
|
return CrawlUrlResult(
|
|
131
|
-
url=
|
|
160
|
+
url=display_url,
|
|
132
161
|
content=content,
|
|
133
162
|
raw=raw_body,
|
|
134
163
|
content_type=content_type,
|
unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/crawlers/pinned_egress.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol, runtime_checkable
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from unique_search_proxy_core.schema import CrawlUrlResult
|
|
7
|
+
|
|
8
|
+
from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class PinnedEgressCrawler(Protocol):
|
|
13
|
+
"""Crawlers that fetch directly and must reuse gate DNS resolution."""
|
|
14
|
+
|
|
15
|
+
async def crawl_pinned(
|
|
16
|
+
self,
|
|
17
|
+
request: BaseModel,
|
|
18
|
+
allowed_targets: list[AllowedCrawlTarget],
|
|
19
|
+
) -> list[CrawlUrlResult]: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = ["PinnedEgressCrawler"]
|
|
@@ -80,6 +80,16 @@ def crawl_upstream_error(
|
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
|
|
83
|
+
def crawl_forbidden_target(url: str, message: str) -> CrawlUrlResult:
|
|
84
|
+
"""Build a per-URL crawl failure for a URL safety policy violation."""
|
|
85
|
+
return crawl_upstream_error(
|
|
86
|
+
url,
|
|
87
|
+
message,
|
|
88
|
+
code=ProxyErrorCode.FORBIDDEN_TARGET.value,
|
|
89
|
+
content_type=None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
83
93
|
def raise_for_upstream_response(
|
|
84
94
|
response: httpx.Response,
|
|
85
95
|
*,
|
unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/url_safety/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from unique_search_proxy_client.web.core.url_safety.gate import (
|
|
2
|
+
AllowedCrawlTarget,
|
|
3
|
+
UrlSafetyGateResult,
|
|
4
|
+
apply_url_safety_gate,
|
|
5
|
+
merge_crawl_results,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AllowedCrawlTarget",
|
|
10
|
+
"UrlSafetyGateResult",
|
|
11
|
+
"apply_url_safety_gate",
|
|
12
|
+
"merge_crawl_results",
|
|
13
|
+
]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from unique_search_proxy_core.schema import CrawlUrlResult
|
|
6
|
+
from unique_search_proxy_core.url_safety import ResolvedCrawlTarget, UrlSafetyService
|
|
7
|
+
|
|
8
|
+
from unique_search_proxy_client.web.core.provider_response import crawl_forbidden_target
|
|
9
|
+
from unique_search_proxy_client.web.monitoring.metrics import record_crawl_blocked
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class AllowedCrawlTarget:
|
|
14
|
+
"""User-facing URL paired with the validated resolution for pinned egress."""
|
|
15
|
+
|
|
16
|
+
display_url: str
|
|
17
|
+
resolved: ResolvedCrawlTarget
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class UrlSafetyGateResult:
|
|
22
|
+
allowed_targets: list[AllowedCrawlTarget]
|
|
23
|
+
blocked_by_index: dict[int, CrawlUrlResult]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def apply_url_safety_gate(urls: list[str]) -> UrlSafetyGateResult:
|
|
27
|
+
"""Validate crawl URLs and partition them into allowed vs blocked targets."""
|
|
28
|
+
outcomes = await UrlSafetyService.validate_urls_individually(urls)
|
|
29
|
+
allowed_targets: list[AllowedCrawlTarget] = []
|
|
30
|
+
blocked_by_index: dict[int, CrawlUrlResult] = {}
|
|
31
|
+
|
|
32
|
+
for index, outcome in enumerate(outcomes):
|
|
33
|
+
if outcome.blocked is not None:
|
|
34
|
+
record_crawl_blocked(outcome.blocked.category)
|
|
35
|
+
blocked_by_index[index] = crawl_forbidden_target(
|
|
36
|
+
outcome.url.strip(),
|
|
37
|
+
outcome.blocked.reason,
|
|
38
|
+
)
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
if outcome.resolved is None:
|
|
42
|
+
msg = "URL safety allowed a crawl target without resolved metadata"
|
|
43
|
+
raise RuntimeError(msg)
|
|
44
|
+
|
|
45
|
+
allowed_targets.append(
|
|
46
|
+
AllowedCrawlTarget(
|
|
47
|
+
display_url=outcome.url.strip(),
|
|
48
|
+
resolved=outcome.resolved,
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return UrlSafetyGateResult(
|
|
53
|
+
allowed_targets=allowed_targets,
|
|
54
|
+
blocked_by_index=blocked_by_index,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def merge_crawl_results(
|
|
59
|
+
urls: list[str],
|
|
60
|
+
*,
|
|
61
|
+
blocked_by_index: dict[int, CrawlUrlResult],
|
|
62
|
+
crawler_results: list[CrawlUrlResult],
|
|
63
|
+
) -> list[CrawlUrlResult]:
|
|
64
|
+
"""Merge per-URL blocked results with crawler outcomes in request order."""
|
|
65
|
+
merged: list[CrawlUrlResult] = []
|
|
66
|
+
crawler_index = 0
|
|
67
|
+
for index, _url in enumerate(urls):
|
|
68
|
+
blocked = blocked_by_index.get(index)
|
|
69
|
+
if blocked is not None:
|
|
70
|
+
merged.append(blocked)
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
merged.append(crawler_results[crawler_index])
|
|
74
|
+
crawler_index += 1
|
|
75
|
+
|
|
76
|
+
return merged
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|